Handle column and grid layouts in amazon EU stores

This commit is contained in:
Charles Haley 2013-04-01 13:54:44 +02:00
parent de0c864159
commit 89a5375f57
5 changed files with 265 additions and 290 deletions

View File

@ -7,7 +7,7 @@ __license__ = 'GPL 3'
__copyright__ = '2011, John Schember <john@nachtimwald.com>' __copyright__ = '2011, John Schember <john@nachtimwald.com>'
__docformat__ = 'restructuredtext en' __docformat__ = 'restructuredtext en'
import re, time import re
from contextlib import closing from contextlib import closing
from lxml import html from lxml import html
@ -29,9 +29,6 @@ class AmazonEUBase(StorePlugin):
For comments on the implementation, please see amazon_plugin.py For comments on the implementation, please see amazon_plugin.py
''' '''
MAX_SEARCH_ATTEMPTS = 5
SLEEP_BETWEEN_ATTEMPTS = 3
def open(self, parent=None, detail_item=None, external=False): def open(self, parent=None, detail_item=None, external=False):
store_link = self.store_link % self.aff_id store_link = self.store_link % self.aff_id
@ -42,27 +39,25 @@ class AmazonEUBase(StorePlugin):
def search(self, query, max_results=10, timeout=60): def search(self, query, max_results=10, timeout=60):
url = self.search_url + query.encode('ascii', 'backslashreplace').replace('%', '%25').replace('\\x', '%').replace(' ', '+') url = self.search_url + query.encode('ascii', 'backslashreplace').replace('%', '%25').replace('\\x', '%').replace(' ', '+')
br = browser()
counter = max_results counter = max_results
loops = 0
while counter == max_results and loops < self.MAX_SEARCH_ATTEMPTS:
br = browser()
if loops > 0:
print ("Retry getbooks search", self.__class__.__name__, counter,
max_results, loops)
time.sleep(self.SLEEP_BETWEEN_ATTEMPTS)
loops += 1
with closing(br.open(url, timeout=timeout)) as f: with closing(br.open(url, timeout=timeout)) as f:
doc = html.fromstring(f.read())#.decode('latin-1', 'replace')) doc = html.fromstring(f.read())#.decode('latin-1', 'replace'))
data_xpath = '//div[contains(@class, "prod")]' data_xpath = '//div[contains(@class, "prod")]'
format_xpath = './/ul[contains(@class, "rsltL")]//span[contains(@class, "lrg") and not(contains(@class, "bld"))]/text()' # Results can be in a grid (table) or a column
format_xpath = (
'.//ul[contains(@class, "rsltL") or contains(@class, "rsltGridList")]'
'//span[contains(@class, "lrg") and not(contains(@class, "bld"))]/text()')
asin_xpath = '@name' asin_xpath = '@name'
cover_xpath = './/img[@class="productImage"]/@src' cover_xpath = './/img[@class="productImage"]/@src'
title_xpath = './/h3[@class="newaps"]/a//text()' title_xpath = './/h3[@class="newaps"]/a//text()'
author_xpath = './/h3[@class="newaps"]//span[contains(@class, "reg")]//text()' author_xpath = './/h3[@class="newaps"]//span[contains(@class, "reg")]//text()'
price_xpath = './/ul[contains(@class, "rsltL")]//span[contains(@class, "lrg") and contains(@class, "bld")]/text()' # Results can be in a grid (table) or a column
price_xpath = (
'.//ul[contains(@class, "rsltL") or contains(@class, "rsltGridList")]'
'//span[contains(@class, "lrg") and contains(@class, "bld")]/text()')
for data in doc.xpath(data_xpath): for data in doc.xpath(data_xpath):
if counter <= 0: if counter <= 0:

View File

@ -7,7 +7,7 @@ __license__ = 'GPL 3'
__copyright__ = '2011, John Schember <john@nachtimwald.com>' __copyright__ = '2011, John Schember <john@nachtimwald.com>'
__docformat__ = 'restructuredtext en' __docformat__ = 'restructuredtext en'
import re, time import re
from contextlib import closing from contextlib import closing
from lxml import html from lxml import html
@ -28,9 +28,6 @@ class AmazonEUBase(StorePlugin):
For comments on the implementation, please see amazon_plugin.py For comments on the implementation, please see amazon_plugin.py
''' '''
MAX_SEARCH_ATTEMPTS = 5
SLEEP_BETWEEN_ATTEMPTS = 3
def open(self, parent=None, detail_item=None, external=False): def open(self, parent=None, detail_item=None, external=False):
store_link = self.store_link % self.aff_id store_link = self.store_link % self.aff_id
@ -41,27 +38,25 @@ class AmazonEUBase(StorePlugin):
def search(self, query, max_results=10, timeout=60): def search(self, query, max_results=10, timeout=60):
url = self.search_url + query.encode('ascii', 'backslashreplace').replace('%', '%25').replace('\\x', '%').replace(' ', '+') url = self.search_url + query.encode('ascii', 'backslashreplace').replace('%', '%25').replace('\\x', '%').replace(' ', '+')
br = browser()
counter = max_results counter = max_results
loops = 0
while counter == max_results and loops < self.MAX_SEARCH_ATTEMPTS:
br = browser()
if loops > 0:
print ("Retry getbooks search", self.__class__.__name__, counter,
max_results, loops)
time.sleep(self.SLEEP_BETWEEN_ATTEMPTS)
loops += 1
with closing(br.open(url, timeout=timeout)) as f: with closing(br.open(url, timeout=timeout)) as f:
doc = html.fromstring(f.read())#.decode('latin-1', 'replace')) doc = html.fromstring(f.read())#.decode('latin-1', 'replace'))
data_xpath = '//div[contains(@class, "prod")]' data_xpath = '//div[contains(@class, "prod")]'
format_xpath = './/ul[contains(@class, "rsltL")]//span[contains(@class, "lrg") and not(contains(@class, "bld"))]/text()' # Results can be in a grid (table) or a column
format_xpath = (
'.//ul[contains(@class, "rsltL") or contains(@class, "rsltGridList")]'
'//span[contains(@class, "lrg") and not(contains(@class, "bld"))]/text()')
asin_xpath = '@name' asin_xpath = '@name'
cover_xpath = './/img[@class="productImage"]/@src' cover_xpath = './/img[@class="productImage"]/@src'
title_xpath = './/h3[@class="newaps"]/a//text()' title_xpath = './/h3[@class="newaps"]/a//text()'
author_xpath = './/h3[@class="newaps"]//span[contains(@class, "reg")]//text()' author_xpath = './/h3[@class="newaps"]//span[contains(@class, "reg")]//text()'
price_xpath = './/ul[contains(@class, "rsltL")]//span[contains(@class, "lrg") and contains(@class, "bld")]/text()' # Results can be in a grid (table) or a column
price_xpath = (
'.//ul[contains(@class, "rsltL") or contains(@class, "rsltGridList")]'
'//span[contains(@class, "lrg") and contains(@class, "bld")]/text()')
for data in doc.xpath(data_xpath): for data in doc.xpath(data_xpath):
if counter <= 0: if counter <= 0:

View File

@ -7,7 +7,7 @@ __license__ = 'GPL 3'
__copyright__ = '2011, John Schember <john@nachtimwald.com>' __copyright__ = '2011, John Schember <john@nachtimwald.com>'
__docformat__ = 'restructuredtext en' __docformat__ = 'restructuredtext en'
import re, time import re
from contextlib import closing from contextlib import closing
from lxml import html from lxml import html
@ -29,9 +29,6 @@ class AmazonEUBase(StorePlugin):
For comments on the implementation, please see amazon_plugin.py For comments on the implementation, please see amazon_plugin.py
''' '''
MAX_SEARCH_ATTEMPTS = 5
SLEEP_BETWEEN_ATTEMPTS = 3
def open(self, parent=None, detail_item=None, external=False): def open(self, parent=None, detail_item=None, external=False):
store_link = self.store_link % self.aff_id store_link = self.store_link % self.aff_id
@ -42,27 +39,25 @@ class AmazonEUBase(StorePlugin):
def search(self, query, max_results=10, timeout=60): def search(self, query, max_results=10, timeout=60):
url = self.search_url + query.encode('ascii', 'backslashreplace').replace('%', '%25').replace('\\x', '%').replace(' ', '+') url = self.search_url + query.encode('ascii', 'backslashreplace').replace('%', '%25').replace('\\x', '%').replace(' ', '+')
br = browser()
counter = max_results counter = max_results
loops = 0
while counter == max_results and loops < self.MAX_SEARCH_ATTEMPTS:
br = browser()
if loops > 0:
print ("Retry getbooks search", self.__class__.__name__, counter,
max_results, loops)
time.sleep(self.SLEEP_BETWEEN_ATTEMPTS)
loops += 1
with closing(br.open(url, timeout=timeout)) as f: with closing(br.open(url, timeout=timeout)) as f:
doc = html.fromstring(f.read())#.decode('latin-1', 'replace')) doc = html.fromstring(f.read())#.decode('latin-1', 'replace'))
data_xpath = '//div[contains(@class, "prod")]' data_xpath = '//div[contains(@class, "prod")]'
format_xpath = './/ul[contains(@class, "rsltL")]//span[contains(@class, "lrg") and not(contains(@class, "bld"))]/text()' # Results can be in a grid (table) or a column
format_xpath = (
'.//ul[contains(@class, "rsltL") or contains(@class, "rsltGridList")]'
'//span[contains(@class, "lrg") and not(contains(@class, "bld"))]/text()')
asin_xpath = '@name' asin_xpath = '@name'
cover_xpath = './/img[@class="productImage"]/@src' cover_xpath = './/img[@class="productImage"]/@src'
title_xpath = './/h3[@class="newaps"]/a//text()' title_xpath = './/h3[@class="newaps"]/a//text()'
author_xpath = './/h3[@class="newaps"]//span[contains(@class, "reg")]//text()' author_xpath = './/h3[@class="newaps"]//span[contains(@class, "reg")]//text()'
price_xpath = './/ul[contains(@class, "rsltL")]//span[contains(@class, "lrg") and contains(@class, "bld")]/text()' # Results can be in a grid (table) or a column
price_xpath = (
'.//ul[contains(@class, "rsltL") or contains(@class, "rsltGridList")]'
'//span[contains(@class, "lrg") and contains(@class, "bld")]/text()')
for data in doc.xpath(data_xpath): for data in doc.xpath(data_xpath):
if counter <= 0: if counter <= 0:

View File

@ -7,7 +7,7 @@ __license__ = 'GPL 3'
__copyright__ = '2011, John Schember <john@nachtimwald.com>' __copyright__ = '2011, John Schember <john@nachtimwald.com>'
__docformat__ = 'restructuredtext en' __docformat__ = 'restructuredtext en'
import re, time import re
from contextlib import closing from contextlib import closing
from lxml import html from lxml import html
@ -28,9 +28,6 @@ class AmazonEUBase(StorePlugin):
For comments on the implementation, please see amazon_plugin.py For comments on the implementation, please see amazon_plugin.py
''' '''
MAX_SEARCH_ATTEMPTS = 5
SLEEP_BETWEEN_ATTEMPTS = 3
def open(self, parent=None, detail_item=None, external=False): def open(self, parent=None, detail_item=None, external=False):
store_link = self.store_link % self.aff_id store_link = self.store_link % self.aff_id
@ -41,27 +38,25 @@ class AmazonEUBase(StorePlugin):
def search(self, query, max_results=10, timeout=60): def search(self, query, max_results=10, timeout=60):
url = self.search_url + query.encode('ascii', 'backslashreplace').replace('%', '%25').replace('\\x', '%').replace(' ', '+') url = self.search_url + query.encode('ascii', 'backslashreplace').replace('%', '%25').replace('\\x', '%').replace(' ', '+')
br = browser()
counter = max_results counter = max_results
loops = 0
while counter == max_results and loops < self.MAX_SEARCH_ATTEMPTS:
br = browser()
if loops > 0:
print ("Retry getbooks search", self.__class__.__name__, counter,
max_results, loops)
time.sleep(self.SLEEP_BETWEEN_ATTEMPTS)
loops += 1
with closing(br.open(url, timeout=timeout)) as f: with closing(br.open(url, timeout=timeout)) as f:
doc = html.fromstring(f.read())#.decode('latin-1', 'replace')) doc = html.fromstring(f.read())#.decode('latin-1', 'replace'))
data_xpath = '//div[contains(@class, "prod")]' data_xpath = '//div[contains(@class, "prod")]'
format_xpath = './/ul[contains(@class, "rsltL")]//span[contains(@class, "lrg") and not(contains(@class, "bld"))]/text()' # Results can be in a grid (table) or a column
format_xpath = (
'.//ul[contains(@class, "rsltL") or contains(@class, "rsltGridList")]'
'//span[contains(@class, "lrg") and not(contains(@class, "bld"))]/text()')
asin_xpath = '@name' asin_xpath = '@name'
cover_xpath = './/img[@class="productImage"]/@src' cover_xpath = './/img[@class="productImage"]/@src'
title_xpath = './/h3[@class="newaps"]/a//text()' title_xpath = './/h3[@class="newaps"]/a//text()'
author_xpath = './/h3[@class="newaps"]//span[contains(@class, "reg")]//text()' author_xpath = './/h3[@class="newaps"]//span[contains(@class, "reg")]//text()'
price_xpath = './/ul[contains(@class, "rsltL")]//span[contains(@class, "lrg") and contains(@class, "bld")]/text()' # Results can be in a grid (table) or a column
price_xpath = (
'.//ul[contains(@class, "rsltL") or contains(@class, "rsltGridList")]'
'//span[contains(@class, "lrg") and contains(@class, "bld")]/text()')
for data in doc.xpath(data_xpath): for data in doc.xpath(data_xpath):
if counter <= 0: if counter <= 0:

View File

@ -7,7 +7,7 @@ __license__ = 'GPL 3'
__copyright__ = '2011, John Schember <john@nachtimwald.com>' __copyright__ = '2011, John Schember <john@nachtimwald.com>'
__docformat__ = 'restructuredtext en' __docformat__ = 'restructuredtext en'
import re, time import re
from contextlib import closing from contextlib import closing
from lxml import html from lxml import html
@ -28,9 +28,6 @@ class AmazonEUBase(StorePlugin):
For comments on the implementation, please see amazon_plugin.py For comments on the implementation, please see amazon_plugin.py
''' '''
MAX_SEARCH_ATTEMPTS = 5
SLEEP_BETWEEN_ATTEMPTS = 3
def open(self, parent=None, detail_item=None, external=False): def open(self, parent=None, detail_item=None, external=False):
store_link = self.store_link % self.aff_id store_link = self.store_link % self.aff_id
@ -41,27 +38,25 @@ class AmazonEUBase(StorePlugin):
def search(self, query, max_results=10, timeout=60): def search(self, query, max_results=10, timeout=60):
url = self.search_url + query.encode('ascii', 'backslashreplace').replace('%', '%25').replace('\\x', '%').replace(' ', '+') url = self.search_url + query.encode('ascii', 'backslashreplace').replace('%', '%25').replace('\\x', '%').replace(' ', '+')
br = browser()
counter = max_results counter = max_results
loops = 0
while counter == max_results and loops < self.MAX_SEARCH_ATTEMPTS:
br = browser()
if loops > 0:
print ("Retry getbooks search", self.__class__.__name__, counter,
max_results, loops)
time.sleep(self.SLEEP_BETWEEN_ATTEMPTS)
loops += 1
with closing(br.open(url, timeout=timeout)) as f: with closing(br.open(url, timeout=timeout)) as f:
doc = html.fromstring(f.read())#.decode('latin-1', 'replace')) doc = html.fromstring(f.read())#.decode('latin-1', 'replace'))
data_xpath = '//div[contains(@class, "prod")]' data_xpath = '//div[contains(@class, "prod")]'
format_xpath = './/ul[contains(@class, "rsltL")]//span[contains(@class, "lrg") and not(contains(@class, "bld"))]/text()' # Results can be in a grid (table) or a column
format_xpath = (
'.//ul[contains(@class, "rsltL") or contains(@class, "rsltGridList")]'
'//span[contains(@class, "lrg") and not(contains(@class, "bld"))]/text()')
asin_xpath = '@name' asin_xpath = '@name'
cover_xpath = './/img[@class="productImage"]/@src' cover_xpath = './/img[@class="productImage"]/@src'
title_xpath = './/h3[@class="newaps"]/a//text()' title_xpath = './/h3[@class="newaps"]/a//text()'
author_xpath = './/h3[@class="newaps"]//span[contains(@class, "reg")]//text()' author_xpath = './/h3[@class="newaps"]//span[contains(@class, "reg")]//text()'
price_xpath = './/ul[contains(@class, "rsltL")]//span[contains(@class, "lrg") and contains(@class, "bld")]/text()' # Results can be in a grid (table) or a column
price_xpath = (
'.//ul[contains(@class, "rsltL") or contains(@class, "rsltGridList")]'
'//span[contains(@class, "lrg") and contains(@class, "bld")]/text()')
for data in doc.xpath(data_xpath): for data in doc.xpath(data_xpath):
if counter <= 0: if counter <= 0: