From 89a5375f57a0b34792ea927565ef65d87b33fde7 Mon Sep 17 00:00:00 2001 From: Charles Haley <> Date: Mon, 1 Apr 2013 13:54:44 +0200 Subject: [PATCH] Handle column and grid layouts in amazon EU stores --- .../gui2/store/stores/amazon_de_plugin.py | 111 +++++++++--------- .../gui2/store/stores/amazon_es_plugin.py | 111 +++++++++--------- .../gui2/store/stores/amazon_fr_plugin.py | 111 +++++++++--------- .../gui2/store/stores/amazon_it_plugin.py | 111 +++++++++--------- .../gui2/store/stores/amazon_uk_plugin.py | 111 +++++++++--------- 5 files changed, 265 insertions(+), 290 deletions(-) diff --git a/src/calibre/gui2/store/stores/amazon_de_plugin.py b/src/calibre/gui2/store/stores/amazon_de_plugin.py index 5f5b103d69..7b4027794a 100644 --- a/src/calibre/gui2/store/stores/amazon_de_plugin.py +++ b/src/calibre/gui2/store/stores/amazon_de_plugin.py @@ -7,7 +7,7 @@ __license__ = 'GPL 3' __copyright__ = '2011, John Schember ' __docformat__ = 'restructuredtext en' -import re, time +import re from contextlib import closing from lxml import html @@ -29,9 +29,6 @@ class AmazonEUBase(StorePlugin): For comments on the implementation, please see amazon_plugin.py ''' - MAX_SEARCH_ATTEMPTS = 5 - SLEEP_BETWEEN_ATTEMPTS = 3 - def open(self, parent=None, detail_item=None, external=False): store_link = self.store_link % self.aff_id @@ -42,73 +39,71 @@ class AmazonEUBase(StorePlugin): def search(self, query, max_results=10, timeout=60): url = self.search_url + query.encode('ascii', 'backslashreplace').replace('%', '%25').replace('\\x', '%').replace(' ', '+') + br = browser() counter = max_results - loops = 0 - while counter == max_results and loops < self.MAX_SEARCH_ATTEMPTS: - br = browser() - if loops > 0: - print ("Retry getbooks search", self.__class__.__name__, counter, - max_results, loops) - time.sleep(self.SLEEP_BETWEEN_ATTEMPTS) - loops += 1 + with closing(br.open(url, timeout=timeout)) as f: + doc = html.fromstring(f.read())#.decode('latin-1', 'replace')) - with closing(br.open(url, timeout=timeout)) as f: - doc = html.fromstring(f.read())#.decode('latin-1', 'replace')) + data_xpath = '//div[contains(@class, "prod")]' + # Results can be in a grid (table) or a column + format_xpath = ( + './/ul[contains(@class, "rsltL") or contains(@class, "rsltGridList")]' + '//span[contains(@class, "lrg") and not(contains(@class, "bld"))]/text()') + asin_xpath = '@name' + cover_xpath = './/img[@class="productImage"]/@src' + title_xpath = './/h3[@class="newaps"]/a//text()' + author_xpath = './/h3[@class="newaps"]//span[contains(@class, "reg")]//text()' + # Results can be in a grid (table) or a column + price_xpath = ( + './/ul[contains(@class, "rsltL") or contains(@class, "rsltGridList")]' + '//span[contains(@class, "lrg") and contains(@class, "bld")]/text()') - data_xpath = '//div[contains(@class, "prod")]' - format_xpath = './/ul[contains(@class, "rsltL")]//span[contains(@class, "lrg") and not(contains(@class, "bld"))]/text()' - asin_xpath = '@name' - cover_xpath = './/img[@class="productImage"]/@src' - title_xpath = './/h3[@class="newaps"]/a//text()' - author_xpath = './/h3[@class="newaps"]//span[contains(@class, "reg")]//text()' - price_xpath = './/ul[contains(@class, "rsltL")]//span[contains(@class, "lrg") and contains(@class, "bld")]/text()' + for data in doc.xpath(data_xpath): + if counter <= 0: + break - for data in doc.xpath(data_xpath): - if counter <= 0: - break + # Even though we are searching digital-text only Amazon will still + # put in results for non Kindle books (authors pages). Se we need + # to explicitly check if the item is a Kindle book and ignore it + # if it isn't. + format_ = ''.join(data.xpath(format_xpath)) + if 'kindle' not in format_.lower(): + continue - # Even though we are searching digital-text only Amazon will still - # put in results for non Kindle books (authors pages). Se we need - # to explicitly check if the item is a Kindle book and ignore it - # if it isn't. - format_ = ''.join(data.xpath(format_xpath)) - if 'kindle' not in format_.lower(): - continue + # We must have an asin otherwise we can't easily reference the + # book later. + asin = data.xpath(asin_xpath) + if asin: + asin = asin[0] + else: + continue - # We must have an asin otherwise we can't easily reference the - # book later. - asin = data.xpath(asin_xpath) - if asin: - asin = asin[0] - else: - continue + cover_url = ''.join(data.xpath(cover_xpath)) - cover_url = ''.join(data.xpath(cover_xpath)) + title = ''.join(data.xpath(title_xpath)) - title = ''.join(data.xpath(title_xpath)) + authors = ''.join(data.xpath(author_xpath)) + authors = re.sub('^' + self.author_article, '', authors) + authors = re.sub(self.and_word, ' & ', authors) + mo = re.match(r'(.*)(\(\d.*)$', authors) + if mo: + authors = mo.group(1).strip() - authors = ''.join(data.xpath(author_xpath)) - authors = re.sub('^' + self.author_article, '', authors) - authors = re.sub(self.and_word, ' & ', authors) - mo = re.match(r'(.*)(\(\d.*)$', authors) - if mo: - authors = mo.group(1).strip() + price = ''.join(data.xpath(price_xpath)) - price = ''.join(data.xpath(price_xpath)) + counter -= 1 - counter -= 1 + s = SearchResult() + s.cover_url = cover_url.strip() + s.title = title.strip() + s.author = authors.strip() + s.price = price.strip() + s.detail_item = asin.strip() + s.drm = SearchResult.DRM_UNKNOWN + s.formats = 'Kindle' - s = SearchResult() - s.cover_url = cover_url.strip() - s.title = title.strip() - s.author = authors.strip() - s.price = price.strip() - s.detail_item = asin.strip() - s.drm = SearchResult.DRM_UNKNOWN - s.formats = 'Kindle' - - yield s + yield s def get_details(self, search_result, timeout): pass diff --git a/src/calibre/gui2/store/stores/amazon_es_plugin.py b/src/calibre/gui2/store/stores/amazon_es_plugin.py index fdf9ef7502..68387ffe11 100644 --- a/src/calibre/gui2/store/stores/amazon_es_plugin.py +++ b/src/calibre/gui2/store/stores/amazon_es_plugin.py @@ -7,7 +7,7 @@ __license__ = 'GPL 3' __copyright__ = '2011, John Schember ' __docformat__ = 'restructuredtext en' -import re, time +import re from contextlib import closing from lxml import html @@ -28,9 +28,6 @@ class AmazonEUBase(StorePlugin): For comments on the implementation, please see amazon_plugin.py ''' - MAX_SEARCH_ATTEMPTS = 5 - SLEEP_BETWEEN_ATTEMPTS = 3 - def open(self, parent=None, detail_item=None, external=False): store_link = self.store_link % self.aff_id @@ -41,73 +38,71 @@ class AmazonEUBase(StorePlugin): def search(self, query, max_results=10, timeout=60): url = self.search_url + query.encode('ascii', 'backslashreplace').replace('%', '%25').replace('\\x', '%').replace(' ', '+') + br = browser() counter = max_results - loops = 0 - while counter == max_results and loops < self.MAX_SEARCH_ATTEMPTS: - br = browser() - if loops > 0: - print ("Retry getbooks search", self.__class__.__name__, counter, - max_results, loops) - time.sleep(self.SLEEP_BETWEEN_ATTEMPTS) - loops += 1 + with closing(br.open(url, timeout=timeout)) as f: + doc = html.fromstring(f.read())#.decode('latin-1', 'replace')) - with closing(br.open(url, timeout=timeout)) as f: - doc = html.fromstring(f.read())#.decode('latin-1', 'replace')) + data_xpath = '//div[contains(@class, "prod")]' + # Results can be in a grid (table) or a column + format_xpath = ( + './/ul[contains(@class, "rsltL") or contains(@class, "rsltGridList")]' + '//span[contains(@class, "lrg") and not(contains(@class, "bld"))]/text()') + asin_xpath = '@name' + cover_xpath = './/img[@class="productImage"]/@src' + title_xpath = './/h3[@class="newaps"]/a//text()' + author_xpath = './/h3[@class="newaps"]//span[contains(@class, "reg")]//text()' + # Results can be in a grid (table) or a column + price_xpath = ( + './/ul[contains(@class, "rsltL") or contains(@class, "rsltGridList")]' + '//span[contains(@class, "lrg") and contains(@class, "bld")]/text()') - data_xpath = '//div[contains(@class, "prod")]' - format_xpath = './/ul[contains(@class, "rsltL")]//span[contains(@class, "lrg") and not(contains(@class, "bld"))]/text()' - asin_xpath = '@name' - cover_xpath = './/img[@class="productImage"]/@src' - title_xpath = './/h3[@class="newaps"]/a//text()' - author_xpath = './/h3[@class="newaps"]//span[contains(@class, "reg")]//text()' - price_xpath = './/ul[contains(@class, "rsltL")]//span[contains(@class, "lrg") and contains(@class, "bld")]/text()' + for data in doc.xpath(data_xpath): + if counter <= 0: + break - for data in doc.xpath(data_xpath): - if counter <= 0: - break + # Even though we are searching digital-text only Amazon will still + # put in results for non Kindle books (authors pages). Se we need + # to explicitly check if the item is a Kindle book and ignore it + # if it isn't. + format_ = ''.join(data.xpath(format_xpath)) + if 'kindle' not in format_.lower(): + continue - # Even though we are searching digital-text only Amazon will still - # put in results for non Kindle books (authors pages). Se we need - # to explicitly check if the item is a Kindle book and ignore it - # if it isn't. - format_ = ''.join(data.xpath(format_xpath)) - if 'kindle' not in format_.lower(): - continue + # We must have an asin otherwise we can't easily reference the + # book later. + asin = data.xpath(asin_xpath) + if asin: + asin = asin[0] + else: + continue - # We must have an asin otherwise we can't easily reference the - # book later. - asin = data.xpath(asin_xpath) - if asin: - asin = asin[0] - else: - continue + cover_url = ''.join(data.xpath(cover_xpath)) - cover_url = ''.join(data.xpath(cover_xpath)) + title = ''.join(data.xpath(title_xpath)) - title = ''.join(data.xpath(title_xpath)) + authors = ''.join(data.xpath(author_xpath)) + authors = re.sub('^' + self.author_article, '', authors) + authors = re.sub(self.and_word, ' & ', authors) + mo = re.match(r'(.*)(\(\d.*)$', authors) + if mo: + authors = mo.group(1).strip() - authors = ''.join(data.xpath(author_xpath)) - authors = re.sub('^' + self.author_article, '', authors) - authors = re.sub(self.and_word, ' & ', authors) - mo = re.match(r'(.*)(\(\d.*)$', authors) - if mo: - authors = mo.group(1).strip() + price = ''.join(data.xpath(price_xpath)) - price = ''.join(data.xpath(price_xpath)) + counter -= 1 - counter -= 1 + s = SearchResult() + s.cover_url = cover_url.strip() + s.title = title.strip() + s.author = authors.strip() + s.price = price.strip() + s.detail_item = asin.strip() + s.drm = SearchResult.DRM_UNKNOWN + s.formats = 'Kindle' - s = SearchResult() - s.cover_url = cover_url.strip() - s.title = title.strip() - s.author = authors.strip() - s.price = price.strip() - s.detail_item = asin.strip() - s.drm = SearchResult.DRM_UNKNOWN - s.formats = 'Kindle' - - yield s + yield s def get_details(self, search_result, timeout): pass diff --git a/src/calibre/gui2/store/stores/amazon_fr_plugin.py b/src/calibre/gui2/store/stores/amazon_fr_plugin.py index 52e943a535..9b425a2fc9 100644 --- a/src/calibre/gui2/store/stores/amazon_fr_plugin.py +++ b/src/calibre/gui2/store/stores/amazon_fr_plugin.py @@ -7,7 +7,7 @@ __license__ = 'GPL 3' __copyright__ = '2011, John Schember ' __docformat__ = 'restructuredtext en' -import re, time +import re from contextlib import closing from lxml import html @@ -29,9 +29,6 @@ class AmazonEUBase(StorePlugin): For comments on the implementation, please see amazon_plugin.py ''' - MAX_SEARCH_ATTEMPTS = 5 - SLEEP_BETWEEN_ATTEMPTS = 3 - def open(self, parent=None, detail_item=None, external=False): store_link = self.store_link % self.aff_id @@ -42,73 +39,71 @@ class AmazonEUBase(StorePlugin): def search(self, query, max_results=10, timeout=60): url = self.search_url + query.encode('ascii', 'backslashreplace').replace('%', '%25').replace('\\x', '%').replace(' ', '+') + br = browser() counter = max_results - loops = 0 - while counter == max_results and loops < self.MAX_SEARCH_ATTEMPTS: - br = browser() - if loops > 0: - print ("Retry getbooks search", self.__class__.__name__, counter, - max_results, loops) - time.sleep(self.SLEEP_BETWEEN_ATTEMPTS) - loops += 1 + with closing(br.open(url, timeout=timeout)) as f: + doc = html.fromstring(f.read())#.decode('latin-1', 'replace')) - with closing(br.open(url, timeout=timeout)) as f: - doc = html.fromstring(f.read())#.decode('latin-1', 'replace')) + data_xpath = '//div[contains(@class, "prod")]' + # Results can be in a grid (table) or a column + format_xpath = ( + './/ul[contains(@class, "rsltL") or contains(@class, "rsltGridList")]' + '//span[contains(@class, "lrg") and not(contains(@class, "bld"))]/text()') + asin_xpath = '@name' + cover_xpath = './/img[@class="productImage"]/@src' + title_xpath = './/h3[@class="newaps"]/a//text()' + author_xpath = './/h3[@class="newaps"]//span[contains(@class, "reg")]//text()' + # Results can be in a grid (table) or a column + price_xpath = ( + './/ul[contains(@class, "rsltL") or contains(@class, "rsltGridList")]' + '//span[contains(@class, "lrg") and contains(@class, "bld")]/text()') - data_xpath = '//div[contains(@class, "prod")]' - format_xpath = './/ul[contains(@class, "rsltL")]//span[contains(@class, "lrg") and not(contains(@class, "bld"))]/text()' - asin_xpath = '@name' - cover_xpath = './/img[@class="productImage"]/@src' - title_xpath = './/h3[@class="newaps"]/a//text()' - author_xpath = './/h3[@class="newaps"]//span[contains(@class, "reg")]//text()' - price_xpath = './/ul[contains(@class, "rsltL")]//span[contains(@class, "lrg") and contains(@class, "bld")]/text()' + for data in doc.xpath(data_xpath): + if counter <= 0: + break - for data in doc.xpath(data_xpath): - if counter <= 0: - break + # Even though we are searching digital-text only Amazon will still + # put in results for non Kindle books (authors pages). Se we need + # to explicitly check if the item is a Kindle book and ignore it + # if it isn't. + format_ = ''.join(data.xpath(format_xpath)) + if 'kindle' not in format_.lower(): + continue - # Even though we are searching digital-text only Amazon will still - # put in results for non Kindle books (authors pages). Se we need - # to explicitly check if the item is a Kindle book and ignore it - # if it isn't. - format_ = ''.join(data.xpath(format_xpath)) - if 'kindle' not in format_.lower(): - continue + # We must have an asin otherwise we can't easily reference the + # book later. + asin = data.xpath(asin_xpath) + if asin: + asin = asin[0] + else: + continue - # We must have an asin otherwise we can't easily reference the - # book later. - asin = data.xpath(asin_xpath) - if asin: - asin = asin[0] - else: - continue + cover_url = ''.join(data.xpath(cover_xpath)) - cover_url = ''.join(data.xpath(cover_xpath)) + title = ''.join(data.xpath(title_xpath)) - title = ''.join(data.xpath(title_xpath)) + authors = ''.join(data.xpath(author_xpath)) + authors = re.sub('^' + self.author_article, '', authors) + authors = re.sub(self.and_word, ' & ', authors) + mo = re.match(r'(.*)(\(\d.*)$', authors) + if mo: + authors = mo.group(1).strip() - authors = ''.join(data.xpath(author_xpath)) - authors = re.sub('^' + self.author_article, '', authors) - authors = re.sub(self.and_word, ' & ', authors) - mo = re.match(r'(.*)(\(\d.*)$', authors) - if mo: - authors = mo.group(1).strip() + price = ''.join(data.xpath(price_xpath)) - price = ''.join(data.xpath(price_xpath)) + counter -= 1 - counter -= 1 + s = SearchResult() + s.cover_url = cover_url.strip() + s.title = title.strip() + s.author = authors.strip() + s.price = price.strip() + s.detail_item = asin.strip() + s.drm = SearchResult.DRM_UNKNOWN + s.formats = 'Kindle' - s = SearchResult() - s.cover_url = cover_url.strip() - s.title = title.strip() - s.author = authors.strip() - s.price = price.strip() - s.detail_item = asin.strip() - s.drm = SearchResult.DRM_UNKNOWN - s.formats = 'Kindle' - - yield s + yield s def get_details(self, search_result, timeout): pass diff --git a/src/calibre/gui2/store/stores/amazon_it_plugin.py b/src/calibre/gui2/store/stores/amazon_it_plugin.py index f8617a7e61..2493f78ea3 100644 --- a/src/calibre/gui2/store/stores/amazon_it_plugin.py +++ b/src/calibre/gui2/store/stores/amazon_it_plugin.py @@ -7,7 +7,7 @@ __license__ = 'GPL 3' __copyright__ = '2011, John Schember ' __docformat__ = 'restructuredtext en' -import re, time +import re from contextlib import closing from lxml import html @@ -28,9 +28,6 @@ class AmazonEUBase(StorePlugin): For comments on the implementation, please see amazon_plugin.py ''' - MAX_SEARCH_ATTEMPTS = 5 - SLEEP_BETWEEN_ATTEMPTS = 3 - def open(self, parent=None, detail_item=None, external=False): store_link = self.store_link % self.aff_id @@ -41,73 +38,71 @@ class AmazonEUBase(StorePlugin): def search(self, query, max_results=10, timeout=60): url = self.search_url + query.encode('ascii', 'backslashreplace').replace('%', '%25').replace('\\x', '%').replace(' ', '+') + br = browser() counter = max_results - loops = 0 - while counter == max_results and loops < self.MAX_SEARCH_ATTEMPTS: - br = browser() - if loops > 0: - print ("Retry getbooks search", self.__class__.__name__, counter, - max_results, loops) - time.sleep(self.SLEEP_BETWEEN_ATTEMPTS) - loops += 1 + with closing(br.open(url, timeout=timeout)) as f: + doc = html.fromstring(f.read())#.decode('latin-1', 'replace')) - with closing(br.open(url, timeout=timeout)) as f: - doc = html.fromstring(f.read())#.decode('latin-1', 'replace')) + data_xpath = '//div[contains(@class, "prod")]' + # Results can be in a grid (table) or a column + format_xpath = ( + './/ul[contains(@class, "rsltL") or contains(@class, "rsltGridList")]' + '//span[contains(@class, "lrg") and not(contains(@class, "bld"))]/text()') + asin_xpath = '@name' + cover_xpath = './/img[@class="productImage"]/@src' + title_xpath = './/h3[@class="newaps"]/a//text()' + author_xpath = './/h3[@class="newaps"]//span[contains(@class, "reg")]//text()' + # Results can be in a grid (table) or a column + price_xpath = ( + './/ul[contains(@class, "rsltL") or contains(@class, "rsltGridList")]' + '//span[contains(@class, "lrg") and contains(@class, "bld")]/text()') - data_xpath = '//div[contains(@class, "prod")]' - format_xpath = './/ul[contains(@class, "rsltL")]//span[contains(@class, "lrg") and not(contains(@class, "bld"))]/text()' - asin_xpath = '@name' - cover_xpath = './/img[@class="productImage"]/@src' - title_xpath = './/h3[@class="newaps"]/a//text()' - author_xpath = './/h3[@class="newaps"]//span[contains(@class, "reg")]//text()' - price_xpath = './/ul[contains(@class, "rsltL")]//span[contains(@class, "lrg") and contains(@class, "bld")]/text()' + for data in doc.xpath(data_xpath): + if counter <= 0: + break - for data in doc.xpath(data_xpath): - if counter <= 0: - break + # Even though we are searching digital-text only Amazon will still + # put in results for non Kindle books (authors pages). Se we need + # to explicitly check if the item is a Kindle book and ignore it + # if it isn't. + format_ = ''.join(data.xpath(format_xpath)) + if 'kindle' not in format_.lower(): + continue - # Even though we are searching digital-text only Amazon will still - # put in results for non Kindle books (authors pages). Se we need - # to explicitly check if the item is a Kindle book and ignore it - # if it isn't. - format_ = ''.join(data.xpath(format_xpath)) - if 'kindle' not in format_.lower(): - continue + # We must have an asin otherwise we can't easily reference the + # book later. + asin = data.xpath(asin_xpath) + if asin: + asin = asin[0] + else: + continue - # We must have an asin otherwise we can't easily reference the - # book later. - asin = data.xpath(asin_xpath) - if asin: - asin = asin[0] - else: - continue + cover_url = ''.join(data.xpath(cover_xpath)) - cover_url = ''.join(data.xpath(cover_xpath)) + title = ''.join(data.xpath(title_xpath)) - title = ''.join(data.xpath(title_xpath)) + authors = ''.join(data.xpath(author_xpath)) + authors = re.sub('^' + self.author_article, '', authors) + authors = re.sub(self.and_word, ' & ', authors) + mo = re.match(r'(.*)(\(\d.*)$', authors) + if mo: + authors = mo.group(1).strip() - authors = ''.join(data.xpath(author_xpath)) - authors = re.sub('^' + self.author_article, '', authors) - authors = re.sub(self.and_word, ' & ', authors) - mo = re.match(r'(.*)(\(\d.*)$', authors) - if mo: - authors = mo.group(1).strip() + price = ''.join(data.xpath(price_xpath)) - price = ''.join(data.xpath(price_xpath)) + counter -= 1 - counter -= 1 + s = SearchResult() + s.cover_url = cover_url.strip() + s.title = title.strip() + s.author = authors.strip() + s.price = price.strip() + s.detail_item = asin.strip() + s.drm = SearchResult.DRM_UNKNOWN + s.formats = 'Kindle' - s = SearchResult() - s.cover_url = cover_url.strip() - s.title = title.strip() - s.author = authors.strip() - s.price = price.strip() - s.detail_item = asin.strip() - s.drm = SearchResult.DRM_UNKNOWN - s.formats = 'Kindle' - - yield s + yield s def get_details(self, search_result, timeout): pass diff --git a/src/calibre/gui2/store/stores/amazon_uk_plugin.py b/src/calibre/gui2/store/stores/amazon_uk_plugin.py index 2dba752a67..054072824b 100644 --- a/src/calibre/gui2/store/stores/amazon_uk_plugin.py +++ b/src/calibre/gui2/store/stores/amazon_uk_plugin.py @@ -7,7 +7,7 @@ __license__ = 'GPL 3' __copyright__ = '2011, John Schember ' __docformat__ = 'restructuredtext en' -import re, time +import re from contextlib import closing from lxml import html @@ -28,9 +28,6 @@ class AmazonEUBase(StorePlugin): For comments on the implementation, please see amazon_plugin.py ''' - MAX_SEARCH_ATTEMPTS = 5 - SLEEP_BETWEEN_ATTEMPTS = 3 - def open(self, parent=None, detail_item=None, external=False): store_link = self.store_link % self.aff_id @@ -41,73 +38,71 @@ class AmazonEUBase(StorePlugin): def search(self, query, max_results=10, timeout=60): url = self.search_url + query.encode('ascii', 'backslashreplace').replace('%', '%25').replace('\\x', '%').replace(' ', '+') + br = browser() counter = max_results - loops = 0 - while counter == max_results and loops < self.MAX_SEARCH_ATTEMPTS: - br = browser() - if loops > 0: - print ("Retry getbooks search", self.__class__.__name__, counter, - max_results, loops) - time.sleep(self.SLEEP_BETWEEN_ATTEMPTS) - loops += 1 + with closing(br.open(url, timeout=timeout)) as f: + doc = html.fromstring(f.read())#.decode('latin-1', 'replace')) - with closing(br.open(url, timeout=timeout)) as f: - doc = html.fromstring(f.read())#.decode('latin-1', 'replace')) + data_xpath = '//div[contains(@class, "prod")]' + # Results can be in a grid (table) or a column + format_xpath = ( + './/ul[contains(@class, "rsltL") or contains(@class, "rsltGridList")]' + '//span[contains(@class, "lrg") and not(contains(@class, "bld"))]/text()') + asin_xpath = '@name' + cover_xpath = './/img[@class="productImage"]/@src' + title_xpath = './/h3[@class="newaps"]/a//text()' + author_xpath = './/h3[@class="newaps"]//span[contains(@class, "reg")]//text()' + # Results can be in a grid (table) or a column + price_xpath = ( + './/ul[contains(@class, "rsltL") or contains(@class, "rsltGridList")]' + '//span[contains(@class, "lrg") and contains(@class, "bld")]/text()') - data_xpath = '//div[contains(@class, "prod")]' - format_xpath = './/ul[contains(@class, "rsltL")]//span[contains(@class, "lrg") and not(contains(@class, "bld"))]/text()' - asin_xpath = '@name' - cover_xpath = './/img[@class="productImage"]/@src' - title_xpath = './/h3[@class="newaps"]/a//text()' - author_xpath = './/h3[@class="newaps"]//span[contains(@class, "reg")]//text()' - price_xpath = './/ul[contains(@class, "rsltL")]//span[contains(@class, "lrg") and contains(@class, "bld")]/text()' + for data in doc.xpath(data_xpath): + if counter <= 0: + break - for data in doc.xpath(data_xpath): - if counter <= 0: - break + # Even though we are searching digital-text only Amazon will still + # put in results for non Kindle books (authors pages). Se we need + # to explicitly check if the item is a Kindle book and ignore it + # if it isn't. + format_ = ''.join(data.xpath(format_xpath)) + if 'kindle' not in format_.lower(): + continue - # Even though we are searching digital-text only Amazon will still - # put in results for non Kindle books (authors pages). Se we need - # to explicitly check if the item is a Kindle book and ignore it - # if it isn't. - format_ = ''.join(data.xpath(format_xpath)) - if 'kindle' not in format_.lower(): - continue + # We must have an asin otherwise we can't easily reference the + # book later. + asin = data.xpath(asin_xpath) + if asin: + asin = asin[0] + else: + continue - # We must have an asin otherwise we can't easily reference the - # book later. - asin = data.xpath(asin_xpath) - if asin: - asin = asin[0] - else: - continue + cover_url = ''.join(data.xpath(cover_xpath)) - cover_url = ''.join(data.xpath(cover_xpath)) + title = ''.join(data.xpath(title_xpath)) - title = ''.join(data.xpath(title_xpath)) + authors = ''.join(data.xpath(author_xpath)) + authors = re.sub('^' + self.author_article, '', authors) + authors = re.sub(self.and_word, ' & ', authors) + mo = re.match(r'(.*)(\(\d.*)$', authors) + if mo: + authors = mo.group(1).strip() - authors = ''.join(data.xpath(author_xpath)) - authors = re.sub('^' + self.author_article, '', authors) - authors = re.sub(self.and_word, ' & ', authors) - mo = re.match(r'(.*)(\(\d.*)$', authors) - if mo: - authors = mo.group(1).strip() + price = ''.join(data.xpath(price_xpath)) - price = ''.join(data.xpath(price_xpath)) + counter -= 1 - counter -= 1 + s = SearchResult() + s.cover_url = cover_url.strip() + s.title = title.strip() + s.author = authors.strip() + s.price = price.strip() + s.detail_item = asin.strip() + s.drm = SearchResult.DRM_UNKNOWN + s.formats = 'Kindle' - s = SearchResult() - s.cover_url = cover_url.strip() - s.title = title.strip() - s.author = authors.strip() - s.price = price.strip() - s.detail_item = asin.strip() - s.drm = SearchResult.DRM_UNKNOWN - s.formats = 'Kindle' - - yield s + yield s def get_details(self, search_result, timeout): pass