From 71a18a47aac8920bb6609e1d77d56bba9150dc88 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sun, 31 Mar 2013 22:25:46 +0530 Subject: [PATCH 1/7] PDF Output: Fix 1 pixel wide left and top margins on the cover page for some PDF conversions due to incorrect rounding. Fixes #1162054 (ePub to PDF conversion regression) --- src/calibre/ebooks/pdf/render/engine.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/calibre/ebooks/pdf/render/engine.py b/src/calibre/ebooks/pdf/render/engine.py index 6e6347cab2..76fa0f5c41 100644 --- a/src/calibre/ebooks/pdf/render/engine.py +++ b/src/calibre/ebooks/pdf/render/engine.py @@ -7,7 +7,7 @@ __license__ = 'GPL v3' __copyright__ = '2012, Kovid Goyal ' __docformat__ = 'restructuredtext en' -import sys, traceback +import sys, traceback, math from collections import namedtuple from functools import wraps, partial from future_builtins import map @@ -356,9 +356,9 @@ class PdfDevice(QPaintDevice): # {{{ @property def full_page_rect(self): page_width = self.page_width * self.xdpi / 72.0 - lm = self.left_margin * self.xdpi / 72.0 + lm = int(math.ceil(self.left_margin * self.xdpi / 72.0)) page_height = self.page_height * self.ydpi / 72.0 - tm = self.top_margin * self.ydpi / 72.0 + tm = int(math.ceil(self.top_margin * self.ydpi / 72.0)) return (-lm, -tm, page_width, page_height) @property From c10d539a68ba1cd2b81be5d205aad007a7856c8c Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sun, 31 Mar 2013 22:29:23 +0530 Subject: [PATCH 2/7] ... --- src/calibre/ebooks/pdf/render/engine.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/calibre/ebooks/pdf/render/engine.py b/src/calibre/ebooks/pdf/render/engine.py index 76fa0f5c41..4cdebdce8c 100644 --- a/src/calibre/ebooks/pdf/render/engine.py +++ b/src/calibre/ebooks/pdf/render/engine.py @@ -355,11 +355,11 @@ class PdfDevice(QPaintDevice): # {{{ @property def full_page_rect(self): - page_width = self.page_width * self.xdpi / 72.0 + page_width = int(math.ceil(self.page_width * self.xdpi / 72.0)) lm = int(math.ceil(self.left_margin * self.xdpi / 72.0)) - page_height = self.page_height * self.ydpi / 72.0 + page_height = int(math.ceil(self.page_height * self.ydpi / 72.0)) tm = int(math.ceil(self.top_margin * self.ydpi / 72.0)) - return (-lm, -tm, page_width, page_height) + return (-lm, -tm, page_width+1, page_height+1) @property def current_page_num(self): From 069761e03e4f0e1bb963bfdde5aff029c0111f3b Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Mon, 1 Apr 2013 09:55:07 +0530 Subject: [PATCH 3/7] PDF Output: Fix generating page numbers causing links to not work. Fixes #1162573 (Hyperlinks disappear on mobi/epub to pdf conversion) --- src/calibre/ebooks/pdf/render/from_html.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/calibre/ebooks/pdf/render/from_html.py b/src/calibre/ebooks/pdf/render/from_html.py index 2f08e843b0..525bed16a3 100644 --- a/src/calibre/ebooks/pdf/render/from_html.py +++ b/src/calibre/ebooks/pdf/render/from_html.py @@ -302,6 +302,10 @@ class PDFWriter(QObject): py_bridge.value = book_indexing.all_links_and_anchors(); '''%(self.margin_top, 0, self.margin_bottom)) + amap = self.bridge_value + if not isinstance(amap, dict): + amap = {'links':[], 'anchors':{}} # Some javascript error occurred + if self.header: self.bridge_value = self.header evaljs('paged_display.header_template = py_bridge.value') @@ -311,9 +315,6 @@ class PDFWriter(QObject): if self.header or self.footer: evaljs('paged_display.create_header_footer();') - amap = self.bridge_value - if not isinstance(amap, dict): - amap = {'links':[], 'anchors':{}} # Some javascript error occurred start_page = self.current_page_num mf = self.view.page().mainFrame() From 7169ffed0a02c3f407a5b25f341cf2017118af2c Mon Sep 17 00:00:00 2001 From: Charles Haley <> Date: Mon, 1 Apr 2013 10:44:35 +0200 Subject: [PATCH 4/7] Make amazon EU store plugins more robust against amazon reporting zero books found. --- .../gui2/store/stores/amazon_de_plugin.py | 104 ++++++++++-------- .../gui2/store/stores/amazon_es_plugin.py | 104 ++++++++++-------- .../gui2/store/stores/amazon_fr_plugin.py | 104 ++++++++++-------- .../gui2/store/stores/amazon_it_plugin.py | 104 ++++++++++-------- .../gui2/store/stores/amazon_uk_plugin.py | 104 ++++++++++-------- 5 files changed, 290 insertions(+), 230 deletions(-) diff --git a/src/calibre/gui2/store/stores/amazon_de_plugin.py b/src/calibre/gui2/store/stores/amazon_de_plugin.py index 06bc571494..4af5e8186d 100644 --- a/src/calibre/gui2/store/stores/amazon_de_plugin.py +++ b/src/calibre/gui2/store/stores/amazon_de_plugin.py @@ -7,7 +7,7 @@ __license__ = 'GPL 3' __copyright__ = '2011, John Schember ' __docformat__ = 'restructuredtext en' -import re +import re, time from contextlib import closing from lxml import html @@ -29,6 +29,9 @@ class AmazonEUBase(StorePlugin): For comments on the implementation, please see amazon_plugin.py ''' + MAX_SEARCH_ATTEMPTS = 5 + SLEEP_BETWEEN_ATTEMPTS = 3 + def open(self, parent=None, detail_item=None, external=False): store_link = self.store_link % self.aff_id @@ -42,62 +45,71 @@ class AmazonEUBase(StorePlugin): br = browser() counter = max_results - with closing(br.open(url, timeout=timeout)) as f: - doc = html.fromstring(f.read())#.decode('latin-1', 'replace')) + loops = 0 + while counter == max_results and loops < self.MAX_SEARCH_ATTEMPTS: + br = browser() + if loops > 0: + print ("Retry getbooks search", self.__class__.__name__, counter, + max_results, loops) + time.sleep(self.SLEEP_BETWEEN_ATTEMPTS) + loops += 1 - data_xpath = '//div[contains(@class, "prod")]' - format_xpath = './/ul[contains(@class, "rsltL")]//span[contains(@class, "lrg") and not(contains(@class, "bld"))]/text()' - asin_xpath = '@name' - cover_xpath = './/img[@class="productImage"]/@src' - title_xpath = './/h3[@class="newaps"]/a//text()' - author_xpath = './/h3[@class="newaps"]//span[contains(@class, "reg")]//text()' - price_xpath = './/ul[contains(@class, "rsltL")]//span[contains(@class, "lrg") and contains(@class, "bld")]/text()' + with closing(br.open(url, timeout=timeout)) as f: + doc = html.fromstring(f.read())#.decode('latin-1', 'replace')) - for data in doc.xpath(data_xpath): - if counter <= 0: - break + data_xpath = '//div[contains(@class, "prod")]' + format_xpath = './/ul[contains(@class, "rsltL")]//span[contains(@class, "lrg") and not(contains(@class, "bld"))]/text()' + asin_xpath = '@name' + cover_xpath = './/img[@class="productImage"]/@src' + title_xpath = './/h3[@class="newaps"]/a//text()' + author_xpath = './/h3[@class="newaps"]//span[contains(@class, "reg")]//text()' + price_xpath = './/ul[contains(@class, "rsltL")]//span[contains(@class, "lrg") and contains(@class, "bld")]/text()' - # Even though we are searching digital-text only Amazon will still - # put in results for non Kindle books (authors pages). Se we need - # to explicitly check if the item is a Kindle book and ignore it - # if it isn't. - format_ = ''.join(data.xpath(format_xpath)) - if 'kindle' not in format_.lower(): - continue + for data in doc.xpath(data_xpath): + if counter <= 0: + break - # We must have an asin otherwise we can't easily reference the - # book later. - asin = data.xpath(asin_xpath) - if asin: - asin = asin[0] - else: - continue + # Even though we are searching digital-text only Amazon will still + # put in results for non Kindle books (authors pages). Se we need + # to explicitly check if the item is a Kindle book and ignore it + # if it isn't. + format_ = ''.join(data.xpath(format_xpath)) + if 'kindle' not in format_.lower(): + continue - cover_url = ''.join(data.xpath(cover_xpath)) + # We must have an asin otherwise we can't easily reference the + # book later. + asin = data.xpath(asin_xpath) + if asin: + asin = asin[0] + else: + continue - title = ''.join(data.xpath(title_xpath)) + cover_url = ''.join(data.xpath(cover_xpath)) - authors = ''.join(data.xpath(author_xpath)) - authors = re.sub('^' + self.author_article, '', authors) - authors = re.sub(self.and_word, ' & ', authors) - mo = re.match(r'(.*)(\(\d.*)$', authors) - if mo: - authors = mo.group(1).strip() + title = ''.join(data.xpath(title_xpath)) - price = ''.join(data.xpath(price_xpath)) + authors = ''.join(data.xpath(author_xpath)) + authors = re.sub('^' + self.author_article, '', authors) + authors = re.sub(self.and_word, ' & ', authors) + mo = re.match(r'(.*)(\(\d.*)$', authors) + if mo: + authors = mo.group(1).strip() - counter -= 1 + price = ''.join(data.xpath(price_xpath)) - s = SearchResult() - s.cover_url = cover_url.strip() - s.title = title.strip() - s.author = authors.strip() - s.price = price.strip() - s.detail_item = asin.strip() - s.drm = SearchResult.DRM_UNKNOWN - s.formats = 'Kindle' + counter -= 1 - yield s + s = SearchResult() + s.cover_url = cover_url.strip() + s.title = title.strip() + s.author = authors.strip() + s.price = price.strip() + s.detail_item = asin.strip() + s.drm = SearchResult.DRM_UNKNOWN + s.formats = 'Kindle' + + yield s def get_details(self, search_result, timeout): pass diff --git a/src/calibre/gui2/store/stores/amazon_es_plugin.py b/src/calibre/gui2/store/stores/amazon_es_plugin.py index 0254b953c4..d654c0ea8f 100644 --- a/src/calibre/gui2/store/stores/amazon_es_plugin.py +++ b/src/calibre/gui2/store/stores/amazon_es_plugin.py @@ -7,7 +7,7 @@ __license__ = 'GPL 3' __copyright__ = '2011, John Schember ' __docformat__ = 'restructuredtext en' -import re +import re, time from contextlib import closing from lxml import html @@ -28,6 +28,9 @@ class AmazonEUBase(StorePlugin): For comments on the implementation, please see amazon_plugin.py ''' + MAX_SEARCH_ATTEMPTS = 5 + SLEEP_BETWEEN_ATTEMPTS = 3 + def open(self, parent=None, detail_item=None, external=False): store_link = self.store_link % self.aff_id @@ -41,62 +44,71 @@ class AmazonEUBase(StorePlugin): br = browser() counter = max_results - with closing(br.open(url, timeout=timeout)) as f: - doc = html.fromstring(f.read())#.decode('latin-1', 'replace')) + loops = 0 + while counter == max_results and loops < self.MAX_SEARCH_ATTEMPTS: + br = browser() + if loops > 0: + print ("Retry getbooks search", self.__class__.__name__, counter, + max_results, loops) + time.sleep(self.SLEEP_BETWEEN_ATTEMPTS) + loops += 1 - data_xpath = '//div[contains(@class, "prod")]' - format_xpath = './/ul[contains(@class, "rsltL")]//span[contains(@class, "lrg") and not(contains(@class, "bld"))]/text()' - asin_xpath = '@name' - cover_xpath = './/img[@class="productImage"]/@src' - title_xpath = './/h3[@class="newaps"]/a//text()' - author_xpath = './/h3[@class="newaps"]//span[contains(@class, "reg")]//text()' - price_xpath = './/ul[contains(@class, "rsltL")]//span[contains(@class, "lrg") and contains(@class, "bld")]/text()' + with closing(br.open(url, timeout=timeout)) as f: + doc = html.fromstring(f.read())#.decode('latin-1', 'replace')) - for data in doc.xpath(data_xpath): - if counter <= 0: - break + data_xpath = '//div[contains(@class, "prod")]' + format_xpath = './/ul[contains(@class, "rsltL")]//span[contains(@class, "lrg") and not(contains(@class, "bld"))]/text()' + asin_xpath = '@name' + cover_xpath = './/img[@class="productImage"]/@src' + title_xpath = './/h3[@class="newaps"]/a//text()' + author_xpath = './/h3[@class="newaps"]//span[contains(@class, "reg")]//text()' + price_xpath = './/ul[contains(@class, "rsltL")]//span[contains(@class, "lrg") and contains(@class, "bld")]/text()' - # Even though we are searching digital-text only Amazon will still - # put in results for non Kindle books (authors pages). Se we need - # to explicitly check if the item is a Kindle book and ignore it - # if it isn't. - format_ = ''.join(data.xpath(format_xpath)) - if 'kindle' not in format_.lower(): - continue + for data in doc.xpath(data_xpath): + if counter <= 0: + break - # We must have an asin otherwise we can't easily reference the - # book later. - asin = data.xpath(asin_xpath) - if asin: - asin = asin[0] - else: - continue + # Even though we are searching digital-text only Amazon will still + # put in results for non Kindle books (authors pages). Se we need + # to explicitly check if the item is a Kindle book and ignore it + # if it isn't. + format_ = ''.join(data.xpath(format_xpath)) + if 'kindle' not in format_.lower(): + continue - cover_url = ''.join(data.xpath(cover_xpath)) + # We must have an asin otherwise we can't easily reference the + # book later. + asin = data.xpath(asin_xpath) + if asin: + asin = asin[0] + else: + continue - title = ''.join(data.xpath(title_xpath)) + cover_url = ''.join(data.xpath(cover_xpath)) - authors = ''.join(data.xpath(author_xpath)) - authors = re.sub('^' + self.author_article, '', authors) - authors = re.sub(self.and_word, ' & ', authors) - mo = re.match(r'(.*)(\(\d.*)$', authors) - if mo: - authors = mo.group(1).strip() + title = ''.join(data.xpath(title_xpath)) - price = ''.join(data.xpath(price_xpath)) + authors = ''.join(data.xpath(author_xpath)) + authors = re.sub('^' + self.author_article, '', authors) + authors = re.sub(self.and_word, ' & ', authors) + mo = re.match(r'(.*)(\(\d.*)$', authors) + if mo: + authors = mo.group(1).strip() - counter -= 1 + price = ''.join(data.xpath(price_xpath)) - s = SearchResult() - s.cover_url = cover_url.strip() - s.title = title.strip() - s.author = authors.strip() - s.price = price.strip() - s.detail_item = asin.strip() - s.drm = SearchResult.DRM_UNKNOWN - s.formats = 'Kindle' + counter -= 1 - yield s + s = SearchResult() + s.cover_url = cover_url.strip() + s.title = title.strip() + s.author = authors.strip() + s.price = price.strip() + s.detail_item = asin.strip() + s.drm = SearchResult.DRM_UNKNOWN + s.formats = 'Kindle' + + yield s def get_details(self, search_result, timeout): pass diff --git a/src/calibre/gui2/store/stores/amazon_fr_plugin.py b/src/calibre/gui2/store/stores/amazon_fr_plugin.py index 30f6b6f51e..ee4a3ba30f 100644 --- a/src/calibre/gui2/store/stores/amazon_fr_plugin.py +++ b/src/calibre/gui2/store/stores/amazon_fr_plugin.py @@ -7,7 +7,7 @@ __license__ = 'GPL 3' __copyright__ = '2011, John Schember ' __docformat__ = 'restructuredtext en' -import re +import re, time from contextlib import closing from lxml import html @@ -29,6 +29,9 @@ class AmazonEUBase(StorePlugin): For comments on the implementation, please see amazon_plugin.py ''' + MAX_SEARCH_ATTEMPTS = 5 + SLEEP_BETWEEN_ATTEMPTS = 3 + def open(self, parent=None, detail_item=None, external=False): store_link = self.store_link % self.aff_id @@ -42,62 +45,71 @@ class AmazonEUBase(StorePlugin): br = browser() counter = max_results - with closing(br.open(url, timeout=timeout)) as f: - doc = html.fromstring(f.read())#.decode('latin-1', 'replace')) + loops = 0 + while counter == max_results and loops < self.MAX_SEARCH_ATTEMPTS: + br = browser() + if loops > 0: + print ("Retry getbooks search", self.__class__.__name__, counter, + max_results, loops) + time.sleep(self.SLEEP_BETWEEN_ATTEMPTS) + loops += 1 - data_xpath = '//div[contains(@class, "prod")]' - format_xpath = './/ul[contains(@class, "rsltL")]//span[contains(@class, "lrg") and not(contains(@class, "bld"))]/text()' - asin_xpath = '@name' - cover_xpath = './/img[@class="productImage"]/@src' - title_xpath = './/h3[@class="newaps"]/a//text()' - author_xpath = './/h3[@class="newaps"]//span[contains(@class, "reg")]//text()' - price_xpath = './/ul[contains(@class, "rsltL")]//span[contains(@class, "lrg") and contains(@class, "bld")]/text()' + with closing(br.open(url, timeout=timeout)) as f: + doc = html.fromstring(f.read())#.decode('latin-1', 'replace')) - for data in doc.xpath(data_xpath): - if counter <= 0: - break + data_xpath = '//div[contains(@class, "prod")]' + format_xpath = './/ul[contains(@class, "rsltL")]//span[contains(@class, "lrg") and not(contains(@class, "bld"))]/text()' + asin_xpath = '@name' + cover_xpath = './/img[@class="productImage"]/@src' + title_xpath = './/h3[@class="newaps"]/a//text()' + author_xpath = './/h3[@class="newaps"]//span[contains(@class, "reg")]//text()' + price_xpath = './/ul[contains(@class, "rsltL")]//span[contains(@class, "lrg") and contains(@class, "bld")]/text()' - # Even though we are searching digital-text only Amazon will still - # put in results for non Kindle books (authors pages). Se we need - # to explicitly check if the item is a Kindle book and ignore it - # if it isn't. - format_ = ''.join(data.xpath(format_xpath)) - if 'kindle' not in format_.lower(): - continue + for data in doc.xpath(data_xpath): + if counter <= 0: + break - # We must have an asin otherwise we can't easily reference the - # book later. - asin = data.xpath(asin_xpath) - if asin: - asin = asin[0] - else: - continue + # Even though we are searching digital-text only Amazon will still + # put in results for non Kindle books (authors pages). Se we need + # to explicitly check if the item is a Kindle book and ignore it + # if it isn't. + format_ = ''.join(data.xpath(format_xpath)) + if 'kindle' not in format_.lower(): + continue - cover_url = ''.join(data.xpath(cover_xpath)) + # We must have an asin otherwise we can't easily reference the + # book later. + asin = data.xpath(asin_xpath) + if asin: + asin = asin[0] + else: + continue - title = ''.join(data.xpath(title_xpath)) + cover_url = ''.join(data.xpath(cover_xpath)) - authors = ''.join(data.xpath(author_xpath)) - authors = re.sub('^' + self.author_article, '', authors) - authors = re.sub(self.and_word, ' & ', authors) - mo = re.match(r'(.*)(\(\d.*)$', authors) - if mo: - authors = mo.group(1).strip() + title = ''.join(data.xpath(title_xpath)) - price = ''.join(data.xpath(price_xpath)) + authors = ''.join(data.xpath(author_xpath)) + authors = re.sub('^' + self.author_article, '', authors) + authors = re.sub(self.and_word, ' & ', authors) + mo = re.match(r'(.*)(\(\d.*)$', authors) + if mo: + authors = mo.group(1).strip() - counter -= 1 + price = ''.join(data.xpath(price_xpath)) - s = SearchResult() - s.cover_url = cover_url.strip() - s.title = title.strip() - s.author = authors.strip() - s.price = price.strip() - s.detail_item = asin.strip() - s.drm = SearchResult.DRM_UNKNOWN - s.formats = 'Kindle' + counter -= 1 - yield s + s = SearchResult() + s.cover_url = cover_url.strip() + s.title = title.strip() + s.author = authors.strip() + s.price = price.strip() + s.detail_item = asin.strip() + s.drm = SearchResult.DRM_UNKNOWN + s.formats = 'Kindle' + + yield s def get_details(self, search_result, timeout): pass diff --git a/src/calibre/gui2/store/stores/amazon_it_plugin.py b/src/calibre/gui2/store/stores/amazon_it_plugin.py index 53028cf192..6b697e7b77 100644 --- a/src/calibre/gui2/store/stores/amazon_it_plugin.py +++ b/src/calibre/gui2/store/stores/amazon_it_plugin.py @@ -7,7 +7,7 @@ __license__ = 'GPL 3' __copyright__ = '2011, John Schember ' __docformat__ = 'restructuredtext en' -import re +import re, time from contextlib import closing from lxml import html @@ -28,6 +28,9 @@ class AmazonEUBase(StorePlugin): For comments on the implementation, please see amazon_plugin.py ''' + MAX_SEARCH_ATTEMPTS = 5 + SLEEP_BETWEEN_ATTEMPTS = 3 + def open(self, parent=None, detail_item=None, external=False): store_link = self.store_link % self.aff_id @@ -41,62 +44,71 @@ class AmazonEUBase(StorePlugin): br = browser() counter = max_results - with closing(br.open(url, timeout=timeout)) as f: - doc = html.fromstring(f.read())#.decode('latin-1', 'replace')) + loops = 0 + while counter == max_results and loops < self.MAX_SEARCH_ATTEMPTS: + br = browser() + if loops > 0: + print ("Retry getbooks search", self.__class__.__name__, counter, + max_results, loops) + time.sleep(self.SLEEP_BETWEEN_ATTEMPTS) + loops += 1 - data_xpath = '//div[contains(@class, "prod")]' - format_xpath = './/ul[contains(@class, "rsltL")]//span[contains(@class, "lrg") and not(contains(@class, "bld"))]/text()' - asin_xpath = '@name' - cover_xpath = './/img[@class="productImage"]/@src' - title_xpath = './/h3[@class="newaps"]/a//text()' - author_xpath = './/h3[@class="newaps"]//span[contains(@class, "reg")]//text()' - price_xpath = './/ul[contains(@class, "rsltL")]//span[contains(@class, "lrg") and contains(@class, "bld")]/text()' + with closing(br.open(url, timeout=timeout)) as f: + doc = html.fromstring(f.read())#.decode('latin-1', 'replace')) - for data in doc.xpath(data_xpath): - if counter <= 0: - break + data_xpath = '//div[contains(@class, "prod")]' + format_xpath = './/ul[contains(@class, "rsltL")]//span[contains(@class, "lrg") and not(contains(@class, "bld"))]/text()' + asin_xpath = '@name' + cover_xpath = './/img[@class="productImage"]/@src' + title_xpath = './/h3[@class="newaps"]/a//text()' + author_xpath = './/h3[@class="newaps"]//span[contains(@class, "reg")]//text()' + price_xpath = './/ul[contains(@class, "rsltL")]//span[contains(@class, "lrg") and contains(@class, "bld")]/text()' - # Even though we are searching digital-text only Amazon will still - # put in results for non Kindle books (authors pages). Se we need - # to explicitly check if the item is a Kindle book and ignore it - # if it isn't. - format_ = ''.join(data.xpath(format_xpath)) - if 'kindle' not in format_.lower(): - continue + for data in doc.xpath(data_xpath): + if counter <= 0: + break - # We must have an asin otherwise we can't easily reference the - # book later. - asin = data.xpath(asin_xpath) - if asin: - asin = asin[0] - else: - continue + # Even though we are searching digital-text only Amazon will still + # put in results for non Kindle books (authors pages). Se we need + # to explicitly check if the item is a Kindle book and ignore it + # if it isn't. + format_ = ''.join(data.xpath(format_xpath)) + if 'kindle' not in format_.lower(): + continue - cover_url = ''.join(data.xpath(cover_xpath)) + # We must have an asin otherwise we can't easily reference the + # book later. + asin = data.xpath(asin_xpath) + if asin: + asin = asin[0] + else: + continue - title = ''.join(data.xpath(title_xpath)) + cover_url = ''.join(data.xpath(cover_xpath)) - authors = ''.join(data.xpath(author_xpath)) - authors = re.sub('^' + self.author_article, '', authors) - authors = re.sub(self.and_word, ' & ', authors) - mo = re.match(r'(.*)(\(\d.*)$', authors) - if mo: - authors = mo.group(1).strip() + title = ''.join(data.xpath(title_xpath)) - price = ''.join(data.xpath(price_xpath)) + authors = ''.join(data.xpath(author_xpath)) + authors = re.sub('^' + self.author_article, '', authors) + authors = re.sub(self.and_word, ' & ', authors) + mo = re.match(r'(.*)(\(\d.*)$', authors) + if mo: + authors = mo.group(1).strip() - counter -= 1 + price = ''.join(data.xpath(price_xpath)) - s = SearchResult() - s.cover_url = cover_url.strip() - s.title = title.strip() - s.author = authors.strip() - s.price = price.strip() - s.detail_item = asin.strip() - s.drm = SearchResult.DRM_UNKNOWN - s.formats = 'Kindle' + counter -= 1 - yield s + s = SearchResult() + s.cover_url = cover_url.strip() + s.title = title.strip() + s.author = authors.strip() + s.price = price.strip() + s.detail_item = asin.strip() + s.drm = SearchResult.DRM_UNKNOWN + s.formats = 'Kindle' + + yield s def get_details(self, search_result, timeout): pass diff --git a/src/calibre/gui2/store/stores/amazon_uk_plugin.py b/src/calibre/gui2/store/stores/amazon_uk_plugin.py index b5951a533f..72baa8a44c 100644 --- a/src/calibre/gui2/store/stores/amazon_uk_plugin.py +++ b/src/calibre/gui2/store/stores/amazon_uk_plugin.py @@ -7,7 +7,7 @@ __license__ = 'GPL 3' __copyright__ = '2011, John Schember ' __docformat__ = 'restructuredtext en' -import re +import re, time from contextlib import closing from lxml import html @@ -28,6 +28,9 @@ class AmazonEUBase(StorePlugin): For comments on the implementation, please see amazon_plugin.py ''' + MAX_SEARCH_ATTEMPTS = 5 + SLEEP_BETWEEN_ATTEMPTS = 3 + def open(self, parent=None, detail_item=None, external=False): store_link = self.store_link % self.aff_id @@ -41,62 +44,71 @@ class AmazonEUBase(StorePlugin): br = browser() counter = max_results - with closing(br.open(url, timeout=timeout)) as f: - doc = html.fromstring(f.read())#.decode('latin-1', 'replace')) + loops = 0 + while counter == max_results and loops < self.MAX_SEARCH_ATTEMPTS: + br = browser() + if loops > 0: + print ("Retry getbooks search", self.__class__.__name__, counter, + max_results, loops) + time.sleep(self.SLEEP_BETWEEN_ATTEMPTS) + loops += 1 - data_xpath = '//div[contains(@class, "prod")]' - format_xpath = './/ul[contains(@class, "rsltL")]//span[contains(@class, "lrg") and not(contains(@class, "bld"))]/text()' - asin_xpath = '@name' - cover_xpath = './/img[@class="productImage"]/@src' - title_xpath = './/h3[@class="newaps"]/a//text()' - author_xpath = './/h3[@class="newaps"]//span[contains(@class, "reg")]//text()' - price_xpath = './/ul[contains(@class, "rsltL")]//span[contains(@class, "lrg") and contains(@class, "bld")]/text()' + with closing(br.open(url, timeout=timeout)) as f: + doc = html.fromstring(f.read())#.decode('latin-1', 'replace')) - for data in doc.xpath(data_xpath): - if counter <= 0: - break + data_xpath = '//div[contains(@class, "prod")]' + format_xpath = './/ul[contains(@class, "rsltL")]//span[contains(@class, "lrg") and not(contains(@class, "bld"))]/text()' + asin_xpath = '@name' + cover_xpath = './/img[@class="productImage"]/@src' + title_xpath = './/h3[@class="newaps"]/a//text()' + author_xpath = './/h3[@class="newaps"]//span[contains(@class, "reg")]//text()' + price_xpath = './/ul[contains(@class, "rsltL")]//span[contains(@class, "lrg") and contains(@class, "bld")]/text()' - # Even though we are searching digital-text only Amazon will still - # put in results for non Kindle books (authors pages). Se we need - # to explicitly check if the item is a Kindle book and ignore it - # if it isn't. - format_ = ''.join(data.xpath(format_xpath)) - if 'kindle' not in format_.lower(): - continue + for data in doc.xpath(data_xpath): + if counter <= 0: + break - # We must have an asin otherwise we can't easily reference the - # book later. - asin = data.xpath(asin_xpath) - if asin: - asin = asin[0] - else: - continue + # Even though we are searching digital-text only Amazon will still + # put in results for non Kindle books (authors pages). Se we need + # to explicitly check if the item is a Kindle book and ignore it + # if it isn't. + format_ = ''.join(data.xpath(format_xpath)) + if 'kindle' not in format_.lower(): + continue - cover_url = ''.join(data.xpath(cover_xpath)) + # We must have an asin otherwise we can't easily reference the + # book later. + asin = data.xpath(asin_xpath) + if asin: + asin = asin[0] + else: + continue - title = ''.join(data.xpath(title_xpath)) + cover_url = ''.join(data.xpath(cover_xpath)) - authors = ''.join(data.xpath(author_xpath)) - authors = re.sub('^' + self.author_article, '', authors) - authors = re.sub(self.and_word, ' & ', authors) - mo = re.match(r'(.*)(\(\d.*)$', authors) - if mo: - authors = mo.group(1).strip() + title = ''.join(data.xpath(title_xpath)) - price = ''.join(data.xpath(price_xpath)) + authors = ''.join(data.xpath(author_xpath)) + authors = re.sub('^' + self.author_article, '', authors) + authors = re.sub(self.and_word, ' & ', authors) + mo = re.match(r'(.*)(\(\d.*)$', authors) + if mo: + authors = mo.group(1).strip() - counter -= 1 + price = ''.join(data.xpath(price_xpath)) - s = SearchResult() - s.cover_url = cover_url.strip() - s.title = title.strip() - s.author = authors.strip() - s.price = price.strip() - s.detail_item = asin.strip() - s.drm = SearchResult.DRM_UNKNOWN - s.formats = 'Kindle' + counter -= 1 - yield s + s = SearchResult() + s.cover_url = cover_url.strip() + s.title = title.strip() + s.author = authors.strip() + s.price = price.strip() + s.detail_item = asin.strip() + s.drm = SearchResult.DRM_UNKNOWN + s.formats = 'Kindle' + + yield s def get_details(self, search_result, timeout): pass From 11f2d0f61a3e19ff8dfbd25e999ca4dde2eecff1 Mon Sep 17 00:00:00 2001 From: Charles Haley <> Date: Mon, 1 Apr 2013 11:54:28 +0200 Subject: [PATCH 5/7] Remove redundant browser instance creation. --- src/calibre/gui2/store/stores/amazon_de_plugin.py | 1 - src/calibre/gui2/store/stores/amazon_es_plugin.py | 1 - src/calibre/gui2/store/stores/amazon_fr_plugin.py | 1 - src/calibre/gui2/store/stores/amazon_it_plugin.py | 1 - src/calibre/gui2/store/stores/amazon_uk_plugin.py | 1 - 5 files changed, 5 deletions(-) diff --git a/src/calibre/gui2/store/stores/amazon_de_plugin.py b/src/calibre/gui2/store/stores/amazon_de_plugin.py index 4af5e8186d..4352c17056 100644 --- a/src/calibre/gui2/store/stores/amazon_de_plugin.py +++ b/src/calibre/gui2/store/stores/amazon_de_plugin.py @@ -42,7 +42,6 @@ class AmazonEUBase(StorePlugin): def search(self, query, max_results=10, timeout=60): url = self.search_url + query.encode('ascii', 'backslashreplace').replace('%', '%25').replace('\\x', '%').replace(' ', '+') - br = browser() counter = max_results loops = 0 diff --git a/src/calibre/gui2/store/stores/amazon_es_plugin.py b/src/calibre/gui2/store/stores/amazon_es_plugin.py index d654c0ea8f..553b70deae 100644 --- a/src/calibre/gui2/store/stores/amazon_es_plugin.py +++ b/src/calibre/gui2/store/stores/amazon_es_plugin.py @@ -41,7 +41,6 @@ class AmazonEUBase(StorePlugin): def search(self, query, max_results=10, timeout=60): url = self.search_url + query.encode('ascii', 'backslashreplace').replace('%', '%25').replace('\\x', '%').replace(' ', '+') - br = browser() counter = max_results loops = 0 diff --git a/src/calibre/gui2/store/stores/amazon_fr_plugin.py b/src/calibre/gui2/store/stores/amazon_fr_plugin.py index ee4a3ba30f..cae8fa3ba1 100644 --- a/src/calibre/gui2/store/stores/amazon_fr_plugin.py +++ b/src/calibre/gui2/store/stores/amazon_fr_plugin.py @@ -42,7 +42,6 @@ class AmazonEUBase(StorePlugin): def search(self, query, max_results=10, timeout=60): url = self.search_url + query.encode('ascii', 'backslashreplace').replace('%', '%25').replace('\\x', '%').replace(' ', '+') - br = browser() counter = max_results loops = 0 diff --git a/src/calibre/gui2/store/stores/amazon_it_plugin.py b/src/calibre/gui2/store/stores/amazon_it_plugin.py index 6b697e7b77..eb60b29770 100644 --- a/src/calibre/gui2/store/stores/amazon_it_plugin.py +++ b/src/calibre/gui2/store/stores/amazon_it_plugin.py @@ -41,7 +41,6 @@ class AmazonEUBase(StorePlugin): def search(self, query, max_results=10, timeout=60): url = self.search_url + query.encode('ascii', 'backslashreplace').replace('%', '%25').replace('\\x', '%').replace(' ', '+') - br = browser() counter = max_results loops = 0 diff --git a/src/calibre/gui2/store/stores/amazon_uk_plugin.py b/src/calibre/gui2/store/stores/amazon_uk_plugin.py index 72baa8a44c..30cbb223fc 100644 --- a/src/calibre/gui2/store/stores/amazon_uk_plugin.py +++ b/src/calibre/gui2/store/stores/amazon_uk_plugin.py @@ -41,7 +41,6 @@ class AmazonEUBase(StorePlugin): def search(self, query, max_results=10, timeout=60): url = self.search_url + query.encode('ascii', 'backslashreplace').replace('%', '%25').replace('\\x', '%').replace(' ', '+') - br = browser() counter = max_results loops = 0 From de0c86415943a86eae74bafb0130427ef2cfb5cc Mon Sep 17 00:00:00 2001 From: Charles Haley <> Date: Mon, 1 Apr 2013 12:41:52 +0200 Subject: [PATCH 6/7] Forgot to update the version number. --- src/calibre/gui2/store/stores/amazon_de_plugin.py | 2 +- src/calibre/gui2/store/stores/amazon_es_plugin.py | 2 +- src/calibre/gui2/store/stores/amazon_fr_plugin.py | 2 +- src/calibre/gui2/store/stores/amazon_it_plugin.py | 2 +- src/calibre/gui2/store/stores/amazon_uk_plugin.py | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/src/calibre/gui2/store/stores/amazon_de_plugin.py b/src/calibre/gui2/store/stores/amazon_de_plugin.py index 4352c17056..5f5b103d69 100644 --- a/src/calibre/gui2/store/stores/amazon_de_plugin.py +++ b/src/calibre/gui2/store/stores/amazon_de_plugin.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- from __future__ import (unicode_literals, division, absolute_import, print_function) -store_version = 1 # Needed for dynamic plugin loading +store_version = 2 # Needed for dynamic plugin loading __license__ = 'GPL 3' __copyright__ = '2011, John Schember ' diff --git a/src/calibre/gui2/store/stores/amazon_es_plugin.py b/src/calibre/gui2/store/stores/amazon_es_plugin.py index 553b70deae..fdf9ef7502 100644 --- a/src/calibre/gui2/store/stores/amazon_es_plugin.py +++ b/src/calibre/gui2/store/stores/amazon_es_plugin.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- from __future__ import (unicode_literals, division, absolute_import, print_function) -store_version = 1 # Needed for dynamic plugin loading +store_version = 2 # Needed for dynamic plugin loading __license__ = 'GPL 3' __copyright__ = '2011, John Schember ' diff --git a/src/calibre/gui2/store/stores/amazon_fr_plugin.py b/src/calibre/gui2/store/stores/amazon_fr_plugin.py index cae8fa3ba1..52e943a535 100644 --- a/src/calibre/gui2/store/stores/amazon_fr_plugin.py +++ b/src/calibre/gui2/store/stores/amazon_fr_plugin.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- from __future__ import (unicode_literals, division, absolute_import, print_function) -store_version = 1 # Needed for dynamic plugin loading +store_version = 2 # Needed for dynamic plugin loading __license__ = 'GPL 3' __copyright__ = '2011, John Schember ' diff --git a/src/calibre/gui2/store/stores/amazon_it_plugin.py b/src/calibre/gui2/store/stores/amazon_it_plugin.py index eb60b29770..f8617a7e61 100644 --- a/src/calibre/gui2/store/stores/amazon_it_plugin.py +++ b/src/calibre/gui2/store/stores/amazon_it_plugin.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- from __future__ import (unicode_literals, division, absolute_import, print_function) -store_version = 1 # Needed for dynamic plugin loading +store_version = 2 # Needed for dynamic plugin loading __license__ = 'GPL 3' __copyright__ = '2011, John Schember ' diff --git a/src/calibre/gui2/store/stores/amazon_uk_plugin.py b/src/calibre/gui2/store/stores/amazon_uk_plugin.py index 30cbb223fc..2dba752a67 100644 --- a/src/calibre/gui2/store/stores/amazon_uk_plugin.py +++ b/src/calibre/gui2/store/stores/amazon_uk_plugin.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- from __future__ import (unicode_literals, division, absolute_import, print_function) -store_version = 1 # Needed for dynamic plugin loading +store_version = 2 # Needed for dynamic plugin loading __license__ = 'GPL 3' __copyright__ = '2011, John Schember ' From 89a5375f57a0b34792ea927565ef65d87b33fde7 Mon Sep 17 00:00:00 2001 From: Charles Haley <> Date: Mon, 1 Apr 2013 13:54:44 +0200 Subject: [PATCH 7/7] Handle column and grid layouts in amazon EU stores --- .../gui2/store/stores/amazon_de_plugin.py | 111 +++++++++--------- .../gui2/store/stores/amazon_es_plugin.py | 111 +++++++++--------- .../gui2/store/stores/amazon_fr_plugin.py | 111 +++++++++--------- .../gui2/store/stores/amazon_it_plugin.py | 111 +++++++++--------- .../gui2/store/stores/amazon_uk_plugin.py | 111 +++++++++--------- 5 files changed, 265 insertions(+), 290 deletions(-) diff --git a/src/calibre/gui2/store/stores/amazon_de_plugin.py b/src/calibre/gui2/store/stores/amazon_de_plugin.py index 5f5b103d69..7b4027794a 100644 --- a/src/calibre/gui2/store/stores/amazon_de_plugin.py +++ b/src/calibre/gui2/store/stores/amazon_de_plugin.py @@ -7,7 +7,7 @@ __license__ = 'GPL 3' __copyright__ = '2011, John Schember ' __docformat__ = 'restructuredtext en' -import re, time +import re from contextlib import closing from lxml import html @@ -29,9 +29,6 @@ class AmazonEUBase(StorePlugin): For comments on the implementation, please see amazon_plugin.py ''' - MAX_SEARCH_ATTEMPTS = 5 - SLEEP_BETWEEN_ATTEMPTS = 3 - def open(self, parent=None, detail_item=None, external=False): store_link = self.store_link % self.aff_id @@ -42,73 +39,71 @@ class AmazonEUBase(StorePlugin): def search(self, query, max_results=10, timeout=60): url = self.search_url + query.encode('ascii', 'backslashreplace').replace('%', '%25').replace('\\x', '%').replace(' ', '+') + br = browser() counter = max_results - loops = 0 - while counter == max_results and loops < self.MAX_SEARCH_ATTEMPTS: - br = browser() - if loops > 0: - print ("Retry getbooks search", self.__class__.__name__, counter, - max_results, loops) - time.sleep(self.SLEEP_BETWEEN_ATTEMPTS) - loops += 1 + with closing(br.open(url, timeout=timeout)) as f: + doc = html.fromstring(f.read())#.decode('latin-1', 'replace')) - with closing(br.open(url, timeout=timeout)) as f: - doc = html.fromstring(f.read())#.decode('latin-1', 'replace')) + data_xpath = '//div[contains(@class, "prod")]' + # Results can be in a grid (table) or a column + format_xpath = ( + './/ul[contains(@class, "rsltL") or contains(@class, "rsltGridList")]' + '//span[contains(@class, "lrg") and not(contains(@class, "bld"))]/text()') + asin_xpath = '@name' + cover_xpath = './/img[@class="productImage"]/@src' + title_xpath = './/h3[@class="newaps"]/a//text()' + author_xpath = './/h3[@class="newaps"]//span[contains(@class, "reg")]//text()' + # Results can be in a grid (table) or a column + price_xpath = ( + './/ul[contains(@class, "rsltL") or contains(@class, "rsltGridList")]' + '//span[contains(@class, "lrg") and contains(@class, "bld")]/text()') - data_xpath = '//div[contains(@class, "prod")]' - format_xpath = './/ul[contains(@class, "rsltL")]//span[contains(@class, "lrg") and not(contains(@class, "bld"))]/text()' - asin_xpath = '@name' - cover_xpath = './/img[@class="productImage"]/@src' - title_xpath = './/h3[@class="newaps"]/a//text()' - author_xpath = './/h3[@class="newaps"]//span[contains(@class, "reg")]//text()' - price_xpath = './/ul[contains(@class, "rsltL")]//span[contains(@class, "lrg") and contains(@class, "bld")]/text()' + for data in doc.xpath(data_xpath): + if counter <= 0: + break - for data in doc.xpath(data_xpath): - if counter <= 0: - break + # Even though we are searching digital-text only Amazon will still + # put in results for non Kindle books (authors pages). Se we need + # to explicitly check if the item is a Kindle book and ignore it + # if it isn't. + format_ = ''.join(data.xpath(format_xpath)) + if 'kindle' not in format_.lower(): + continue - # Even though we are searching digital-text only Amazon will still - # put in results for non Kindle books (authors pages). Se we need - # to explicitly check if the item is a Kindle book and ignore it - # if it isn't. - format_ = ''.join(data.xpath(format_xpath)) - if 'kindle' not in format_.lower(): - continue + # We must have an asin otherwise we can't easily reference the + # book later. + asin = data.xpath(asin_xpath) + if asin: + asin = asin[0] + else: + continue - # We must have an asin otherwise we can't easily reference the - # book later. - asin = data.xpath(asin_xpath) - if asin: - asin = asin[0] - else: - continue + cover_url = ''.join(data.xpath(cover_xpath)) - cover_url = ''.join(data.xpath(cover_xpath)) + title = ''.join(data.xpath(title_xpath)) - title = ''.join(data.xpath(title_xpath)) + authors = ''.join(data.xpath(author_xpath)) + authors = re.sub('^' + self.author_article, '', authors) + authors = re.sub(self.and_word, ' & ', authors) + mo = re.match(r'(.*)(\(\d.*)$', authors) + if mo: + authors = mo.group(1).strip() - authors = ''.join(data.xpath(author_xpath)) - authors = re.sub('^' + self.author_article, '', authors) - authors = re.sub(self.and_word, ' & ', authors) - mo = re.match(r'(.*)(\(\d.*)$', authors) - if mo: - authors = mo.group(1).strip() + price = ''.join(data.xpath(price_xpath)) - price = ''.join(data.xpath(price_xpath)) + counter -= 1 - counter -= 1 + s = SearchResult() + s.cover_url = cover_url.strip() + s.title = title.strip() + s.author = authors.strip() + s.price = price.strip() + s.detail_item = asin.strip() + s.drm = SearchResult.DRM_UNKNOWN + s.formats = 'Kindle' - s = SearchResult() - s.cover_url = cover_url.strip() - s.title = title.strip() - s.author = authors.strip() - s.price = price.strip() - s.detail_item = asin.strip() - s.drm = SearchResult.DRM_UNKNOWN - s.formats = 'Kindle' - - yield s + yield s def get_details(self, search_result, timeout): pass diff --git a/src/calibre/gui2/store/stores/amazon_es_plugin.py b/src/calibre/gui2/store/stores/amazon_es_plugin.py index fdf9ef7502..68387ffe11 100644 --- a/src/calibre/gui2/store/stores/amazon_es_plugin.py +++ b/src/calibre/gui2/store/stores/amazon_es_plugin.py @@ -7,7 +7,7 @@ __license__ = 'GPL 3' __copyright__ = '2011, John Schember ' __docformat__ = 'restructuredtext en' -import re, time +import re from contextlib import closing from lxml import html @@ -28,9 +28,6 @@ class AmazonEUBase(StorePlugin): For comments on the implementation, please see amazon_plugin.py ''' - MAX_SEARCH_ATTEMPTS = 5 - SLEEP_BETWEEN_ATTEMPTS = 3 - def open(self, parent=None, detail_item=None, external=False): store_link = self.store_link % self.aff_id @@ -41,73 +38,71 @@ class AmazonEUBase(StorePlugin): def search(self, query, max_results=10, timeout=60): url = self.search_url + query.encode('ascii', 'backslashreplace').replace('%', '%25').replace('\\x', '%').replace(' ', '+') + br = browser() counter = max_results - loops = 0 - while counter == max_results and loops < self.MAX_SEARCH_ATTEMPTS: - br = browser() - if loops > 0: - print ("Retry getbooks search", self.__class__.__name__, counter, - max_results, loops) - time.sleep(self.SLEEP_BETWEEN_ATTEMPTS) - loops += 1 + with closing(br.open(url, timeout=timeout)) as f: + doc = html.fromstring(f.read())#.decode('latin-1', 'replace')) - with closing(br.open(url, timeout=timeout)) as f: - doc = html.fromstring(f.read())#.decode('latin-1', 'replace')) + data_xpath = '//div[contains(@class, "prod")]' + # Results can be in a grid (table) or a column + format_xpath = ( + './/ul[contains(@class, "rsltL") or contains(@class, "rsltGridList")]' + '//span[contains(@class, "lrg") and not(contains(@class, "bld"))]/text()') + asin_xpath = '@name' + cover_xpath = './/img[@class="productImage"]/@src' + title_xpath = './/h3[@class="newaps"]/a//text()' + author_xpath = './/h3[@class="newaps"]//span[contains(@class, "reg")]//text()' + # Results can be in a grid (table) or a column + price_xpath = ( + './/ul[contains(@class, "rsltL") or contains(@class, "rsltGridList")]' + '//span[contains(@class, "lrg") and contains(@class, "bld")]/text()') - data_xpath = '//div[contains(@class, "prod")]' - format_xpath = './/ul[contains(@class, "rsltL")]//span[contains(@class, "lrg") and not(contains(@class, "bld"))]/text()' - asin_xpath = '@name' - cover_xpath = './/img[@class="productImage"]/@src' - title_xpath = './/h3[@class="newaps"]/a//text()' - author_xpath = './/h3[@class="newaps"]//span[contains(@class, "reg")]//text()' - price_xpath = './/ul[contains(@class, "rsltL")]//span[contains(@class, "lrg") and contains(@class, "bld")]/text()' + for data in doc.xpath(data_xpath): + if counter <= 0: + break - for data in doc.xpath(data_xpath): - if counter <= 0: - break + # Even though we are searching digital-text only Amazon will still + # put in results for non Kindle books (authors pages). Se we need + # to explicitly check if the item is a Kindle book and ignore it + # if it isn't. + format_ = ''.join(data.xpath(format_xpath)) + if 'kindle' not in format_.lower(): + continue - # Even though we are searching digital-text only Amazon will still - # put in results for non Kindle books (authors pages). Se we need - # to explicitly check if the item is a Kindle book and ignore it - # if it isn't. - format_ = ''.join(data.xpath(format_xpath)) - if 'kindle' not in format_.lower(): - continue + # We must have an asin otherwise we can't easily reference the + # book later. + asin = data.xpath(asin_xpath) + if asin: + asin = asin[0] + else: + continue - # We must have an asin otherwise we can't easily reference the - # book later. - asin = data.xpath(asin_xpath) - if asin: - asin = asin[0] - else: - continue + cover_url = ''.join(data.xpath(cover_xpath)) - cover_url = ''.join(data.xpath(cover_xpath)) + title = ''.join(data.xpath(title_xpath)) - title = ''.join(data.xpath(title_xpath)) + authors = ''.join(data.xpath(author_xpath)) + authors = re.sub('^' + self.author_article, '', authors) + authors = re.sub(self.and_word, ' & ', authors) + mo = re.match(r'(.*)(\(\d.*)$', authors) + if mo: + authors = mo.group(1).strip() - authors = ''.join(data.xpath(author_xpath)) - authors = re.sub('^' + self.author_article, '', authors) - authors = re.sub(self.and_word, ' & ', authors) - mo = re.match(r'(.*)(\(\d.*)$', authors) - if mo: - authors = mo.group(1).strip() + price = ''.join(data.xpath(price_xpath)) - price = ''.join(data.xpath(price_xpath)) + counter -= 1 - counter -= 1 + s = SearchResult() + s.cover_url = cover_url.strip() + s.title = title.strip() + s.author = authors.strip() + s.price = price.strip() + s.detail_item = asin.strip() + s.drm = SearchResult.DRM_UNKNOWN + s.formats = 'Kindle' - s = SearchResult() - s.cover_url = cover_url.strip() - s.title = title.strip() - s.author = authors.strip() - s.price = price.strip() - s.detail_item = asin.strip() - s.drm = SearchResult.DRM_UNKNOWN - s.formats = 'Kindle' - - yield s + yield s def get_details(self, search_result, timeout): pass diff --git a/src/calibre/gui2/store/stores/amazon_fr_plugin.py b/src/calibre/gui2/store/stores/amazon_fr_plugin.py index 52e943a535..9b425a2fc9 100644 --- a/src/calibre/gui2/store/stores/amazon_fr_plugin.py +++ b/src/calibre/gui2/store/stores/amazon_fr_plugin.py @@ -7,7 +7,7 @@ __license__ = 'GPL 3' __copyright__ = '2011, John Schember ' __docformat__ = 'restructuredtext en' -import re, time +import re from contextlib import closing from lxml import html @@ -29,9 +29,6 @@ class AmazonEUBase(StorePlugin): For comments on the implementation, please see amazon_plugin.py ''' - MAX_SEARCH_ATTEMPTS = 5 - SLEEP_BETWEEN_ATTEMPTS = 3 - def open(self, parent=None, detail_item=None, external=False): store_link = self.store_link % self.aff_id @@ -42,73 +39,71 @@ class AmazonEUBase(StorePlugin): def search(self, query, max_results=10, timeout=60): url = self.search_url + query.encode('ascii', 'backslashreplace').replace('%', '%25').replace('\\x', '%').replace(' ', '+') + br = browser() counter = max_results - loops = 0 - while counter == max_results and loops < self.MAX_SEARCH_ATTEMPTS: - br = browser() - if loops > 0: - print ("Retry getbooks search", self.__class__.__name__, counter, - max_results, loops) - time.sleep(self.SLEEP_BETWEEN_ATTEMPTS) - loops += 1 + with closing(br.open(url, timeout=timeout)) as f: + doc = html.fromstring(f.read())#.decode('latin-1', 'replace')) - with closing(br.open(url, timeout=timeout)) as f: - doc = html.fromstring(f.read())#.decode('latin-1', 'replace')) + data_xpath = '//div[contains(@class, "prod")]' + # Results can be in a grid (table) or a column + format_xpath = ( + './/ul[contains(@class, "rsltL") or contains(@class, "rsltGridList")]' + '//span[contains(@class, "lrg") and not(contains(@class, "bld"))]/text()') + asin_xpath = '@name' + cover_xpath = './/img[@class="productImage"]/@src' + title_xpath = './/h3[@class="newaps"]/a//text()' + author_xpath = './/h3[@class="newaps"]//span[contains(@class, "reg")]//text()' + # Results can be in a grid (table) or a column + price_xpath = ( + './/ul[contains(@class, "rsltL") or contains(@class, "rsltGridList")]' + '//span[contains(@class, "lrg") and contains(@class, "bld")]/text()') - data_xpath = '//div[contains(@class, "prod")]' - format_xpath = './/ul[contains(@class, "rsltL")]//span[contains(@class, "lrg") and not(contains(@class, "bld"))]/text()' - asin_xpath = '@name' - cover_xpath = './/img[@class="productImage"]/@src' - title_xpath = './/h3[@class="newaps"]/a//text()' - author_xpath = './/h3[@class="newaps"]//span[contains(@class, "reg")]//text()' - price_xpath = './/ul[contains(@class, "rsltL")]//span[contains(@class, "lrg") and contains(@class, "bld")]/text()' + for data in doc.xpath(data_xpath): + if counter <= 0: + break - for data in doc.xpath(data_xpath): - if counter <= 0: - break + # Even though we are searching digital-text only Amazon will still + # put in results for non Kindle books (authors pages). Se we need + # to explicitly check if the item is a Kindle book and ignore it + # if it isn't. + format_ = ''.join(data.xpath(format_xpath)) + if 'kindle' not in format_.lower(): + continue - # Even though we are searching digital-text only Amazon will still - # put in results for non Kindle books (authors pages). Se we need - # to explicitly check if the item is a Kindle book and ignore it - # if it isn't. - format_ = ''.join(data.xpath(format_xpath)) - if 'kindle' not in format_.lower(): - continue + # We must have an asin otherwise we can't easily reference the + # book later. + asin = data.xpath(asin_xpath) + if asin: + asin = asin[0] + else: + continue - # We must have an asin otherwise we can't easily reference the - # book later. - asin = data.xpath(asin_xpath) - if asin: - asin = asin[0] - else: - continue + cover_url = ''.join(data.xpath(cover_xpath)) - cover_url = ''.join(data.xpath(cover_xpath)) + title = ''.join(data.xpath(title_xpath)) - title = ''.join(data.xpath(title_xpath)) + authors = ''.join(data.xpath(author_xpath)) + authors = re.sub('^' + self.author_article, '', authors) + authors = re.sub(self.and_word, ' & ', authors) + mo = re.match(r'(.*)(\(\d.*)$', authors) + if mo: + authors = mo.group(1).strip() - authors = ''.join(data.xpath(author_xpath)) - authors = re.sub('^' + self.author_article, '', authors) - authors = re.sub(self.and_word, ' & ', authors) - mo = re.match(r'(.*)(\(\d.*)$', authors) - if mo: - authors = mo.group(1).strip() + price = ''.join(data.xpath(price_xpath)) - price = ''.join(data.xpath(price_xpath)) + counter -= 1 - counter -= 1 + s = SearchResult() + s.cover_url = cover_url.strip() + s.title = title.strip() + s.author = authors.strip() + s.price = price.strip() + s.detail_item = asin.strip() + s.drm = SearchResult.DRM_UNKNOWN + s.formats = 'Kindle' - s = SearchResult() - s.cover_url = cover_url.strip() - s.title = title.strip() - s.author = authors.strip() - s.price = price.strip() - s.detail_item = asin.strip() - s.drm = SearchResult.DRM_UNKNOWN - s.formats = 'Kindle' - - yield s + yield s def get_details(self, search_result, timeout): pass diff --git a/src/calibre/gui2/store/stores/amazon_it_plugin.py b/src/calibre/gui2/store/stores/amazon_it_plugin.py index f8617a7e61..2493f78ea3 100644 --- a/src/calibre/gui2/store/stores/amazon_it_plugin.py +++ b/src/calibre/gui2/store/stores/amazon_it_plugin.py @@ -7,7 +7,7 @@ __license__ = 'GPL 3' __copyright__ = '2011, John Schember ' __docformat__ = 'restructuredtext en' -import re, time +import re from contextlib import closing from lxml import html @@ -28,9 +28,6 @@ class AmazonEUBase(StorePlugin): For comments on the implementation, please see amazon_plugin.py ''' - MAX_SEARCH_ATTEMPTS = 5 - SLEEP_BETWEEN_ATTEMPTS = 3 - def open(self, parent=None, detail_item=None, external=False): store_link = self.store_link % self.aff_id @@ -41,73 +38,71 @@ class AmazonEUBase(StorePlugin): def search(self, query, max_results=10, timeout=60): url = self.search_url + query.encode('ascii', 'backslashreplace').replace('%', '%25').replace('\\x', '%').replace(' ', '+') + br = browser() counter = max_results - loops = 0 - while counter == max_results and loops < self.MAX_SEARCH_ATTEMPTS: - br = browser() - if loops > 0: - print ("Retry getbooks search", self.__class__.__name__, counter, - max_results, loops) - time.sleep(self.SLEEP_BETWEEN_ATTEMPTS) - loops += 1 + with closing(br.open(url, timeout=timeout)) as f: + doc = html.fromstring(f.read())#.decode('latin-1', 'replace')) - with closing(br.open(url, timeout=timeout)) as f: - doc = html.fromstring(f.read())#.decode('latin-1', 'replace')) + data_xpath = '//div[contains(@class, "prod")]' + # Results can be in a grid (table) or a column + format_xpath = ( + './/ul[contains(@class, "rsltL") or contains(@class, "rsltGridList")]' + '//span[contains(@class, "lrg") and not(contains(@class, "bld"))]/text()') + asin_xpath = '@name' + cover_xpath = './/img[@class="productImage"]/@src' + title_xpath = './/h3[@class="newaps"]/a//text()' + author_xpath = './/h3[@class="newaps"]//span[contains(@class, "reg")]//text()' + # Results can be in a grid (table) or a column + price_xpath = ( + './/ul[contains(@class, "rsltL") or contains(@class, "rsltGridList")]' + '//span[contains(@class, "lrg") and contains(@class, "bld")]/text()') - data_xpath = '//div[contains(@class, "prod")]' - format_xpath = './/ul[contains(@class, "rsltL")]//span[contains(@class, "lrg") and not(contains(@class, "bld"))]/text()' - asin_xpath = '@name' - cover_xpath = './/img[@class="productImage"]/@src' - title_xpath = './/h3[@class="newaps"]/a//text()' - author_xpath = './/h3[@class="newaps"]//span[contains(@class, "reg")]//text()' - price_xpath = './/ul[contains(@class, "rsltL")]//span[contains(@class, "lrg") and contains(@class, "bld")]/text()' + for data in doc.xpath(data_xpath): + if counter <= 0: + break - for data in doc.xpath(data_xpath): - if counter <= 0: - break + # Even though we are searching digital-text only Amazon will still + # put in results for non Kindle books (authors pages). Se we need + # to explicitly check if the item is a Kindle book and ignore it + # if it isn't. + format_ = ''.join(data.xpath(format_xpath)) + if 'kindle' not in format_.lower(): + continue - # Even though we are searching digital-text only Amazon will still - # put in results for non Kindle books (authors pages). Se we need - # to explicitly check if the item is a Kindle book and ignore it - # if it isn't. - format_ = ''.join(data.xpath(format_xpath)) - if 'kindle' not in format_.lower(): - continue + # We must have an asin otherwise we can't easily reference the + # book later. + asin = data.xpath(asin_xpath) + if asin: + asin = asin[0] + else: + continue - # We must have an asin otherwise we can't easily reference the - # book later. - asin = data.xpath(asin_xpath) - if asin: - asin = asin[0] - else: - continue + cover_url = ''.join(data.xpath(cover_xpath)) - cover_url = ''.join(data.xpath(cover_xpath)) + title = ''.join(data.xpath(title_xpath)) - title = ''.join(data.xpath(title_xpath)) + authors = ''.join(data.xpath(author_xpath)) + authors = re.sub('^' + self.author_article, '', authors) + authors = re.sub(self.and_word, ' & ', authors) + mo = re.match(r'(.*)(\(\d.*)$', authors) + if mo: + authors = mo.group(1).strip() - authors = ''.join(data.xpath(author_xpath)) - authors = re.sub('^' + self.author_article, '', authors) - authors = re.sub(self.and_word, ' & ', authors) - mo = re.match(r'(.*)(\(\d.*)$', authors) - if mo: - authors = mo.group(1).strip() + price = ''.join(data.xpath(price_xpath)) - price = ''.join(data.xpath(price_xpath)) + counter -= 1 - counter -= 1 + s = SearchResult() + s.cover_url = cover_url.strip() + s.title = title.strip() + s.author = authors.strip() + s.price = price.strip() + s.detail_item = asin.strip() + s.drm = SearchResult.DRM_UNKNOWN + s.formats = 'Kindle' - s = SearchResult() - s.cover_url = cover_url.strip() - s.title = title.strip() - s.author = authors.strip() - s.price = price.strip() - s.detail_item = asin.strip() - s.drm = SearchResult.DRM_UNKNOWN - s.formats = 'Kindle' - - yield s + yield s def get_details(self, search_result, timeout): pass diff --git a/src/calibre/gui2/store/stores/amazon_uk_plugin.py b/src/calibre/gui2/store/stores/amazon_uk_plugin.py index 2dba752a67..054072824b 100644 --- a/src/calibre/gui2/store/stores/amazon_uk_plugin.py +++ b/src/calibre/gui2/store/stores/amazon_uk_plugin.py @@ -7,7 +7,7 @@ __license__ = 'GPL 3' __copyright__ = '2011, John Schember ' __docformat__ = 'restructuredtext en' -import re, time +import re from contextlib import closing from lxml import html @@ -28,9 +28,6 @@ class AmazonEUBase(StorePlugin): For comments on the implementation, please see amazon_plugin.py ''' - MAX_SEARCH_ATTEMPTS = 5 - SLEEP_BETWEEN_ATTEMPTS = 3 - def open(self, parent=None, detail_item=None, external=False): store_link = self.store_link % self.aff_id @@ -41,73 +38,71 @@ class AmazonEUBase(StorePlugin): def search(self, query, max_results=10, timeout=60): url = self.search_url + query.encode('ascii', 'backslashreplace').replace('%', '%25').replace('\\x', '%').replace(' ', '+') + br = browser() counter = max_results - loops = 0 - while counter == max_results and loops < self.MAX_SEARCH_ATTEMPTS: - br = browser() - if loops > 0: - print ("Retry getbooks search", self.__class__.__name__, counter, - max_results, loops) - time.sleep(self.SLEEP_BETWEEN_ATTEMPTS) - loops += 1 + with closing(br.open(url, timeout=timeout)) as f: + doc = html.fromstring(f.read())#.decode('latin-1', 'replace')) - with closing(br.open(url, timeout=timeout)) as f: - doc = html.fromstring(f.read())#.decode('latin-1', 'replace')) + data_xpath = '//div[contains(@class, "prod")]' + # Results can be in a grid (table) or a column + format_xpath = ( + './/ul[contains(@class, "rsltL") or contains(@class, "rsltGridList")]' + '//span[contains(@class, "lrg") and not(contains(@class, "bld"))]/text()') + asin_xpath = '@name' + cover_xpath = './/img[@class="productImage"]/@src' + title_xpath = './/h3[@class="newaps"]/a//text()' + author_xpath = './/h3[@class="newaps"]//span[contains(@class, "reg")]//text()' + # Results can be in a grid (table) or a column + price_xpath = ( + './/ul[contains(@class, "rsltL") or contains(@class, "rsltGridList")]' + '//span[contains(@class, "lrg") and contains(@class, "bld")]/text()') - data_xpath = '//div[contains(@class, "prod")]' - format_xpath = './/ul[contains(@class, "rsltL")]//span[contains(@class, "lrg") and not(contains(@class, "bld"))]/text()' - asin_xpath = '@name' - cover_xpath = './/img[@class="productImage"]/@src' - title_xpath = './/h3[@class="newaps"]/a//text()' - author_xpath = './/h3[@class="newaps"]//span[contains(@class, "reg")]//text()' - price_xpath = './/ul[contains(@class, "rsltL")]//span[contains(@class, "lrg") and contains(@class, "bld")]/text()' + for data in doc.xpath(data_xpath): + if counter <= 0: + break - for data in doc.xpath(data_xpath): - if counter <= 0: - break + # Even though we are searching digital-text only Amazon will still + # put in results for non Kindle books (authors pages). Se we need + # to explicitly check if the item is a Kindle book and ignore it + # if it isn't. + format_ = ''.join(data.xpath(format_xpath)) + if 'kindle' not in format_.lower(): + continue - # Even though we are searching digital-text only Amazon will still - # put in results for non Kindle books (authors pages). Se we need - # to explicitly check if the item is a Kindle book and ignore it - # if it isn't. - format_ = ''.join(data.xpath(format_xpath)) - if 'kindle' not in format_.lower(): - continue + # We must have an asin otherwise we can't easily reference the + # book later. + asin = data.xpath(asin_xpath) + if asin: + asin = asin[0] + else: + continue - # We must have an asin otherwise we can't easily reference the - # book later. - asin = data.xpath(asin_xpath) - if asin: - asin = asin[0] - else: - continue + cover_url = ''.join(data.xpath(cover_xpath)) - cover_url = ''.join(data.xpath(cover_xpath)) + title = ''.join(data.xpath(title_xpath)) - title = ''.join(data.xpath(title_xpath)) + authors = ''.join(data.xpath(author_xpath)) + authors = re.sub('^' + self.author_article, '', authors) + authors = re.sub(self.and_word, ' & ', authors) + mo = re.match(r'(.*)(\(\d.*)$', authors) + if mo: + authors = mo.group(1).strip() - authors = ''.join(data.xpath(author_xpath)) - authors = re.sub('^' + self.author_article, '', authors) - authors = re.sub(self.and_word, ' & ', authors) - mo = re.match(r'(.*)(\(\d.*)$', authors) - if mo: - authors = mo.group(1).strip() + price = ''.join(data.xpath(price_xpath)) - price = ''.join(data.xpath(price_xpath)) + counter -= 1 - counter -= 1 + s = SearchResult() + s.cover_url = cover_url.strip() + s.title = title.strip() + s.author = authors.strip() + s.price = price.strip() + s.detail_item = asin.strip() + s.drm = SearchResult.DRM_UNKNOWN + s.formats = 'Kindle' - s = SearchResult() - s.cover_url = cover_url.strip() - s.title = title.strip() - s.author = authors.strip() - s.price = price.strip() - s.detail_item = asin.strip() - s.drm = SearchResult.DRM_UNKNOWN - s.formats = 'Kindle' - - yield s + yield s def get_details(self, search_result, timeout): pass