Get Books: Update the Amazon (US) plugin for website changes. Fixes #1406040 [Get Books does not return results from Amazon US](https://bugs.launchpad.net/calibre/+bug/1406040)

2025-08-30 23:00:21 -04:00 · 2014-12-28 10:32:34 +05:30 · 2014-12-28 10:32:34 +05:30 · 00f0b2373a
commit 00f0b2373a
parent 1b7bb1f472
1 changed files with 127 additions and 92 deletions
--- a/src/calibre/gui2/store/stores/amazon_plugin.py
+++ b/src/calibre/gui2/store/stores/amazon_plugin.py
@ -1,7 +1,7 @@
 # -*- coding: utf-8 -*-

 from __future__ import (unicode_literals, division, absolute_import, print_function)
-store_version = 5 # Needed for dynamic plugin loading
+store_version = 6  # Needed for dynamic plugin loading

 __license__ = 'GPL 3'
 __copyright__ = '2011, John Schember <john@nachtimwald.com>'
@ -19,9 +19,123 @@ from calibre.gui2 import open_url
 from calibre.gui2.store import StorePlugin
 from calibre.gui2.store.search_result import SearchResult

+def CSSSelect(expr):
+    from cssselect import HTMLTranslator
+    return HTMLTranslator().css_to_xpath(expr)
+
+def search_amazon(query, max_results=10, timeout=60,
+                  write_html_to=None,
+                  search_url='http://www.amazon.com/s/?url=search-alias%3Ddigital-text&field-keywords='):
+    url = search_url + query.encode('ascii', 'backslashreplace').replace('%', '%25').replace('\\x', '%').replace(' ', '+')
+    br = browser()
+
+    counter = max_results
+    with closing(br.open(url, timeout=timeout)) as f:
+        raw = f.read()
+        if write_html_to is not None:
+            with open(write_html_to, 'wb') as f:
+                f.write(raw)
+        doc = html.fromstring(raw)
+        try:
+            results = doc.xpath('//div[@id="atfResults" and @class]')[0]
+        except IndexError:
+            return
+
+        if 's-result-list-parent-container' in results.get('class', ''):
+            data_xpath = CSSSelect('li.s-result-item')
+            format_xpath = './/a[@title="Kindle Edition"]//h3/text()'
+            asin_xpath = '@data-asin'
+            cover_xpath = CSSSelect('img.s-access-image') + '/@src'
+            title_xpath = CSSSelect('h2.s-access-title') + '//text()'
+            author_xpath = ('.//a[not(@title) and starts-with(@href, "/") and'
+                            ' contains(@class, "a-link-normal") and'
+                            ' contains(@href, "ref=sr_ntt_srch_lnk")]//text()')
+            price_xpath = '(.//span[contains(@class, " s-price ")])[last()]//text()'
+        elif 'grid' in results.get('class', ''):
+            data_xpath = '//div[contains(@class, "prod")]'
+            format_xpath = (
+                    './/ul[contains(@class, "rsltGridList")]'
+                    '//span[contains(@class, "lrg") and not(contains(@class, "bld"))]/text()')
+            asin_xpath = '@name'
+            cover_xpath = './/img[contains(@class, "productImage")]/@src'
+            title_xpath = './/h3[@class="newaps"]/a//text()'
+            author_xpath = './/h3[@class="newaps"]//span[contains(@class, "reg")]//text()'
+            price_xpath = (
+                    './/ul[contains(@class, "rsltGridList")]'
+                    '//span[contains(@class, "lrg") and contains(@class, "bld")]/text()')
+        elif 'ilresults' in results.get('class', ''):
+            data_xpath = '//li[(@class="ilo")]'
+            format_xpath = (
+                    './/ul[contains(@class, "rsltGridList")]'
+                    '//span[contains(@class, "lrg") and not(contains(@class, "bld"))]/text()')
+            asin_xpath = '@name'
+            cover_xpath = './div[@class = "ilf"]/a/img[contains(@class, "ilo")]/@src'
+            title_xpath = './/h3[@class="newaps"]/a//text()'
+            author_xpath = './/h3[@class="newaps"]//span[contains(@class, "reg")]//text()'
+            # Results can be in a grid (table) or a column
+            price_xpath = (
+                    './/ul[contains(@class, "rsltL") or contains(@class, "rsltGridList")]'
+                    '//span[contains(@class, "lrg") and contains(@class, "bld")]/text()')
+        elif 'list' in results.get('class', ''):
+            data_xpath = '//div[contains(@class, "prod")]'
+            format_xpath = (
+                    './/ul[contains(@class, "rsltL")]'
+                    '//span[contains(@class, "lrg") and not(contains(@class, "bld"))]/text()')
+            asin_xpath = '@name'
+            cover_xpath = './/img[contains(@class, "productImage")]/@src'
+            title_xpath = './/h3[@class="newaps"]/a//text()'
+            author_xpath = './/h3[@class="newaps"]//span[contains(@class, "reg")]//text()'
+            price_xpath = (
+                    './/ul[contains(@class, "rsltL")]'
+                    '//span[contains(@class, "lrg") and contains(@class, "bld")]/text()')
+        else:
+            return
+
+        for data in doc.xpath(data_xpath):
+            if counter <= 0:
+                break
+
+            # Even though we are searching digital-text only Amazon will still
+            # put in results for non Kindle books (author pages). Se we need
+            # to explicitly check if the item is a Kindle book and ignore it
+            # if it isn't.
+            format = ''.join(data.xpath(format_xpath))
+            if 'kindle' not in format.lower():
+                continue
+
+            # We must have an asin otherwise we can't easily reference the
+            # book later.
+            asin = data.xpath(asin_xpath)
+            if asin:
+                asin = asin[0]
+            else:
+                continue
+
+            cover_url = ''.join(data.xpath(cover_xpath))
+
+            title = ''.join(data.xpath(title_xpath))
+            author = ''.join(data.xpath(author_xpath))
+            try:
+                author = author.split('by ', 1)[1].split(" (")[0]
+            except:
+                pass
+
+            price = ''.join(data.xpath(price_xpath))
+
+            counter -= 1
+
+            s = SearchResult()
+            s.cover_url = cover_url.strip()
+            s.title = title.strip()
+            s.author = author.strip()
+            s.price = price.strip()
+            s.detail_item = asin.strip()
+            s.formats = 'Kindle'
+
+            yield s
+
 class AmazonKindleStore(StorePlugin):

-    search_url = 'http://www.amazon.com/s/?url=search-alias%3Ddigital-text&field-keywords='
    details_url = 'http://amazon.com/dp/'
    drm_search_text = u'Simultaneous Device Usage'
    drm_free_text = u'Unlimited'
@ -114,102 +228,15 @@ class AmazonKindleStore(StorePlugin):
        # Use Kovid's affiliate id 30% of the time.
        if random.randint(1, 10) in (1, 2, 3):
            aff_id['tag'] = 'calibrebs-20'
-        store_link = 'http://www.amazon.com/Kindle-eBooks/b/?ie=UTF&node=1286228011&ref_=%(tag)s&ref=%(tag)s&tag=%(tag)s&linkCode=ur2&camp=1789&creative=390957' % aff_id
+        store_link = 'http://www.amazon.com/Kindle-eBooks/b/?ie=UTF&node=1286228011&ref_=%(tag)s&ref=%(tag)s&tag=%(tag)s&linkCode=ur2&camp=1789&creative=390957' % aff_id  # noqa
        if detail_item:
            aff_id['asin'] = detail_item
            store_link = 'http://www.amazon.com/dp/%(asin)s/?tag=%(tag)s' % aff_id
        open_url(QUrl(store_link))

    def search(self, query, max_results=10, timeout=60):
-        url = self.search_url + query.encode('ascii', 'backslashreplace').replace('%', '%25').replace('\\x', '%').replace(' ', '+')
-        br = browser()
-
-        counter = max_results
-        with closing(br.open(url, timeout=timeout)) as f:
-            doc = html.fromstring(f.read())
-
-            if doc.xpath('//div[@id = "atfResults" and contains(@class, "grid")]'):
-                data_xpath = '//div[contains(@class, "prod")]'
-                format_xpath = (
-                        './/ul[contains(@class, "rsltGridList")]'
-                        '//span[contains(@class, "lrg") and not(contains(@class, "bld"))]/text()')
-                asin_xpath = '@name'
-                cover_xpath = './/img[contains(@class, "productImage")]/@src'
-                title_xpath = './/h3[@class="newaps"]/a//text()'
-                author_xpath = './/h3[@class="newaps"]//span[contains(@class, "reg")]//text()'
-                price_xpath = (
-                        './/ul[contains(@class, "rsltGridList")]'
-                        '//span[contains(@class, "lrg") and contains(@class, "bld")]/text()')
-            elif doc.xpath('//div[@id = "atfResults" and contains(@class, "ilresults")]'):
-                data_xpath = '//li[(@class="ilo")]'
-                format_xpath = (
-                        './/ul[contains(@class, "rsltGridList")]'
-                        '//span[contains(@class, "lrg") and not(contains(@class, "bld"))]/text()')
-                asin_xpath = '@name'
-                cover_xpath = './div[@class = "ilf"]/a/img[contains(@class, "ilo")]/@src'
-                title_xpath = './/h3[@class="newaps"]/a//text()'
-                author_xpath = './/h3[@class="newaps"]//span[contains(@class, "reg")]//text()'
-                # Results can be in a grid (table) or a column
-                price_xpath = (
-                        './/ul[contains(@class, "rsltL") or contains(@class, "rsltGridList")]'
-                        '//span[contains(@class, "lrg") and contains(@class, "bld")]/text()')
-            elif doc.xpath('//div[@id = "atfResults" and contains(@class, "list")]'):
-                data_xpath = '//div[contains(@class, "prod")]'
-                format_xpath = (
-                        './/ul[contains(@class, "rsltL")]'
-                        '//span[contains(@class, "lrg") and not(contains(@class, "bld"))]/text()')
-                asin_xpath = '@name'
-                cover_xpath = './/img[contains(@class, "productImage")]/@src'
-                title_xpath = './/h3[@class="newaps"]/a//text()'
-                author_xpath = './/h3[@class="newaps"]//span[contains(@class, "reg")]//text()'
-                price_xpath = (
-                        './/ul[contains(@class, "rsltL")]'
-                        '//span[contains(@class, "lrg") and contains(@class, "bld")]/text()')
-            else:
-                return
-
-            for data in doc.xpath(data_xpath):
-                if counter <= 0:
-                    break
-
-                # Even though we are searching digital-text only Amazon will still
-                # put in results for non Kindle books (author pages). Se we need
-                # to explicitly check if the item is a Kindle book and ignore it
-                # if it isn't.
-                format = ''.join(data.xpath(format_xpath))
-                if 'kindle' not in format.lower():
-                    continue
-
-                # We must have an asin otherwise we can't easily reference the
-                # book later.
-                asin = data.xpath(asin_xpath)
-                if asin:
-                    asin = asin[0]
-                else:
-                    continue
-
-                cover_url = ''.join(data.xpath(cover_xpath))
-
-                title = ''.join(data.xpath(title_xpath))
-                author = ''.join(data.xpath(author_xpath))
-                try:
-                    author = author.split('by ', 1)[1].split(" (")[0]
-                except:
-                    pass
-
-                price = ''.join(data.xpath(price_xpath))
-
-                counter -= 1
-
-                s = SearchResult()
-                s.cover_url = cover_url.strip()
-                s.title = title.strip()
-                s.author = author.strip()
-                s.price = price.strip()
-                s.detail_item = asin.strip()
-                s.formats = 'Kindle'
-
-                yield s
+        for result in search_amazon(query, max_results=max_results, timeout=timeout):
+            yield result

    def get_details(self, search_result, timeout):
        url = self.details_url
@ -228,3 +255,11 @@ class AmazonKindleStore(StorePlugin):
            else:
                search_result.drm = SearchResult.DRM_LOCKED
        return True
+
+if __name__ == '__main__':
+    import os
+    from tempfile import gettempdir
+    rp = os.path.join(gettempdir(), 'raw.html')
+    for result in search_amazon('heroes abercrombie', write_html_to=rp):
+        print (result)
+    print ('HTML written to: %s' % rp)