Get Books: Update the Amazon (US) plugin for website changes. Fixes #1406040 [Get Books does not return results from Amazon US](https://bugs.launchpad.net/calibre/+bug/1406040)

2025-08-30 23:00:21 -04:00 · 2014-12-28 10:32:34 +05:30 · 2014-12-28 10:32:34 +05:30 · 00f0b2373a
commit 00f0b2373a
parent 1b7bb1f472
1 changed files with 127 additions and 92 deletions
--- a/src/calibre/gui2/store/stores/amazon_plugin.py
+++ b/src/calibre/gui2/store/stores/amazon_plugin.py
@ -1,7 +1,7 @@
 # -*- coding: utf-8 -*-
 from __future__ import (unicode_literals, division, absolute_import, print_function)
-store_version = 5 # Needed for dynamic plugin loading
+store_version = 6  # Needed for dynamic plugin loading
 __license__ = 'GPL 3'
 __copyright__ = '2011, John Schember <john@nachtimwald.com>'
@ -19,9 +19,123 @@ from calibre.gui2 import open_url
 from calibre.gui2.store import StorePlugin
 from calibre.gui2.store.search_result import SearchResult
 def CSSSelect(expr):
    from cssselect import HTMLTranslator
    return HTMLTranslator().css_to_xpath(expr)
 def search_amazon(query, max_results=10, timeout=60,
                  write_html_to=None,
                  search_url='http://www.amazon.com/s/?url=search-alias%3Ddigital-text&field-keywords='):
    url = search_url + query.encode('ascii', 'backslashreplace').replace('%', '%25').replace('\\x', '%').replace(' ', '+')
    br = browser()
    counter = max_results
    with closing(br.open(url, timeout=timeout)) as f:
        raw = f.read()
        if write_html_to is not None:
            with open(write_html_to, 'wb') as f:
                f.write(raw)
        doc = html.fromstring(raw)
        try:
            results = doc.xpath('//div[@id="atfResults" and @class]')[0]
        except IndexError:
            return
        if 's-result-list-parent-container' in results.get('class', ''):
            data_xpath = CSSSelect('li.s-result-item')
            format_xpath = './/a[@title="Kindle Edition"]//h3/text()'
            asin_xpath = '@data-asin'
            cover_xpath = CSSSelect('img.s-access-image') + '/@src'
            title_xpath = CSSSelect('h2.s-access-title') + '//text()'
            author_xpath = ('.//a[not(@title) and starts-with(@href, "/") and'
                            ' contains(@class, "a-link-normal") and'
                            ' contains(@href, "ref=sr_ntt_srch_lnk")]//text()')
            price_xpath = '(.//span[contains(@class, " s-price ")])[last()]//text()'
        elif 'grid' in results.get('class', ''):
            data_xpath = '//div[contains(@class, "prod")]'
            format_xpath = (
                    './/ul[contains(@class, "rsltGridList")]'
                    '//span[contains(@class, "lrg") and not(contains(@class, "bld"))]/text()')
            asin_xpath = '@name'
            cover_xpath = './/img[contains(@class, "productImage")]/@src'
            title_xpath = './/h3[@class="newaps"]/a//text()'
            author_xpath = './/h3[@class="newaps"]//span[contains(@class, "reg")]//text()'
            price_xpath = (
                    './/ul[contains(@class, "rsltGridList")]'
                    '//span[contains(@class, "lrg") and contains(@class, "bld")]/text()')
        elif 'ilresults' in results.get('class', ''):
            data_xpath = '//li[(@class="ilo")]'
            format_xpath = (
                    './/ul[contains(@class, "rsltGridList")]'
                    '//span[contains(@class, "lrg") and not(contains(@class, "bld"))]/text()')
            asin_xpath = '@name'
            cover_xpath = './div[@class = "ilf"]/a/img[contains(@class, "ilo")]/@src'
            title_xpath = './/h3[@class="newaps"]/a//text()'
            author_xpath = './/h3[@class="newaps"]//span[contains(@class, "reg")]//text()'
            # Results can be in a grid (table) or a column
            price_xpath = (
                    './/ul[contains(@class, "rsltL") or contains(@class, "rsltGridList")]'
                    '//span[contains(@class, "lrg") and contains(@class, "bld")]/text()')
        elif 'list' in results.get('class', ''):
            data_xpath = '//div[contains(@class, "prod")]'
            format_xpath = (
                    './/ul[contains(@class, "rsltL")]'
                    '//span[contains(@class, "lrg") and not(contains(@class, "bld"))]/text()')
            asin_xpath = '@name'
            cover_xpath = './/img[contains(@class, "productImage")]/@src'
            title_xpath = './/h3[@class="newaps"]/a//text()'
            author_xpath = './/h3[@class="newaps"]//span[contains(@class, "reg")]//text()'
            price_xpath = (
                    './/ul[contains(@class, "rsltL")]'
                    '//span[contains(@class, "lrg") and contains(@class, "bld")]/text()')
        else:
            return
        for data in doc.xpath(data_xpath):
            if counter <= 0:
                break
            # Even though we are searching digital-text only Amazon will still
            # put in results for non Kindle books (author pages). Se we need
            # to explicitly check if the item is a Kindle book and ignore it
            # if it isn't.
            format = ''.join(data.xpath(format_xpath))
            if 'kindle' not in format.lower():
                continue
            # We must have an asin otherwise we can't easily reference the
            # book later.
            asin = data.xpath(asin_xpath)
            if asin:
                asin = asin[0]
            else:
                continue
            cover_url = ''.join(data.xpath(cover_xpath))
            title = ''.join(data.xpath(title_xpath))
            author = ''.join(data.xpath(author_xpath))
            try:
                author = author.split('by ', 1)[1].split(" (")[0]
            except:
                pass
            price = ''.join(data.xpath(price_xpath))
            counter -= 1
            s = SearchResult()
            s.cover_url = cover_url.strip()
            s.title = title.strip()
            s.author = author.strip()
            s.price = price.strip()
            s.detail_item = asin.strip()
            s.formats = 'Kindle'
            yield s
 class AmazonKindleStore(StorePlugin):
    search_url = 'http://www.amazon.com/s/?url=search-alias%3Ddigital-text&field-keywords='
    details_url = 'http://amazon.com/dp/'
    drm_search_text = u'Simultaneous Device Usage'
    drm_free_text = u'Unlimited'
@ -114,102 +228,15 @@ class AmazonKindleStore(StorePlugin):
        # Use Kovid's affiliate id 30% of the time.
        if random.randint(1, 10) in (1, 2, 3):
            aff_id['tag'] = 'calibrebs-20'
-        store_link = 'http://www.amazon.com/Kindle-eBooks/b/?ie=UTF&node=1286228011&ref_=%(tag)s&ref=%(tag)s&tag=%(tag)s&linkCode=ur2&camp=1789&creative=390957' % aff_id
+        store_link = 'http://www.amazon.com/Kindle-eBooks/b/?ie=UTF&node=1286228011&ref_=%(tag)s&ref=%(tag)s&tag=%(tag)s&linkCode=ur2&camp=1789&creative=390957' % aff_id  # noqa
        if detail_item:
            aff_id['asin'] = detail_item
            store_link = 'http://www.amazon.com/dp/%(asin)s/?tag=%(tag)s' % aff_id
        open_url(QUrl(store_link))
    def search(self, query, max_results=10, timeout=60):
-        url = self.search_url + query.encode('ascii', 'backslashreplace').replace('%', '%25').replace('\\x', '%').replace(' ', '+')
+        for result in search_amazon(query, max_results=max_results, timeout=timeout):
-        br = browser()
+            yield result
        counter = max_results
        with closing(br.open(url, timeout=timeout)) as f:
            doc = html.fromstring(f.read())
            if doc.xpath('//div[@id = "atfResults" and contains(@class, "grid")]'):
                data_xpath = '//div[contains(@class, "prod")]'
                format_xpath = (
                        './/ul[contains(@class, "rsltGridList")]'
                        '//span[contains(@class, "lrg") and not(contains(@class, "bld"))]/text()')
                asin_xpath = '@name'
                cover_xpath = './/img[contains(@class, "productImage")]/@src'
                title_xpath = './/h3[@class="newaps"]/a//text()'
                author_xpath = './/h3[@class="newaps"]//span[contains(@class, "reg")]//text()'
                price_xpath = (
                        './/ul[contains(@class, "rsltGridList")]'
                        '//span[contains(@class, "lrg") and contains(@class, "bld")]/text()')
            elif doc.xpath('//div[@id = "atfResults" and contains(@class, "ilresults")]'):
                data_xpath = '//li[(@class="ilo")]'
                format_xpath = (
                        './/ul[contains(@class, "rsltGridList")]'
                        '//span[contains(@class, "lrg") and not(contains(@class, "bld"))]/text()')
                asin_xpath = '@name'
                cover_xpath = './div[@class = "ilf"]/a/img[contains(@class, "ilo")]/@src'
                title_xpath = './/h3[@class="newaps"]/a//text()'
                author_xpath = './/h3[@class="newaps"]//span[contains(@class, "reg")]//text()'
                # Results can be in a grid (table) or a column
                price_xpath = (
                        './/ul[contains(@class, "rsltL") or contains(@class, "rsltGridList")]'
                        '//span[contains(@class, "lrg") and contains(@class, "bld")]/text()')
            elif doc.xpath('//div[@id = "atfResults" and contains(@class, "list")]'):
                data_xpath = '//div[contains(@class, "prod")]'
                format_xpath = (
                        './/ul[contains(@class, "rsltL")]'
                        '//span[contains(@class, "lrg") and not(contains(@class, "bld"))]/text()')
                asin_xpath = '@name'
                cover_xpath = './/img[contains(@class, "productImage")]/@src'
                title_xpath = './/h3[@class="newaps"]/a//text()'
                author_xpath = './/h3[@class="newaps"]//span[contains(@class, "reg")]//text()'
                price_xpath = (
                        './/ul[contains(@class, "rsltL")]'
                        '//span[contains(@class, "lrg") and contains(@class, "bld")]/text()')
            else:
                return
            for data in doc.xpath(data_xpath):
                if counter <= 0:
                    break
                # Even though we are searching digital-text only Amazon will still
                # put in results for non Kindle books (author pages). Se we need
                # to explicitly check if the item is a Kindle book and ignore it
                # if it isn't.
                format = ''.join(data.xpath(format_xpath))
                if 'kindle' not in format.lower():
                    continue
                # We must have an asin otherwise we can't easily reference the
                # book later.
                asin = data.xpath(asin_xpath)
                if asin:
                    asin = asin[0]
                else:
                    continue
                cover_url = ''.join(data.xpath(cover_xpath))
                title = ''.join(data.xpath(title_xpath))
                author = ''.join(data.xpath(author_xpath))
                try:
                    author = author.split('by ', 1)[1].split(" (")[0]
                except:
                    pass
                price = ''.join(data.xpath(price_xpath))
                counter -= 1
                s = SearchResult()
                s.cover_url = cover_url.strip()
                s.title = title.strip()
                s.author = author.strip()
                s.price = price.strip()
                s.detail_item = asin.strip()
                s.formats = 'Kindle'
                yield s
    def get_details(self, search_result, timeout):
        url = self.details_url
@ -228,3 +255,11 @@ class AmazonKindleStore(StorePlugin):
            else:
                search_result.drm = SearchResult.DRM_LOCKED
        return True
 if __name__ == '__main__':
    import os
    from tempfile import gettempdir
    rp = os.path.join(gettempdir(), 'raw.html')
    for result in search_amazon('heroes abercrombie', write_html_to=rp):
        print (result)
    print ('HTML written to: %s' % rp)