Amazon metadata download: Speedup metadata download from amazon.com when an ASIN is specified by trying the product page directly first instead of running a search. Also works around amazon server problems where it does not return Kindle Editions in search results when not logged in. Fixes #1433125 [metadata: error no matches found](https://bugs.launchpad.net/calibre/+bug/1433125)

2025-07-09 03:04:10 -04:00 · 2015-03-18 10:55:53 +05:30 · 2015-03-18 10:55:53 +05:30 · 2653a0c67b
commit 2653a0c67b
parent 0a3b9f678f
1 changed files with 93 additions and 62 deletions
--- a/src/calibre/ebooks/metadata/sources/amazon.py
+++ b/src/calibre/ebooks/metadata/sources/amazon.py
@ -19,6 +19,70 @@ from calibre.ebooks.metadata.sources.base import (Source, Option, fixcase,
 from calibre.ebooks.metadata.book.base import Metadata
 from calibre.utils.localization import canonicalize_lang

+def parse_details_page(url, log, timeout, browser, domain):
+    from calibre.utils.cleantext import clean_ascii_chars
+    from calibre.ebooks.chardet import xml_to_unicode
+    import html5lib
+    from lxml.html import tostring
+    try:
+        raw = browser.open_novisit(url, timeout=timeout).read().strip()
+    except Exception as e:
+        if callable(getattr(e, 'getcode', None)) and \
+                e.getcode() == 404:
+            log.error('URL malformed: %r'%url)
+            return
+        attr = getattr(e, 'args', [None])
+        attr = attr if attr else [None]
+        if isinstance(attr[0], socket.timeout):
+            msg = 'Amazon timed out. Try again later.'
+            log.error(msg)
+        else:
+            msg = 'Failed to make details query: %r'%url
+            log.exception(msg)
+        return
+
+    oraw = raw
+    if 'amazon.com.br' in url:
+        raw = raw.decode('utf-8')  # amazon.com.br serves utf-8 but has an incorrect latin1 <meta> tag
+    raw = xml_to_unicode(raw, strip_encoding_pats=True, resolve_entities=True)[0]
+    if '<title>404 - ' in raw:
+        log.error('URL malformed: %r'%url)
+        return
+
+    try:
+        root = html5lib.parse(clean_ascii_chars(raw), treebuilder='lxml',
+                namespaceHTMLElements=False)
+    except:
+        msg = 'Failed to parse amazon details page: %r'%url
+        log.exception(msg)
+        return
+    if domain == 'jp':
+        for a in root.xpath('//a[@href]'):
+            if 'black-curtain-redirect.html' in a.get('href'):
+                url = 'http://amazon.co.jp'+a.get('href')
+                log('Black curtain redirect found, following')
+                return parse_details_page(url, log, timeout, browser, domain)
+
+    errmsg = root.xpath('//*[@id="errorMessage"]')
+    if errmsg:
+        msg = 'Failed to parse amazon details page: %r'%url
+        msg += tostring(errmsg, method='text', encoding=unicode).strip()
+        log.error(msg)
+        return
+
+    from css_selectors import Select
+    selector = Select(root)
+    return oraw, root, selector
+
+def parse_asin(root, log, url):
+    try:
+        link = root.xpath('//link[@rel="canonical" and @href]')
+        for l in link:
+            return l.get('href').rpartition('/')[-1]
+    except Exception:
+        log.exception('Error parsing ASIN for url: %r'%url)
+
+
 class Worker(Thread):  # Get details {{{

    '''
@ -26,8 +90,9 @@ class Worker(Thread):  # Get details {{{
    '''

    def __init__(self, url, result_queue, browser, log, relevance, domain,
-            plugin, timeout=20, testing=False):
+            plugin, timeout=20, testing=False, preparsed_root=None):
        Thread.__init__(self)
+        self.preparsed_root = preparsed_root
        self.daemon = True
        self.testing = testing
        self.url, self.result_queue = url, result_queue
@ -213,67 +278,18 @@ class Worker(Thread):  # Get details {{{
            self.log.exception('get_details failed for url: %r'%self.url)

    def get_details(self):
-        from calibre.utils.cleantext import clean_ascii_chars
-        from calibre.ebooks.chardet import xml_to_unicode
-        import html5lib

-        try:
-            raw = self.browser.open_novisit(self.url, timeout=self.timeout).read().strip()
-        except Exception as e:
-            if callable(getattr(e, 'getcode', None)) and \
-                    e.getcode() == 404:
-                self.log.error('URL malformed: %r'%self.url)
-                return
-            attr = getattr(e, 'args', [None])
-            attr = attr if attr else [None]
-            if isinstance(attr[0], socket.timeout):
-                msg = 'Amazon timed out. Try again later.'
-                self.log.error(msg)
-            else:
-                msg = 'Failed to make details query: %r'%self.url
-                self.log.exception(msg)
-            return
-
-        oraw = raw
-        if 'amazon.com.br' in self.url:
-            raw = raw.decode('utf-8')  # amazon.com.br serves utf-8 but has an incorrect latin1 <meta> tag
-        raw = xml_to_unicode(raw, strip_encoding_pats=True,
-                resolve_entities=True)[0]
-        if '<title>404 - ' in raw:
-            self.log.error('URL malformed: %r'%self.url)
-            return
-
-        try:
-            root = html5lib.parse(clean_ascii_chars(raw), treebuilder='lxml',
-                    namespaceHTMLElements=False)
-        except:
-            msg = 'Failed to parse amazon details page: %r'%self.url
-            self.log.exception(msg)
-            return
-        if self.domain == 'jp':
-            for a in root.xpath('//a[@href]'):
-                if 'black-curtain-redirect.html' in a.get('href'):
-                    self.url = 'http://amazon.co.jp'+a.get('href')
-                    self.log('Black curtain redirect found, following')
-                    return self.get_details()
-
-        errmsg = root.xpath('//*[@id="errorMessage"]')
-        if errmsg:
-            msg = 'Failed to parse amazon details page: %r'%self.url
-            msg += self.tostring(errmsg, method='text', encoding=unicode).strip()
-            self.log.error(msg)
-            return
+        if self.preparsed_root is None:
+            raw, root, selector = parse_details_page(self.url, self.log, self.timeout, self.browser, self.domain)
+        else:
+            raw, root, selector = self.preparsed_root

        from css_selectors import Select
        self.selector = Select(root)
-        self.parse_details(oraw, root)
+        self.parse_details(raw, root)

    def parse_details(self, raw, root):
-        try:
-            asin = self.parse_asin(root)
-        except:
-            self.log.exception('Error parsing asin for url: %r'%self.url)
-            asin = None
+        asin = parse_asin(root, self.log, self.url)
        if self.testing:
            import tempfile, uuid
            with tempfile.NamedTemporaryFile(prefix=(asin or str(uuid.uuid4()))+ '_',
@ -386,11 +402,6 @@ class Worker(Thread):  # Get details {{{

        self.result_queue.put(mi)

-    def parse_asin(self, root):
-        link = root.xpath('//link[@rel="canonical" and @href]')
-        for l in link:
-            return l.get('href').rpartition('/')[-1]
-
    def totext(self, elem):
        return self.tostring(elem, encoding=unicode, method='text').strip()

@ -934,13 +945,28 @@ class Amazon(Source):
        import html5lib

        testing = getattr(self, 'running_a_test', False)
+        br = self.browser
+
+        domain, asin = self.get_domain_and_asin(identifiers)
+        if asin and domain == 'com':
+            # Try to directly get details page instead of running a search
+            durl = 'http://www.amazon.com/gp/product/' + asin
+            preparsed_root = parse_details_page(durl, log, timeout, br, domain)
+            if preparsed_root is not None:
+                qasin = parse_asin(preparsed_root[1], log, durl)
+                if qasin == asin:
+                    w = Worker(durl, result_queue, br, log, 0, domain, self, testing=testing, preparsed_root=preparsed_root)
+                    try:
+                        w.get_details()
+                        return
+                    except Exception:
+                        log.exception('get_details failed for url: %r'%durl)

        query, domain = self.create_query(log, title=title, authors=authors,
                identifiers=identifiers)
        if query is None:
            log.error('Insufficient metadata to construct query')
            return
-        br = self.browser
        if testing:
            print ('Using user agent for amazon: %s'%self.user_agent)
        try:
@ -1069,6 +1095,11 @@ if __name__ == '__main__':  # tests {{{
            isbn_test, title_test, authors_test, comments_test)
    com_tests = [  # {{{

+            (   # A kindle edition that does not appear in the search results when searching by ASIN
+                {'identifiers':{'amazon':'B004JHY6OG'}},
+                [title_test('The Heroes: A First Law Novel', exact=True)]
+            ),
+
            (  # + in title and uses id="main-image" for cover
             {'title':'C++ Concurrency in Action'},
             [title_test('C++ Concurrency in Action: Practical Multithreading',