From 2653a0c67b5ac4a87740ca533a73309d4ab6a3b1 Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Wed, 18 Mar 2015 10:55:53 +0530
Subject: [PATCH] Amazon metadata download: Speedup metadata download from
 amazon.com when an ASIN is specified by trying the product page directly
 first instead of running a search. Also works around amazon server problems
 where it does not return Kindle Editions in search results when not logged
 in. Fixes #1433125 [metadata: error no matches
 found](https://bugs.launchpad.net/calibre/+bug/1433125)

---
 src/calibre/ebooks/metadata/sources/amazon.py | 155 +++++++++++-------
 1 file changed, 93 insertions(+), 62 deletions(-)
diff --git a/src/calibre/ebooks/metadata/sources/amazon.py b/src/calibre/ebooks/metadata/sources/amazon.py
index 105cc42105..9004f51a04 100644
--- a/src/calibre/ebooks/metadata/sources/amazon.py
+++ b/src/calibre/ebooks/metadata/sources/amazon.py
@@ -19,6 +19,70 @@ from calibre.ebooks.metadata.sources.base import (Source, Option, fixcase,
 from calibre.ebooks.metadata.book.base import Metadata
 from calibre.utils.localization import canonicalize_lang
 
+def parse_details_page(url, log, timeout, browser, domain):
+    from calibre.utils.cleantext import clean_ascii_chars
+    from calibre.ebooks.chardet import xml_to_unicode
+    import html5lib
+    from lxml.html import tostring
+    try:
+        raw = browser.open_novisit(url, timeout=timeout).read().strip()
+    except Exception as e:
+        if callable(getattr(e, 'getcode', None)) and \
+                e.getcode() == 404:
+            log.error('URL malformed: %r'%url)
+            return
+        attr = getattr(e, 'args', [None])
+        attr = attr if attr else [None]
+        if isinstance(attr[0], socket.timeout):
+            msg = 'Amazon timed out. Try again later.'
+            log.error(msg)
+        else:
+            msg = 'Failed to make details query: %r'%url
+            log.exception(msg)
+        return
+
+    oraw = raw
+    if 'amazon.com.br' in url:
+        raw = raw.decode('utf-8')  # amazon.com.br serves utf-8 but has an incorrect latin1 <meta> tag
+    raw = xml_to_unicode(raw, strip_encoding_pats=True, resolve_entities=True)[0]
+    if '<title>404 - ' in raw:
+        log.error('URL malformed: %r'%url)
+        return
+
+    try:
+        root = html5lib.parse(clean_ascii_chars(raw), treebuilder='lxml',
+                namespaceHTMLElements=False)
+    except:
+        msg = 'Failed to parse amazon details page: %r'%url
+        log.exception(msg)
+        return
+    if domain == 'jp':
+        for a in root.xpath('//a[@href]'):
+            if 'black-curtain-redirect.html' in a.get('href'):
+                url = 'http://amazon.co.jp'+a.get('href')
+                log('Black curtain redirect found, following')
+                return parse_details_page(url, log, timeout, browser, domain)
+
+    errmsg = root.xpath('//*[@id="errorMessage"]')
+    if errmsg:
+        msg = 'Failed to parse amazon details page: %r'%url
+        msg += tostring(errmsg, method='text', encoding=unicode).strip()
+        log.error(msg)
+        return
+
+    from css_selectors import Select
+    selector = Select(root)
+    return oraw, root, selector
+
+def parse_asin(root, log, url):
+    try:
+        link = root.xpath('//link[@rel="canonical" and @href]')
+        for l in link:
+            return l.get('href').rpartition('/')[-1]
+    except Exception:
+        log.exception('Error parsing ASIN for url: %r'%url)
+
+
 class Worker(Thread):  # Get details {{{
 
     '''
@@ -26,8 +90,9 @@ class Worker(Thread):  # Get details {{{
     '''
 
     def __init__(self, url, result_queue, browser, log, relevance, domain,
-            plugin, timeout=20, testing=False):
+            plugin, timeout=20, testing=False, preparsed_root=None):
         Thread.__init__(self)
+        self.preparsed_root = preparsed_root
         self.daemon = True
         self.testing = testing
         self.url, self.result_queue = url, result_queue
@@ -213,67 +278,18 @@ class Worker(Thread):  # Get details {{{
             self.log.exception('get_details failed for url: %r'%self.url)
 
     def get_details(self):
-        from calibre.utils.cleantext import clean_ascii_chars
-        from calibre.ebooks.chardet import xml_to_unicode
-        import html5lib
 
-        try:
-            raw = self.browser.open_novisit(self.url, timeout=self.timeout).read().strip()
-        except Exception as e:
-            if callable(getattr(e, 'getcode', None)) and \
-                    e.getcode() == 404:
-                self.log.error('URL malformed: %r'%self.url)
-                return
-            attr = getattr(e, 'args', [None])
-            attr = attr if attr else [None]
-            if isinstance(attr[0], socket.timeout):
-                msg = 'Amazon timed out. Try again later.'
-                self.log.error(msg)
-            else:
-                msg = 'Failed to make details query: %r'%self.url
-                self.log.exception(msg)
-            return
-
-        oraw = raw
-        if 'amazon.com.br' in self.url:
-            raw = raw.decode('utf-8')  # amazon.com.br serves utf-8 but has an incorrect latin1 <meta> tag
-        raw = xml_to_unicode(raw, strip_encoding_pats=True,
-                resolve_entities=True)[0]
-        if '<title>404 - ' in raw:
-            self.log.error('URL malformed: %r'%self.url)
-            return
-
-        try:
-            root = html5lib.parse(clean_ascii_chars(raw), treebuilder='lxml',
-                    namespaceHTMLElements=False)
-        except:
-            msg = 'Failed to parse amazon details page: %r'%self.url
-            self.log.exception(msg)
-            return
-        if self.domain == 'jp':
-            for a in root.xpath('//a[@href]'):
-                if 'black-curtain-redirect.html' in a.get('href'):
-                    self.url = 'http://amazon.co.jp'+a.get('href')
-                    self.log('Black curtain redirect found, following')
-                    return self.get_details()
-
-        errmsg = root.xpath('//*[@id="errorMessage"]')
-        if errmsg:
-            msg = 'Failed to parse amazon details page: %r'%self.url
-            msg += self.tostring(errmsg, method='text', encoding=unicode).strip()
-            self.log.error(msg)
-            return
+        if self.preparsed_root is None:
+            raw, root, selector = parse_details_page(self.url, self.log, self.timeout, self.browser, self.domain)
+        else:
+            raw, root, selector = self.preparsed_root
 
         from css_selectors import Select
         self.selector = Select(root)
-        self.parse_details(oraw, root)
+        self.parse_details(raw, root)
 
     def parse_details(self, raw, root):
-        try:
-            asin = self.parse_asin(root)
-        except:
-            self.log.exception('Error parsing asin for url: %r'%self.url)
-            asin = None
+        asin = parse_asin(root, self.log, self.url)
         if self.testing:
             import tempfile, uuid
             with tempfile.NamedTemporaryFile(prefix=(asin or str(uuid.uuid4()))+ '_',
@@ -386,11 +402,6 @@ class Worker(Thread):  # Get details {{{
 
         self.result_queue.put(mi)
 
-    def parse_asin(self, root):
-        link = root.xpath('//link[@rel="canonical" and @href]')
-        for l in link:
-            return l.get('href').rpartition('/')[-1]
-
     def totext(self, elem):
         return self.tostring(elem, encoding=unicode, method='text').strip()
 
@@ -934,13 +945,28 @@ class Amazon(Source):
         import html5lib
 
         testing = getattr(self, 'running_a_test', False)
+        br = self.browser
+
+        domain, asin = self.get_domain_and_asin(identifiers)
+        if asin and domain == 'com':
+            # Try to directly get details page instead of running a search
+            durl = 'http://www.amazon.com/gp/product/' + asin
+            preparsed_root = parse_details_page(durl, log, timeout, br, domain)
+            if preparsed_root is not None:
+                qasin = parse_asin(preparsed_root[1], log, durl)
+                if qasin == asin:
+                    w = Worker(durl, result_queue, br, log, 0, domain, self, testing=testing, preparsed_root=preparsed_root)
+                    try:
+                        w.get_details()
+                        return
+                    except Exception:
+                        log.exception('get_details failed for url: %r'%durl)
 
         query, domain = self.create_query(log, title=title, authors=authors,
                 identifiers=identifiers)
         if query is None:
             log.error('Insufficient metadata to construct query')
             return
-        br = self.browser
         if testing:
             print ('Using user agent for amazon: %s'%self.user_agent)
         try:
@@ -1069,6 +1095,11 @@ if __name__ == '__main__':  # tests {{{
             isbn_test, title_test, authors_test, comments_test)
     com_tests = [  # {{{
 
+            (   # A kindle edition that does not appear in the search results when searching by ASIN
+                {'identifiers':{'amazon':'B004JHY6OG'}},
+                [title_test('The Heroes: A First Law Novel', exact=True)]
+            ),
+
             (  # + in title and uses id="main-image" for cover
              {'title':'C++ Concurrency in Action'},
              [title_test('C++ Concurrency in Action: Practical Multithreading',