Amazon metadata download: Fix for change in Amazon website that broke downloading metadata. Also make debugging the plugin a little easier. Fixes #878395 (Amazon ASIN no longer searchable in the ids field)

2025-07-09 03:04:10 -04:00 · 2011-10-20 08:10:16 +05:30 · 2011-10-20 08:10:16 +05:30 · 594bf0b678
commit 594bf0b678
parent eda4c65740
3 changed files with 23 additions and 20 deletions
--- a/src/calibre/ebooks/metadata/sources/amazon.py
+++ b/src/calibre/ebooks/metadata/sources/amazon.py
@ -30,9 +30,11 @@ class Worker(Thread): # Get details {{{
    Get book details from amazons book page in a separate thread
    '''

-    def __init__(self, url, result_queue, browser, log, relevance, domain, plugin, timeout=20):
+    def __init__(self, url, result_queue, browser, log, relevance, domain,
+            plugin, timeout=20, testing=False):
        Thread.__init__(self)
        self.daemon = True
+        self.testing = testing
        self.url, self.result_queue = url, result_queue
        self.log, self.timeout = log, timeout
        self.relevance, self.plugin = relevance, plugin
@ -189,10 +191,9 @@ class Worker(Thread): # Get details {{{
                self.log.exception(msg)
            return

+        oraw = raw
        raw = xml_to_unicode(raw, strip_encoding_pats=True,
                resolve_entities=True)[0]
-        #open('/t/t.html', 'wb').write(raw)
-
        if '<title>404 - ' in raw:
            self.log.error('URL malformed: %r'%self.url)
            return
@ -211,14 +212,20 @@ class Worker(Thread): # Get details {{{
            self.log.error(msg)
            return

-        self.parse_details(root)
+        self.parse_details(oraw, root)

-    def parse_details(self, root):
+    def parse_details(self, raw, root):
        try:
            asin = self.parse_asin(root)
        except:
            self.log.exception('Error parsing asin for url: %r'%self.url)
            asin = None
+        if self.testing:
+            import tempfile
+            with tempfile.NamedTemporaryFile(prefix=asin + '_',
+                    suffix='.html', delete=False) as f:
+                f.write(raw)
+            print ('Downloaded html for', asin, 'saved in', f.name)

        try:
            title = self.parse_title(root)
@ -310,7 +317,7 @@ class Worker(Thread): # Get details {{{
            return l.get('href').rpartition('/')[-1]

    def parse_title(self, root):
-        tdiv = root.xpath('//h1[@class="parseasinTitle"]')[0]
+        tdiv = root.xpath('//h1[contains(@class, "parseasinTitle")]')[0]
        actual_title = tdiv.xpath('descendant::*[@id="btAsinTitle"]')
        if actual_title:
            title = tostring(actual_title[0], encoding=unicode,
@ -320,11 +327,11 @@ class Worker(Thread): # Get details {{{
        return re.sub(r'[(\[].*[)\]]', '', title).strip()

    def parse_authors(self, root):
-        x = '//h1[@class="parseasinTitle"]/following-sibling::span/*[(name()="a" and @href) or (name()="span" and @class="contributorNameTrigger")]'
+        x = '//h1[contains(@class, "parseasinTitle")]/following-sibling::span/*[(name()="a" and @href) or (name()="span" and @class="contributorNameTrigger")]'
        aname = root.xpath(x)
        if not aname:
            aname = root.xpath('''
-            //h1[@class="parseasinTitle"]/following-sibling::*[(name()="a" and @href) or (name()="span" and @class="contributorNameTrigger")]
+            //h1[contains(@class, "parseasinTitle")]/following-sibling::*[(name()="a" and @href) or (name()="span" and @class="contributorNameTrigger")]
                    ''')
        for x in aname:
            x.tail = ''
@ -666,7 +673,8 @@ class Amazon(Source):
            log.error('No matches found with query: %r'%query)
            return

-        workers = [Worker(url, result_queue, br, log, i, domain, self) for i, url in
+        workers = [Worker(url, result_queue, br, log, i, domain, self,
+            testing=getattr(self, 'running_a_test', False)) for i, url in
                enumerate(matches)]

        for w in workers:
@ -740,16 +748,6 @@ if __name__ == '__main__': # tests {{{

            ),

-            ( # An e-book ISBN not on Amazon, the title/author search matches
-              # the Kindle edition, which has different markup for ratings and
-              # isbn
-                {'identifiers':{'isbn': '9780307459671'},
-                    'title':'Invisible Gorilla', 'authors':['Christopher Chabris']},
-                [title_test('The Invisible Gorilla: And Other Ways Our Intuitions Deceive Us',
-                    exact=True), authors_test(['Christopher Chabris', 'Daniel Simons'])]
-
-            ),
-
            (  # This isbn not on amazon
                {'identifiers':{'isbn': '8324616489'}, 'title':'Learning Python',
                    'authors':['Lutz']},
--- a/src/calibre/ebooks/metadata/sources/base.py
+++ b/src/calibre/ebooks/metadata/sources/base.py
@ -196,6 +196,7 @@ class Source(Plugin):

    def __init__(self, *args, **kwargs):
        Plugin.__init__(self, *args, **kwargs)
+        self.running_a_test = False # Set to True when using identify_test()
        self._isbn_to_identifier_cache = {}
        self._identifier_to_cover_url_cache = {}
        self.cache_lock = threading.RLock()
--- a/src/calibre/ebooks/metadata/sources/test.py
+++ b/src/calibre/ebooks/metadata/sources/test.py
@ -183,7 +183,11 @@ def test_identify_plugin(name, tests): # {{{
        rq = Queue()
        args = (log, rq, abort)
        start_time = time.time()
+        plugin.running_a_test = True
+        try:
            err = plugin.identify(*args, **kwargs)
+        finally:
+            plugin.running_a_test = False
        total_time = time.time() - start_time
        times.append(total_time)
        if err is not None: