Amazon metadata download: Support the new 'Book Description' section that Amazon publishes for some books. Also workaround the amazon US servers occassionally returning broken markup leading to calibre not finding any matches for books on Amazon.

2025-07-07 10:14:46 -04:00 · 2012-02-16 14:04:51 +05:30 · 2012-02-16 14:04:51 +05:30 · e961004b11
commit e961004b11
parent 7140b93b56
5 changed files with 167 additions and 66 deletions
--- a/src/calibre/init.py
+++ b/src/calibre/init.py
@ -350,20 +350,20 @@ def get_proxy_info(proxy_scheme, proxy_string):
 USER_AGENT = 'Mozilla/5.0 (X11; U; Linux x86_64; en-US; rv:1.9.2.13) Gecko/20101210 Gentoo Firefox/3.6.13'
 USER_AGENT_MOBILE = 'Mozilla/5.0 (Windows; U; Windows CE 5.1; rv:1.8.1a3) Gecko/20060610 Minimo/0.016'

-def random_user_agent():
+def random_user_agent(choose=None):
    choices = [
        'Mozilla/5.0 (Windows NT 5.2; rv:2.0.1) Gecko/20100101 Firefox/4.0.1',
        'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:2.0.1) Gecko/20100101 Firefox/4.0.1',
        'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1',
-        'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US; rv:1.9.2.11) Gecko/20101012 Firefox/3.6.11',
-        'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/525.19 (KHTML, like Gecko) Chrome/0.2.153.1 Safari/525.19',
-        'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US; rv:1.9.2.11) Gecko/20101012 Firefox/3.6.11',
+        'Mozilla/5.0 (Windows NT 6.1; rv:6.0) Gecko/20110814 Firefox/6.0',
+        'Mozilla/5.0 (Windows NT 6.2; rv:9.0.1) Gecko/20100101 Firefox/9.0.1',
        'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/534.3 (KHTML, like Gecko) Chrome/6.0.472.63 Safari/534.3',
        'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/532.5 (KHTML, like Gecko) Chrome/4.0.249.78 Safari/532.5',
        'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0)',
    ]
-    #return choices[-1]
-    return choices[random.randint(0, len(choices)-1)]
+    if choose is None:
+        choose = random.randint(0, len(choices)-1)
+    return choices[choose]

 def browser(honor_time=True, max_time=2, mobile_browser=False, user_agent=None):
    '''
--- a/src/calibre/ebooks/metadata/sources/amazon.py
+++ b/src/calibre/ebooks/metadata/sources/amazon.py
@ -13,7 +13,7 @@ from threading import Thread
 from Queue import Queue, Empty


-from calibre import as_unicode
+from calibre import as_unicode, random_user_agent
 from calibre.ebooks.metadata import check_isbn
 from calibre.ebooks.metadata.sources.base import (Source, Option, fixcase,
        fixauthors)
@ -174,8 +174,8 @@ class Worker(Thread): # Get details {{{

    def get_details(self):
        from calibre.utils.cleantext import clean_ascii_chars
-        from calibre.utils.soupparser import fromstring
        from calibre.ebooks.chardet import xml_to_unicode
+        import html5lib

        try:
            raw = self.browser.open_novisit(self.url, timeout=self.timeout).read().strip()
@ -202,7 +202,8 @@ class Worker(Thread): # Get details {{{
            return

        try:
-            root = fromstring(clean_ascii_chars(raw))
+            root = html5lib.parse(clean_ascii_chars(raw), treebuilder='lxml',
+                    namespaceHTMLElements=False)
        except:
            msg = 'Failed to parse amazon details page: %r'%self.url
            self.log.exception(msg)
@ -356,33 +357,46 @@ class Worker(Thread): # Get details {{{
                if m is not None:
                    return float(m.group(1))/float(m.group(3)) * 5

-    def parse_comments(self, root):
+    def _render_comments(self, desc):
        from calibre.library.comments import sanitize_comments_html

+        for c in desc.xpath('descendant::noscript'):
+            c.getparent().remove(c)
+        for c in desc.xpath('descendant::*[@class="seeAll" or'
+                ' @class="emptyClear" or @id="collapsePS" or'
+                ' @id="expandPS"]'):
+            c.getparent().remove(c)
+
+        for a in desc.xpath('descendant::a[@href]'):
+            del a.attrib['href']
+            a.tag = 'span'
+        desc = self.tostring(desc, method='html', encoding=unicode).strip()
+
+        # Encoding bug in Amazon data U+fffd (replacement char)
+        # in some examples it is present in place of '
+        desc = desc.replace('\ufffd', "'")
+        # remove all attributes from tags
+        desc = re.sub(r'<([a-zA-Z0-9]+)\s[^>]+>', r'<\1>', desc)
+        # Collapse whitespace
+        #desc = re.sub('\n+', '\n', desc)
+        #desc = re.sub(' +', ' ', desc)
+        # Remove the notice about text referring to out of print editions
+        desc = re.sub(r'(?s)<em>--This text ref.*?</em>', '', desc)
+        # Remove comments
+        desc = re.sub(r'(?s)<!--.*?-->', '', desc)
+        return sanitize_comments_html(desc)
+
+
+    def parse_comments(self, root):
+        ans = ''
+        desc = root.xpath('//div[@id="ps-content"]/div[@class="content"]')
+        if desc:
+            ans = self._render_comments(desc[0])
+
        desc = root.xpath('//div[@id="productDescription"]/*[@class="content"]')
        if desc:
-            desc = desc[0]
-            for c in desc.xpath('descendant::*[@class="seeAll" or'
-                    ' @class="emptyClear"]'):
-                c.getparent().remove(c)
-            for a in desc.xpath('descendant::a[@href]'):
-                del a.attrib['href']
-                a.tag = 'span'
-            desc = self.tostring(desc, method='html', encoding=unicode).strip()
-
-            # Encoding bug in Amazon data U+fffd (replacement char)
-            # in some examples it is present in place of '
-            desc = desc.replace('\ufffd', "'")
-            # remove all attributes from tags
-            desc = re.sub(r'<([a-zA-Z0-9]+)\s[^>]+>', r'<\1>', desc)
-            # Collapse whitespace
-            #desc = re.sub('\n+', '\n', desc)
-            #desc = re.sub(' +', ' ', desc)
-            # Remove the notice about text referring to out of print editions
-            desc = re.sub(r'(?s)<em>--This text ref.*?</em>', '', desc)
-            # Remove comments
-            desc = re.sub(r'(?s)<!--.*?-->', '', desc)
-            return sanitize_comments_html(desc)
+            ans += self._render_comments(desc[0])
+        return ans

    def parse_cover(self, root):
        imgs = root.xpath('//img[@id="prodImage" and @src]')
@ -467,6 +481,28 @@ class Amazon(Source):
        Source.__init__(self, *args, **kwargs)
        self.set_amazon_id_touched_fields()

+    def test_fields(self, mi):
+        '''
+        Return the first field from self.touched_fields that is null on the
+        mi object
+        '''
+        for key in self.touched_fields:
+            if key.startswith('identifier:'):
+                key = key.partition(':')[-1]
+                if key == 'amazon':
+                    if self.domain != 'com':
+                        key += '_' + self.domain
+                if not mi.has_identifier(key):
+                    return 'identifier: ' + key
+            elif mi.is_null(key):
+                return key
+
+    @property
+    def user_agent(self):
+        # Pass in an index to random_user_agent() to test with a particular
+        # user agent
+        return random_user_agent()
+
    def save_settings(self, *args, **kwargs):
        Source.save_settings(self, *args, **kwargs)
        self.set_amazon_id_touched_fields()
@ -507,6 +543,9 @@ class Amazon(Source):

    @property
    def domain(self):
+        x = getattr(self, 'testing_domain', None)
+        if x is not None:
+            return x
        domain = self.prefs['domain']
        if domain not in self.AMAZON_DOMAINS:
            domain = 'com'
@ -599,16 +638,52 @@ class Amazon(Source):
        return url
    # }}}

+    def parse_results_page(self, root): # {{{
+        from lxml.html import tostring
+
+        matches = []
+
+        def title_ok(title):
+            title = title.lower()
+            for x in ('bulk pack', '[audiobook]', '[audio cd]'):
+                if x in title:
+                    return False
+            return True
+
+        for div in root.xpath(r'//div[starts-with(@id, "result_")]'):
+            for a in div.xpath(r'descendant::a[@class="title" and @href]'):
+                title = tostring(a, method='text', encoding=unicode)
+                if title_ok(title):
+                    matches.append(a.get('href'))
+                break
+
+        if not matches:
+            # This can happen for some user agents that Amazon thinks are
+            # mobile/less capable
+            for td in root.xpath(
+                r'//div[@id="Results"]/descendant::td[starts-with(@id, "search:Td:")]'):
+                for a in td.xpath(r'descendant::td[@class="dataColumn"]/descendant::a[@href]/span[@class="srTitle"]/..'):
+                    title = tostring(a, method='text', encoding=unicode)
+                    if title_ok(title):
+                        matches.append(a.get('href'))
+                    break
+
+
+        # Keep only the top 5 matches as the matches are sorted by relevance by
+        # Amazon so lower matches are not likely to be very relevant
+        return matches[:5]
+    # }}}
+
    def identify(self, log, result_queue, abort, title=None, authors=None, # {{{
            identifiers={}, timeout=30):
        '''
        Note this method will retry without identifiers automatically if no
        match is found with identifiers.
        '''
-        from lxml.html import tostring
        from calibre.utils.cleantext import clean_ascii_chars
-        from calibre.utils.soupparser import fromstring
        from calibre.ebooks.chardet import xml_to_unicode
+        from lxml.html import tostring
+        import html5lib

        query, domain = self.create_query(log, title=title, authors=authors,
                identifiers=identifiers)
@ -616,6 +691,8 @@ class Amazon(Source):
            log.error('Insufficient metadata to construct query')
            return
        br = self.browser
+        if getattr(self, 'running_a_test', False):
+            print ('Using user agent for amazon: %s'%self.user_agent)
        try:
            raw = br.open_novisit(query, timeout=timeout).read().strip()
        except Exception as e:
@ -634,15 +711,23 @@ class Amazon(Source):
            return as_unicode(msg)


-        raw = xml_to_unicode(raw, strip_encoding_pats=True,
-                resolve_entities=True)[0]
+        raw = clean_ascii_chars(xml_to_unicode(raw,
+            strip_encoding_pats=True, resolve_entities=True)[0])
+
+        if getattr(self, 'running_a_test', False):
+            import tempfile
+            with tempfile.NamedTemporaryFile(prefix='amazon_results_',
+                    suffix='.html', delete=False) as f:
+                f.write(raw.encode('utf-8'))
+            print ('Downloaded html for results page saved in', f.name)

        matches = []
        found = '<title>404 - ' not in raw

        if found:
            try:
-                root = fromstring(clean_ascii_chars(raw))
+                root = html5lib.parse(raw, treebuilder='lxml',
+                        namespaceHTMLElements=False)
            except:
                msg = 'Failed to parse amazon page for query: %r'%query
                log.exception(msg)
@ -655,30 +740,9 @@ class Amazon(Source):
                    # The error is almost always a not found error
                    found = False

+
        if found:
-            for div in root.xpath(r'//div[starts-with(@id, "result_")]'):
-                for a in div.xpath(r'descendant::a[@class="title" and @href]'):
-                    title = tostring(a, method='text', encoding=unicode).lower()
-                    if 'bulk pack' not in title:
-                        matches.append(a.get('href'))
-                    break
-            if not matches:
-                # This can happen for some user agents that Amazon thinks are
-                # mobile/less capable
-                log('Trying alternate results page markup')
-                for td in root.xpath(
-                    r'//div[@id="Results"]/descendant::td[starts-with(@id, "search:Td:")]'):
-                    for a in td.xpath(r'descendant::td[@class="dataColumn"]/descendant::a[@href]/span[@class="srTitle"]/..'):
-                        title = tostring(a, method='text', encoding=unicode).lower()
-                        if ('bulk pack' not in title and '[audiobook]' not in
-                                title and '[audio cd]' not in title):
-                            matches.append(a.get('href'))
-                        break
-
-
-        # Keep only the top 5 matches as the matches are sorted by relevance by
-        # Amazon so lower matches are not likely to be very relevant
-        matches = matches[:5]
+            matches = self.parse_results_page(root)

        if abort.is_set():
            return
@ -686,7 +750,7 @@ class Amazon(Source):
        if not matches:
            if identifiers and title and authors:
                log('No matches found with identifiers, retrying using only'
-                        ' title and authors')
+                        ' title and authors. Query: %r'%query)
                return self.identify(log, result_queue, abort, title=title,
                        authors=authors, timeout=timeout)
            log.error('No matches found with query: %r'%query)
@ -756,9 +820,18 @@ if __name__ == '__main__': # tests {{{
    # To run these test use: calibre-debug -e
    # src/calibre/ebooks/metadata/sources/amazon.py
    from calibre.ebooks.metadata.sources.test import (test_identify_plugin,
-            isbn_test, title_test, authors_test)
+            isbn_test, title_test, authors_test, comments_test)
    com_tests = [ # {{{

+            ( # Different comments markup, using Book Description section
+                {'identifiers':{'amazon':'0982514506'}},
+                [title_test(
+                "Griffin's Destiny: Book Three: The Griffin's Daughter Trilogy"
+                , exact=True),
+                comments_test('Jelena'), comments_test('Leslie'),
+                ]
+            ),
+
            ( # # in title
                {'title':'Expert C# 2008 Business Objects',
                    'authors':['Lhotka']},
@ -850,7 +923,17 @@ if __name__ == '__main__': # tests {{{
            ),
    ] # }}}

-    test_identify_plugin(Amazon.name, com_tests)
-    #test_identify_plugin(Amazon.name, de_tests)
+    def do_test(domain, start=0, stop=None):
+        tests = globals().get(domain+'_tests')
+        if stop is None:
+            stop = len(tests)
+        tests = tests[start:stop]
+        test_identify_plugin(Amazon.name, tests, modify_plugin=lambda
+                p:setattr(p, 'testing_domain', domain))
+
+    do_test('com')
+
+    #do_test('de')
+
 # }}}

--- a/src/calibre/ebooks/metadata/sources/base.py
+++ b/src/calibre/ebooks/metadata/sources/base.py
@ -253,10 +253,16 @@ class Source(Plugin):

    # Browser {{{

+    @property
+    def user_agent(self):
+        # Pass in an index to random_user_agent() to test with a particular
+        # user agent
+        return random_user_agent()
+
    @property
    def browser(self):
        if self._browser is None:
-            self._browser = browser(user_agent=random_user_agent())
+            self._browser = browser(user_agent=self.user_agent)
            if self.supports_gzip_transfer_encoding:
                self._browser.set_handle_gzip(True)
        return self._browser.clone_browser()
--- a/src/calibre/ebooks/metadata/sources/test.py
+++ b/src/calibre/ebooks/metadata/sources/test.py
@ -84,6 +84,16 @@ def series_test(series, series_index):

    return test

+def comments_test(sentinel):
+
+    def test(mi):
+        comm = mi.comments.lower() if mi.comments else ''
+        if sentinel and sentinel.lower() in comm:
+            return True
+        prints('comments test failed. %s not in comments'%sentinel)
+        return False
+    return test
+
 def init_test(tdir_name):
    tdir = tempfile.gettempdir()
    lf = os.path.join(tdir, tdir_name.replace(' ', '')+'_identify_test.txt')
@ -157,7 +167,7 @@ def test_identify(tests): # {{{

 # }}}

-def test_identify_plugin(name, tests): # {{{
+def test_identify_plugin(name, tests, modify_plugin=lambda plugin:None): # {{{
    '''
    :param name: Plugin name
    :param tests: List of 2-tuples. Each two tuple is of the form (args,
@ -171,6 +181,7 @@ def test_identify_plugin(name, tests): # {{{
        if x.name == name:
            plugin = x
            break
+    modify_plugin(plugin)
    prints('Testing the identify function of', plugin.name)
    prints('Using extra headers:', plugin.browser.addheaders)

--- a/src/calibre/library/comments.py
+++ b/src/calibre/library/comments.py
@ -136,6 +136,7 @@ def sanitize_comments_html(html):
    text = html2text(html)
    md = markdown.Markdown(safe_mode=True)
    cleansed = re.sub('\n+', '', md.convert(text))
+    cleansed = cleansed.replace(markdown.HTML_REMOVED_TEXT, '')
    return cleansed

 def test():