Implement searching amazon via wayback machine

Disabled, as wayback machine is really slow/flaky
2025-07-09 03:04:10 -04:00 · 2017-03-02 09:19:51 +05:30 · 2017-03-02 09:19:51 +05:30 · d1ad4955a8
commit d1ad4955a8
parent 6c4c14ceca
3 changed files with 447 additions and 335 deletions
--- a/src/calibre/ebooks/metadata/sources/amazon.py
+++ b/src/calibre/ebooks/metadata/sources/amazon.py
@ -1,24 +1,22 @@
 #!/usr/bin/env python2
 # vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
-from __future__ import (unicode_literals, division, absolute_import,
+# License: GPLv3 Copyright: 2011, Kovid Goyal <kovid at kovidgoyal.net>
-                        print_function)
+from __future__ import absolute_import, division, print_function, unicode_literals
-__license__   = 'GPL v3'
+import re
-__copyright__ = '2011, Kovid Goyal <kovid@kovidgoyal.net>'
+import socket
-__docformat__ = 'restructuredtext en'
+import time
-
+from Queue import Empty, Queue
 import socket, time, re
 from threading import Thread
-from Queue import Queue, Empty
+from urlparse import urlparse
 from calibre import as_unicode, browser
 from calibre.ebooks.metadata import check_isbn
 from calibre.ebooks.metadata.sources.base import (Source, Option, fixcase,
        fixauthors)
 from calibre.ebooks.metadata.book.base import Metadata
 from calibre.ebooks.metadata.sources.base import Option, Source, fixauthors, fixcase
 from calibre.ebooks.metadata.sources.update import search_engines_module
 from calibre.utils.localization import canonicalize_lang
-from calibre.utils.random_ua import all_user_agents, accept_header_for_ua
+from calibre.utils.random_ua import accept_header_for_ua, all_user_agents
 class CaptchaError(Exception):
@ -30,6 +28,7 @@ class SearchFailed(ValueError):
 ua_index = -1
 USE_SEARCH_ENGINE = False
 def parse_details_page(url, log, timeout, browser, domain):
@ -37,6 +36,7 @@ def parse_details_page(url, log, timeout, browser, domain):
    from calibre.ebooks.chardet import xml_to_unicode
    import html5lib
    from lxml.html import tostring
    log('Getting details from:', url)
    try:
        raw = browser.open_novisit(url, timeout=timeout).read().strip()
    except Exception as e:
@ -56,8 +56,10 @@ def parse_details_page(url, log, timeout, browser, domain):
    oraw = raw
    if 'amazon.com.br' in url:
-        raw = raw.decode('utf-8')  # amazon.com.br serves utf-8 but has an incorrect latin1 <meta> tag
+        # amazon.com.br serves utf-8 but has an incorrect latin1 <meta> tag
-    raw = xml_to_unicode(raw, strip_encoding_pats=True, resolve_entities=True)[0]
+        raw = raw.decode('utf-8')
    raw = xml_to_unicode(raw, strip_encoding_pats=True,
                         resolve_entities=True)[0]
    if '<title>404 - ' in raw:
        log.error('URL malformed: %r' % url)
        return
@ -104,8 +106,9 @@ class Worker(Thread):  # Get details {{{
    '''
    def __init__(self, url, result_queue, browser, log, relevance, domain,
-            plugin, timeout=20, testing=False, preparsed_root=None):
+                 plugin, timeout=20, testing=False, preparsed_root=None, cover_url_processor=None):
        Thread.__init__(self)
        self.cover_url_processor = cover_url_processor
        self.preparsed_root = preparsed_root
        self.daemon = True
        self.testing = testing
@ -230,7 +233,8 @@ class Worker(Thread):  # Get details {{{
                    starts-with(text(), "Uitgever:") or \
                    starts-with(text(), "出版社:")]
            '''
-        self.publisher_names = {'Publisher', 'Uitgever', 'Verlag', 'Editore', 'Editeur', 'Editor', 'Editora', '出版社'}
+        self.publisher_names = {'Publisher', 'Uitgever', 'Verlag',
                                'Editore', 'Editeur', 'Editor', 'Editora', '出版社'}
        self.language_xpath =    '''
            descendant::*[
@ -244,7 +248,8 @@ class Worker(Thread):  # Get details {{{
                or starts-with(text(), "语种")
                ]
            '''
-        self.language_names = {'Language', 'Sprache', 'Lingua', 'Idioma', 'Langue', '言語', 'Taal', '语种'}
+        self.language_names = {'Language', 'Sprache',
                               'Lingua', 'Idioma', 'Langue', '言語', 'Taal', '语种'}
        self.tags_xpath = '''
            descendant::h2[
@ -308,7 +313,8 @@ class Worker(Thread):  # Get details {{{
    def get_details(self):
        if self.preparsed_root is None:
-            raw, root, selector = parse_details_page(self.url, self.log, self.timeout, self.browser, self.domain)
+            raw, root, selector = parse_details_page(
                self.url, self.log, self.timeout, self.browser, self.domain)
        else:
            raw, root, selector = self.preparsed_root
@ -319,9 +325,11 @@ class Worker(Thread):  # Get details {{{
    def parse_details(self, raw, root):
        asin = parse_asin(root, self.log, self.url)
        if not asin and root.xpath('//form[@action="/errors/validateCaptcha"]'):
-            raise CaptchaError('Amazon returned a CAPTCHA page, probably because you downloaded too many books. Wait for some time and try again.')
+            raise CaptchaError(
                'Amazon returned a CAPTCHA page, probably because you downloaded too many books. Wait for some time and try again.')
        if self.testing:
-            import tempfile, uuid
+            import tempfile
            import uuid
            with tempfile.NamedTemporaryFile(prefix=(asin or str(uuid.uuid4())) + '_',
                                             suffix='.html', delete=False) as f:
                f.write(raw)
@ -340,7 +348,8 @@ class Worker(Thread):  # Get details {{{
            authors = []
        if not title or not authors or not asin:
-            self.log.error('Could not find title/authors/asin for %r'%self.url)
+            self.log.error(
                'Could not find title/authors/asin for %r' % self.url)
            self.log.error('ASIN: %r Title: %r Authors: %r' % (asin, title,
                                                               authors))
            return
@ -378,15 +387,19 @@ class Worker(Thread):  # Get details {{{
            self.cover_url = self.parse_cover(root, raw)
        except:
            self.log.exception('Error parsing cover for url: %r' % self.url)
        if self.cover_url_processor is not None and self.cover_url.startswith('/'):
            self.cover_url = self.cover_url_processor(self.cover_url)
        mi.has_cover = bool(self.cover_url)
-        non_hero = tuple(self.selector('div#bookDetails_container_div div#nonHeroSection'))
+        non_hero = tuple(self.selector(
            'div#bookDetails_container_div div#nonHeroSection'))
        if non_hero:
            # New style markup
            try:
                self.parse_new_details(root, mi, non_hero[0])
            except:
-                self.log.exception('Failed to parse new-style book details section')
+                self.log.exception(
                    'Failed to parse new-style book details section')
        else:
            pd = root.xpath(self.pd_xpath)
            if pd:
@ -397,27 +410,32 @@ class Worker(Thread):  # Get details {{{
                    if isbn:
                        self.isbn = mi.isbn = isbn
                except:
-                    self.log.exception('Error parsing ISBN for url: %r'%self.url)
+                    self.log.exception(
                        'Error parsing ISBN for url: %r' % self.url)
                try:
                    mi.publisher = self.parse_publisher(pd)
                except:
-                    self.log.exception('Error parsing publisher for url: %r'%self.url)
+                    self.log.exception(
                        'Error parsing publisher for url: %r' % self.url)
                try:
                    mi.pubdate = self.parse_pubdate(pd)
                except:
-                    self.log.exception('Error parsing publish date for url: %r'%self.url)
+                    self.log.exception(
                        'Error parsing publish date for url: %r' % self.url)
                try:
                    lang = self.parse_language(pd)
                    if lang:
                        mi.language = lang
                except:
-                    self.log.exception('Error parsing language for url: %r'%self.url)
+                    self.log.exception(
                        'Error parsing language for url: %r' % self.url)
            else:
-                self.log.warning('Failed to find product description for url: %r'%self.url)
+                self.log.warning(
                    'Failed to find product description for url: %r' % self.url)
        mi.source_relevance = self.relevance
@ -448,7 +466,8 @@ class Worker(Thread):  # Get details {{{
            title = self.tostring(actual_title[0], encoding=unicode,
                                  method='text').strip()
        else:
-            title = self.tostring(tdiv, encoding=unicode, method='text').strip()
+            title = self.tostring(tdiv, encoding=unicode,
                                  method='text').strip()
        ans = re.sub(r'[(\[].*[)\]]', '', title).strip()
        if not ans:
            ans = title.rpartition('[')[0].strip()
@ -540,7 +559,8 @@ class Worker(Thread):  # Get details {{{
            if len(ns) == 0 and ns.text:
                import html5lib
                # html5lib parsed noscript as CDATA
-                ns = html5lib.parseFragment('<div>%s</div>' % (ns.text), treebuilder='lxml', namespaceHTMLElements=False)[0]
+                ns = html5lib.parseFragment(
                    '<div>%s</div>' % (ns.text), treebuilder='lxml', namespaceHTMLElements=False)[0]
            else:
                ns.tag = 'div'
            ans = self._render_comments(ns)
@ -549,7 +569,8 @@ class Worker(Thread):  # Get details {{{
            if desc:
                ans = self._render_comments(desc[0])
-        desc = root.xpath('//div[@id="productDescription"]/*[@class="content"]')
+        desc = root.xpath(
            '//div[@id="productDescription"]/*[@class="content"]')
        if desc:
            ans += self._render_comments(desc[0])
        else:
@ -559,12 +580,15 @@ class Worker(Thread):  # Get details {{{
            if m is not None:
                try:
                    text = unquote(m.group(1)).decode('utf-8')
-                    nr = html5lib.parse(text, treebuilder='lxml', namespaceHTMLElements=False)
+                    nr = html5lib.parse(
-                    desc = nr.xpath('//div[@id="productDescription"]/*[@class="content"]')
+                        text, treebuilder='lxml', namespaceHTMLElements=False)
                    desc = nr.xpath(
                        '//div[@id="productDescription"]/*[@class="content"]')
                    if desc:
                        ans += self._render_comments(desc[0])
                except Exception as e:
-                    self.log.warn('Parsing of obfuscated product description failed with error: %s' % as_unicode(e))
+                    self.log.warn(
                        'Parsing of obfuscated product description failed with error: %s' % as_unicode(e))
        return ans
@ -577,13 +601,15 @@ class Worker(Thread):  # Get details {{{
            series = series[0]
            spans = series.xpath('./span')
            if spans:
-                raw = self.tostring(spans[0], encoding=unicode, method='text', with_tail=False).strip()
+                raw = self.tostring(
                    spans[0], encoding=unicode, method='text', with_tail=False).strip()
                m = re.search('\s+([0-9.]+)$', raw.strip())
                if m is not None:
                    series_index = float(m.group(1))
                    s = series.xpath('./a[@id="series-page-link"]')
                    if s:
-                        series = self.tostring(s[0], encoding=unicode, method='text', with_tail=False).strip()
+                        series = self.tostring(
                            s[0], encoding=unicode, method='text', with_tail=False).strip()
                        if series:
                            ans = (series, series_index)
        # This is found on Kindle edition pages on amazon.com
@ -595,7 +621,8 @@ class Worker(Thread):  # Get details {{{
                    series_index = float(m.group(1))
                    a = span.xpath('./a[@href]')
                    if a:
-                        series = self.tostring(a[0], encoding=unicode, method='text', with_tail=False).strip()
+                        series = self.tostring(
                            a[0], encoding=unicode, method='text', with_tail=False).strip()
                        if series:
                            ans = (series, series_index)
        # This is found on newer Kindle edition pages on amazon.com
@ -607,7 +634,8 @@ class Worker(Thread):  # Get details {{{
                    series_index = float(m.group(1))
                    a = b.getparent().xpath('./a[@href]')
                    if a:
-                        series = self.tostring(a[0], encoding=unicode, method='text', with_tail=False).partition('(')[0].strip()
+                        series = self.tostring(
                            a[0], encoding=unicode, method='text', with_tail=False).partition('(')[0].strip()
                        if series:
                            ans = series, series_index
@ -629,12 +657,14 @@ class Worker(Thread):  # Get details {{{
    def parse_tags(self, root):
        ans = []
        exclude_tokens = {'kindle', 'a-z'}
-        exclude = {'special features', 'by authors', 'authors & illustrators', 'books', 'new; used & rental textbooks'}
+        exclude = {'special features', 'by authors',
                   'authors & illustrators', 'books', 'new; used & rental textbooks'}
        seen = set()
        for li in root.xpath(self.tags_xpath):
            for i, a in enumerate(li.iterdescendants('a')):
                if i > 0:
-                    # we ignore the first category since it is almost always too broad
+                    # we ignore the first category since it is almost always
                    # too broad
                    raw = (a.text or '').strip().replace(',', ';')
                    lraw = icu_lower(raw)
                    tokens = frozenset(lraw.split())
@ -674,12 +704,14 @@ class Worker(Thread):  # Get details {{{
                if url:
                    return url
-        imgs = root.xpath('//img[(@id="prodImage" or @id="original-main-image" or @id="main-image" or @id="main-image-nonjs") and @src]')
+        imgs = root.xpath(
            '//img[(@id="prodImage" or @id="original-main-image" or @id="main-image" or @id="main-image-nonjs") and @src]')
        if not imgs:
            imgs = (
                root.xpath('//div[@class="main-image-inner-wrapper"]/img[@src]') or
                root.xpath('//div[@id="main-image-container" or @id="ebooks-main-image-container"]//img[@src]') or
-                root.xpath('//div[@id="mainImageContainer"]//img[@data-a-dynamic-image]')
+                root.xpath(
                    '//div[@id="mainImageContainer"]//img[@data-a-dynamic-image]')
            )
            for img in imgs:
                try:
@ -887,7 +919,8 @@ class Amazon(Source):
        return 'https://www.amazon.%s/' % domain
    def _get_book_url(self, identifiers):  # {{{
-        domain, asin = self.get_domain_and_asin(identifiers, extra_domains=('in', 'au', 'ca'))
+        domain, asin = self.get_domain_and_asin(
            identifiers, extra_domains=('in', 'au', 'ca'))
        if domain and asin:
            url = None
            r = self.referrer_for_domain(domain)
@ -955,7 +988,7 @@ class Amazon(Source):
        return udomain
    def create_query(self, log, title=None, authors=None, identifiers={},  # {{{
-            domain=None):
+                     domain=None, for_amazon=True):
        from urllib import urlencode
        if domain is None:
            domain = self.domain
@ -965,6 +998,7 @@ class Amazon(Source):
            domain = idomain
        # See the amazon detailed search page to get all options
        terms = []
        q = {'search-alias': 'aps',
             'unfiltered': '1',
             }
@ -978,26 +1012,34 @@ class Amazon(Source):
        if asin is not None:
            q['field-keywords'] = asin
            terms.append(asin)
        elif isbn is not None:
            q['field-isbn'] = isbn
            terms.append(isbn)
        else:
            # Only return book results
-            q['search-alias'] = {'br':'digital-text', 'nl':'aps'}.get(domain, 'stripbooks')
+            q['search-alias'] = {'br': 'digital-text',
                                 'nl': 'aps'}.get(domain, 'stripbooks')
            if title:
                title_tokens = list(self.get_title_tokens(title))
                if title_tokens:
                    q['field-title'] = ' '.join(title_tokens)
                    terms.extend(title_tokens)
            if authors:
                author_tokens = self.get_author_tokens(authors,
                                                       only_first_author=True)
                if author_tokens:
                    q['field-author'] = ' '.join(author_tokens)
                    terms.extend(author_tokens)
        if not ('field-keywords' in q or 'field-isbn' in q or
                ('field-title' in q)):
            # Insufficient metadata to make an identify query
            return None, None
        if not for_amazon:
            return terms, domain
        # magic parameter to enable Japanese Shift_JIS encoding.
        if domain == 'jp':
            q['__mk_ja_JP'] = u'カタカナ'
@ -1018,7 +1060,8 @@ class Amazon(Source):
        encoded_q = dict([(x.encode(encode_to, 'ignore'), y.encode(encode_to,
                                                                   'ignore')) for x, y in
                          q.iteritems()])
-        url = 'https://www.amazon.%s/s/?'%self.get_website_domain(domain) + urlencode(encoded_q)
+        url = 'https://www.amazon.%s/s/?' % self.get_website_domain(
            domain) + urlencode(encoded_q)
        return url, domain
    # }}}
@ -1043,7 +1086,8 @@ class Amazon(Source):
        def title_ok(title):
            title = title.lower()
-            bad = ['bulk pack', '[audiobook]', '[audio cd]', '(a book companion)', '( slipcase with door )', ': free sampler']
+            bad = ['bulk pack', '[audiobook]', '[audio cd]',
                   '(a book companion)', '( slipcase with door )', ': free sampler']
            if self.domain == 'com':
                bad.extend(['(%s edition)' % x for x in ('spanish', 'german')])
            for x in bad:
@ -1059,7 +1103,8 @@ class Amazon(Source):
            if title_ok(title):
                url = a.get('href')
                if url.startswith('/'):
-                    url = 'https://www.amazon.%s%s' % (self.get_website_domain(domain), url)
+                    url = 'https://www.amazon.%s%s' % (
                        self.get_website_domain(domain), url)
                matches.append(url)
        if not matches:
@ -1074,7 +1119,8 @@ class Amazon(Source):
                    if title_ok(title):
                        url = a.get('href')
                        if url.startswith('/'):
-                            url = 'https://www.amazon.%s%s' % (self.get_website_domain(domain), url)
+                            url = 'https://www.amazon.%s%s' % (
                                self.get_website_domain(domain), url)
                        matches.append(url)
                    break
@ -1088,7 +1134,8 @@ class Amazon(Source):
                    if title_ok(title):
                        url = a.get('href')
                        if url.startswith('/'):
-                            url = 'https://www.amazon.%s%s' % (self.get_website_domain(domain), url)
+                            url = 'https://www.amazon.%s%s' % (
                                self.get_website_domain(domain), url)
                        matches.append(url)
                    break
        if not matches and root.xpath('//form[@action="/errors/validateCaptcha"]'):
@ -1101,7 +1148,7 @@ class Amazon(Source):
        return matches[:3]
    # }}}
-    def search_amazon(self, br, testing, log, abort, title, authors, identifiers, timeout):
+    def search_amazon(self, br, testing, log, abort, title, authors, identifiers, timeout):  # {{{
        import html5lib
        from calibre.utils.cleantext import clean_ascii_chars
        from calibre.ebooks.chardet import xml_to_unicode
@ -1152,10 +1199,42 @@ class Amazon(Source):
        matches = self.parse_results_page(root, domain)
-        return matches, query, domain
+        return matches, query, domain, None
    # }}}
    def search_search_engine(self, br, testing, log, abort, title, authors, identifiers, timeout):  # {{{
        terms, domain = self.create_query(log, title=title, authors=authors,
                                          identifiers=identifiers, for_amazon=False)
        site = self.referrer_for_domain(
            domain)[len('https://'):].partition('/')[0]
        se = search_engines_module()
        matches = []
        for result in se.ddg_search(terms, site, log=log, br=br, timeout=timeout):
            if abort.is_set():
                return matches, terms, domain, None
            purl = urlparse(result.url)
            if '/dp/' in purl.path and site in purl.netloc:
                url = result.cached_url
                if url is None:
                    url = se.wayback_machine_cached_url(
                        result.url, br, timeout=timeout)
                if url is None:
                    log('Failed to find cached page for:', result.url)
                    continue
                if url not in matches:
                    matches.append(url)
                if len(matches) >= 3:
                    break
            else:
                log('Skipping non-book result:', result)
        if not matches:
            log('No search engine results for terms:', ' '.join(terms))
        return matches, terms, domain, se.wayback_url_processor
    # }}}
    def identify(self, log, result_queue, abort, title=None, authors=None,  # {{{
-            identifiers={}, timeout=30):
+                 identifiers={}, timeout=60):
        '''
        Note this method will retry without identifiers automatically if no
        match is found with identifiers.
@ -1165,23 +1244,38 @@ class Amazon(Source):
        udata = self._get_book_url(identifiers)
        br = self.browser
        log('User-agent:', br.current_user_agent())
        if testing:
            print('User-agent:', br.current_user_agent())
        if udata is not None:
            # Try to directly get details page instead of running a search
            domain, idtype, asin, durl = udata
-            preparsed_root = parse_details_page(durl, log, timeout, br, domain)
+            cover_url_processor = None
            if USE_SEARCH_ENGINE:
                se = search_engines_module()
                durl = se.wayback_machine_cached_url(
                    durl, br, timeout=timeout, log=log)
                cover_url_processor = se.wayback_url_processor
            if durl is None:
                log('Failed to get cached URL for asin:', asin)
            else:
                preparsed_root = parse_details_page(
                    durl, log, timeout, br, domain)
                if preparsed_root is not None:
                    qasin = parse_asin(preparsed_root[1], log, durl)
                    if qasin == asin:
-                    w = Worker(durl, result_queue, br, log, 0, domain, self, testing=testing, preparsed_root=preparsed_root)
+                        w = Worker(durl, result_queue, br, log, 0, domain,
                                   self, testing=testing, preparsed_root=preparsed_root, cover_url_processor=cover_url_processor)
                        try:
                            w.get_details()
                            return
                        except Exception:
-                        log.exception('get_details failed for url: %r'%durl)
+                            log.exception(
                                'get_details failed for url: %r' % durl)
        func = self.search_search_engine if USE_SEARCH_ENGINE else self.search_amazon
        try:
-            matches, query, domain = self.search_amazon(br, testing, log, abort, title, authors, identifiers, timeout)
+            matches, query, domain, cover_url_processor = func(
                br, testing, log, abort, title, authors, identifiers, timeout)
        except SearchFailed:
            return
@ -1198,8 +1292,8 @@ class Amazon(Source):
            log.error('No matches found with query: %r' % query)
            return
-        workers = [Worker(url, result_queue, br, log, i, domain, self,
+        workers = [Worker(url, result_queue, br, log, i, domain, self, testing=testing,
-                            testing=testing) for i, url in enumerate(matches)]
+                          cover_url_processor=cover_url_processor) for i, url in enumerate(matches)]
        for w in workers:
            # Don't send all requests at the same time
@ -1223,7 +1317,7 @@ class Amazon(Source):
    # }}}
    def download_cover(self, log, result_queue, abort,  # {{{
-            title=None, authors=None, identifiers={}, timeout=30, get_best_cover=False):
+                       title=None, authors=None, identifiers={}, timeout=60, get_best_cover=False):
        cached_url = self.get_cached_cover_url(identifiers)
        if cached_url is None:
            log.info('No cached cover found, running identify')
@ -1255,7 +1349,8 @@ class Amazon(Source):
        log('Downloading cover from:', cached_url)
        try:
            time.sleep(1)
-            cdata = self.browser.open_novisit(cached_url, timeout=timeout).read()
+            cdata = self.browser.open_novisit(
                cached_url, timeout=timeout).read()
            result_queue.put((self, cdata))
        except:
            log.exception('Failed to download cover from:', cached_url)
@ -1263,29 +1358,34 @@ class Amazon(Source):
 if __name__ == '__main__':  # tests {{{
-    # To run these test use: calibre-debug src/calibre/ebooks/metadata/sources/amazon.py
+    # To run these test use: calibre-debug
    # src/calibre/ebooks/metadata/sources/amazon.py
    from calibre.ebooks.metadata.sources.test import (test_identify_plugin,
                                                      isbn_test, title_test, authors_test, comments_test, series_test)
    com_tests = [  # {{{
        (   # Paperback with series
            {'identifiers': {'amazon': '1423146786'}},
-                [title_test('The Heroes of Olympus, Book Five The Blood of Olympus', exact=True), series_test('Heroes of Olympus', 5)]
+            [title_test('The Heroes of Olympus, Book Five The Blood of Olympus',
                        exact=True), series_test('Heroes of Olympus', 5)]
        ),
        (   # Kindle edition with series
            {'identifiers': {'amazon': 'B0085UEQDO'}},
-                [title_test('Three Parts Dead', exact=True), series_test('Craft Sequence', 1)]
+            [title_test('Three Parts Dead', exact=True),
             series_test('Craft Sequence', 1)]
        ),
        (   # A kindle edition that does not appear in the search results when searching by ASIN
            {'identifiers': {'amazon': 'B004JHY6OG'}},
-                [title_test('The Heroes: A First Law Novel (First Law World 2)', exact=True)]
+            [title_test(
                'The Heroes: A First Law Novel (First Law World 2)', exact=True)]
        ),
        (  # + in title and uses id="main-image" for cover
            {'identifiers': {'amazon': '1933988770'}},
-                [title_test('C++ Concurrency in Action: Practical Multithreading', exact=True)]
+            [title_test(
                'C++ Concurrency in Action: Practical Multithreading', exact=True)]
        ),
@ -1426,7 +1526,8 @@ if __name__ == '__main__':  # tests {{{
    cn_tests = [  # {{{
        (
            {'identifiers': {'isbn': '9787115369512'}},
-                [title_test('若为自由故 自由软件之父理查德斯托曼传', exact=True), authors_test(['[美]sam Williams', '邓楠，李凡希'])]
+            [title_test('若为自由故 自由软件之父理查德斯托曼传', exact=True),
             authors_test(['[美]sam Williams', '邓楠，李凡希'])]
        ),
        (
            {'title': '爱上Raspberry Pi'},
@ -1440,12 +1541,14 @@ if __name__ == '__main__':  # tests {{{
    ca_tests = [  # {{{
        (   # Paperback with series
            {'identifiers': {'isbn': '9781623808747'}},
-                [title_test('Parting Shot', exact=True), authors_test(['Mary Calmes'])]
+            [title_test('Parting Shot', exact=True),
             authors_test(['Mary Calmes'])]
        ),
        (  # # in title
            {'title': 'Expert C# 2008 Business Objects',
             'authors': ['Lhotka']},
-                [title_test('Expert C# 2008 Business Objects'), authors_test(['Rockford Lhotka'])]
+            [title_test('Expert C# 2008 Business Objects'),
             authors_test(['Rockford Lhotka'])]
        ),
        (  # noscript description
            {'identifiers': {'amazon_ca': '162380874X'}},
--- a/src/calibre/ebooks/metadata/sources/search_engines.py
+++ b/src/calibre/ebooks/metadata/sources/search_engines.py
@ -46,12 +46,12 @@ def parse_html(raw):
    return html5lib.parse(raw, treebuilder='lxml', namespaceHTMLElements=False)
-def query(br, url, key, dump_raw=None, limit=1, parser=parse_html):
+def query(br, url, key, dump_raw=None, limit=1, parser=parse_html, timeout=60):
    delta = monotonic() - last_visited[key]
    if delta < limit and delta > 0:
        time.sleep(delta)
    try:
-        raw = br.open_novisit(url).read()
+        raw = br.open_novisit(url, timeout=timeout).read()
    finally:
        last_visited[key] = monotonic()
    if dump_raw is not None:
@ -80,20 +80,29 @@ def ddg_href(url):
    return url
-def wayback_machine_cached_url(url, br=None):
+def wayback_machine_cached_url(url, br=None, log=prints, timeout=60):
    q = quote_term(url)
    br = br or browser()
    data = query(br, 'https://archive.org/wayback/available?url=' +
-                 q, 'wayback', parser=json.loads, limit=0.25)
+                 q, 'wayback', parser=json.loads, limit=0.25, timeout=timeout)
    try:
        closest = data['archived_snapshots']['closest']
    except KeyError:
-        return
+        pass
    else:
        if closest['available']:
            return closest['url']
    from pprint import pformat
    log('Response from wayback machine:', pformat(data))
-def ddg_search(terms, site=None, br=None, log=prints, safe_search=False, dump_raw=None):
+def wayback_url_processor(url):
    if url.startswith('/'):
        url = 'https://web.archive.org' + url
    return url
 def ddg_search(terms, site=None, br=None, log=prints, safe_search=False, dump_raw=None, timeout=60):
    # https://duck.co/help/results/syntax
    terms = map(ddg_term, terms)
    terms = [quote_term(t) for t in terms]
@ -104,7 +113,7 @@ def ddg_search(terms, site=None, br=None, log=prints, safe_search=False, dump_ra
        q=q, kp=1 if safe_search else -1)
    log('Making ddg query: ' + url)
    br = br or browser()
-    root = query(br, url, 'ddg', dump_raw)
+    root = query(br, url, 'ddg', dump_raw, timeout=timeout)
    ans = []
    for a in root.xpath('//*[@class="results"]//*[@class="result__title"]/a[@href and @class="result__a"]'):
        ans.append(Result(ddg_href(a.get('href')), etree.tostring(
--- a/src/calibre/ebooks/metadata/sources/update.py
+++ b/src/calibre/ebooks/metadata/sources/update.py
@ -14,7 +14,6 @@ from threading import Thread
 import calibre.ebooks.metadata.sources.search_engines as builtin_search_engines
 from calibre import as_unicode, prints
 from calibre.constants import DEBUG, numeric_version
 from calibre.customize.ui import patch_metadata_plugins
 from calibre.ebooks.metadata.sources.base import Source
 from calibre.utils.config import JSONConfig
 from calibre.utils.https import get_https_resource_securely
@ -59,6 +58,7 @@ def patch_search_engines(src):
 def patch_plugins():
    from calibre.customize.ui import patch_metadata_plugins
    patches = {}
    for name, val in cache.iteritems():
        if name == 'hashes':