Implement searching amazon via wayback machine

Disabled, as wayback machine is really slow/flaky
2025-07-09 03:04:10 -04:00 · 2017-03-02 09:19:51 +05:30 · 2017-03-02 09:19:51 +05:30 · d1ad4955a8
commit d1ad4955a8
parent 6c4c14ceca
3 changed files with 447 additions and 335 deletions
--- a/src/calibre/ebooks/metadata/sources/amazon.py
+++ b/src/calibre/ebooks/metadata/sources/amazon.py
@ -1,24 +1,22 @@
 #!/usr/bin/env python2
 # vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
-from __future__ import (unicode_literals, division, absolute_import,
+# License: GPLv3 Copyright: 2011, Kovid Goyal <kovid at kovidgoyal.net>
-                        print_function)
+from __future__ import absolute_import, division, print_function, unicode_literals
-__license__   = 'GPL v3'
+import re
-__copyright__ = '2011, Kovid Goyal <kovid@kovidgoyal.net>'
+import socket
-__docformat__ = 'restructuredtext en'
+import time
-
+from Queue import Empty, Queue
 import socket, time, re
 from threading import Thread
-from Queue import Queue, Empty
+from urlparse import urlparse
 from calibre import as_unicode, browser
 from calibre.ebooks.metadata import check_isbn
 from calibre.ebooks.metadata.sources.base import (Source, Option, fixcase,
        fixauthors)
 from calibre.ebooks.metadata.book.base import Metadata
 from calibre.ebooks.metadata.sources.base import Option, Source, fixauthors, fixcase
 from calibre.ebooks.metadata.sources.update import search_engines_module
 from calibre.utils.localization import canonicalize_lang
-from calibre.utils.random_ua import all_user_agents, accept_header_for_ua
+from calibre.utils.random_ua import accept_header_for_ua, all_user_agents
 class CaptchaError(Exception):
@ -30,6 +28,7 @@ class SearchFailed(ValueError):
 ua_index = -1
 USE_SEARCH_ENGINE = False
 def parse_details_page(url, log, timeout, browser, domain):
@ -37,12 +36,13 @@ def parse_details_page(url, log, timeout, browser, domain):
    from calibre.ebooks.chardet import xml_to_unicode
    import html5lib
    from lxml.html import tostring
    log('Getting details from:', url)
    try:
        raw = browser.open_novisit(url, timeout=timeout).read().strip()
    except Exception as e:
        if callable(getattr(e, 'getcode', None)) and \
                e.getcode() == 404:
-            log.error('URL malformed: %r'%url)
+            log.error('URL malformed: %r' % url)
            return
        attr = getattr(e, 'args', [None])
        attr = attr if attr else [None]
@ -50,35 +50,37 @@ def parse_details_page(url, log, timeout, browser, domain):
            msg = 'Amazon timed out. Try again later.'
            log.error(msg)
        else:
-            msg = 'Failed to make details query: %r'%url
+            msg = 'Failed to make details query: %r' % url
            log.exception(msg)
        return
    oraw = raw
    if 'amazon.com.br' in url:
-        raw = raw.decode('utf-8')  # amazon.com.br serves utf-8 but has an incorrect latin1 <meta> tag
+        # amazon.com.br serves utf-8 but has an incorrect latin1 <meta> tag
-    raw = xml_to_unicode(raw, strip_encoding_pats=True, resolve_entities=True)[0]
+        raw = raw.decode('utf-8')
    raw = xml_to_unicode(raw, strip_encoding_pats=True,
                         resolve_entities=True)[0]
    if '<title>404 - ' in raw:
-        log.error('URL malformed: %r'%url)
+        log.error('URL malformed: %r' % url)
        return
    try:
        root = html5lib.parse(clean_ascii_chars(raw), treebuilder='lxml',
                              namespaceHTMLElements=False)
    except:
-        msg = 'Failed to parse amazon details page: %r'%url
+        msg = 'Failed to parse amazon details page: %r' % url
        log.exception(msg)
        return
    if domain == 'jp':
        for a in root.xpath('//a[@href]'):
            if 'black-curtain-redirect.html' in a.get('href'):
-                url = 'https://amazon.co.jp'+a.get('href')
+                url = 'https://amazon.co.jp' + a.get('href')
                log('Black curtain redirect found, following')
                return parse_details_page(url, log, timeout, browser, domain)
    errmsg = root.xpath('//*[@id="errorMessage"]')
    if errmsg:
-        msg = 'Failed to parse amazon details page: %r'%url
+        msg = 'Failed to parse amazon details page: %r' % url
        msg += tostring(errmsg, method='text', encoding=unicode).strip()
        log.error(msg)
        return
@ -94,7 +96,7 @@ def parse_asin(root, log, url):
        for l in link:
            return l.get('href').rpartition('/')[-1]
    except Exception:
-        log.exception('Error parsing ASIN for url: %r'%url)
+        log.exception('Error parsing ASIN for url: %r' % url)
 class Worker(Thread):  # Get details {{{
@ -104,8 +106,9 @@ class Worker(Thread):  # Get details {{{
    '''
    def __init__(self, url, result_queue, browser, log, relevance, domain,
-            plugin, timeout=20, testing=False, preparsed_root=None):
+                 plugin, timeout=20, testing=False, preparsed_root=None, cover_url_processor=None):
        Thread.__init__(self)
        self.cover_url_processor = cover_url_processor
        self.preparsed_root = preparsed_root
        self.daemon = True
        self.testing = testing
@ -230,7 +233,8 @@ class Worker(Thread):  # Get details {{{
                    starts-with(text(), "Uitgever:") or \
                    starts-with(text(), "出版社:")]
            '''
-        self.publisher_names = {'Publisher', 'Uitgever', 'Verlag', 'Editore', 'Editeur', 'Editor', 'Editora', '出版社'}
+        self.publisher_names = {'Publisher', 'Uitgever', 'Verlag',
                                'Editore', 'Editeur', 'Editor', 'Editora', '出版社'}
        self.language_xpath =    '''
            descendant::*[
@ -244,7 +248,8 @@ class Worker(Thread):  # Get details {{{
                or starts-with(text(), "语种")
                ]
            '''
-        self.language_names = {'Language', 'Sprache', 'Lingua', 'Idioma', 'Langue', '言語', 'Taal', '语种'}
+        self.language_names = {'Language', 'Sprache',
                               'Lingua', 'Idioma', 'Langue', '言語', 'Taal', '语种'}
        self.tags_xpath = '''
            descendant::h2[
@ -290,7 +295,7 @@ class Worker(Thread):  # Get details {{{
    def delocalize_datestr(self, raw):
        if self.domain == 'cn':
-            return raw.replace('年','-').replace('月','-').replace('日','')
+            return raw.replace('年', '-').replace('月', '-').replace('日', '')
        if not self.months:
            return raw
        ans = raw.lower()
@ -304,11 +309,12 @@ class Worker(Thread):  # Get details {{{
        try:
            self.get_details()
        except:
-            self.log.exception('get_details failed for url: %r'%self.url)
+            self.log.exception('get_details failed for url: %r' % self.url)
    def get_details(self):
        if self.preparsed_root is None:
-            raw, root, selector = parse_details_page(self.url, self.log, self.timeout, self.browser, self.domain)
+            raw, root, selector = parse_details_page(
                self.url, self.log, self.timeout, self.browser, self.domain)
        else:
            raw, root, selector = self.preparsed_root
@ -319,10 +325,12 @@ class Worker(Thread):  # Get details {{{
    def parse_details(self, raw, root):
        asin = parse_asin(root, self.log, self.url)
        if not asin and root.xpath('//form[@action="/errors/validateCaptcha"]'):
-            raise CaptchaError('Amazon returned a CAPTCHA page, probably because you downloaded too many books. Wait for some time and try again.')
+            raise CaptchaError(
                'Amazon returned a CAPTCHA page, probably because you downloaded too many books. Wait for some time and try again.')
        if self.testing:
-            import tempfile, uuid
+            import tempfile
-            with tempfile.NamedTemporaryFile(prefix=(asin or str(uuid.uuid4()))+ '_',
+            import uuid
            with tempfile.NamedTemporaryFile(prefix=(asin or str(uuid.uuid4())) + '_',
                                             suffix='.html', delete=False) as f:
                f.write(raw)
            print ('Downloaded html for', asin, 'saved in', f.name)
@ -330,35 +338,36 @@ class Worker(Thread):  # Get details {{{
        try:
            title = self.parse_title(root)
        except:
-            self.log.exception('Error parsing title for url: %r'%self.url)
+            self.log.exception('Error parsing title for url: %r' % self.url)
            title = None
        try:
            authors = self.parse_authors(root)
        except:
-            self.log.exception('Error parsing authors for url: %r'%self.url)
+            self.log.exception('Error parsing authors for url: %r' % self.url)
            authors = []
        if not title or not authors or not asin:
-            self.log.error('Could not find title/authors/asin for %r'%self.url)
+            self.log.error(
-            self.log.error('ASIN: %r Title: %r Authors: %r'%(asin, title,
+                'Could not find title/authors/asin for %r' % self.url)
            self.log.error('ASIN: %r Title: %r Authors: %r' % (asin, title,
                                                               authors))
            return
        mi = Metadata(title, authors)
-        idtype = 'amazon' if self.domain == 'com' else 'amazon_'+self.domain
+        idtype = 'amazon' if self.domain == 'com' else 'amazon_' + self.domain
        mi.set_identifier(idtype, asin)
        self.amazon_id = asin
        try:
            mi.rating = self.parse_rating(root)
        except:
-            self.log.exception('Error parsing ratings for url: %r'%self.url)
+            self.log.exception('Error parsing ratings for url: %r' % self.url)
        try:
            mi.comments = self.parse_comments(root, raw)
        except:
-            self.log.exception('Error parsing comments for url: %r'%self.url)
+            self.log.exception('Error parsing comments for url: %r' % self.url)
        try:
            series, series_index = self.parse_series(root)
@ -367,26 +376,30 @@ class Worker(Thread):  # Get details {{{
            elif self.testing:
                mi.series, mi.series_index = 'Dummy series for testing', 1
        except:
-            self.log.exception('Error parsing series for url: %r'%self.url)
+            self.log.exception('Error parsing series for url: %r' % self.url)
        try:
            mi.tags = self.parse_tags(root)
        except:
-            self.log.exception('Error parsing tags for url: %r'%self.url)
+            self.log.exception('Error parsing tags for url: %r' % self.url)
        try:
            self.cover_url = self.parse_cover(root, raw)
        except:
-            self.log.exception('Error parsing cover for url: %r'%self.url)
+            self.log.exception('Error parsing cover for url: %r' % self.url)
        if self.cover_url_processor is not None and self.cover_url.startswith('/'):
            self.cover_url = self.cover_url_processor(self.cover_url)
        mi.has_cover = bool(self.cover_url)
-        non_hero = tuple(self.selector('div#bookDetails_container_div div#nonHeroSection'))
+        non_hero = tuple(self.selector(
            'div#bookDetails_container_div div#nonHeroSection'))
        if non_hero:
            # New style markup
            try:
                self.parse_new_details(root, mi, non_hero[0])
            except:
-                self.log.exception('Failed to parse new-style book details section')
+                self.log.exception(
                    'Failed to parse new-style book details section')
        else:
            pd = root.xpath(self.pd_xpath)
            if pd:
@ -397,27 +410,32 @@ class Worker(Thread):  # Get details {{{
                    if isbn:
                        self.isbn = mi.isbn = isbn
                except:
-                    self.log.exception('Error parsing ISBN for url: %r'%self.url)
+                    self.log.exception(
                        'Error parsing ISBN for url: %r' % self.url)
                try:
                    mi.publisher = self.parse_publisher(pd)
                except:
-                    self.log.exception('Error parsing publisher for url: %r'%self.url)
+                    self.log.exception(
                        'Error parsing publisher for url: %r' % self.url)
                try:
                    mi.pubdate = self.parse_pubdate(pd)
                except:
-                    self.log.exception('Error parsing publish date for url: %r'%self.url)
+                    self.log.exception(
                        'Error parsing publish date for url: %r' % self.url)
                try:
                    lang = self.parse_language(pd)
                    if lang:
                        mi.language = lang
                except:
-                    self.log.exception('Error parsing language for url: %r'%self.url)
+                    self.log.exception(
                        'Error parsing language for url: %r' % self.url)
            else:
-                self.log.warning('Failed to find product description for url: %r'%self.url)
+                self.log.warning(
                    'Failed to find product description for url: %r' % self.url)
        mi.source_relevance = self.relevance
@ -448,7 +466,8 @@ class Worker(Thread):  # Get details {{{
            title = self.tostring(actual_title[0], encoding=unicode,
                                  method='text').strip()
        else:
-            title = self.tostring(tdiv, encoding=unicode, method='text').strip()
+            title = self.tostring(tdiv, encoding=unicode,
                                  method='text').strip()
        ans = re.sub(r'[(\[].*[)\]]', '', title).strip()
        if not ans:
            ans = title.rpartition('[')[0].strip()
@ -500,7 +519,7 @@ class Worker(Thread):  # Get details {{{
                else:
                    m = self.ratings_pat.match(t)
                    if m is not None:
-                        return float(m.group(1))/float(m.group(3)) * 5
+                        return float(m.group(1)) / float(m.group(3)) * 5
    def _render_comments(self, desc):
        from calibre.library.comments import sanitize_comments_html
@ -540,7 +559,8 @@ class Worker(Thread):  # Get details {{{
            if len(ns) == 0 and ns.text:
                import html5lib
                # html5lib parsed noscript as CDATA
-                ns = html5lib.parseFragment('<div>%s</div>' % (ns.text), treebuilder='lxml', namespaceHTMLElements=False)[0]
+                ns = html5lib.parseFragment(
                    '<div>%s</div>' % (ns.text), treebuilder='lxml', namespaceHTMLElements=False)[0]
            else:
                ns.tag = 'div'
            ans = self._render_comments(ns)
@ -549,7 +569,8 @@ class Worker(Thread):  # Get details {{{
            if desc:
                ans = self._render_comments(desc[0])
-        desc = root.xpath('//div[@id="productDescription"]/*[@class="content"]')
+        desc = root.xpath(
            '//div[@id="productDescription"]/*[@class="content"]')
        if desc:
            ans += self._render_comments(desc[0])
        else:
@ -559,12 +580,15 @@ class Worker(Thread):  # Get details {{{
            if m is not None:
                try:
                    text = unquote(m.group(1)).decode('utf-8')
-                    nr = html5lib.parse(text, treebuilder='lxml', namespaceHTMLElements=False)
+                    nr = html5lib.parse(
-                    desc = nr.xpath('//div[@id="productDescription"]/*[@class="content"]')
+                        text, treebuilder='lxml', namespaceHTMLElements=False)
                    desc = nr.xpath(
                        '//div[@id="productDescription"]/*[@class="content"]')
                    if desc:
                        ans += self._render_comments(desc[0])
                except Exception as e:
-                    self.log.warn('Parsing of obfuscated product description failed with error: %s' % as_unicode(e))
+                    self.log.warn(
                        'Parsing of obfuscated product description failed with error: %s' % as_unicode(e))
        return ans
@ -577,13 +601,15 @@ class Worker(Thread):  # Get details {{{
            series = series[0]
            spans = series.xpath('./span')
            if spans:
-                raw = self.tostring(spans[0], encoding=unicode, method='text', with_tail=False).strip()
+                raw = self.tostring(
                    spans[0], encoding=unicode, method='text', with_tail=False).strip()
                m = re.search('\s+([0-9.]+)$', raw.strip())
                if m is not None:
                    series_index = float(m.group(1))
                    s = series.xpath('./a[@id="series-page-link"]')
                    if s:
-                        series = self.tostring(s[0], encoding=unicode, method='text', with_tail=False).strip()
+                        series = self.tostring(
                            s[0], encoding=unicode, method='text', with_tail=False).strip()
                        if series:
                            ans = (series, series_index)
        # This is found on Kindle edition pages on amazon.com
@ -595,7 +621,8 @@ class Worker(Thread):  # Get details {{{
                    series_index = float(m.group(1))
                    a = span.xpath('./a[@href]')
                    if a:
-                        series = self.tostring(a[0], encoding=unicode, method='text', with_tail=False).strip()
+                        series = self.tostring(
                            a[0], encoding=unicode, method='text', with_tail=False).strip()
                        if series:
                            ans = (series, series_index)
        # This is found on newer Kindle edition pages on amazon.com
@ -607,7 +634,8 @@ class Worker(Thread):  # Get details {{{
                    series_index = float(m.group(1))
                    a = b.getparent().xpath('./a[@href]')
                    if a:
-                        series = self.tostring(a[0], encoding=unicode, method='text', with_tail=False).partition('(')[0].strip()
+                        series = self.tostring(
                            a[0], encoding=unicode, method='text', with_tail=False).partition('(')[0].strip()
                        if series:
                            ans = series, series_index
@ -629,12 +657,14 @@ class Worker(Thread):  # Get details {{{
    def parse_tags(self, root):
        ans = []
        exclude_tokens = {'kindle', 'a-z'}
-        exclude = {'special features', 'by authors', 'authors & illustrators', 'books', 'new; used & rental textbooks'}
+        exclude = {'special features', 'by authors',
                   'authors & illustrators', 'books', 'new; used & rental textbooks'}
        seen = set()
        for li in root.xpath(self.tags_xpath):
            for i, a in enumerate(li.iterdescendants('a')):
                if i > 0:
-                    # we ignore the first category since it is almost always too broad
+                    # we ignore the first category since it is almost always
                    # too broad
                    raw = (a.text or '').strip().replace(',', ';')
                    lraw = icu_lower(raw)
                    tokens = frozenset(lraw.split())
@ -663,7 +693,7 @@ class Worker(Thread):  # Get details {{{
                sparts = bn.split('_')
                if len(sparts) > 2:
                    bn = re.sub(r'\.\.jpg$', '.jpg', (sparts[0] + sparts[-1]))
-                    return ('/'.join(parts[:-1]))+'/'+bn
+                    return ('/'.join(parts[:-1])) + '/' + bn
        imgpat2 = re.compile(r'var imageSrc = "([^"]+)"')
        for script in root.xpath('//script'):
@ -674,12 +704,14 @@ class Worker(Thread):  # Get details {{{
                if url:
                    return url
-        imgs = root.xpath('//img[(@id="prodImage" or @id="original-main-image" or @id="main-image" or @id="main-image-nonjs") and @src]')
+        imgs = root.xpath(
            '//img[(@id="prodImage" or @id="original-main-image" or @id="main-image" or @id="main-image-nonjs") and @src]')
        if not imgs:
            imgs = (
                root.xpath('//div[@class="main-image-inner-wrapper"]/img[@src]') or
                root.xpath('//div[@id="main-image-container" or @id="ebooks-main-image-container"]//img[@src]') or
-                root.xpath('//div[@id="mainImageContainer"]//img[@data-a-dynamic-image]')
+                root.xpath(
                    '//div[@id="mainImageContainer"]//img[@data-a-dynamic-image]')
            )
            for img in imgs:
                try:
@ -703,7 +735,7 @@ class Worker(Thread):  # Get details {{{
            if 'data:' in src:
                continue
            if 'loading-' in src:
-                js_img = re.search(br'"largeImage":"(https?://[^"]+)",',raw)
+                js_img = re.search(br'"largeImage":"(https?://[^"]+)",', raw)
                if js_img:
                    src = js_img.group(1).decode('utf-8')
            if ('/no-image-avail' not in src and 'loading-' not in src and '/no-img-sm' not in src):
@ -884,17 +916,18 @@ class Amazon(Source):
            return 'https://www.amazon.com.br/'
        if domain == 'au':
            return 'https://www.amazon.com.au/'
-        return 'https://www.amazon.%s/'%domain
+        return 'https://www.amazon.%s/' % domain
    def _get_book_url(self, identifiers):  # {{{
-        domain, asin = self.get_domain_and_asin(identifiers, extra_domains=('in', 'au', 'ca'))
+        domain, asin = self.get_domain_and_asin(
            identifiers, extra_domains=('in', 'au', 'ca'))
        if domain and asin:
            url = None
            r = self.referrer_for_domain(domain)
            if r is not None:
                url = r + 'dp/' + asin
            if url:
-                idtype = 'amazon' if domain == 'com' else 'amazon_'+domain
+                idtype = 'amazon' if domain == 'com' else 'amazon_' + domain
                return domain, idtype, asin, url
    def get_book_url(self, identifiers):
@ -955,7 +988,7 @@ class Amazon(Source):
        return udomain
    def create_query(self, log, title=None, authors=None, identifiers={},  # {{{
-            domain=None):
+                     domain=None, for_amazon=True):
        from urllib import urlencode
        if domain is None:
            domain = self.domain
@ -965,6 +998,7 @@ class Amazon(Source):
            domain = idomain
        # See the amazon detailed search page to get all options
        terms = []
        q = {'search-alias': 'aps',
             'unfiltered': '1',
             }
@ -978,26 +1012,34 @@ class Amazon(Source):
        if asin is not None:
            q['field-keywords'] = asin
            terms.append(asin)
        elif isbn is not None:
            q['field-isbn'] = isbn
            terms.append(isbn)
        else:
            # Only return book results
-            q['search-alias'] = {'br':'digital-text', 'nl':'aps'}.get(domain, 'stripbooks')
+            q['search-alias'] = {'br': 'digital-text',
                                 'nl': 'aps'}.get(domain, 'stripbooks')
            if title:
                title_tokens = list(self.get_title_tokens(title))
                if title_tokens:
                    q['field-title'] = ' '.join(title_tokens)
                    terms.extend(title_tokens)
            if authors:
                author_tokens = self.get_author_tokens(authors,
                                                       only_first_author=True)
                if author_tokens:
                    q['field-author'] = ' '.join(author_tokens)
                    terms.extend(author_tokens)
        if not ('field-keywords' in q or 'field-isbn' in q or
                ('field-title' in q)):
            # Insufficient metadata to make an identify query
            return None, None
        if not for_amazon:
            return terms, domain
        # magic parameter to enable Japanese Shift_JIS encoding.
        if domain == 'jp':
            q['__mk_ja_JP'] = u'カタカナ'
@ -1012,13 +1054,14 @@ class Amazon(Source):
        if domain == 'jp':
            encode_to = 'Shift_JIS'
        elif domain == 'nl' or domain == 'cn':
-            encode_to='utf-8'
+            encode_to = 'utf-8'
        else:
            encode_to = 'latin1'
        encoded_q = dict([(x.encode(encode_to, 'ignore'), y.encode(encode_to,
                                                                   'ignore')) for x, y in
                          q.iteritems()])
-        url = 'https://www.amazon.%s/s/?'%self.get_website_domain(domain) + urlencode(encoded_q)
+        url = 'https://www.amazon.%s/s/?' % self.get_website_domain(
            domain) + urlencode(encoded_q)
        return url, domain
    # }}}
@ -1043,7 +1086,8 @@ class Amazon(Source):
        def title_ok(title):
            title = title.lower()
-            bad = ['bulk pack', '[audiobook]', '[audio cd]', '(a book companion)', '( slipcase with door )', ': free sampler']
+            bad = ['bulk pack', '[audiobook]', '[audio cd]',
                   '(a book companion)', '( slipcase with door )', ': free sampler']
            if self.domain == 'com':
                bad.extend(['(%s edition)' % x for x in ('spanish', 'german')])
            for x in bad:
@ -1059,7 +1103,8 @@ class Amazon(Source):
            if title_ok(title):
                url = a.get('href')
                if url.startswith('/'):
-                    url = 'https://www.amazon.%s%s' % (self.get_website_domain(domain), url)
+                    url = 'https://www.amazon.%s%s' % (
                        self.get_website_domain(domain), url)
                matches.append(url)
        if not matches:
@ -1074,7 +1119,8 @@ class Amazon(Source):
                    if title_ok(title):
                        url = a.get('href')
                        if url.startswith('/'):
-                            url = 'https://www.amazon.%s%s' % (self.get_website_domain(domain), url)
+                            url = 'https://www.amazon.%s%s' % (
                                self.get_website_domain(domain), url)
                        matches.append(url)
                    break
@ -1088,7 +1134,8 @@ class Amazon(Source):
                    if title_ok(title):
                        url = a.get('href')
                        if url.startswith('/'):
-                            url = 'https://www.amazon.%s%s' % (self.get_website_domain(domain), url)
+                            url = 'https://www.amazon.%s%s' % (
                                self.get_website_domain(domain), url)
                        matches.append(url)
                    break
        if not matches and root.xpath('//form[@action="/errors/validateCaptcha"]'):
@ -1101,7 +1148,7 @@ class Amazon(Source):
        return matches[:3]
    # }}}
-    def search_amazon(self, br, testing, log, abort, title, authors, identifiers, timeout):
+    def search_amazon(self, br, testing, log, abort, title, authors, identifiers, timeout):  # {{{
        import html5lib
        from calibre.utils.cleantext import clean_ascii_chars
        from calibre.ebooks.chardet import xml_to_unicode
@ -1116,7 +1163,7 @@ class Amazon(Source):
        except Exception as e:
            if callable(getattr(e, 'getcode', None)) and \
                    e.getcode() == 404:
-                log.error('Query malformed: %r'%query)
+                log.error('Query malformed: %r' % query)
                raise SearchFailed()
            attr = getattr(e, 'args', [None])
            attr = attr if attr else [None]
@ -1124,7 +1171,7 @@ class Amazon(Source):
                msg = _('Amazon timed out. Try again later.')
                log.error(msg)
            else:
-                msg = 'Failed to make identify query: %r'%query
+                msg = 'Failed to make identify query: %r' % query
                log.exception(msg)
            raise SearchFailed()
@ -1146,16 +1193,48 @@ class Amazon(Source):
                root = html5lib.parse(raw, treebuilder='lxml',
                                      namespaceHTMLElements=False)
            except Exception:
-                msg = 'Failed to parse amazon page for query: %r'%query
+                msg = 'Failed to parse amazon page for query: %r' % query
                log.exception(msg)
                raise SearchFailed()
        matches = self.parse_results_page(root, domain)
-        return matches, query, domain
+        return matches, query, domain, None
    # }}}
    def search_search_engine(self, br, testing, log, abort, title, authors, identifiers, timeout):  # {{{
        terms, domain = self.create_query(log, title=title, authors=authors,
                                          identifiers=identifiers, for_amazon=False)
        site = self.referrer_for_domain(
            domain)[len('https://'):].partition('/')[0]
        se = search_engines_module()
        matches = []
        for result in se.ddg_search(terms, site, log=log, br=br, timeout=timeout):
            if abort.is_set():
                return matches, terms, domain, None
            purl = urlparse(result.url)
            if '/dp/' in purl.path and site in purl.netloc:
                url = result.cached_url
                if url is None:
                    url = se.wayback_machine_cached_url(
                        result.url, br, timeout=timeout)
                if url is None:
                    log('Failed to find cached page for:', result.url)
                    continue
                if url not in matches:
                    matches.append(url)
                if len(matches) >= 3:
                    break
            else:
                log('Skipping non-book result:', result)
        if not matches:
            log('No search engine results for terms:', ' '.join(terms))
        return matches, terms, domain, se.wayback_url_processor
    # }}}
    def identify(self, log, result_queue, abort, title=None, authors=None,  # {{{
-            identifiers={}, timeout=30):
+                 identifiers={}, timeout=60):
        '''
        Note this method will retry without identifiers automatically if no
        match is found with identifiers.
@ -1165,23 +1244,38 @@ class Amazon(Source):
        udata = self._get_book_url(identifiers)
        br = self.browser
        log('User-agent:', br.current_user_agent())
        if testing:
            print('User-agent:', br.current_user_agent())
        if udata is not None:
            # Try to directly get details page instead of running a search
            domain, idtype, asin, durl = udata
-            preparsed_root = parse_details_page(durl, log, timeout, br, domain)
+            cover_url_processor = None
            if USE_SEARCH_ENGINE:
                se = search_engines_module()
                durl = se.wayback_machine_cached_url(
                    durl, br, timeout=timeout, log=log)
                cover_url_processor = se.wayback_url_processor
            if durl is None:
                log('Failed to get cached URL for asin:', asin)
            else:
                preparsed_root = parse_details_page(
                    durl, log, timeout, br, domain)
                if preparsed_root is not None:
                    qasin = parse_asin(preparsed_root[1], log, durl)
                    if qasin == asin:
-                    w = Worker(durl, result_queue, br, log, 0, domain, self, testing=testing, preparsed_root=preparsed_root)
+                        w = Worker(durl, result_queue, br, log, 0, domain,
                                   self, testing=testing, preparsed_root=preparsed_root, cover_url_processor=cover_url_processor)
                        try:
                            w.get_details()
                            return
                        except Exception:
-                        log.exception('get_details failed for url: %r'%durl)
+                            log.exception(
                                'get_details failed for url: %r' % durl)
        func = self.search_search_engine if USE_SEARCH_ENGINE else self.search_amazon
        try:
-            matches, query, domain = self.search_amazon(br, testing, log, abort, title, authors, identifiers, timeout)
+            matches, query, domain, cover_url_processor = func(
                br, testing, log, abort, title, authors, identifiers, timeout)
        except SearchFailed:
            return
@ -1191,15 +1285,15 @@ class Amazon(Source):
        if not matches:
            if identifiers and title and authors:
                log('No matches found with identifiers, retrying using only'
-                        ' title and authors. Query: %r'%query)
+                    ' title and authors. Query: %r' % query)
                time.sleep(1)
                return self.identify(log, result_queue, abort, title=title,
                                     authors=authors, timeout=timeout)
-            log.error('No matches found with query: %r'%query)
+            log.error('No matches found with query: %r' % query)
            return
-        workers = [Worker(url, result_queue, br, log, i, domain, self,
+        workers = [Worker(url, result_queue, br, log, i, domain, self, testing=testing,
-                            testing=testing) for i, url in enumerate(matches)]
+                          cover_url_processor=cover_url_processor) for i, url in enumerate(matches)]
        for w in workers:
            # Don't send all requests at the same time
@ -1223,7 +1317,7 @@ class Amazon(Source):
    # }}}
    def download_cover(self, log, result_queue, abort,  # {{{
-            title=None, authors=None, identifiers={}, timeout=30, get_best_cover=False):
+                       title=None, authors=None, identifiers={}, timeout=60, get_best_cover=False):
        cached_url = self.get_cached_cover_url(identifiers)
        if cached_url is None:
            log.info('No cached cover found, running identify')
@ -1255,7 +1349,8 @@ class Amazon(Source):
        log('Downloading cover from:', cached_url)
        try:
            time.sleep(1)
-            cdata = self.browser.open_novisit(cached_url, timeout=timeout).read()
+            cdata = self.browser.open_novisit(
                cached_url, timeout=timeout).read()
            result_queue.put((self, cdata))
        except:
            log.exception('Failed to download cover from:', cached_url)
@ -1263,34 +1358,39 @@ class Amazon(Source):
 if __name__ == '__main__':  # tests {{{
-    # To run these test use: calibre-debug src/calibre/ebooks/metadata/sources/amazon.py
+    # To run these test use: calibre-debug
    # src/calibre/ebooks/metadata/sources/amazon.py
    from calibre.ebooks.metadata.sources.test import (test_identify_plugin,
                                                      isbn_test, title_test, authors_test, comments_test, series_test)
    com_tests = [  # {{{
        (   # Paperback with series
-                {'identifiers':{'amazon':'1423146786'}},
+            {'identifiers': {'amazon': '1423146786'}},
-                [title_test('The Heroes of Olympus, Book Five The Blood of Olympus', exact=True), series_test('Heroes of Olympus', 5)]
+            [title_test('The Heroes of Olympus, Book Five The Blood of Olympus',
                        exact=True), series_test('Heroes of Olympus', 5)]
        ),
        (   # Kindle edition with series
-                {'identifiers':{'amazon':'B0085UEQDO'}},
+            {'identifiers': {'amazon': 'B0085UEQDO'}},
-                [title_test('Three Parts Dead', exact=True), series_test('Craft Sequence', 1)]
+            [title_test('Three Parts Dead', exact=True),
             series_test('Craft Sequence', 1)]
        ),
        (   # A kindle edition that does not appear in the search results when searching by ASIN
-                {'identifiers':{'amazon':'B004JHY6OG'}},
+            {'identifiers': {'amazon': 'B004JHY6OG'}},
-                [title_test('The Heroes: A First Law Novel (First Law World 2)', exact=True)]
+            [title_test(
                'The Heroes: A First Law Novel (First Law World 2)', exact=True)]
        ),
        (  # + in title and uses id="main-image" for cover
-                {'identifiers':{'amazon':'1933988770'}},
+            {'identifiers': {'amazon': '1933988770'}},
-                [title_test('C++ Concurrency in Action: Practical Multithreading', exact=True)]
+            [title_test(
                'C++ Concurrency in Action: Practical Multithreading', exact=True)]
        ),
        (  # noscript description
-                {'identifiers':{'amazon':'0756407117'}},
+            {'identifiers': {'amazon': '0756407117'}},
            [title_test(
                "Throne of the Crescent Moon"),
             comments_test('Makhslood'), comments_test('Dhamsawaat'),
@ -1298,7 +1398,7 @@ if __name__ == '__main__':  # tests {{{
        ),
        (  # Different comments markup, using Book Description section
-                {'identifiers':{'amazon':'0982514506'}},
+            {'identifiers': {'amazon': '0982514506'}},
            [title_test(
                "Griffin's Destiny: Book Three: The Griffin's Daughter Trilogy",
                exact=True),
@ -1307,15 +1407,15 @@ if __name__ == '__main__':  # tests {{{
        ),
        (  # # in title
-                {'title':'Expert C# 2008 Business Objects',
+            {'title': 'Expert C# 2008 Business Objects',
-                    'authors':['Lhotka']},
+             'authors': ['Lhotka']},
            [title_test('Expert C# 2008 Business Objects'),
             authors_test(['Rockford Lhotka'])
             ]
        ),
        (  # Description has links
-                {'identifiers':{'isbn': '9780671578275'}},
+            {'identifiers': {'isbn': '9780671578275'}},
            [title_test('A Civil Campaign: A Comedy of Biology and Manners',
                        exact=True), authors_test(['Lois McMaster Bujold'])
             ]
@ -1323,13 +1423,13 @@ if __name__ == '__main__':  # tests {{{
        ),
        (  # Sophisticated comment formatting
-                {'identifiers':{'isbn': '9781416580829'}},
+            {'identifiers': {'isbn': '9781416580829'}},
            [title_test('Angels & Demons - Movie Tie-In: A Novel',
                        exact=True), authors_test(['Dan Brown'])]
        ),
        (  # No specific problems
-                {'identifiers':{'isbn': '0743273567'}},
+            {'identifiers': {'isbn': '0743273567'}},
            [title_test('The great gatsby', exact=True),
             authors_test(['F. Scott Fitzgerald'])]
        ),
@ -1338,7 +1438,7 @@ if __name__ == '__main__':  # tests {{{
    de_tests = [  # {{{
        (
-                {'identifiers':{'isbn': '9783453314979'}},
+            {'identifiers': {'isbn': '9783453314979'}},
            [title_test('Die letzten Wächter: Roman',
                        exact=False), authors_test(['Sergej Lukianenko', 'Christiane Pöhlmann'])
             ]
@ -1346,7 +1446,7 @@ if __name__ == '__main__':  # tests {{{
        ),
        (
-                {'identifiers':{'isbn': '3548283519'}},
+            {'identifiers': {'isbn': '3548283519'}},
            [title_test('Wer Wind Sät: Der Fünfte Fall Für Bodenstein Und Kirchhoff',
                        exact=False), authors_test(['Nele Neuhaus'])
             ]
@ -1356,7 +1456,7 @@ if __name__ == '__main__':  # tests {{{
    it_tests = [  # {{{
        (
-                {'identifiers':{'isbn': '8838922195'}},
+            {'identifiers': {'isbn': '8838922195'}},
            [title_test('La briscola in cinque',
                        exact=True), authors_test(['Marco Malvaldi'])
             ]
@ -1366,7 +1466,7 @@ if __name__ == '__main__':  # tests {{{
    fr_tests = [  # {{{
        (
-                {'identifiers':{'isbn': '2221116798'}},
+            {'identifiers': {'isbn': '2221116798'}},
            [title_test('L\'étrange voyage de Monsieur Daldry',
                        exact=True), authors_test(['Marc Levy'])
             ]
@ -1376,7 +1476,7 @@ if __name__ == '__main__':  # tests {{{
    es_tests = [  # {{{
        (
-                {'identifiers':{'isbn': '8483460831'}},
+            {'identifiers': {'isbn': '8483460831'}},
            [title_test('Tiempos Interesantes',
                        exact=True), authors_test(['Terry Pratchett'])
             ]
@ -1386,12 +1486,12 @@ if __name__ == '__main__':  # tests {{{
    jp_tests = [  # {{{
        (  # Adult filtering test
-             {'identifiers':{'isbn':'4799500066'}},
+            {'identifiers': {'isbn': '4799500066'}},
-             [title_test(u'Ｂｉｔｃｈ Ｔｒａｐ'),]
+            [title_test(u'Ｂｉｔｃｈ Ｔｒａｐ'), ]
        ),
        (  # isbn -> title, authors
-                {'identifiers':{'isbn': '9784101302720'}},
+            {'identifiers': {'isbn': '9784101302720'}},
            [title_test(u'精霊の守り人',
                        exact=True), authors_test([u'上橋 菜穂子'])
             ]
@ -1405,7 +1505,7 @@ if __name__ == '__main__':  # tests {{{
    br_tests = [  # {{{
        (
-                {'title':'Guerra dos Tronos'},
+            {'title': 'Guerra dos Tronos'},
            [title_test('A Guerra dos Tronos - As Crônicas de Gelo e Fogo',
                        exact=True), authors_test(['George R. R. Martin'])
             ]
@ -1415,7 +1515,7 @@ if __name__ == '__main__':  # tests {{{
    nl_tests = [  # {{{
        (
-                {'title':'Freakonomics'},
+            {'title': 'Freakonomics'},
            [title_test('Freakonomics',
                        exact=True), authors_test(['Steven Levitt & Stephen Dubner & R. Kuitenbrouwer & O. Brenninkmeijer & A. van Den Berg'])
             ]
@ -1425,11 +1525,12 @@ if __name__ == '__main__':  # tests {{{
    cn_tests = [  # {{{
        (
-                {'identifiers':{'isbn':'9787115369512'}},
+            {'identifiers': {'isbn': '9787115369512'}},
-                [title_test('若为自由故 自由软件之父理查德斯托曼传', exact=True), authors_test(['[美]sam Williams', '邓楠，李凡希'])]
+            [title_test('若为自由故 自由软件之父理查德斯托曼传', exact=True),
             authors_test(['[美]sam Williams', '邓楠，李凡希'])]
        ),
        (
-                {'title':'爱上Raspberry Pi'},
+            {'title': '爱上Raspberry Pi'},
            [title_test('爱上Raspberry Pi',
                        exact=True), authors_test(['Matt Richardson', 'Shawn Wallace', '李凡希'])
             ]
@ -1439,28 +1540,30 @@ if __name__ == '__main__':  # tests {{{
    ca_tests = [  # {{{
        (   # Paperback with series
-                {'identifiers':{'isbn':'9781623808747'}},
+            {'identifiers': {'isbn': '9781623808747'}},
-                [title_test('Parting Shot', exact=True), authors_test(['Mary Calmes'])]
+            [title_test('Parting Shot', exact=True),
             authors_test(['Mary Calmes'])]
        ),
        (  # # in title
-                {'title':'Expert C# 2008 Business Objects',
+            {'title': 'Expert C# 2008 Business Objects',
-                    'authors':['Lhotka']},
+             'authors': ['Lhotka']},
-                [title_test('Expert C# 2008 Business Objects'), authors_test(['Rockford Lhotka'])]
+            [title_test('Expert C# 2008 Business Objects'),
             authors_test(['Rockford Lhotka'])]
        ),
        (  # noscript description
-                {'identifiers':{'amazon_ca':'162380874X'}},
+            {'identifiers': {'amazon_ca': '162380874X'}},
            [title_test('Parting Shot', exact=True), authors_test(['Mary Calmes'])
             ]
        ),
    ]  # }}}
    def do_test(domain, start=0, stop=None):
-        tests = globals().get(domain+'_tests')
+        tests = globals().get(domain + '_tests')
        if stop is None:
            stop = len(tests)
        tests = tests[start:stop]
        test_identify_plugin(Amazon.name, tests, modify_plugin=lambda
-                p:(setattr(p, 'testing_domain', domain), setattr(p, 'touched_fields', p.touched_fields - {'tags'})))
+                             p: (setattr(p, 'testing_domain', domain), setattr(p, 'touched_fields', p.touched_fields - {'tags'})))
    do_test('com')
    # do_test('de')
--- a/src/calibre/ebooks/metadata/sources/search_engines.py
+++ b/src/calibre/ebooks/metadata/sources/search_engines.py
@ -46,12 +46,12 @@ def parse_html(raw):
    return html5lib.parse(raw, treebuilder='lxml', namespaceHTMLElements=False)
-def query(br, url, key, dump_raw=None, limit=1, parser=parse_html):
+def query(br, url, key, dump_raw=None, limit=1, parser=parse_html, timeout=60):
    delta = monotonic() - last_visited[key]
    if delta < limit and delta > 0:
        time.sleep(delta)
    try:
-        raw = br.open_novisit(url).read()
+        raw = br.open_novisit(url, timeout=timeout).read()
    finally:
        last_visited[key] = monotonic()
    if dump_raw is not None:
@ -80,20 +80,29 @@ def ddg_href(url):
    return url
-def wayback_machine_cached_url(url, br=None):
+def wayback_machine_cached_url(url, br=None, log=prints, timeout=60):
    q = quote_term(url)
    br = br or browser()
    data = query(br, 'https://archive.org/wayback/available?url=' +
-                 q, 'wayback', parser=json.loads, limit=0.25)
+                 q, 'wayback', parser=json.loads, limit=0.25, timeout=timeout)
    try:
        closest = data['archived_snapshots']['closest']
    except KeyError:
-        return
+        pass
    else:
        if closest['available']:
            return closest['url']
    from pprint import pformat
    log('Response from wayback machine:', pformat(data))
-def ddg_search(terms, site=None, br=None, log=prints, safe_search=False, dump_raw=None):
+def wayback_url_processor(url):
    if url.startswith('/'):
        url = 'https://web.archive.org' + url
    return url
 def ddg_search(terms, site=None, br=None, log=prints, safe_search=False, dump_raw=None, timeout=60):
    # https://duck.co/help/results/syntax
    terms = map(ddg_term, terms)
    terms = [quote_term(t) for t in terms]
@ -104,7 +113,7 @@ def ddg_search(terms, site=None, br=None, log=prints, safe_search=False, dump_ra
        q=q, kp=1 if safe_search else -1)
    log('Making ddg query: ' + url)
    br = br or browser()
-    root = query(br, url, 'ddg', dump_raw)
+    root = query(br, url, 'ddg', dump_raw, timeout=timeout)
    ans = []
    for a in root.xpath('//*[@class="results"]//*[@class="result__title"]/a[@href and @class="result__a"]'):
        ans.append(Result(ddg_href(a.get('href')), etree.tostring(
--- a/src/calibre/ebooks/metadata/sources/update.py
+++ b/src/calibre/ebooks/metadata/sources/update.py
@ -14,7 +14,6 @@ from threading import Thread
 import calibre.ebooks.metadata.sources.search_engines as builtin_search_engines
 from calibre import as_unicode, prints
 from calibre.constants import DEBUG, numeric_version
 from calibre.customize.ui import patch_metadata_plugins
 from calibre.ebooks.metadata.sources.base import Source
 from calibre.utils.config import JSONConfig
 from calibre.utils.https import get_https_resource_securely
@ -59,6 +58,7 @@ def patch_search_engines(src):
 def patch_plugins():
    from calibre.customize.ui import patch_metadata_plugins
    patches = {}
    for name, val in cache.iteritems():
        if name == 'hashes':