Amazon metadata download: Fallback to using DDG if Google starts returning too many request errors

2025-08-11 09:13:57 -04:00 · 2022-07-31 13:57:01 +05:30 · 2022-07-31 13:57:01 +05:30 · cd1a36c5fe
commit cd1a36c5fe
parent e82a9274e7
2 changed files with 51 additions and 27 deletions
--- a/src/calibre/ebooks/metadata/sources/amazon.py
+++ b/src/calibre/ebooks/metadata/sources/amazon.py
@ -18,6 +18,8 @@ try:
 except ImportError:
    from urlparse import urlparse

+from mechanize import HTTPError
+
 from calibre import as_unicode, browser, random_user_agent, xml_replace_entities
 from calibre.ebooks.metadata import check_isbn
 from calibre.ebooks.metadata.book.base import Metadata
@ -78,23 +80,32 @@ def parse_details_page(url, log, timeout, browser, domain):
    from calibre.utils.cleantext import clean_ascii_chars
    from calibre.ebooks.chardet import xml_to_unicode
    from lxml.html import tostring
-    log('Getting details from:', url)
    try:
-        raw = browser.open_novisit(url, timeout=timeout).read().strip()
-    except Exception as e:
-        if callable(getattr(e, 'getcode', None)) and \
-                e.getcode() == 404:
-            log.error('URL malformed: %r' % url)
+        from calibre.ebooks.metadata.sources.update import search_engines_module
+        get_data_for_cached_url = search_engines_module().get_data_for_cached_url
+    except Exception:
+        get_data_for_cached_url = lambda *a: None
+    raw = get_data_for_cached_url(url)
+    if raw:
+        log('Using cached details for url:', url)
+    else:
+        log('Downloading details from:', url)
+        try:
+            raw = browser.open_novisit(url, timeout=timeout).read().strip()
+        except Exception as e:
+            if callable(getattr(e, 'getcode', None)) and \
+                    e.getcode() == 404:
+                log.error('URL malformed: %r' % url)
+                return
+            attr = getattr(e, 'args', [None])
+            attr = attr if attr else [None]
+            if isinstance(attr[0], socket.timeout):
+                msg = 'Details page timed out. Try again later.'
+                log.error(msg)
+            else:
+                msg = 'Failed to make details query: %r' % url
+                log.exception(msg)
            return
-        attr = getattr(e, 'args', [None])
-        attr = attr if attr else [None]
-        if isinstance(attr[0], socket.timeout):
-            msg = 'Details page timed out. Try again later.'
-            log.error(msg)
-        else:
-            msg = 'Failed to make details query: %r' % url
-            log.exception(msg)
-        return

    oraw = raw
    if 'amazon.com.br' in url:
@ -404,7 +415,7 @@ class Worker(Thread):  # Get details {{{
            with tempfile.NamedTemporaryFile(prefix=(asin or type('')(uuid.uuid4())) + '_',
                                             suffix='.html', delete=False) as f:
                f.write(raw)
-            print('Downloaded html for', asin, 'saved in', f.name)
+            print('Downloaded HTML for', asin, 'saved in', f.name)

        try:
            title = self.parse_title(root)
@ -992,7 +1003,7 @@ class Worker(Thread):  # Get details {{{
 class Amazon(Source):

    name = 'Amazon.com'
-    version = (1, 2, 28)
+    version = (1, 3, 0)
    minimum_calibre_version = (2, 82, 0)
    description = _('Downloads metadata and covers from Amazon')

@ -1027,6 +1038,7 @@ class Amazon(Source):
        'bing': _('Bing search cache'),
        'google': _('Google search cache'),
        'wayback': _('Wayback machine cache (slow)'),
+        'ddg': _('DuckDuckGo search and Google cache'),
    }

    options = (
@ -1453,20 +1465,30 @@ class Amazon(Source):

    def search_search_engine(self, br, testing, log, abort, title, authors, identifiers, timeout, override_server=None):  # {{{
        from calibre.ebooks.metadata.sources.update import search_engines_module
+        se = search_engines_module()
        terms, domain = self.create_query(log, title=title, authors=authors,
                                          identifiers=identifiers, for_amazon=False)
        site = self.referrer_for_domain(
            domain)[len('https://'):].partition('/')[0]
        matches = []
-        se = search_engines_module()
        server = override_server or self.server
-        if server in ('bing',):
+        urlproc, sfunc = se.google_url_processor, se.google_search
+        if server == 'bing':
            urlproc, sfunc = se.bing_url_processor, se.bing_search
-        elif server in ('auto', 'google'):
-            urlproc, sfunc = se.google_url_processor, se.google_search
        elif server == 'wayback':
            urlproc, sfunc = se.wayback_url_processor, se.ddg_search
-        results, qurl = sfunc(terms, site, log=log, br=br, timeout=timeout)
+        elif server == 'ddg':
+            urlproc, sfunc = se.ddg_url_processor, se.ddg_search
+        try:
+            results, qurl = sfunc(terms, site, log=log, br=br, timeout=timeout)
+        except HTTPError as err:
+            if err.code == 429 and sfunc is se.google_search:
+                log('Got too many requests error from Google, trying via DuckDuckGo')
+                urlproc, sfunc = se.ddg_url_processor, se.ddg_search
+                results, qurl = sfunc(terms, site, log=log, br=br, timeout=timeout)
+            else:
+                raise
+
        br.set_current_header('Referer', qurl)
        for result in results:
            if abort.is_set():
@ -1476,8 +1498,7 @@ class Amazon(Source):
            if '/dp/' in purl.path and site in purl.netloc:
                url = result.cached_url
                if url is None:
-                    url = se.wayback_machine_cached_url(
-                        result.url, br, timeout=timeout)
+                    url = se.get_cached_url(result.url, br, timeout=timeout)
                if url is None:
                    log('Failed to find cached page for:', result.url)
                    continue
--- a/src/calibre/ebooks/metadata/sources/search_engines.py
+++ b/src/calibre/ebooks/metadata/sources/search_engines.py
@ -25,7 +25,7 @@ from calibre.ebooks.chardet import xml_to_unicode
 from calibre.utils.lock import ExclusiveFile
 from calibre.utils.random_ua import accept_header_for_ua

-current_version = (1, 1, 0)
+current_version = (1, 1, 1)
 minimum_calibre_version = (2, 80, 0)
 webcache = {}
 webcache_lock = Lock()
@ -170,7 +170,10 @@ def ddg_search(terms, site=None, br=None, log=prints, safe_search=False, dump_ra
    root = query(br, url, 'ddg', dump_raw, timeout=timeout)
    ans = []
    for a in root.xpath('//*[@class="results"]//*[@class="result__title"]/a[@href and @class="result__a"]'):
-        ans.append(Result(ddg_href(a.get('href')), tostring(a), None))
+        try:
+            ans.append(Result(ddg_href(a.get('href')), tostring(a), None))
+        except KeyError:
+            log('Failed to find ddg href in:', a.get('href'))
    return ans, url


@ -270,7 +273,7 @@ def google_get_cached_url(url, br=None, log=prints, timeout=60):
    cached_url = 'https://webcache.googleusercontent.com/search?q=cache:' + cu
    br = google_specialize_browser(br or browser())
    try:
-        raw = query(br, cached_url, 'google-cache', parser=lambda x: x, timeout=timeout)
+        raw = query(br, cached_url, 'google-cache', parser=lambda x: x.encode('utf-8'), timeout=timeout)
    except Exception as err:
        log('Failed to get cached URL from google for URL: {} with error: {}'.format(ourl, err))
    else: