Amazon metadata download: Fix metadata download via Google not working because of Google changes

Google no longer returns cache URLs with its search results, so we just use a generated URL and hope it works. Fixes #2047257 [Amazon Metadata Download Failing: No Cached Page](https://bugs.launchpad.net/calibre/+bug/2047257)
2025-07-09 03:04:10 -04:00 · 2023-12-29 12:19:04 +05:30 · 2023-12-29 12:19:04 +05:30 · ca90f71f23
commit ca90f71f23
parent f9fbd402d8
1 changed files with 11 additions and 53 deletions
--- a/src/calibre/ebooks/metadata/sources/search_engines.py
+++ b/src/calibre/ebooks/metadata/sources/search_engines.py
@ -14,10 +14,10 @@ from threading import Lock
 from functools import partial

 try:
-    from urllib.parse import parse_qs, quote_plus, unquote, urlencode, quote, urlparse
+    from urllib.parse import parse_qs, quote_plus, urlencode, quote, urlparse
 except ImportError:
    from urlparse import parse_qs, urlparse
-    from urllib import quote_plus, urlencode, unquote, quote
+    from urllib import quote_plus, urlencode, quote

 from lxml import etree

@ -27,7 +27,7 @@ from calibre.ebooks.chardet import xml_to_unicode
 from calibre.utils.lock import ExclusiveFile
 from calibre.utils.random_ua import accept_header_for_ua

-current_version = (1, 2, 4)
+current_version = (1, 2, 5)
 minimum_calibre_version = (2, 80, 0)
 webcache = {}
 webcache_lock = Lock()
@ -271,19 +271,22 @@ def google_url_processor(url):
    return url


-def google_get_cached_url(url, br=None, log=prints, timeout=60):
-    ourl = url
+def google_cache_url_for_url(url):
    if not isinstance(url, bytes):
        url = url.encode('utf-8')
    cu = quote(url, safe='')
    if isinstance(cu, bytes):
        cu = cu.decode('utf-8')
-    cached_url = 'https://webcache.googleusercontent.com/search?q=cache:' + cu
+    return 'https://webcache.googleusercontent.com/search?q=cache:' + cu
+
+
+def google_get_cached_url(url, br=None, log=prints, timeout=60):
+    cached_url = google_cache_url_for_url(url)
    br = google_specialize_browser(br or browser())
    try:
        raw = query(br, cached_url, 'google-cache', parser=lambda x: x.encode('utf-8'), timeout=timeout)
    except Exception as err:
-        log('Failed to get cached URL from google for URL: {} with error: {}'.format(ourl, err))
+        log('Failed to get cached URL from google for URL: {} with error: {}'.format(url, err))
    else:
        with webcache_lock:
            webcache[cached_url] = raw
@ -300,42 +303,7 @@ def canonicalize_url_for_cache_map(url):
    return url


-def google_extract_cache_urls(raw):
-    if isinstance(raw, bytes):
-        raw = raw.decode('utf-8', 'replace')
-    pat = re.compile(r'\\x22(https://webcache\.googleusercontent\.com/.+?)\\x22')
-    upat = re.compile(r'\\\\u([0-9a-fA-F]{4})')
-    xpat = re.compile(r'\\x([0-9a-fA-F]{2})')
-    cache_pat = re.compile('cache:([^:]+):(.+)')
-
-    def urepl(m):
-        return chr(int(m.group(1), 16))
-
-    seen = set()
-    ans = {}
-    for m in pat.finditer(raw):
-        cache_url = upat.sub(urepl, m.group(1))
-        # print(1111111, cache_url)
-        # the following two are necessary for results from Portugal
-        cache_url = xpat.sub(urepl, cache_url)
-        cache_url = cache_url.replace('&amp;', '&')
-
-        m = cache_pat.search(cache_url)
-        cache_id, src_url = m.group(1), m.group(2)
-        if cache_id in seen:
-            continue
-        seen.add(cache_id)
-        src_url = src_url.split('+')[0]
-        src_url = unquote(src_url)
-        curl = canonicalize_url_for_cache_map(src_url)
-        # print(22222, cache_id, src_url, curl)
-        ans[curl] = cache_url
-    return ans
-
-
 def google_parse_results(root, raw, log=prints, ignore_uncached=True):
-    cache_url_map = google_extract_cache_urls(raw)
-    # print('\n'.join(cache_url_map))
    ans = []
    seen = set()
    for div in root.xpath('//*[@id="search"]//*[@id="rso"]//div[descendant::h3]'):
@ -351,17 +319,7 @@ def google_parse_results(root, raw, log=prints, ignore_uncached=True):
        if curl in seen:
            continue
        seen.add(curl)
-        if curl in cache_url_map:
-            cached_url = cache_url_map[curl]
-        else:
-            try:
-                c = div.xpath('descendant::*[@role="menuitem"]//a[@class="fl"]')[0]
-            except IndexError:
-                if ignore_uncached:
-                    log('Ignoring {!r} as it has no cached page'.format(title))
-                    continue
-                c = {'href': ''}
-            cached_url = c.get('href')
+        cached_url = google_cache_url_for_url(curl)
        ans.append(Result(a.get('href'), title, cached_url))
    if not ans:
        title = ' '.join(root.xpath('//title/text()'))