From 673650170b32c8db1d9bface31da383cb462b22d Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Thu, 18 Aug 2022 16:07:59 +0530 Subject: [PATCH] More cache URL encoding weirdness from google. Fixes #1986837 [Unable to download Metadata from Amazon](https://bugs.launchpad.net/calibre/+bug/1986837) --- .../ebooks/metadata/sources/search_engines.py | 27 ++++++++++++++----- 1 file changed, 21 insertions(+), 6 deletions(-) diff --git a/src/calibre/ebooks/metadata/sources/search_engines.py b/src/calibre/ebooks/metadata/sources/search_engines.py index 23aecb7d6f..4d958be2c6 100644 --- a/src/calibre/ebooks/metadata/sources/search_engines.py +++ b/src/calibre/ebooks/metadata/sources/search_engines.py @@ -12,9 +12,9 @@ from contextlib import contextmanager from threading import Lock try: - from urllib.parse import parse_qs, quote_plus, unquote, urlencode, quote + from urllib.parse import parse_qs, quote_plus, unquote, urlencode, quote, urlparse except ImportError: - from urlparse import parse_qs + from urlparse import parse_qs, urlparse from urllib import quote_plus, urlencode, unquote, quote from lxml import etree @@ -25,7 +25,7 @@ from calibre.ebooks.chardet import xml_to_unicode from calibre.utils.lock import ExclusiveFile from calibre.utils.random_ua import accept_header_for_ua -current_version = (1, 2, 1) +current_version = (1, 2, 2) minimum_calibre_version = (2, 80, 0) webcache = {} webcache_lock = Lock() @@ -284,6 +284,16 @@ def google_get_cached_url(url, br=None, log=prints, timeout=60): return cached_url +def canonicalize_url_for_cache_map(url): + try: + purl = urlparse(url) + except Exception: + return url + if '.amazon.' in purl.netloc: + url = url.split('&', 1)[0] + return url + + def google_extract_cache_urls(raw): if isinstance(raw, bytes): raw = raw.decode('utf-8', 'replace') @@ -299,6 +309,7 @@ def google_extract_cache_urls(raw): ans = {} for m in pat.finditer(raw): cache_url = upat.sub(urepl, m.group(1)) + # print(1111111, cache_url) # the following two are necessary for results from Portugal cache_url = xpat.sub(urepl, cache_url) cache_url = cache_url.replace('&', '&') @@ -310,7 +321,9 @@ def google_extract_cache_urls(raw): seen.add(cache_id) src_url = src_url.split('+')[0] src_url = unquote(src_url) - ans[src_url] = cache_url + curl = canonicalize_url_for_cache_map(src_url) + # print(22222, cache_id, src_url, curl) + ans[curl] = cache_url return ans @@ -326,8 +339,10 @@ def google_parse_results(root, raw, log=prints, ignore_uncached=True): continue title = tostring(a) src_url = a.get('href') - if src_url in cache_url_map: - cached_url = cache_url_map[src_url] + # print(f'{src_url=}') + curl = canonicalize_url_for_cache_map(src_url) + if curl in cache_url_map: + cached_url = cache_url_map[curl] else: try: c = div.xpath('descendant::*[@role="menuitem"]//a[@class="fl"]')[0]