Amazon metadata download: Fix metadata download via Google not working because of Google changes

Google no longer returns cache URLs with its search results, so we just
use a generated URL and hope it works. Fixes #2047257 [Amazon Metadata Download Failing: No Cached Page](https://bugs.launchpad.net/calibre/+bug/2047257)
This commit is contained in:
Kovid Goyal 2023-12-29 12:19:04 +05:30
parent f9fbd402d8
commit ca90f71f23
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C

View File

@ -14,10 +14,10 @@ from threading import Lock
from functools import partial
try:
from urllib.parse import parse_qs, quote_plus, unquote, urlencode, quote, urlparse
from urllib.parse import parse_qs, quote_plus, urlencode, quote, urlparse
except ImportError:
from urlparse import parse_qs, urlparse
from urllib import quote_plus, urlencode, unquote, quote
from urllib import quote_plus, urlencode, quote
from lxml import etree
@ -27,7 +27,7 @@ from calibre.ebooks.chardet import xml_to_unicode
from calibre.utils.lock import ExclusiveFile
from calibre.utils.random_ua import accept_header_for_ua
current_version = (1, 2, 4)
current_version = (1, 2, 5)
minimum_calibre_version = (2, 80, 0)
webcache = {}
webcache_lock = Lock()
@ -271,19 +271,22 @@ def google_url_processor(url):
return url
def google_get_cached_url(url, br=None, log=prints, timeout=60):
ourl = url
def google_cache_url_for_url(url):
if not isinstance(url, bytes):
url = url.encode('utf-8')
cu = quote(url, safe='')
if isinstance(cu, bytes):
cu = cu.decode('utf-8')
cached_url = 'https://webcache.googleusercontent.com/search?q=cache:' + cu
return 'https://webcache.googleusercontent.com/search?q=cache:' + cu
def google_get_cached_url(url, br=None, log=prints, timeout=60):
cached_url = google_cache_url_for_url(url)
br = google_specialize_browser(br or browser())
try:
raw = query(br, cached_url, 'google-cache', parser=lambda x: x.encode('utf-8'), timeout=timeout)
except Exception as err:
log('Failed to get cached URL from google for URL: {} with error: {}'.format(ourl, err))
log('Failed to get cached URL from google for URL: {} with error: {}'.format(url, err))
else:
with webcache_lock:
webcache[cached_url] = raw
@ -300,42 +303,7 @@ def canonicalize_url_for_cache_map(url):
return url
def google_extract_cache_urls(raw):
if isinstance(raw, bytes):
raw = raw.decode('utf-8', 'replace')
pat = re.compile(r'\\x22(https://webcache\.googleusercontent\.com/.+?)\\x22')
upat = re.compile(r'\\\\u([0-9a-fA-F]{4})')
xpat = re.compile(r'\\x([0-9a-fA-F]{2})')
cache_pat = re.compile('cache:([^:]+):(.+)')
def urepl(m):
return chr(int(m.group(1), 16))
seen = set()
ans = {}
for m in pat.finditer(raw):
cache_url = upat.sub(urepl, m.group(1))
# print(1111111, cache_url)
# the following two are necessary for results from Portugal
cache_url = xpat.sub(urepl, cache_url)
cache_url = cache_url.replace('&', '&')
m = cache_pat.search(cache_url)
cache_id, src_url = m.group(1), m.group(2)
if cache_id in seen:
continue
seen.add(cache_id)
src_url = src_url.split('+')[0]
src_url = unquote(src_url)
curl = canonicalize_url_for_cache_map(src_url)
# print(22222, cache_id, src_url, curl)
ans[curl] = cache_url
return ans
def google_parse_results(root, raw, log=prints, ignore_uncached=True):
cache_url_map = google_extract_cache_urls(raw)
# print('\n'.join(cache_url_map))
ans = []
seen = set()
for div in root.xpath('//*[@id="search"]//*[@id="rso"]//div[descendant::h3]'):
@ -351,17 +319,7 @@ def google_parse_results(root, raw, log=prints, ignore_uncached=True):
if curl in seen:
continue
seen.add(curl)
if curl in cache_url_map:
cached_url = cache_url_map[curl]
else:
try:
c = div.xpath('descendant::*[@role="menuitem"]//a[@class="fl"]')[0]
except IndexError:
if ignore_uncached:
log('Ignoring {!r} as it has no cached page'.format(title))
continue
c = {'href': ''}
cached_url = c.get('href')
cached_url = google_cache_url_for_url(curl)
ans.append(Result(a.get('href'), title, cached_url))
if not ans:
title = ' '.join(root.xpath('//title/text()'))