Google search engine: Fix for different URL encoding used by Google servers in Portugal

This commit is contained in:
Kovid Goyal 2022-08-05 07:07:22 +05:30
parent ec0e30f0aa
commit c9d3d132be
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C

View File

@ -25,7 +25,7 @@ from calibre.ebooks.chardet import xml_to_unicode
from calibre.utils.lock import ExclusiveFile from calibre.utils.lock import ExclusiveFile
from calibre.utils.random_ua import accept_header_for_ua from calibre.utils.random_ua import accept_header_for_ua
current_version = (1, 2, 0) current_version = (1, 2, 1)
minimum_calibre_version = (2, 80, 0) minimum_calibre_version = (2, 80, 0)
webcache = {} webcache = {}
webcache_lock = Lock() webcache_lock = Lock()
@ -289,6 +289,7 @@ def google_extract_cache_urls(raw):
raw = raw.decode('utf-8', 'replace') raw = raw.decode('utf-8', 'replace')
pat = re.compile(r'\\x22(https://webcache\.googleusercontent\.com/.+?)\\x22') pat = re.compile(r'\\x22(https://webcache\.googleusercontent\.com/.+?)\\x22')
upat = re.compile(r'\\\\u([0-9a-fA-F]{4})') upat = re.compile(r'\\\\u([0-9a-fA-F]{4})')
xpat = re.compile(r'\\x([0-9a-fA-F]{2})')
cache_pat = re.compile('cache:([^:]+):(.+)') cache_pat = re.compile('cache:([^:]+):(.+)')
def urepl(m): def urepl(m):
@ -298,6 +299,10 @@ def google_extract_cache_urls(raw):
ans = {} ans = {}
for m in pat.finditer(raw): for m in pat.finditer(raw):
cache_url = upat.sub(urepl, m.group(1)) cache_url = upat.sub(urepl, m.group(1))
# the following two are necessary for results from Portugal
cache_url = xpat.sub(urepl, cache_url)
cache_url = cache_url.replace('&', '&')
m = cache_pat.search(cache_url) m = cache_pat.search(cache_url)
cache_id, src_url = m.group(1), m.group(2) cache_id, src_url = m.group(1), m.group(2)
if cache_id in seen: if cache_id in seen: