Fix google web search

Use a specific user agent, otherwise google serves up JS based result
pages. Sigh. And google was supposed to be non-evil.

Also switch amazon default server to google since it currently works better
than bing.
This commit is contained in:
Kovid Goyal 2025-04-19 05:34:02 +05:30
parent 3f8050f8cf
commit 377d51769b
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C
2 changed files with 24 additions and 15 deletions

View File

@ -1090,7 +1090,7 @@ class Worker(Thread): # Get details {{{
class Amazon(Source): class Amazon(Source):
name = 'Amazon.com' name = 'Amazon.com'
version = (1, 3, 12) version = (1, 3, 13)
minimum_calibre_version = (2, 82, 0) minimum_calibre_version = (2, 82, 0)
description = _('Downloads metadata and covers from Amazon') description = _('Downloads metadata and covers from Amazon')
@ -1568,8 +1568,8 @@ class Amazon(Source):
elif server == 'google': elif server == 'google':
urlproc, sfunc = se.google_url_processor, se.google_search urlproc, sfunc = se.google_url_processor, se.google_search
else: # auto or unknown else: # auto or unknown
# urlproc, sfunc = se.google_url_processor, se.google_search urlproc, sfunc = se.google_url_processor, se.google_search
urlproc, sfunc = se.bing_url_processor, se.bing_search # urlproc, sfunc = se.bing_url_processor, se.bing_search
try: try:
results, qurl = sfunc(terms, site, log=log, br=br, timeout=timeout) results, qurl = sfunc(terms, site, log=log, br=br, timeout=timeout)
except HTTPError as err: except HTTPError as err:

View File

@ -31,7 +31,7 @@ from calibre.ebooks.chardet import xml_to_unicode
from calibre.utils.lock import ExclusiveFile from calibre.utils.lock import ExclusiveFile
from calibre.utils.random_ua import accept_header_for_ua from calibre.utils.random_ua import accept_header_for_ua
current_version = (1, 2, 13) current_version = (1, 2, 14)
minimum_calibre_version = (2, 80, 0) minimum_calibre_version = (2, 80, 0)
webcache = {} webcache = {}
webcache_lock = Lock() webcache_lock = Lock()
@ -327,16 +327,22 @@ def canonicalize_url_for_cache_map(url):
def google_parse_results(root, raw, log=prints, ignore_uncached=True): def google_parse_results(root, raw, log=prints, ignore_uncached=True):
ans = [] ans = []
seen = set() seen = set()
for div in root.xpath('//*[@id="search"]//*[@id="rso"]//div[descendant::h3]'): for a in root.xpath('//a[@href]'):
try: href = a.get('href')
a = div.xpath('descendant::a[@href]')[0] if not href.startswith('/url?q=http'):
except IndexError:
log('Ignoring div with no main result link')
continue continue
title = tostring(a) try:
src_url = a.get('href') url = parse_qs(urlparse(href).query)['q'][0]
# print(f'{src_url=}') purl = urlparse(url)
curl = canonicalize_url_for_cache_map(src_url) except Exception:
continue
if 'google.com' in purl.netloc:
continue
try:
title = tostring(next(a.iterchildren('span')))
except StopIteration:
continue
curl = canonicalize_url_for_cache_map(url)
if curl in seen: if curl in seen:
continue continue
seen.add(curl) seen.add(curl)
@ -368,6 +374,8 @@ def google_specialize_browser(br):
for c in google_consent_cookies(): for c in google_consent_cookies():
br.set_simple_cookie(c['name'], c['value'], c['domain'], path=c['path']) br.set_simple_cookie(c['name'], c['value'], c['domain'], path=c['path'])
br.google_consent_cookie_added = True br.google_consent_cookie_added = True
# google serves JS based pages without the right user agent
br.set_user_agent('L''y''nx''/2.''8.''6rel''.5 lib''ww''w-F''M/2.''1''4') # noqa
return br return br
@ -391,8 +399,9 @@ def google_format_query(terms, site=None, tbm=None):
terms.append(quote_term(('site:' + site))) terms.append(quote_term(('site:' + site)))
q = '+'.join(terms) q = '+'.join(terms)
url = 'https://www.google.com/search?q={q}'.format(q=q) url = 'https://www.google.com/search?q={q}'.format(q=q)
if tbm: # tbm causes 403 forbidden errors
url += '&tbm=' + tbm # if tbm:
# url += '&tbm=' + tbm
if prevent_spelling_correction: if prevent_spelling_correction:
url += '&nfpr=1' url += '&nfpr=1'
return url return url