Fix google web search

Use a specific user agent, otherwise google serves up JS based result pages. Sigh. And google was supposed to be non-evil. Also switch amazon default server to google since it currently works better than bing.
2025-07-09 03:04:10 -04:00 · 2025-04-19 05:34:02 +05:30 · 2025-04-19 05:34:02 +05:30 · 377d51769b
commit 377d51769b
parent 3f8050f8cf
2 changed files with 24 additions and 15 deletions
--- a/src/calibre/ebooks/metadata/sources/amazon.py
+++ b/src/calibre/ebooks/metadata/sources/amazon.py
@ -1090,7 +1090,7 @@ class Worker(Thread):  # Get details {{{
 class Amazon(Source):

    name = 'Amazon.com'
-    version = (1, 3, 12)
+    version = (1, 3, 13)
    minimum_calibre_version = (2, 82, 0)
    description = _('Downloads metadata and covers from Amazon')

@ -1568,8 +1568,8 @@ class Amazon(Source):
        elif server == 'google':
            urlproc, sfunc = se.google_url_processor, se.google_search
        else:  # auto or unknown
-            # urlproc, sfunc = se.google_url_processor, se.google_search
-            urlproc, sfunc = se.bing_url_processor, se.bing_search
+            urlproc, sfunc = se.google_url_processor, se.google_search
+            # urlproc, sfunc = se.bing_url_processor, se.bing_search
        try:
            results, qurl = sfunc(terms, site, log=log, br=br, timeout=timeout)
        except HTTPError as err:
--- a/src/calibre/ebooks/metadata/sources/search_engines.py
+++ b/src/calibre/ebooks/metadata/sources/search_engines.py
@ -31,7 +31,7 @@ from calibre.ebooks.chardet import xml_to_unicode
 from calibre.utils.lock import ExclusiveFile
 from calibre.utils.random_ua import accept_header_for_ua

-current_version = (1, 2, 13)
+current_version = (1, 2, 14)
 minimum_calibre_version = (2, 80, 0)
 webcache = {}
 webcache_lock = Lock()
@ -327,16 +327,22 @@ def canonicalize_url_for_cache_map(url):
 def google_parse_results(root, raw, log=prints, ignore_uncached=True):
    ans = []
    seen = set()
-    for div in root.xpath('//*[@id="search"]//*[@id="rso"]//div[descendant::h3]'):
-        try:
-            a = div.xpath('descendant::a[@href]')[0]
-        except IndexError:
-            log('Ignoring div with no main result link')
+    for a in root.xpath('//a[@href]'):
+        href = a.get('href')
+        if not href.startswith('/url?q=http'):
            continue
-        title = tostring(a)
-        src_url = a.get('href')
-        # print(f'{src_url=}')
-        curl = canonicalize_url_for_cache_map(src_url)
+        try:
+            url = parse_qs(urlparse(href).query)['q'][0]
+            purl = urlparse(url)
+        except Exception:
+            continue
+        if 'google.com' in purl.netloc:
+            continue
+        try:
+            title = tostring(next(a.iterchildren('span')))
+        except StopIteration:
+            continue
+        curl = canonicalize_url_for_cache_map(url)
        if curl in seen:
            continue
        seen.add(curl)
@ -368,6 +374,8 @@ def google_specialize_browser(br):
            for c in google_consent_cookies():
                br.set_simple_cookie(c['name'], c['value'], c['domain'], path=c['path'])
            br.google_consent_cookie_added = True
+    # google serves JS based pages without the right user agent
+    br.set_user_agent('L''y''nx''/2.''8.''6rel''.5 lib''ww''w-F''M/2.''1''4')  # noqa
    return br


@ -391,8 +399,9 @@ def google_format_query(terms, site=None, tbm=None):
        terms.append(quote_term(('site:' + site)))
    q = '+'.join(terms)
    url = 'https://www.google.com/search?q={q}'.format(q=q)
-    if tbm:
-        url += '&tbm=' + tbm
+    # tbm causes 403 forbidden errors
+    # if tbm:
+    #     url += '&tbm=' + tbm
    if prevent_spelling_correction:
        url += '&nfpr=1'
    return url