mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Fix google web search
Use a specific user agent, otherwise google serves up JS based result pages. Sigh. And google was supposed to be non-evil. Also switch amazon default server to google since it currently works better than bing.
This commit is contained in:
parent
3f8050f8cf
commit
377d51769b
@ -1090,7 +1090,7 @@ class Worker(Thread): # Get details {{{
|
|||||||
class Amazon(Source):
|
class Amazon(Source):
|
||||||
|
|
||||||
name = 'Amazon.com'
|
name = 'Amazon.com'
|
||||||
version = (1, 3, 12)
|
version = (1, 3, 13)
|
||||||
minimum_calibre_version = (2, 82, 0)
|
minimum_calibre_version = (2, 82, 0)
|
||||||
description = _('Downloads metadata and covers from Amazon')
|
description = _('Downloads metadata and covers from Amazon')
|
||||||
|
|
||||||
@ -1568,8 +1568,8 @@ class Amazon(Source):
|
|||||||
elif server == 'google':
|
elif server == 'google':
|
||||||
urlproc, sfunc = se.google_url_processor, se.google_search
|
urlproc, sfunc = se.google_url_processor, se.google_search
|
||||||
else: # auto or unknown
|
else: # auto or unknown
|
||||||
# urlproc, sfunc = se.google_url_processor, se.google_search
|
urlproc, sfunc = se.google_url_processor, se.google_search
|
||||||
urlproc, sfunc = se.bing_url_processor, se.bing_search
|
# urlproc, sfunc = se.bing_url_processor, se.bing_search
|
||||||
try:
|
try:
|
||||||
results, qurl = sfunc(terms, site, log=log, br=br, timeout=timeout)
|
results, qurl = sfunc(terms, site, log=log, br=br, timeout=timeout)
|
||||||
except HTTPError as err:
|
except HTTPError as err:
|
||||||
|
@ -31,7 +31,7 @@ from calibre.ebooks.chardet import xml_to_unicode
|
|||||||
from calibre.utils.lock import ExclusiveFile
|
from calibre.utils.lock import ExclusiveFile
|
||||||
from calibre.utils.random_ua import accept_header_for_ua
|
from calibre.utils.random_ua import accept_header_for_ua
|
||||||
|
|
||||||
current_version = (1, 2, 13)
|
current_version = (1, 2, 14)
|
||||||
minimum_calibre_version = (2, 80, 0)
|
minimum_calibre_version = (2, 80, 0)
|
||||||
webcache = {}
|
webcache = {}
|
||||||
webcache_lock = Lock()
|
webcache_lock = Lock()
|
||||||
@ -327,16 +327,22 @@ def canonicalize_url_for_cache_map(url):
|
|||||||
def google_parse_results(root, raw, log=prints, ignore_uncached=True):
|
def google_parse_results(root, raw, log=prints, ignore_uncached=True):
|
||||||
ans = []
|
ans = []
|
||||||
seen = set()
|
seen = set()
|
||||||
for div in root.xpath('//*[@id="search"]//*[@id="rso"]//div[descendant::h3]'):
|
for a in root.xpath('//a[@href]'):
|
||||||
try:
|
href = a.get('href')
|
||||||
a = div.xpath('descendant::a[@href]')[0]
|
if not href.startswith('/url?q=http'):
|
||||||
except IndexError:
|
|
||||||
log('Ignoring div with no main result link')
|
|
||||||
continue
|
continue
|
||||||
title = tostring(a)
|
try:
|
||||||
src_url = a.get('href')
|
url = parse_qs(urlparse(href).query)['q'][0]
|
||||||
# print(f'{src_url=}')
|
purl = urlparse(url)
|
||||||
curl = canonicalize_url_for_cache_map(src_url)
|
except Exception:
|
||||||
|
continue
|
||||||
|
if 'google.com' in purl.netloc:
|
||||||
|
continue
|
||||||
|
try:
|
||||||
|
title = tostring(next(a.iterchildren('span')))
|
||||||
|
except StopIteration:
|
||||||
|
continue
|
||||||
|
curl = canonicalize_url_for_cache_map(url)
|
||||||
if curl in seen:
|
if curl in seen:
|
||||||
continue
|
continue
|
||||||
seen.add(curl)
|
seen.add(curl)
|
||||||
@ -368,6 +374,8 @@ def google_specialize_browser(br):
|
|||||||
for c in google_consent_cookies():
|
for c in google_consent_cookies():
|
||||||
br.set_simple_cookie(c['name'], c['value'], c['domain'], path=c['path'])
|
br.set_simple_cookie(c['name'], c['value'], c['domain'], path=c['path'])
|
||||||
br.google_consent_cookie_added = True
|
br.google_consent_cookie_added = True
|
||||||
|
# google serves JS based pages without the right user agent
|
||||||
|
br.set_user_agent('L''y''nx''/2.''8.''6rel''.5 lib''ww''w-F''M/2.''1''4') # noqa
|
||||||
return br
|
return br
|
||||||
|
|
||||||
|
|
||||||
@ -391,8 +399,9 @@ def google_format_query(terms, site=None, tbm=None):
|
|||||||
terms.append(quote_term(('site:' + site)))
|
terms.append(quote_term(('site:' + site)))
|
||||||
q = '+'.join(terms)
|
q = '+'.join(terms)
|
||||||
url = 'https://www.google.com/search?q={q}'.format(q=q)
|
url = 'https://www.google.com/search?q={q}'.format(q=q)
|
||||||
if tbm:
|
# tbm causes 403 forbidden errors
|
||||||
url += '&tbm=' + tbm
|
# if tbm:
|
||||||
|
# url += '&tbm=' + tbm
|
||||||
if prevent_spelling_correction:
|
if prevent_spelling_correction:
|
||||||
url += '&nfpr=1'
|
url += '&nfpr=1'
|
||||||
return url
|
return url
|
||||||
|
Loading…
x
Reference in New Issue
Block a user