Fix extracting cached URLs from google search results

This commit is contained in:
Kovid Goyal 2021-11-10 13:08:02 +05:30
parent 0749ba592c
commit 45b17f6cad
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C

View File

@ -9,10 +9,10 @@ import re
import time import time
from collections import defaultdict, namedtuple from collections import defaultdict, namedtuple
try: try:
from urllib.parse import parse_qs, quote_plus, urlencode from urllib.parse import parse_qs, quote_plus, urlencode, unquote
except ImportError: except ImportError:
from urlparse import parse_qs from urlparse import parse_qs
from urllib import quote_plus, urlencode from urllib import quote_plus, urlencode, unquote
from lxml import etree from lxml import etree
@ -20,7 +20,7 @@ from calibre import browser as _browser, prints, random_user_agent
from calibre.utils.monotonic import monotonic from calibre.utils.monotonic import monotonic
from calibre.utils.random_ua import accept_header_for_ua from calibre.utils.random_ua import accept_header_for_ua
current_version = (1, 0, 7) current_version = (1, 0, 8)
minimum_calibre_version = (2, 80, 0) minimum_calibre_version = (2, 80, 0)
@ -60,7 +60,7 @@ def parse_html(raw):
return parse(raw) return parse(raw)
def query(br, url, key, dump_raw=None, limit=1, parser=parse_html, timeout=60): def query(br, url, key, dump_raw=None, limit=1, parser=parse_html, timeout=60, save_raw=None):
delta = monotonic() - last_visited[key] delta = monotonic() - last_visited[key]
if delta < limit and delta > 0: if delta < limit and delta > 0:
time.sleep(delta) time.sleep(delta)
@ -71,6 +71,8 @@ def query(br, url, key, dump_raw=None, limit=1, parser=parse_html, timeout=60):
if dump_raw is not None: if dump_raw is not None:
with open(dump_raw, 'wb') as f: with open(dump_raw, 'wb') as f:
f.write(raw) f.write(raw)
if save_raw is not None:
save_raw(raw)
return parser(raw) return parser(raw)
@ -221,15 +223,34 @@ def google_url_processor(url):
return url return url
def google_search(terms, site=None, br=None, log=prints, safe_search=False, dump_raw=None, timeout=60): def google_extract_cache_urls(raw):
terms = [quote_term(google_term(t)) for t in terms] if isinstance(raw, bytes):
if site is not None: raw = raw.decode('utf-8', 'replace')
terms.append(quote_term(('site:' + site))) pat = re.compile(r'\\x22(https://webcache\.googleusercontent\.com/.+?)\\x22')
q = '+'.join(terms) upat = re.compile(r'\\\\u([0-9a-fA-F]{4})')
url = 'https://www.google.com/search?q={q}'.format(q=q) cache_pat = re.compile('cache:([^:]+):(.+)')
log('Making google query: ' + url)
br = br or browser() def urepl(m):
root = query(br, url, 'google', dump_raw, timeout=timeout) return chr(int(m.group(1), 16))
seen = set()
ans = {}
for m in pat.finditer(raw):
cache_url = upat.sub(urepl, m.group(1))
m = cache_pat.search(cache_url)
cache_id, src_url = m.group(1), m.group(2)
if cache_id in seen:
continue
seen.add(cache_id)
src_url = src_url.split('+')[0]
src_url = unquote(src_url)
ans[src_url] = cache_url
return ans
def google_parse_results(root, raw, log=prints):
cache_url_map = google_extract_cache_urls(raw)
# print('\n'.join(cache_url_map))
ans = [] ans = []
for div in root.xpath('//*[@id="search"]//*[@id="rso"]//*[@class="g"]'): for div in root.xpath('//*[@id="search"]//*[@id="rso"]//*[@class="g"]'):
try: try:
@ -238,22 +259,45 @@ def google_search(terms, site=None, br=None, log=prints, safe_search=False, dump
log('Ignoring div with no main result link') log('Ignoring div with no main result link')
continue continue
title = tostring(a) title = tostring(a)
try: src_url = a.get('href')
c = div.xpath('descendant::*[@role="menuitem"]//a[@class="fl"]')[0] if src_url in cache_url_map:
except IndexError: cached_url = cache_url_map[src_url]
log('Ignoring {!r} as it has no cached page'.format(title)) else:
continue try:
cached_url = c.get('href') c = div.xpath('descendant::*[@role="menuitem"]//a[@class="fl"]')[0]
except IndexError:
log('Ignoring {!r} as it has no cached page'.format(title))
continue
cached_url = c.get('href')
ans.append(Result(a.get('href'), title, cached_url)) ans.append(Result(a.get('href'), title, cached_url))
if not ans: if not ans:
title = ' '.join(root.xpath('//title/text()')) title = ' '.join(root.xpath('//title/text()'))
log('Failed to find any results on results page, with title:', title) log('Failed to find any results on results page, with title:', title)
return ans, url return ans
def google_develop(search_terms='1423146786'): def google_search(terms, site=None, br=None, log=prints, safe_search=False, dump_raw=None, timeout=60):
br = browser() terms = [quote_term(google_term(t)) for t in terms]
for result in google_search(search_terms.split(), 'www.amazon.com', dump_raw='/t/raw.html', br=br)[0]: if site is not None:
terms.append(quote_term(('site:' + site)))
q = '+'.join(terms)
url = 'https://www.google.com/search?q={q}'.format(q=q)
log('Making google query: ' + url)
br = br or browser()
r = []
root = query(br, url, 'google', dump_raw, timeout=timeout, save_raw=r.append)
return google_parse_results(root, r[0], log=log), url
def google_develop(search_terms='1423146786', raw_from=''):
if raw_from:
with open(raw_from, 'rb') as f:
raw = f.read()
results = google_parse_results(parse_html(raw), raw)
else:
br = browser()
results = google_search(search_terms.split(), 'www.amazon.com', dump_raw='/t/raw.html', br=br)[0]
for result in results:
if '/dp/' in result.url: if '/dp/' in result.url:
print(result.title) print(result.title)
print(' ', result.url) print(' ', result.url)