mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Fix extracting cached URLs from google search results
This commit is contained in:
parent
0749ba592c
commit
45b17f6cad
@ -9,10 +9,10 @@ import re
|
|||||||
import time
|
import time
|
||||||
from collections import defaultdict, namedtuple
|
from collections import defaultdict, namedtuple
|
||||||
try:
|
try:
|
||||||
from urllib.parse import parse_qs, quote_plus, urlencode
|
from urllib.parse import parse_qs, quote_plus, urlencode, unquote
|
||||||
except ImportError:
|
except ImportError:
|
||||||
from urlparse import parse_qs
|
from urlparse import parse_qs
|
||||||
from urllib import quote_plus, urlencode
|
from urllib import quote_plus, urlencode, unquote
|
||||||
|
|
||||||
from lxml import etree
|
from lxml import etree
|
||||||
|
|
||||||
@ -20,7 +20,7 @@ from calibre import browser as _browser, prints, random_user_agent
|
|||||||
from calibre.utils.monotonic import monotonic
|
from calibre.utils.monotonic import monotonic
|
||||||
from calibre.utils.random_ua import accept_header_for_ua
|
from calibre.utils.random_ua import accept_header_for_ua
|
||||||
|
|
||||||
current_version = (1, 0, 7)
|
current_version = (1, 0, 8)
|
||||||
minimum_calibre_version = (2, 80, 0)
|
minimum_calibre_version = (2, 80, 0)
|
||||||
|
|
||||||
|
|
||||||
@ -60,7 +60,7 @@ def parse_html(raw):
|
|||||||
return parse(raw)
|
return parse(raw)
|
||||||
|
|
||||||
|
|
||||||
def query(br, url, key, dump_raw=None, limit=1, parser=parse_html, timeout=60):
|
def query(br, url, key, dump_raw=None, limit=1, parser=parse_html, timeout=60, save_raw=None):
|
||||||
delta = monotonic() - last_visited[key]
|
delta = monotonic() - last_visited[key]
|
||||||
if delta < limit and delta > 0:
|
if delta < limit and delta > 0:
|
||||||
time.sleep(delta)
|
time.sleep(delta)
|
||||||
@ -71,6 +71,8 @@ def query(br, url, key, dump_raw=None, limit=1, parser=parse_html, timeout=60):
|
|||||||
if dump_raw is not None:
|
if dump_raw is not None:
|
||||||
with open(dump_raw, 'wb') as f:
|
with open(dump_raw, 'wb') as f:
|
||||||
f.write(raw)
|
f.write(raw)
|
||||||
|
if save_raw is not None:
|
||||||
|
save_raw(raw)
|
||||||
return parser(raw)
|
return parser(raw)
|
||||||
|
|
||||||
|
|
||||||
@ -221,15 +223,34 @@ def google_url_processor(url):
|
|||||||
return url
|
return url
|
||||||
|
|
||||||
|
|
||||||
def google_search(terms, site=None, br=None, log=prints, safe_search=False, dump_raw=None, timeout=60):
|
def google_extract_cache_urls(raw):
|
||||||
terms = [quote_term(google_term(t)) for t in terms]
|
if isinstance(raw, bytes):
|
||||||
if site is not None:
|
raw = raw.decode('utf-8', 'replace')
|
||||||
terms.append(quote_term(('site:' + site)))
|
pat = re.compile(r'\\x22(https://webcache\.googleusercontent\.com/.+?)\\x22')
|
||||||
q = '+'.join(terms)
|
upat = re.compile(r'\\\\u([0-9a-fA-F]{4})')
|
||||||
url = 'https://www.google.com/search?q={q}'.format(q=q)
|
cache_pat = re.compile('cache:([^:]+):(.+)')
|
||||||
log('Making google query: ' + url)
|
|
||||||
br = br or browser()
|
def urepl(m):
|
||||||
root = query(br, url, 'google', dump_raw, timeout=timeout)
|
return chr(int(m.group(1), 16))
|
||||||
|
|
||||||
|
seen = set()
|
||||||
|
ans = {}
|
||||||
|
for m in pat.finditer(raw):
|
||||||
|
cache_url = upat.sub(urepl, m.group(1))
|
||||||
|
m = cache_pat.search(cache_url)
|
||||||
|
cache_id, src_url = m.group(1), m.group(2)
|
||||||
|
if cache_id in seen:
|
||||||
|
continue
|
||||||
|
seen.add(cache_id)
|
||||||
|
src_url = src_url.split('+')[0]
|
||||||
|
src_url = unquote(src_url)
|
||||||
|
ans[src_url] = cache_url
|
||||||
|
return ans
|
||||||
|
|
||||||
|
|
||||||
|
def google_parse_results(root, raw, log=prints):
|
||||||
|
cache_url_map = google_extract_cache_urls(raw)
|
||||||
|
# print('\n'.join(cache_url_map))
|
||||||
ans = []
|
ans = []
|
||||||
for div in root.xpath('//*[@id="search"]//*[@id="rso"]//*[@class="g"]'):
|
for div in root.xpath('//*[@id="search"]//*[@id="rso"]//*[@class="g"]'):
|
||||||
try:
|
try:
|
||||||
@ -238,22 +259,45 @@ def google_search(terms, site=None, br=None, log=prints, safe_search=False, dump
|
|||||||
log('Ignoring div with no main result link')
|
log('Ignoring div with no main result link')
|
||||||
continue
|
continue
|
||||||
title = tostring(a)
|
title = tostring(a)
|
||||||
try:
|
src_url = a.get('href')
|
||||||
c = div.xpath('descendant::*[@role="menuitem"]//a[@class="fl"]')[0]
|
if src_url in cache_url_map:
|
||||||
except IndexError:
|
cached_url = cache_url_map[src_url]
|
||||||
log('Ignoring {!r} as it has no cached page'.format(title))
|
else:
|
||||||
continue
|
try:
|
||||||
cached_url = c.get('href')
|
c = div.xpath('descendant::*[@role="menuitem"]//a[@class="fl"]')[0]
|
||||||
|
except IndexError:
|
||||||
|
log('Ignoring {!r} as it has no cached page'.format(title))
|
||||||
|
continue
|
||||||
|
cached_url = c.get('href')
|
||||||
ans.append(Result(a.get('href'), title, cached_url))
|
ans.append(Result(a.get('href'), title, cached_url))
|
||||||
if not ans:
|
if not ans:
|
||||||
title = ' '.join(root.xpath('//title/text()'))
|
title = ' '.join(root.xpath('//title/text()'))
|
||||||
log('Failed to find any results on results page, with title:', title)
|
log('Failed to find any results on results page, with title:', title)
|
||||||
return ans, url
|
return ans
|
||||||
|
|
||||||
|
|
||||||
def google_develop(search_terms='1423146786'):
|
def google_search(terms, site=None, br=None, log=prints, safe_search=False, dump_raw=None, timeout=60):
|
||||||
br = browser()
|
terms = [quote_term(google_term(t)) for t in terms]
|
||||||
for result in google_search(search_terms.split(), 'www.amazon.com', dump_raw='/t/raw.html', br=br)[0]:
|
if site is not None:
|
||||||
|
terms.append(quote_term(('site:' + site)))
|
||||||
|
q = '+'.join(terms)
|
||||||
|
url = 'https://www.google.com/search?q={q}'.format(q=q)
|
||||||
|
log('Making google query: ' + url)
|
||||||
|
br = br or browser()
|
||||||
|
r = []
|
||||||
|
root = query(br, url, 'google', dump_raw, timeout=timeout, save_raw=r.append)
|
||||||
|
return google_parse_results(root, r[0], log=log), url
|
||||||
|
|
||||||
|
|
||||||
|
def google_develop(search_terms='1423146786', raw_from=''):
|
||||||
|
if raw_from:
|
||||||
|
with open(raw_from, 'rb') as f:
|
||||||
|
raw = f.read()
|
||||||
|
results = google_parse_results(parse_html(raw), raw)
|
||||||
|
else:
|
||||||
|
br = browser()
|
||||||
|
results = google_search(search_terms.split(), 'www.amazon.com', dump_raw='/t/raw.html', br=br)[0]
|
||||||
|
for result in results:
|
||||||
if '/dp/' in result.url:
|
if '/dp/' in result.url:
|
||||||
print(result.title)
|
print(result.title)
|
||||||
print(' ', result.url)
|
print(' ', result.url)
|
||||||
|
Loading…
x
Reference in New Issue
Block a user