diff --git a/src/calibre/ebooks/metadata/sources/search_engines.py b/src/calibre/ebooks/metadata/sources/search_engines.py index e37bd2eb26..534928a369 100644 --- a/src/calibre/ebooks/metadata/sources/search_engines.py +++ b/src/calibre/ebooks/metadata/sources/search_engines.py @@ -9,10 +9,10 @@ import re import time from collections import defaultdict, namedtuple try: - from urllib.parse import parse_qs, quote_plus, urlencode + from urllib.parse import parse_qs, quote_plus, urlencode, unquote except ImportError: from urlparse import parse_qs - from urllib import quote_plus, urlencode + from urllib import quote_plus, urlencode, unquote from lxml import etree @@ -20,7 +20,7 @@ from calibre import browser as _browser, prints, random_user_agent from calibre.utils.monotonic import monotonic from calibre.utils.random_ua import accept_header_for_ua -current_version = (1, 0, 7) +current_version = (1, 0, 8) minimum_calibre_version = (2, 80, 0) @@ -60,7 +60,7 @@ def parse_html(raw): return parse(raw) -def query(br, url, key, dump_raw=None, limit=1, parser=parse_html, timeout=60): +def query(br, url, key, dump_raw=None, limit=1, parser=parse_html, timeout=60, save_raw=None): delta = monotonic() - last_visited[key] if delta < limit and delta > 0: time.sleep(delta) @@ -71,6 +71,8 @@ def query(br, url, key, dump_raw=None, limit=1, parser=parse_html, timeout=60): if dump_raw is not None: with open(dump_raw, 'wb') as f: f.write(raw) + if save_raw is not None: + save_raw(raw) return parser(raw) @@ -221,15 +223,34 @@ def google_url_processor(url): return url -def google_search(terms, site=None, br=None, log=prints, safe_search=False, dump_raw=None, timeout=60): - terms = [quote_term(google_term(t)) for t in terms] - if site is not None: - terms.append(quote_term(('site:' + site))) - q = '+'.join(terms) - url = 'https://www.google.com/search?q={q}'.format(q=q) - log('Making google query: ' + url) - br = br or browser() - root = query(br, url, 'google', dump_raw, timeout=timeout) +def google_extract_cache_urls(raw): + if isinstance(raw, bytes): + raw = raw.decode('utf-8', 'replace') + pat = re.compile(r'\\x22(https://webcache\.googleusercontent\.com/.+?)\\x22') + upat = re.compile(r'\\\\u([0-9a-fA-F]{4})') + cache_pat = re.compile('cache:([^:]+):(.+)') + + def urepl(m): + return chr(int(m.group(1), 16)) + + seen = set() + ans = {} + for m in pat.finditer(raw): + cache_url = upat.sub(urepl, m.group(1)) + m = cache_pat.search(cache_url) + cache_id, src_url = m.group(1), m.group(2) + if cache_id in seen: + continue + seen.add(cache_id) + src_url = src_url.split('+')[0] + src_url = unquote(src_url) + ans[src_url] = cache_url + return ans + + +def google_parse_results(root, raw, log=prints): + cache_url_map = google_extract_cache_urls(raw) + # print('\n'.join(cache_url_map)) ans = [] for div in root.xpath('//*[@id="search"]//*[@id="rso"]//*[@class="g"]'): try: @@ -238,22 +259,45 @@ def google_search(terms, site=None, br=None, log=prints, safe_search=False, dump log('Ignoring div with no main result link') continue title = tostring(a) - try: - c = div.xpath('descendant::*[@role="menuitem"]//a[@class="fl"]')[0] - except IndexError: - log('Ignoring {!r} as it has no cached page'.format(title)) - continue - cached_url = c.get('href') + src_url = a.get('href') + if src_url in cache_url_map: + cached_url = cache_url_map[src_url] + else: + try: + c = div.xpath('descendant::*[@role="menuitem"]//a[@class="fl"]')[0] + except IndexError: + log('Ignoring {!r} as it has no cached page'.format(title)) + continue + cached_url = c.get('href') ans.append(Result(a.get('href'), title, cached_url)) if not ans: title = ' '.join(root.xpath('//title/text()')) log('Failed to find any results on results page, with title:', title) - return ans, url + return ans -def google_develop(search_terms='1423146786'): - br = browser() - for result in google_search(search_terms.split(), 'www.amazon.com', dump_raw='/t/raw.html', br=br)[0]: +def google_search(terms, site=None, br=None, log=prints, safe_search=False, dump_raw=None, timeout=60): + terms = [quote_term(google_term(t)) for t in terms] + if site is not None: + terms.append(quote_term(('site:' + site))) + q = '+'.join(terms) + url = 'https://www.google.com/search?q={q}'.format(q=q) + log('Making google query: ' + url) + br = br or browser() + r = [] + root = query(br, url, 'google', dump_raw, timeout=timeout, save_raw=r.append) + return google_parse_results(root, r[0], log=log), url + + +def google_develop(search_terms='1423146786', raw_from=''): + if raw_from: + with open(raw_from, 'rb') as f: + raw = f.read() + results = google_parse_results(parse_html(raw), raw) + else: + br = browser() + results = google_search(search_terms.split(), 'www.amazon.com', dump_raw='/t/raw.html', br=br)[0] + for result in results: if '/dp/' in result.url: print(result.title) print(' ', result.url)