From 8c8c3dad3f4917e1f5e3ac11caee91c96f7d4821 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Wed, 1 Mar 2017 21:35:10 +0530 Subject: [PATCH] Code to get cached URLs from wayback machine --- .../ebooks/metadata/sources/search_engines.py | 53 +++++++++++++++---- 1 file changed, 43 insertions(+), 10 deletions(-) diff --git a/src/calibre/ebooks/metadata/sources/search_engines.py b/src/calibre/ebooks/metadata/sources/search_engines.py index 9af22d8d20..dca7e4f2e5 100644 --- a/src/calibre/ebooks/metadata/sources/search_engines.py +++ b/src/calibre/ebooks/metadata/sources/search_engines.py @@ -4,6 +4,7 @@ from __future__ import absolute_import, division, print_function, unicode_literals +import json import time from collections import defaultdict, namedtuple from future_builtins import map @@ -13,15 +14,27 @@ from urlparse import parse_qs from lxml import etree import html5lib -from calibre import browser, prints +from calibre import browser as _browser, prints from calibre.utils.monotonic import monotonic +from calibre.utils.random_ua import random_user_agent, accept_header_for_ua current_version = (1, 0, 0) minimum_calibre_version = (2, 80, 0) last_visited = defaultdict(lambda: 0) -Result = namedtuple('Result', 'url text cached_url') +Result = namedtuple('Result', 'url title cached_url') + + +def browser(): + ua = random_user_agent() + br = _browser(user_agent=ua) + br.set_handle_gzip(True) + br.addheaders += [ + ('Accept', accept_header_for_ua(ua)), + ('Upgrade-insecure-requests', '1'), + ] + return br def encode_query(**query): @@ -29,13 +42,13 @@ def encode_query(**query): return urlencode(q).decode('utf-8') -def parse(raw): +def parse_html(raw): return html5lib.parse(raw, treebuilder='lxml', namespaceHTMLElements=False) -def query(br, url, key, dump_raw=None): +def query(br, url, key, dump_raw=None, limit=1, parser=parse_html): delta = monotonic() - last_visited[key] - if delta < 1 and delta > 0: + if delta < limit and delta > 0: time.sleep(delta) try: raw = br.open_novisit(url).read() @@ -44,7 +57,7 @@ def query(br, url, key, dump_raw=None): if dump_raw is not None: with open(dump_raw, 'wb') as f: f.write(raw) - return parse(raw) + return parser(raw) def quote_term(x): @@ -67,6 +80,19 @@ def ddg_href(url): return url +def wayback_machine_cached_url(url, br=None): + q = quote_term(url) + br = br or browser() + data = query(br, 'https://archive.org/wayback/available?url=' + + q, 'wayback', parser=json.loads) + try: + closest = data['archived_snapshots']['closest'] + except KeyError: + return + if closest['available']: + return closest['url'] + + def ddg_search(terms, site=None, br=None, log=prints, safe_search=False, dump_raw=None): # https://duck.co/help/results/syntax terms = map(ddg_term, terms) @@ -74,16 +100,23 @@ def ddg_search(terms, site=None, br=None, log=prints, safe_search=False, dump_ra if site is not None: terms.append(quote_term(('site:' + site))) q = '+'.join(terms) - url = 'https://duckduckgo.com/html/?q={q}&kp={kp}'.format(q=q, kp=1 if safe_search else -1) + url = 'https://duckduckgo.com/html/?q={q}&kp={kp}'.format( + q=q, kp=1 if safe_search else -1) log('Making ddg query: ' + url) br = br or browser() root = query(br, url, 'ddg', dump_raw) ans = [] for a in root.xpath('//*[@class="results"]//*[@class="result__title"]/a[@href and @class="result__a"]'): - ans.append(Result(ddg_href(a.get('href')), etree.tostring(a, encoding=unicode, method='text', with_tail=False), None)) + ans.append(Result(ddg_href(a.get('href')), etree.tostring( + a, encoding=unicode, method='text', with_tail=False), None)) return ans def ddg_develop(): - from pprint import pprint - pprint(ddg_search('heroes abercrombie'.split(), 'amazon.com', dump_raw='/t/raw.html')) + br = browser() + for result in ddg_search('heroes abercrombie'.split(), 'amazon.com', dump_raw='/t/raw.html', br=br): + if '/dp/' in result.url: + print(result.title) + print(' ', result.url) + print(' ', wayback_machine_cached_url(result.url, br)) + print()