Code to get cached URLs from wayback machine

This commit is contained in:
Kovid Goyal 2017-03-01 21:35:10 +05:30
parent 45c0fe8c54
commit 8c8c3dad3f

View File

@ -4,6 +4,7 @@
from __future__ import absolute_import, division, print_function, unicode_literals from __future__ import absolute_import, division, print_function, unicode_literals
import json
import time import time
from collections import defaultdict, namedtuple from collections import defaultdict, namedtuple
from future_builtins import map from future_builtins import map
@ -13,15 +14,27 @@ from urlparse import parse_qs
from lxml import etree from lxml import etree
import html5lib import html5lib
from calibre import browser, prints from calibre import browser as _browser, prints
from calibre.utils.monotonic import monotonic from calibre.utils.monotonic import monotonic
from calibre.utils.random_ua import random_user_agent, accept_header_for_ua
current_version = (1, 0, 0) current_version = (1, 0, 0)
minimum_calibre_version = (2, 80, 0) minimum_calibre_version = (2, 80, 0)
last_visited = defaultdict(lambda: 0) last_visited = defaultdict(lambda: 0)
Result = namedtuple('Result', 'url text cached_url') Result = namedtuple('Result', 'url title cached_url')
def browser():
ua = random_user_agent()
br = _browser(user_agent=ua)
br.set_handle_gzip(True)
br.addheaders += [
('Accept', accept_header_for_ua(ua)),
('Upgrade-insecure-requests', '1'),
]
return br
def encode_query(**query): def encode_query(**query):
@ -29,13 +42,13 @@ def encode_query(**query):
return urlencode(q).decode('utf-8') return urlencode(q).decode('utf-8')
def parse(raw): def parse_html(raw):
return html5lib.parse(raw, treebuilder='lxml', namespaceHTMLElements=False) return html5lib.parse(raw, treebuilder='lxml', namespaceHTMLElements=False)
def query(br, url, key, dump_raw=None): def query(br, url, key, dump_raw=None, limit=1, parser=parse_html):
delta = monotonic() - last_visited[key] delta = monotonic() - last_visited[key]
if delta < 1 and delta > 0: if delta < limit and delta > 0:
time.sleep(delta) time.sleep(delta)
try: try:
raw = br.open_novisit(url).read() raw = br.open_novisit(url).read()
@ -44,7 +57,7 @@ def query(br, url, key, dump_raw=None):
if dump_raw is not None: if dump_raw is not None:
with open(dump_raw, 'wb') as f: with open(dump_raw, 'wb') as f:
f.write(raw) f.write(raw)
return parse(raw) return parser(raw)
def quote_term(x): def quote_term(x):
@ -67,6 +80,19 @@ def ddg_href(url):
return url return url
def wayback_machine_cached_url(url, br=None):
q = quote_term(url)
br = br or browser()
data = query(br, 'https://archive.org/wayback/available?url=' +
q, 'wayback', parser=json.loads)
try:
closest = data['archived_snapshots']['closest']
except KeyError:
return
if closest['available']:
return closest['url']
def ddg_search(terms, site=None, br=None, log=prints, safe_search=False, dump_raw=None): def ddg_search(terms, site=None, br=None, log=prints, safe_search=False, dump_raw=None):
# https://duck.co/help/results/syntax # https://duck.co/help/results/syntax
terms = map(ddg_term, terms) terms = map(ddg_term, terms)
@ -74,16 +100,23 @@ def ddg_search(terms, site=None, br=None, log=prints, safe_search=False, dump_ra
if site is not None: if site is not None:
terms.append(quote_term(('site:' + site))) terms.append(quote_term(('site:' + site)))
q = '+'.join(terms) q = '+'.join(terms)
url = 'https://duckduckgo.com/html/?q={q}&kp={kp}'.format(q=q, kp=1 if safe_search else -1) url = 'https://duckduckgo.com/html/?q={q}&kp={kp}'.format(
q=q, kp=1 if safe_search else -1)
log('Making ddg query: ' + url) log('Making ddg query: ' + url)
br = br or browser() br = br or browser()
root = query(br, url, 'ddg', dump_raw) root = query(br, url, 'ddg', dump_raw)
ans = [] ans = []
for a in root.xpath('//*[@class="results"]//*[@class="result__title"]/a[@href and @class="result__a"]'): for a in root.xpath('//*[@class="results"]//*[@class="result__title"]/a[@href and @class="result__a"]'):
ans.append(Result(ddg_href(a.get('href')), etree.tostring(a, encoding=unicode, method='text', with_tail=False), None)) ans.append(Result(ddg_href(a.get('href')), etree.tostring(
a, encoding=unicode, method='text', with_tail=False), None))
return ans return ans
def ddg_develop(): def ddg_develop():
from pprint import pprint br = browser()
pprint(ddg_search('heroes abercrombie'.split(), 'amazon.com', dump_raw='/t/raw.html')) for result in ddg_search('heroes abercrombie'.split(), 'amazon.com', dump_raw='/t/raw.html', br=br):
if '/dp/' in result.url:
print(result.title)
print(' ', result.url)
print(' ', wayback_machine_cached_url(result.url, br))
print()