mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Code to get cached URLs from wayback machine
This commit is contained in:
parent
45c0fe8c54
commit
8c8c3dad3f
@ -4,6 +4,7 @@
|
|||||||
|
|
||||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||||
|
|
||||||
|
import json
|
||||||
import time
|
import time
|
||||||
from collections import defaultdict, namedtuple
|
from collections import defaultdict, namedtuple
|
||||||
from future_builtins import map
|
from future_builtins import map
|
||||||
@ -13,15 +14,27 @@ from urlparse import parse_qs
|
|||||||
from lxml import etree
|
from lxml import etree
|
||||||
|
|
||||||
import html5lib
|
import html5lib
|
||||||
from calibre import browser, prints
|
from calibre import browser as _browser, prints
|
||||||
from calibre.utils.monotonic import monotonic
|
from calibre.utils.monotonic import monotonic
|
||||||
|
from calibre.utils.random_ua import random_user_agent, accept_header_for_ua
|
||||||
|
|
||||||
current_version = (1, 0, 0)
|
current_version = (1, 0, 0)
|
||||||
minimum_calibre_version = (2, 80, 0)
|
minimum_calibre_version = (2, 80, 0)
|
||||||
|
|
||||||
|
|
||||||
last_visited = defaultdict(lambda: 0)
|
last_visited = defaultdict(lambda: 0)
|
||||||
Result = namedtuple('Result', 'url text cached_url')
|
Result = namedtuple('Result', 'url title cached_url')
|
||||||
|
|
||||||
|
|
||||||
|
def browser():
|
||||||
|
ua = random_user_agent()
|
||||||
|
br = _browser(user_agent=ua)
|
||||||
|
br.set_handle_gzip(True)
|
||||||
|
br.addheaders += [
|
||||||
|
('Accept', accept_header_for_ua(ua)),
|
||||||
|
('Upgrade-insecure-requests', '1'),
|
||||||
|
]
|
||||||
|
return br
|
||||||
|
|
||||||
|
|
||||||
def encode_query(**query):
|
def encode_query(**query):
|
||||||
@ -29,13 +42,13 @@ def encode_query(**query):
|
|||||||
return urlencode(q).decode('utf-8')
|
return urlencode(q).decode('utf-8')
|
||||||
|
|
||||||
|
|
||||||
def parse(raw):
|
def parse_html(raw):
|
||||||
return html5lib.parse(raw, treebuilder='lxml', namespaceHTMLElements=False)
|
return html5lib.parse(raw, treebuilder='lxml', namespaceHTMLElements=False)
|
||||||
|
|
||||||
|
|
||||||
def query(br, url, key, dump_raw=None):
|
def query(br, url, key, dump_raw=None, limit=1, parser=parse_html):
|
||||||
delta = monotonic() - last_visited[key]
|
delta = monotonic() - last_visited[key]
|
||||||
if delta < 1 and delta > 0:
|
if delta < limit and delta > 0:
|
||||||
time.sleep(delta)
|
time.sleep(delta)
|
||||||
try:
|
try:
|
||||||
raw = br.open_novisit(url).read()
|
raw = br.open_novisit(url).read()
|
||||||
@ -44,7 +57,7 @@ def query(br, url, key, dump_raw=None):
|
|||||||
if dump_raw is not None:
|
if dump_raw is not None:
|
||||||
with open(dump_raw, 'wb') as f:
|
with open(dump_raw, 'wb') as f:
|
||||||
f.write(raw)
|
f.write(raw)
|
||||||
return parse(raw)
|
return parser(raw)
|
||||||
|
|
||||||
|
|
||||||
def quote_term(x):
|
def quote_term(x):
|
||||||
@ -67,6 +80,19 @@ def ddg_href(url):
|
|||||||
return url
|
return url
|
||||||
|
|
||||||
|
|
||||||
|
def wayback_machine_cached_url(url, br=None):
|
||||||
|
q = quote_term(url)
|
||||||
|
br = br or browser()
|
||||||
|
data = query(br, 'https://archive.org/wayback/available?url=' +
|
||||||
|
q, 'wayback', parser=json.loads)
|
||||||
|
try:
|
||||||
|
closest = data['archived_snapshots']['closest']
|
||||||
|
except KeyError:
|
||||||
|
return
|
||||||
|
if closest['available']:
|
||||||
|
return closest['url']
|
||||||
|
|
||||||
|
|
||||||
def ddg_search(terms, site=None, br=None, log=prints, safe_search=False, dump_raw=None):
|
def ddg_search(terms, site=None, br=None, log=prints, safe_search=False, dump_raw=None):
|
||||||
# https://duck.co/help/results/syntax
|
# https://duck.co/help/results/syntax
|
||||||
terms = map(ddg_term, terms)
|
terms = map(ddg_term, terms)
|
||||||
@ -74,16 +100,23 @@ def ddg_search(terms, site=None, br=None, log=prints, safe_search=False, dump_ra
|
|||||||
if site is not None:
|
if site is not None:
|
||||||
terms.append(quote_term(('site:' + site)))
|
terms.append(quote_term(('site:' + site)))
|
||||||
q = '+'.join(terms)
|
q = '+'.join(terms)
|
||||||
url = 'https://duckduckgo.com/html/?q={q}&kp={kp}'.format(q=q, kp=1 if safe_search else -1)
|
url = 'https://duckduckgo.com/html/?q={q}&kp={kp}'.format(
|
||||||
|
q=q, kp=1 if safe_search else -1)
|
||||||
log('Making ddg query: ' + url)
|
log('Making ddg query: ' + url)
|
||||||
br = br or browser()
|
br = br or browser()
|
||||||
root = query(br, url, 'ddg', dump_raw)
|
root = query(br, url, 'ddg', dump_raw)
|
||||||
ans = []
|
ans = []
|
||||||
for a in root.xpath('//*[@class="results"]//*[@class="result__title"]/a[@href and @class="result__a"]'):
|
for a in root.xpath('//*[@class="results"]//*[@class="result__title"]/a[@href and @class="result__a"]'):
|
||||||
ans.append(Result(ddg_href(a.get('href')), etree.tostring(a, encoding=unicode, method='text', with_tail=False), None))
|
ans.append(Result(ddg_href(a.get('href')), etree.tostring(
|
||||||
|
a, encoding=unicode, method='text', with_tail=False), None))
|
||||||
return ans
|
return ans
|
||||||
|
|
||||||
|
|
||||||
def ddg_develop():
|
def ddg_develop():
|
||||||
from pprint import pprint
|
br = browser()
|
||||||
pprint(ddg_search('heroes abercrombie'.split(), 'amazon.com', dump_raw='/t/raw.html'))
|
for result in ddg_search('heroes abercrombie'.split(), 'amazon.com', dump_raw='/t/raw.html', br=br):
|
||||||
|
if '/dp/' in result.url:
|
||||||
|
print(result.title)
|
||||||
|
print(' ', result.url)
|
||||||
|
print(' ', wayback_machine_cached_url(result.url, br))
|
||||||
|
print()
|
||||||
|
Loading…
x
Reference in New Issue
Block a user