mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Implement searching amazon via wayback machine
Disabled, as wayback machine is really slow/flaky
This commit is contained in:
parent
6c4c14ceca
commit
d1ad4955a8
File diff suppressed because it is too large
Load Diff
@ -46,12 +46,12 @@ def parse_html(raw):
|
|||||||
return html5lib.parse(raw, treebuilder='lxml', namespaceHTMLElements=False)
|
return html5lib.parse(raw, treebuilder='lxml', namespaceHTMLElements=False)
|
||||||
|
|
||||||
|
|
||||||
def query(br, url, key, dump_raw=None, limit=1, parser=parse_html):
|
def query(br, url, key, dump_raw=None, limit=1, parser=parse_html, timeout=60):
|
||||||
delta = monotonic() - last_visited[key]
|
delta = monotonic() - last_visited[key]
|
||||||
if delta < limit and delta > 0:
|
if delta < limit and delta > 0:
|
||||||
time.sleep(delta)
|
time.sleep(delta)
|
||||||
try:
|
try:
|
||||||
raw = br.open_novisit(url).read()
|
raw = br.open_novisit(url, timeout=timeout).read()
|
||||||
finally:
|
finally:
|
||||||
last_visited[key] = monotonic()
|
last_visited[key] = monotonic()
|
||||||
if dump_raw is not None:
|
if dump_raw is not None:
|
||||||
@ -80,20 +80,29 @@ def ddg_href(url):
|
|||||||
return url
|
return url
|
||||||
|
|
||||||
|
|
||||||
def wayback_machine_cached_url(url, br=None):
|
def wayback_machine_cached_url(url, br=None, log=prints, timeout=60):
|
||||||
q = quote_term(url)
|
q = quote_term(url)
|
||||||
br = br or browser()
|
br = br or browser()
|
||||||
data = query(br, 'https://archive.org/wayback/available?url=' +
|
data = query(br, 'https://archive.org/wayback/available?url=' +
|
||||||
q, 'wayback', parser=json.loads, limit=0.25)
|
q, 'wayback', parser=json.loads, limit=0.25, timeout=timeout)
|
||||||
try:
|
try:
|
||||||
closest = data['archived_snapshots']['closest']
|
closest = data['archived_snapshots']['closest']
|
||||||
except KeyError:
|
except KeyError:
|
||||||
return
|
pass
|
||||||
if closest['available']:
|
else:
|
||||||
return closest['url']
|
if closest['available']:
|
||||||
|
return closest['url']
|
||||||
|
from pprint import pformat
|
||||||
|
log('Response from wayback machine:', pformat(data))
|
||||||
|
|
||||||
|
|
||||||
def ddg_search(terms, site=None, br=None, log=prints, safe_search=False, dump_raw=None):
|
def wayback_url_processor(url):
|
||||||
|
if url.startswith('/'):
|
||||||
|
url = 'https://web.archive.org' + url
|
||||||
|
return url
|
||||||
|
|
||||||
|
|
||||||
|
def ddg_search(terms, site=None, br=None, log=prints, safe_search=False, dump_raw=None, timeout=60):
|
||||||
# https://duck.co/help/results/syntax
|
# https://duck.co/help/results/syntax
|
||||||
terms = map(ddg_term, terms)
|
terms = map(ddg_term, terms)
|
||||||
terms = [quote_term(t) for t in terms]
|
terms = [quote_term(t) for t in terms]
|
||||||
@ -104,7 +113,7 @@ def ddg_search(terms, site=None, br=None, log=prints, safe_search=False, dump_ra
|
|||||||
q=q, kp=1 if safe_search else -1)
|
q=q, kp=1 if safe_search else -1)
|
||||||
log('Making ddg query: ' + url)
|
log('Making ddg query: ' + url)
|
||||||
br = br or browser()
|
br = br or browser()
|
||||||
root = query(br, url, 'ddg', dump_raw)
|
root = query(br, url, 'ddg', dump_raw, timeout=timeout)
|
||||||
ans = []
|
ans = []
|
||||||
for a in root.xpath('//*[@class="results"]//*[@class="result__title"]/a[@href and @class="result__a"]'):
|
for a in root.xpath('//*[@class="results"]//*[@class="result__title"]/a[@href and @class="result__a"]'):
|
||||||
ans.append(Result(ddg_href(a.get('href')), etree.tostring(
|
ans.append(Result(ddg_href(a.get('href')), etree.tostring(
|
||||||
|
@ -14,7 +14,6 @@ from threading import Thread
|
|||||||
import calibre.ebooks.metadata.sources.search_engines as builtin_search_engines
|
import calibre.ebooks.metadata.sources.search_engines as builtin_search_engines
|
||||||
from calibre import as_unicode, prints
|
from calibre import as_unicode, prints
|
||||||
from calibre.constants import DEBUG, numeric_version
|
from calibre.constants import DEBUG, numeric_version
|
||||||
from calibre.customize.ui import patch_metadata_plugins
|
|
||||||
from calibre.ebooks.metadata.sources.base import Source
|
from calibre.ebooks.metadata.sources.base import Source
|
||||||
from calibre.utils.config import JSONConfig
|
from calibre.utils.config import JSONConfig
|
||||||
from calibre.utils.https import get_https_resource_securely
|
from calibre.utils.https import get_https_resource_securely
|
||||||
@ -59,6 +58,7 @@ def patch_search_engines(src):
|
|||||||
|
|
||||||
|
|
||||||
def patch_plugins():
|
def patch_plugins():
|
||||||
|
from calibre.customize.ui import patch_metadata_plugins
|
||||||
patches = {}
|
patches = {}
|
||||||
for name, val in cache.iteritems():
|
for name, val in cache.iteritems():
|
||||||
if name == 'hashes':
|
if name == 'hashes':
|
||||||
|
Loading…
x
Reference in New Issue
Block a user