Implement searching amazon via wayback machine

Disabled, as wayback machine is really slow/flaky
This commit is contained in:
Kovid Goyal 2017-03-02 09:19:51 +05:30
parent 6c4c14ceca
commit d1ad4955a8
3 changed files with 447 additions and 335 deletions

View File

@ -1,24 +1,22 @@
#!/usr/bin/env python2
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from __future__ import (unicode_literals, division, absolute_import,
print_function)
# License: GPLv3 Copyright: 2011, Kovid Goyal <kovid at kovidgoyal.net>
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL v3'
__copyright__ = '2011, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
import socket, time, re
import re
import socket
import time
from Queue import Empty, Queue
from threading import Thread
from Queue import Queue, Empty
from urlparse import urlparse
from calibre import as_unicode, browser
from calibre.ebooks.metadata import check_isbn
from calibre.ebooks.metadata.sources.base import (Source, Option, fixcase,
fixauthors)
from calibre.ebooks.metadata.book.base import Metadata
from calibre.ebooks.metadata.sources.base import Option, Source, fixauthors, fixcase
from calibre.ebooks.metadata.sources.update import search_engines_module
from calibre.utils.localization import canonicalize_lang
from calibre.utils.random_ua import all_user_agents, accept_header_for_ua
from calibre.utils.random_ua import accept_header_for_ua, all_user_agents
class CaptchaError(Exception):
@ -30,6 +28,7 @@ class SearchFailed(ValueError):
ua_index = -1
USE_SEARCH_ENGINE = False
def parse_details_page(url, log, timeout, browser, domain):
@ -37,6 +36,7 @@ def parse_details_page(url, log, timeout, browser, domain):
from calibre.ebooks.chardet import xml_to_unicode
import html5lib
from lxml.html import tostring
log('Getting details from:', url)
try:
raw = browser.open_novisit(url, timeout=timeout).read().strip()
except Exception as e:
@ -56,8 +56,10 @@ def parse_details_page(url, log, timeout, browser, domain):
oraw = raw
if 'amazon.com.br' in url:
raw = raw.decode('utf-8') # amazon.com.br serves utf-8 but has an incorrect latin1 <meta> tag
raw = xml_to_unicode(raw, strip_encoding_pats=True, resolve_entities=True)[0]
# amazon.com.br serves utf-8 but has an incorrect latin1 <meta> tag
raw = raw.decode('utf-8')
raw = xml_to_unicode(raw, strip_encoding_pats=True,
resolve_entities=True)[0]
if '<title>404 - ' in raw:
log.error('URL malformed: %r' % url)
return
@ -104,8 +106,9 @@ class Worker(Thread): # Get details {{{
'''
def __init__(self, url, result_queue, browser, log, relevance, domain,
plugin, timeout=20, testing=False, preparsed_root=None):
plugin, timeout=20, testing=False, preparsed_root=None, cover_url_processor=None):
Thread.__init__(self)
self.cover_url_processor = cover_url_processor
self.preparsed_root = preparsed_root
self.daemon = True
self.testing = testing
@ -230,7 +233,8 @@ class Worker(Thread): # Get details {{{
starts-with(text(), "Uitgever:") or \
starts-with(text(), "出版社:")]
'''
self.publisher_names = {'Publisher', 'Uitgever', 'Verlag', 'Editore', 'Editeur', 'Editor', 'Editora', '出版社'}
self.publisher_names = {'Publisher', 'Uitgever', 'Verlag',
'Editore', 'Editeur', 'Editor', 'Editora', '出版社'}
self.language_xpath = '''
descendant::*[
@ -244,7 +248,8 @@ class Worker(Thread): # Get details {{{
or starts-with(text(), "语种")
]
'''
self.language_names = {'Language', 'Sprache', 'Lingua', 'Idioma', 'Langue', '言語', 'Taal', '语种'}
self.language_names = {'Language', 'Sprache',
'Lingua', 'Idioma', 'Langue', '言語', 'Taal', '语种'}
self.tags_xpath = '''
descendant::h2[
@ -308,7 +313,8 @@ class Worker(Thread): # Get details {{{
def get_details(self):
if self.preparsed_root is None:
raw, root, selector = parse_details_page(self.url, self.log, self.timeout, self.browser, self.domain)
raw, root, selector = parse_details_page(
self.url, self.log, self.timeout, self.browser, self.domain)
else:
raw, root, selector = self.preparsed_root
@ -319,9 +325,11 @@ class Worker(Thread): # Get details {{{
def parse_details(self, raw, root):
asin = parse_asin(root, self.log, self.url)
if not asin and root.xpath('//form[@action="/errors/validateCaptcha"]'):
raise CaptchaError('Amazon returned a CAPTCHA page, probably because you downloaded too many books. Wait for some time and try again.')
raise CaptchaError(
'Amazon returned a CAPTCHA page, probably because you downloaded too many books. Wait for some time and try again.')
if self.testing:
import tempfile, uuid
import tempfile
import uuid
with tempfile.NamedTemporaryFile(prefix=(asin or str(uuid.uuid4())) + '_',
suffix='.html', delete=False) as f:
f.write(raw)
@ -340,7 +348,8 @@ class Worker(Thread): # Get details {{{
authors = []
if not title or not authors or not asin:
self.log.error('Could not find title/authors/asin for %r'%self.url)
self.log.error(
'Could not find title/authors/asin for %r' % self.url)
self.log.error('ASIN: %r Title: %r Authors: %r' % (asin, title,
authors))
return
@ -378,15 +387,19 @@ class Worker(Thread): # Get details {{{
self.cover_url = self.parse_cover(root, raw)
except:
self.log.exception('Error parsing cover for url: %r' % self.url)
if self.cover_url_processor is not None and self.cover_url.startswith('/'):
self.cover_url = self.cover_url_processor(self.cover_url)
mi.has_cover = bool(self.cover_url)
non_hero = tuple(self.selector('div#bookDetails_container_div div#nonHeroSection'))
non_hero = tuple(self.selector(
'div#bookDetails_container_div div#nonHeroSection'))
if non_hero:
# New style markup
try:
self.parse_new_details(root, mi, non_hero[0])
except:
self.log.exception('Failed to parse new-style book details section')
self.log.exception(
'Failed to parse new-style book details section')
else:
pd = root.xpath(self.pd_xpath)
if pd:
@ -397,27 +410,32 @@ class Worker(Thread): # Get details {{{
if isbn:
self.isbn = mi.isbn = isbn
except:
self.log.exception('Error parsing ISBN for url: %r'%self.url)
self.log.exception(
'Error parsing ISBN for url: %r' % self.url)
try:
mi.publisher = self.parse_publisher(pd)
except:
self.log.exception('Error parsing publisher for url: %r'%self.url)
self.log.exception(
'Error parsing publisher for url: %r' % self.url)
try:
mi.pubdate = self.parse_pubdate(pd)
except:
self.log.exception('Error parsing publish date for url: %r'%self.url)
self.log.exception(
'Error parsing publish date for url: %r' % self.url)
try:
lang = self.parse_language(pd)
if lang:
mi.language = lang
except:
self.log.exception('Error parsing language for url: %r'%self.url)
self.log.exception(
'Error parsing language for url: %r' % self.url)
else:
self.log.warning('Failed to find product description for url: %r'%self.url)
self.log.warning(
'Failed to find product description for url: %r' % self.url)
mi.source_relevance = self.relevance
@ -448,7 +466,8 @@ class Worker(Thread): # Get details {{{
title = self.tostring(actual_title[0], encoding=unicode,
method='text').strip()
else:
title = self.tostring(tdiv, encoding=unicode, method='text').strip()
title = self.tostring(tdiv, encoding=unicode,
method='text').strip()
ans = re.sub(r'[(\[].*[)\]]', '', title).strip()
if not ans:
ans = title.rpartition('[')[0].strip()
@ -540,7 +559,8 @@ class Worker(Thread): # Get details {{{
if len(ns) == 0 and ns.text:
import html5lib
# html5lib parsed noscript as CDATA
ns = html5lib.parseFragment('<div>%s</div>' % (ns.text), treebuilder='lxml', namespaceHTMLElements=False)[0]
ns = html5lib.parseFragment(
'<div>%s</div>' % (ns.text), treebuilder='lxml', namespaceHTMLElements=False)[0]
else:
ns.tag = 'div'
ans = self._render_comments(ns)
@ -549,7 +569,8 @@ class Worker(Thread): # Get details {{{
if desc:
ans = self._render_comments(desc[0])
desc = root.xpath('//div[@id="productDescription"]/*[@class="content"]')
desc = root.xpath(
'//div[@id="productDescription"]/*[@class="content"]')
if desc:
ans += self._render_comments(desc[0])
else:
@ -559,12 +580,15 @@ class Worker(Thread): # Get details {{{
if m is not None:
try:
text = unquote(m.group(1)).decode('utf-8')
nr = html5lib.parse(text, treebuilder='lxml', namespaceHTMLElements=False)
desc = nr.xpath('//div[@id="productDescription"]/*[@class="content"]')
nr = html5lib.parse(
text, treebuilder='lxml', namespaceHTMLElements=False)
desc = nr.xpath(
'//div[@id="productDescription"]/*[@class="content"]')
if desc:
ans += self._render_comments(desc[0])
except Exception as e:
self.log.warn('Parsing of obfuscated product description failed with error: %s' % as_unicode(e))
self.log.warn(
'Parsing of obfuscated product description failed with error: %s' % as_unicode(e))
return ans
@ -577,13 +601,15 @@ class Worker(Thread): # Get details {{{
series = series[0]
spans = series.xpath('./span')
if spans:
raw = self.tostring(spans[0], encoding=unicode, method='text', with_tail=False).strip()
raw = self.tostring(
spans[0], encoding=unicode, method='text', with_tail=False).strip()
m = re.search('\s+([0-9.]+)$', raw.strip())
if m is not None:
series_index = float(m.group(1))
s = series.xpath('./a[@id="series-page-link"]')
if s:
series = self.tostring(s[0], encoding=unicode, method='text', with_tail=False).strip()
series = self.tostring(
s[0], encoding=unicode, method='text', with_tail=False).strip()
if series:
ans = (series, series_index)
# This is found on Kindle edition pages on amazon.com
@ -595,7 +621,8 @@ class Worker(Thread): # Get details {{{
series_index = float(m.group(1))
a = span.xpath('./a[@href]')
if a:
series = self.tostring(a[0], encoding=unicode, method='text', with_tail=False).strip()
series = self.tostring(
a[0], encoding=unicode, method='text', with_tail=False).strip()
if series:
ans = (series, series_index)
# This is found on newer Kindle edition pages on amazon.com
@ -607,7 +634,8 @@ class Worker(Thread): # Get details {{{
series_index = float(m.group(1))
a = b.getparent().xpath('./a[@href]')
if a:
series = self.tostring(a[0], encoding=unicode, method='text', with_tail=False).partition('(')[0].strip()
series = self.tostring(
a[0], encoding=unicode, method='text', with_tail=False).partition('(')[0].strip()
if series:
ans = series, series_index
@ -629,12 +657,14 @@ class Worker(Thread): # Get details {{{
def parse_tags(self, root):
ans = []
exclude_tokens = {'kindle', 'a-z'}
exclude = {'special features', 'by authors', 'authors & illustrators', 'books', 'new; used & rental textbooks'}
exclude = {'special features', 'by authors',
'authors & illustrators', 'books', 'new; used & rental textbooks'}
seen = set()
for li in root.xpath(self.tags_xpath):
for i, a in enumerate(li.iterdescendants('a')):
if i > 0:
# we ignore the first category since it is almost always too broad
# we ignore the first category since it is almost always
# too broad
raw = (a.text or '').strip().replace(',', ';')
lraw = icu_lower(raw)
tokens = frozenset(lraw.split())
@ -674,12 +704,14 @@ class Worker(Thread): # Get details {{{
if url:
return url
imgs = root.xpath('//img[(@id="prodImage" or @id="original-main-image" or @id="main-image" or @id="main-image-nonjs") and @src]')
imgs = root.xpath(
'//img[(@id="prodImage" or @id="original-main-image" or @id="main-image" or @id="main-image-nonjs") and @src]')
if not imgs:
imgs = (
root.xpath('//div[@class="main-image-inner-wrapper"]/img[@src]') or
root.xpath('//div[@id="main-image-container" or @id="ebooks-main-image-container"]//img[@src]') or
root.xpath('//div[@id="mainImageContainer"]//img[@data-a-dynamic-image]')
root.xpath(
'//div[@id="mainImageContainer"]//img[@data-a-dynamic-image]')
)
for img in imgs:
try:
@ -887,7 +919,8 @@ class Amazon(Source):
return 'https://www.amazon.%s/' % domain
def _get_book_url(self, identifiers): # {{{
domain, asin = self.get_domain_and_asin(identifiers, extra_domains=('in', 'au', 'ca'))
domain, asin = self.get_domain_and_asin(
identifiers, extra_domains=('in', 'au', 'ca'))
if domain and asin:
url = None
r = self.referrer_for_domain(domain)
@ -955,7 +988,7 @@ class Amazon(Source):
return udomain
def create_query(self, log, title=None, authors=None, identifiers={}, # {{{
domain=None):
domain=None, for_amazon=True):
from urllib import urlencode
if domain is None:
domain = self.domain
@ -965,6 +998,7 @@ class Amazon(Source):
domain = idomain
# See the amazon detailed search page to get all options
terms = []
q = {'search-alias': 'aps',
'unfiltered': '1',
}
@ -978,26 +1012,34 @@ class Amazon(Source):
if asin is not None:
q['field-keywords'] = asin
terms.append(asin)
elif isbn is not None:
q['field-isbn'] = isbn
terms.append(isbn)
else:
# Only return book results
q['search-alias'] = {'br':'digital-text', 'nl':'aps'}.get(domain, 'stripbooks')
q['search-alias'] = {'br': 'digital-text',
'nl': 'aps'}.get(domain, 'stripbooks')
if title:
title_tokens = list(self.get_title_tokens(title))
if title_tokens:
q['field-title'] = ' '.join(title_tokens)
terms.extend(title_tokens)
if authors:
author_tokens = self.get_author_tokens(authors,
only_first_author=True)
if author_tokens:
q['field-author'] = ' '.join(author_tokens)
terms.extend(author_tokens)
if not ('field-keywords' in q or 'field-isbn' in q or
('field-title' in q)):
# Insufficient metadata to make an identify query
return None, None
if not for_amazon:
return terms, domain
# magic parameter to enable Japanese Shift_JIS encoding.
if domain == 'jp':
q['__mk_ja_JP'] = u'カタカナ'
@ -1018,7 +1060,8 @@ class Amazon(Source):
encoded_q = dict([(x.encode(encode_to, 'ignore'), y.encode(encode_to,
'ignore')) for x, y in
q.iteritems()])
url = 'https://www.amazon.%s/s/?'%self.get_website_domain(domain) + urlencode(encoded_q)
url = 'https://www.amazon.%s/s/?' % self.get_website_domain(
domain) + urlencode(encoded_q)
return url, domain
# }}}
@ -1043,7 +1086,8 @@ class Amazon(Source):
def title_ok(title):
title = title.lower()
bad = ['bulk pack', '[audiobook]', '[audio cd]', '(a book companion)', '( slipcase with door )', ': free sampler']
bad = ['bulk pack', '[audiobook]', '[audio cd]',
'(a book companion)', '( slipcase with door )', ': free sampler']
if self.domain == 'com':
bad.extend(['(%s edition)' % x for x in ('spanish', 'german')])
for x in bad:
@ -1059,7 +1103,8 @@ class Amazon(Source):
if title_ok(title):
url = a.get('href')
if url.startswith('/'):
url = 'https://www.amazon.%s%s' % (self.get_website_domain(domain), url)
url = 'https://www.amazon.%s%s' % (
self.get_website_domain(domain), url)
matches.append(url)
if not matches:
@ -1074,7 +1119,8 @@ class Amazon(Source):
if title_ok(title):
url = a.get('href')
if url.startswith('/'):
url = 'https://www.amazon.%s%s' % (self.get_website_domain(domain), url)
url = 'https://www.amazon.%s%s' % (
self.get_website_domain(domain), url)
matches.append(url)
break
@ -1088,7 +1134,8 @@ class Amazon(Source):
if title_ok(title):
url = a.get('href')
if url.startswith('/'):
url = 'https://www.amazon.%s%s' % (self.get_website_domain(domain), url)
url = 'https://www.amazon.%s%s' % (
self.get_website_domain(domain), url)
matches.append(url)
break
if not matches and root.xpath('//form[@action="/errors/validateCaptcha"]'):
@ -1101,7 +1148,7 @@ class Amazon(Source):
return matches[:3]
# }}}
def search_amazon(self, br, testing, log, abort, title, authors, identifiers, timeout):
def search_amazon(self, br, testing, log, abort, title, authors, identifiers, timeout): # {{{
import html5lib
from calibre.utils.cleantext import clean_ascii_chars
from calibre.ebooks.chardet import xml_to_unicode
@ -1152,10 +1199,42 @@ class Amazon(Source):
matches = self.parse_results_page(root, domain)
return matches, query, domain
return matches, query, domain, None
# }}}
def search_search_engine(self, br, testing, log, abort, title, authors, identifiers, timeout): # {{{
terms, domain = self.create_query(log, title=title, authors=authors,
identifiers=identifiers, for_amazon=False)
site = self.referrer_for_domain(
domain)[len('https://'):].partition('/')[0]
se = search_engines_module()
matches = []
for result in se.ddg_search(terms, site, log=log, br=br, timeout=timeout):
if abort.is_set():
return matches, terms, domain, None
purl = urlparse(result.url)
if '/dp/' in purl.path and site in purl.netloc:
url = result.cached_url
if url is None:
url = se.wayback_machine_cached_url(
result.url, br, timeout=timeout)
if url is None:
log('Failed to find cached page for:', result.url)
continue
if url not in matches:
matches.append(url)
if len(matches) >= 3:
break
else:
log('Skipping non-book result:', result)
if not matches:
log('No search engine results for terms:', ' '.join(terms))
return matches, terms, domain, se.wayback_url_processor
# }}}
def identify(self, log, result_queue, abort, title=None, authors=None, # {{{
identifiers={}, timeout=30):
identifiers={}, timeout=60):
'''
Note this method will retry without identifiers automatically if no
match is found with identifiers.
@ -1165,23 +1244,38 @@ class Amazon(Source):
udata = self._get_book_url(identifiers)
br = self.browser
log('User-agent:', br.current_user_agent())
if testing:
print('User-agent:', br.current_user_agent())
if udata is not None:
# Try to directly get details page instead of running a search
domain, idtype, asin, durl = udata
preparsed_root = parse_details_page(durl, log, timeout, br, domain)
cover_url_processor = None
if USE_SEARCH_ENGINE:
se = search_engines_module()
durl = se.wayback_machine_cached_url(
durl, br, timeout=timeout, log=log)
cover_url_processor = se.wayback_url_processor
if durl is None:
log('Failed to get cached URL for asin:', asin)
else:
preparsed_root = parse_details_page(
durl, log, timeout, br, domain)
if preparsed_root is not None:
qasin = parse_asin(preparsed_root[1], log, durl)
if qasin == asin:
w = Worker(durl, result_queue, br, log, 0, domain, self, testing=testing, preparsed_root=preparsed_root)
w = Worker(durl, result_queue, br, log, 0, domain,
self, testing=testing, preparsed_root=preparsed_root, cover_url_processor=cover_url_processor)
try:
w.get_details()
return
except Exception:
log.exception('get_details failed for url: %r'%durl)
log.exception(
'get_details failed for url: %r' % durl)
func = self.search_search_engine if USE_SEARCH_ENGINE else self.search_amazon
try:
matches, query, domain = self.search_amazon(br, testing, log, abort, title, authors, identifiers, timeout)
matches, query, domain, cover_url_processor = func(
br, testing, log, abort, title, authors, identifiers, timeout)
except SearchFailed:
return
@ -1198,8 +1292,8 @@ class Amazon(Source):
log.error('No matches found with query: %r' % query)
return
workers = [Worker(url, result_queue, br, log, i, domain, self,
testing=testing) for i, url in enumerate(matches)]
workers = [Worker(url, result_queue, br, log, i, domain, self, testing=testing,
cover_url_processor=cover_url_processor) for i, url in enumerate(matches)]
for w in workers:
# Don't send all requests at the same time
@ -1223,7 +1317,7 @@ class Amazon(Source):
# }}}
def download_cover(self, log, result_queue, abort, # {{{
title=None, authors=None, identifiers={}, timeout=30, get_best_cover=False):
title=None, authors=None, identifiers={}, timeout=60, get_best_cover=False):
cached_url = self.get_cached_cover_url(identifiers)
if cached_url is None:
log.info('No cached cover found, running identify')
@ -1255,7 +1349,8 @@ class Amazon(Source):
log('Downloading cover from:', cached_url)
try:
time.sleep(1)
cdata = self.browser.open_novisit(cached_url, timeout=timeout).read()
cdata = self.browser.open_novisit(
cached_url, timeout=timeout).read()
result_queue.put((self, cdata))
except:
log.exception('Failed to download cover from:', cached_url)
@ -1263,29 +1358,34 @@ class Amazon(Source):
if __name__ == '__main__': # tests {{{
# To run these test use: calibre-debug src/calibre/ebooks/metadata/sources/amazon.py
# To run these test use: calibre-debug
# src/calibre/ebooks/metadata/sources/amazon.py
from calibre.ebooks.metadata.sources.test import (test_identify_plugin,
isbn_test, title_test, authors_test, comments_test, series_test)
com_tests = [ # {{{
( # Paperback with series
{'identifiers': {'amazon': '1423146786'}},
[title_test('The Heroes of Olympus, Book Five The Blood of Olympus', exact=True), series_test('Heroes of Olympus', 5)]
[title_test('The Heroes of Olympus, Book Five The Blood of Olympus',
exact=True), series_test('Heroes of Olympus', 5)]
),
( # Kindle edition with series
{'identifiers': {'amazon': 'B0085UEQDO'}},
[title_test('Three Parts Dead', exact=True), series_test('Craft Sequence', 1)]
[title_test('Three Parts Dead', exact=True),
series_test('Craft Sequence', 1)]
),
( # A kindle edition that does not appear in the search results when searching by ASIN
{'identifiers': {'amazon': 'B004JHY6OG'}},
[title_test('The Heroes: A First Law Novel (First Law World 2)', exact=True)]
[title_test(
'The Heroes: A First Law Novel (First Law World 2)', exact=True)]
),
( # + in title and uses id="main-image" for cover
{'identifiers': {'amazon': '1933988770'}},
[title_test('C++ Concurrency in Action: Practical Multithreading', exact=True)]
[title_test(
'C++ Concurrency in Action: Practical Multithreading', exact=True)]
),
@ -1426,7 +1526,8 @@ if __name__ == '__main__': # tests {{{
cn_tests = [ # {{{
(
{'identifiers': {'isbn': '9787115369512'}},
[title_test('若为自由故 自由软件之父理查德斯托曼传', exact=True), authors_test(['[美]sam Williams', '邓楠,李凡希'])]
[title_test('若为自由故 自由软件之父理查德斯托曼传', exact=True),
authors_test(['[美]sam Williams', '邓楠,李凡希'])]
),
(
{'title': '爱上Raspberry Pi'},
@ -1440,12 +1541,14 @@ if __name__ == '__main__': # tests {{{
ca_tests = [ # {{{
( # Paperback with series
{'identifiers': {'isbn': '9781623808747'}},
[title_test('Parting Shot', exact=True), authors_test(['Mary Calmes'])]
[title_test('Parting Shot', exact=True),
authors_test(['Mary Calmes'])]
),
( # # in title
{'title': 'Expert C# 2008 Business Objects',
'authors': ['Lhotka']},
[title_test('Expert C# 2008 Business Objects'), authors_test(['Rockford Lhotka'])]
[title_test('Expert C# 2008 Business Objects'),
authors_test(['Rockford Lhotka'])]
),
( # noscript description
{'identifiers': {'amazon_ca': '162380874X'}},

View File

@ -46,12 +46,12 @@ def parse_html(raw):
return html5lib.parse(raw, treebuilder='lxml', namespaceHTMLElements=False)
def query(br, url, key, dump_raw=None, limit=1, parser=parse_html):
def query(br, url, key, dump_raw=None, limit=1, parser=parse_html, timeout=60):
delta = monotonic() - last_visited[key]
if delta < limit and delta > 0:
time.sleep(delta)
try:
raw = br.open_novisit(url).read()
raw = br.open_novisit(url, timeout=timeout).read()
finally:
last_visited[key] = monotonic()
if dump_raw is not None:
@ -80,20 +80,29 @@ def ddg_href(url):
return url
def wayback_machine_cached_url(url, br=None):
def wayback_machine_cached_url(url, br=None, log=prints, timeout=60):
q = quote_term(url)
br = br or browser()
data = query(br, 'https://archive.org/wayback/available?url=' +
q, 'wayback', parser=json.loads, limit=0.25)
q, 'wayback', parser=json.loads, limit=0.25, timeout=timeout)
try:
closest = data['archived_snapshots']['closest']
except KeyError:
return
pass
else:
if closest['available']:
return closest['url']
from pprint import pformat
log('Response from wayback machine:', pformat(data))
def ddg_search(terms, site=None, br=None, log=prints, safe_search=False, dump_raw=None):
def wayback_url_processor(url):
if url.startswith('/'):
url = 'https://web.archive.org' + url
return url
def ddg_search(terms, site=None, br=None, log=prints, safe_search=False, dump_raw=None, timeout=60):
# https://duck.co/help/results/syntax
terms = map(ddg_term, terms)
terms = [quote_term(t) for t in terms]
@ -104,7 +113,7 @@ def ddg_search(terms, site=None, br=None, log=prints, safe_search=False, dump_ra
q=q, kp=1 if safe_search else -1)
log('Making ddg query: ' + url)
br = br or browser()
root = query(br, url, 'ddg', dump_raw)
root = query(br, url, 'ddg', dump_raw, timeout=timeout)
ans = []
for a in root.xpath('//*[@class="results"]//*[@class="result__title"]/a[@href and @class="result__a"]'):
ans.append(Result(ddg_href(a.get('href')), etree.tostring(

View File

@ -14,7 +14,6 @@ from threading import Thread
import calibre.ebooks.metadata.sources.search_engines as builtin_search_engines
from calibre import as_unicode, prints
from calibre.constants import DEBUG, numeric_version
from calibre.customize.ui import patch_metadata_plugins
from calibre.ebooks.metadata.sources.base import Source
from calibre.utils.config import JSONConfig
from calibre.utils.https import get_https_resource_securely
@ -59,6 +58,7 @@ def patch_search_engines(src):
def patch_plugins():
from calibre.customize.ui import patch_metadata_plugins
patches = {}
for name, val in cache.iteritems():
if name == 'hashes':