Implement searching amazon via wayback machine

Disabled, as wayback machine is really slow/flaky
This commit is contained in:
Kovid Goyal 2017-03-02 09:19:51 +05:30
parent 6c4c14ceca
commit d1ad4955a8
3 changed files with 447 additions and 335 deletions

View File

@ -1,24 +1,22 @@
#!/usr/bin/env python2 #!/usr/bin/env python2
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai # vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from __future__ import (unicode_literals, division, absolute_import, # License: GPLv3 Copyright: 2011, Kovid Goyal <kovid at kovidgoyal.net>
print_function) from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL v3' import re
__copyright__ = '2011, Kovid Goyal <kovid@kovidgoyal.net>' import socket
__docformat__ = 'restructuredtext en' import time
from Queue import Empty, Queue
import socket, time, re
from threading import Thread from threading import Thread
from Queue import Queue, Empty from urlparse import urlparse
from calibre import as_unicode, browser from calibre import as_unicode, browser
from calibre.ebooks.metadata import check_isbn from calibre.ebooks.metadata import check_isbn
from calibre.ebooks.metadata.sources.base import (Source, Option, fixcase,
fixauthors)
from calibre.ebooks.metadata.book.base import Metadata from calibre.ebooks.metadata.book.base import Metadata
from calibre.ebooks.metadata.sources.base import Option, Source, fixauthors, fixcase
from calibre.ebooks.metadata.sources.update import search_engines_module
from calibre.utils.localization import canonicalize_lang from calibre.utils.localization import canonicalize_lang
from calibre.utils.random_ua import all_user_agents, accept_header_for_ua from calibre.utils.random_ua import accept_header_for_ua, all_user_agents
class CaptchaError(Exception): class CaptchaError(Exception):
@ -30,6 +28,7 @@ class SearchFailed(ValueError):
ua_index = -1 ua_index = -1
USE_SEARCH_ENGINE = False
def parse_details_page(url, log, timeout, browser, domain): def parse_details_page(url, log, timeout, browser, domain):
@ -37,12 +36,13 @@ def parse_details_page(url, log, timeout, browser, domain):
from calibre.ebooks.chardet import xml_to_unicode from calibre.ebooks.chardet import xml_to_unicode
import html5lib import html5lib
from lxml.html import tostring from lxml.html import tostring
log('Getting details from:', url)
try: try:
raw = browser.open_novisit(url, timeout=timeout).read().strip() raw = browser.open_novisit(url, timeout=timeout).read().strip()
except Exception as e: except Exception as e:
if callable(getattr(e, 'getcode', None)) and \ if callable(getattr(e, 'getcode', None)) and \
e.getcode() == 404: e.getcode() == 404:
log.error('URL malformed: %r'%url) log.error('URL malformed: %r' % url)
return return
attr = getattr(e, 'args', [None]) attr = getattr(e, 'args', [None])
attr = attr if attr else [None] attr = attr if attr else [None]
@ -50,35 +50,37 @@ def parse_details_page(url, log, timeout, browser, domain):
msg = 'Amazon timed out. Try again later.' msg = 'Amazon timed out. Try again later.'
log.error(msg) log.error(msg)
else: else:
msg = 'Failed to make details query: %r'%url msg = 'Failed to make details query: %r' % url
log.exception(msg) log.exception(msg)
return return
oraw = raw oraw = raw
if 'amazon.com.br' in url: if 'amazon.com.br' in url:
raw = raw.decode('utf-8') # amazon.com.br serves utf-8 but has an incorrect latin1 <meta> tag # amazon.com.br serves utf-8 but has an incorrect latin1 <meta> tag
raw = xml_to_unicode(raw, strip_encoding_pats=True, resolve_entities=True)[0] raw = raw.decode('utf-8')
raw = xml_to_unicode(raw, strip_encoding_pats=True,
resolve_entities=True)[0]
if '<title>404 - ' in raw: if '<title>404 - ' in raw:
log.error('URL malformed: %r'%url) log.error('URL malformed: %r' % url)
return return
try: try:
root = html5lib.parse(clean_ascii_chars(raw), treebuilder='lxml', root = html5lib.parse(clean_ascii_chars(raw), treebuilder='lxml',
namespaceHTMLElements=False) namespaceHTMLElements=False)
except: except:
msg = 'Failed to parse amazon details page: %r'%url msg = 'Failed to parse amazon details page: %r' % url
log.exception(msg) log.exception(msg)
return return
if domain == 'jp': if domain == 'jp':
for a in root.xpath('//a[@href]'): for a in root.xpath('//a[@href]'):
if 'black-curtain-redirect.html' in a.get('href'): if 'black-curtain-redirect.html' in a.get('href'):
url = 'https://amazon.co.jp'+a.get('href') url = 'https://amazon.co.jp' + a.get('href')
log('Black curtain redirect found, following') log('Black curtain redirect found, following')
return parse_details_page(url, log, timeout, browser, domain) return parse_details_page(url, log, timeout, browser, domain)
errmsg = root.xpath('//*[@id="errorMessage"]') errmsg = root.xpath('//*[@id="errorMessage"]')
if errmsg: if errmsg:
msg = 'Failed to parse amazon details page: %r'%url msg = 'Failed to parse amazon details page: %r' % url
msg += tostring(errmsg, method='text', encoding=unicode).strip() msg += tostring(errmsg, method='text', encoding=unicode).strip()
log.error(msg) log.error(msg)
return return
@ -94,7 +96,7 @@ def parse_asin(root, log, url):
for l in link: for l in link:
return l.get('href').rpartition('/')[-1] return l.get('href').rpartition('/')[-1]
except Exception: except Exception:
log.exception('Error parsing ASIN for url: %r'%url) log.exception('Error parsing ASIN for url: %r' % url)
class Worker(Thread): # Get details {{{ class Worker(Thread): # Get details {{{
@ -104,8 +106,9 @@ class Worker(Thread): # Get details {{{
''' '''
def __init__(self, url, result_queue, browser, log, relevance, domain, def __init__(self, url, result_queue, browser, log, relevance, domain,
plugin, timeout=20, testing=False, preparsed_root=None): plugin, timeout=20, testing=False, preparsed_root=None, cover_url_processor=None):
Thread.__init__(self) Thread.__init__(self)
self.cover_url_processor = cover_url_processor
self.preparsed_root = preparsed_root self.preparsed_root = preparsed_root
self.daemon = True self.daemon = True
self.testing = testing self.testing = testing
@ -230,7 +233,8 @@ class Worker(Thread): # Get details {{{
starts-with(text(), "Uitgever:") or \ starts-with(text(), "Uitgever:") or \
starts-with(text(), "出版社:")] starts-with(text(), "出版社:")]
''' '''
self.publisher_names = {'Publisher', 'Uitgever', 'Verlag', 'Editore', 'Editeur', 'Editor', 'Editora', '出版社'} self.publisher_names = {'Publisher', 'Uitgever', 'Verlag',
'Editore', 'Editeur', 'Editor', 'Editora', '出版社'}
self.language_xpath = ''' self.language_xpath = '''
descendant::*[ descendant::*[
@ -244,7 +248,8 @@ class Worker(Thread): # Get details {{{
or starts-with(text(), "语种") or starts-with(text(), "语种")
] ]
''' '''
self.language_names = {'Language', 'Sprache', 'Lingua', 'Idioma', 'Langue', '言語', 'Taal', '语种'} self.language_names = {'Language', 'Sprache',
'Lingua', 'Idioma', 'Langue', '言語', 'Taal', '语种'}
self.tags_xpath = ''' self.tags_xpath = '''
descendant::h2[ descendant::h2[
@ -290,7 +295,7 @@ class Worker(Thread): # Get details {{{
def delocalize_datestr(self, raw): def delocalize_datestr(self, raw):
if self.domain == 'cn': if self.domain == 'cn':
return raw.replace('','-').replace('','-').replace('','') return raw.replace('', '-').replace('', '-').replace('', '')
if not self.months: if not self.months:
return raw return raw
ans = raw.lower() ans = raw.lower()
@ -304,11 +309,12 @@ class Worker(Thread): # Get details {{{
try: try:
self.get_details() self.get_details()
except: except:
self.log.exception('get_details failed for url: %r'%self.url) self.log.exception('get_details failed for url: %r' % self.url)
def get_details(self): def get_details(self):
if self.preparsed_root is None: if self.preparsed_root is None:
raw, root, selector = parse_details_page(self.url, self.log, self.timeout, self.browser, self.domain) raw, root, selector = parse_details_page(
self.url, self.log, self.timeout, self.browser, self.domain)
else: else:
raw, root, selector = self.preparsed_root raw, root, selector = self.preparsed_root
@ -319,10 +325,12 @@ class Worker(Thread): # Get details {{{
def parse_details(self, raw, root): def parse_details(self, raw, root):
asin = parse_asin(root, self.log, self.url) asin = parse_asin(root, self.log, self.url)
if not asin and root.xpath('//form[@action="/errors/validateCaptcha"]'): if not asin and root.xpath('//form[@action="/errors/validateCaptcha"]'):
raise CaptchaError('Amazon returned a CAPTCHA page, probably because you downloaded too many books. Wait for some time and try again.') raise CaptchaError(
'Amazon returned a CAPTCHA page, probably because you downloaded too many books. Wait for some time and try again.')
if self.testing: if self.testing:
import tempfile, uuid import tempfile
with tempfile.NamedTemporaryFile(prefix=(asin or str(uuid.uuid4()))+ '_', import uuid
with tempfile.NamedTemporaryFile(prefix=(asin or str(uuid.uuid4())) + '_',
suffix='.html', delete=False) as f: suffix='.html', delete=False) as f:
f.write(raw) f.write(raw)
print ('Downloaded html for', asin, 'saved in', f.name) print ('Downloaded html for', asin, 'saved in', f.name)
@ -330,35 +338,36 @@ class Worker(Thread): # Get details {{{
try: try:
title = self.parse_title(root) title = self.parse_title(root)
except: except:
self.log.exception('Error parsing title for url: %r'%self.url) self.log.exception('Error parsing title for url: %r' % self.url)
title = None title = None
try: try:
authors = self.parse_authors(root) authors = self.parse_authors(root)
except: except:
self.log.exception('Error parsing authors for url: %r'%self.url) self.log.exception('Error parsing authors for url: %r' % self.url)
authors = [] authors = []
if not title or not authors or not asin: if not title or not authors or not asin:
self.log.error('Could not find title/authors/asin for %r'%self.url) self.log.error(
self.log.error('ASIN: %r Title: %r Authors: %r'%(asin, title, 'Could not find title/authors/asin for %r' % self.url)
self.log.error('ASIN: %r Title: %r Authors: %r' % (asin, title,
authors)) authors))
return return
mi = Metadata(title, authors) mi = Metadata(title, authors)
idtype = 'amazon' if self.domain == 'com' else 'amazon_'+self.domain idtype = 'amazon' if self.domain == 'com' else 'amazon_' + self.domain
mi.set_identifier(idtype, asin) mi.set_identifier(idtype, asin)
self.amazon_id = asin self.amazon_id = asin
try: try:
mi.rating = self.parse_rating(root) mi.rating = self.parse_rating(root)
except: except:
self.log.exception('Error parsing ratings for url: %r'%self.url) self.log.exception('Error parsing ratings for url: %r' % self.url)
try: try:
mi.comments = self.parse_comments(root, raw) mi.comments = self.parse_comments(root, raw)
except: except:
self.log.exception('Error parsing comments for url: %r'%self.url) self.log.exception('Error parsing comments for url: %r' % self.url)
try: try:
series, series_index = self.parse_series(root) series, series_index = self.parse_series(root)
@ -367,26 +376,30 @@ class Worker(Thread): # Get details {{{
elif self.testing: elif self.testing:
mi.series, mi.series_index = 'Dummy series for testing', 1 mi.series, mi.series_index = 'Dummy series for testing', 1
except: except:
self.log.exception('Error parsing series for url: %r'%self.url) self.log.exception('Error parsing series for url: %r' % self.url)
try: try:
mi.tags = self.parse_tags(root) mi.tags = self.parse_tags(root)
except: except:
self.log.exception('Error parsing tags for url: %r'%self.url) self.log.exception('Error parsing tags for url: %r' % self.url)
try: try:
self.cover_url = self.parse_cover(root, raw) self.cover_url = self.parse_cover(root, raw)
except: except:
self.log.exception('Error parsing cover for url: %r'%self.url) self.log.exception('Error parsing cover for url: %r' % self.url)
if self.cover_url_processor is not None and self.cover_url.startswith('/'):
self.cover_url = self.cover_url_processor(self.cover_url)
mi.has_cover = bool(self.cover_url) mi.has_cover = bool(self.cover_url)
non_hero = tuple(self.selector('div#bookDetails_container_div div#nonHeroSection')) non_hero = tuple(self.selector(
'div#bookDetails_container_div div#nonHeroSection'))
if non_hero: if non_hero:
# New style markup # New style markup
try: try:
self.parse_new_details(root, mi, non_hero[0]) self.parse_new_details(root, mi, non_hero[0])
except: except:
self.log.exception('Failed to parse new-style book details section') self.log.exception(
'Failed to parse new-style book details section')
else: else:
pd = root.xpath(self.pd_xpath) pd = root.xpath(self.pd_xpath)
if pd: if pd:
@ -397,27 +410,32 @@ class Worker(Thread): # Get details {{{
if isbn: if isbn:
self.isbn = mi.isbn = isbn self.isbn = mi.isbn = isbn
except: except:
self.log.exception('Error parsing ISBN for url: %r'%self.url) self.log.exception(
'Error parsing ISBN for url: %r' % self.url)
try: try:
mi.publisher = self.parse_publisher(pd) mi.publisher = self.parse_publisher(pd)
except: except:
self.log.exception('Error parsing publisher for url: %r'%self.url) self.log.exception(
'Error parsing publisher for url: %r' % self.url)
try: try:
mi.pubdate = self.parse_pubdate(pd) mi.pubdate = self.parse_pubdate(pd)
except: except:
self.log.exception('Error parsing publish date for url: %r'%self.url) self.log.exception(
'Error parsing publish date for url: %r' % self.url)
try: try:
lang = self.parse_language(pd) lang = self.parse_language(pd)
if lang: if lang:
mi.language = lang mi.language = lang
except: except:
self.log.exception('Error parsing language for url: %r'%self.url) self.log.exception(
'Error parsing language for url: %r' % self.url)
else: else:
self.log.warning('Failed to find product description for url: %r'%self.url) self.log.warning(
'Failed to find product description for url: %r' % self.url)
mi.source_relevance = self.relevance mi.source_relevance = self.relevance
@ -448,7 +466,8 @@ class Worker(Thread): # Get details {{{
title = self.tostring(actual_title[0], encoding=unicode, title = self.tostring(actual_title[0], encoding=unicode,
method='text').strip() method='text').strip()
else: else:
title = self.tostring(tdiv, encoding=unicode, method='text').strip() title = self.tostring(tdiv, encoding=unicode,
method='text').strip()
ans = re.sub(r'[(\[].*[)\]]', '', title).strip() ans = re.sub(r'[(\[].*[)\]]', '', title).strip()
if not ans: if not ans:
ans = title.rpartition('[')[0].strip() ans = title.rpartition('[')[0].strip()
@ -500,7 +519,7 @@ class Worker(Thread): # Get details {{{
else: else:
m = self.ratings_pat.match(t) m = self.ratings_pat.match(t)
if m is not None: if m is not None:
return float(m.group(1))/float(m.group(3)) * 5 return float(m.group(1)) / float(m.group(3)) * 5
def _render_comments(self, desc): def _render_comments(self, desc):
from calibre.library.comments import sanitize_comments_html from calibre.library.comments import sanitize_comments_html
@ -540,7 +559,8 @@ class Worker(Thread): # Get details {{{
if len(ns) == 0 and ns.text: if len(ns) == 0 and ns.text:
import html5lib import html5lib
# html5lib parsed noscript as CDATA # html5lib parsed noscript as CDATA
ns = html5lib.parseFragment('<div>%s</div>' % (ns.text), treebuilder='lxml', namespaceHTMLElements=False)[0] ns = html5lib.parseFragment(
'<div>%s</div>' % (ns.text), treebuilder='lxml', namespaceHTMLElements=False)[0]
else: else:
ns.tag = 'div' ns.tag = 'div'
ans = self._render_comments(ns) ans = self._render_comments(ns)
@ -549,7 +569,8 @@ class Worker(Thread): # Get details {{{
if desc: if desc:
ans = self._render_comments(desc[0]) ans = self._render_comments(desc[0])
desc = root.xpath('//div[@id="productDescription"]/*[@class="content"]') desc = root.xpath(
'//div[@id="productDescription"]/*[@class="content"]')
if desc: if desc:
ans += self._render_comments(desc[0]) ans += self._render_comments(desc[0])
else: else:
@ -559,12 +580,15 @@ class Worker(Thread): # Get details {{{
if m is not None: if m is not None:
try: try:
text = unquote(m.group(1)).decode('utf-8') text = unquote(m.group(1)).decode('utf-8')
nr = html5lib.parse(text, treebuilder='lxml', namespaceHTMLElements=False) nr = html5lib.parse(
desc = nr.xpath('//div[@id="productDescription"]/*[@class="content"]') text, treebuilder='lxml', namespaceHTMLElements=False)
desc = nr.xpath(
'//div[@id="productDescription"]/*[@class="content"]')
if desc: if desc:
ans += self._render_comments(desc[0]) ans += self._render_comments(desc[0])
except Exception as e: except Exception as e:
self.log.warn('Parsing of obfuscated product description failed with error: %s' % as_unicode(e)) self.log.warn(
'Parsing of obfuscated product description failed with error: %s' % as_unicode(e))
return ans return ans
@ -577,13 +601,15 @@ class Worker(Thread): # Get details {{{
series = series[0] series = series[0]
spans = series.xpath('./span') spans = series.xpath('./span')
if spans: if spans:
raw = self.tostring(spans[0], encoding=unicode, method='text', with_tail=False).strip() raw = self.tostring(
spans[0], encoding=unicode, method='text', with_tail=False).strip()
m = re.search('\s+([0-9.]+)$', raw.strip()) m = re.search('\s+([0-9.]+)$', raw.strip())
if m is not None: if m is not None:
series_index = float(m.group(1)) series_index = float(m.group(1))
s = series.xpath('./a[@id="series-page-link"]') s = series.xpath('./a[@id="series-page-link"]')
if s: if s:
series = self.tostring(s[0], encoding=unicode, method='text', with_tail=False).strip() series = self.tostring(
s[0], encoding=unicode, method='text', with_tail=False).strip()
if series: if series:
ans = (series, series_index) ans = (series, series_index)
# This is found on Kindle edition pages on amazon.com # This is found on Kindle edition pages on amazon.com
@ -595,7 +621,8 @@ class Worker(Thread): # Get details {{{
series_index = float(m.group(1)) series_index = float(m.group(1))
a = span.xpath('./a[@href]') a = span.xpath('./a[@href]')
if a: if a:
series = self.tostring(a[0], encoding=unicode, method='text', with_tail=False).strip() series = self.tostring(
a[0], encoding=unicode, method='text', with_tail=False).strip()
if series: if series:
ans = (series, series_index) ans = (series, series_index)
# This is found on newer Kindle edition pages on amazon.com # This is found on newer Kindle edition pages on amazon.com
@ -607,7 +634,8 @@ class Worker(Thread): # Get details {{{
series_index = float(m.group(1)) series_index = float(m.group(1))
a = b.getparent().xpath('./a[@href]') a = b.getparent().xpath('./a[@href]')
if a: if a:
series = self.tostring(a[0], encoding=unicode, method='text', with_tail=False).partition('(')[0].strip() series = self.tostring(
a[0], encoding=unicode, method='text', with_tail=False).partition('(')[0].strip()
if series: if series:
ans = series, series_index ans = series, series_index
@ -629,12 +657,14 @@ class Worker(Thread): # Get details {{{
def parse_tags(self, root): def parse_tags(self, root):
ans = [] ans = []
exclude_tokens = {'kindle', 'a-z'} exclude_tokens = {'kindle', 'a-z'}
exclude = {'special features', 'by authors', 'authors & illustrators', 'books', 'new; used & rental textbooks'} exclude = {'special features', 'by authors',
'authors & illustrators', 'books', 'new; used & rental textbooks'}
seen = set() seen = set()
for li in root.xpath(self.tags_xpath): for li in root.xpath(self.tags_xpath):
for i, a in enumerate(li.iterdescendants('a')): for i, a in enumerate(li.iterdescendants('a')):
if i > 0: if i > 0:
# we ignore the first category since it is almost always too broad # we ignore the first category since it is almost always
# too broad
raw = (a.text or '').strip().replace(',', ';') raw = (a.text or '').strip().replace(',', ';')
lraw = icu_lower(raw) lraw = icu_lower(raw)
tokens = frozenset(lraw.split()) tokens = frozenset(lraw.split())
@ -663,7 +693,7 @@ class Worker(Thread): # Get details {{{
sparts = bn.split('_') sparts = bn.split('_')
if len(sparts) > 2: if len(sparts) > 2:
bn = re.sub(r'\.\.jpg$', '.jpg', (sparts[0] + sparts[-1])) bn = re.sub(r'\.\.jpg$', '.jpg', (sparts[0] + sparts[-1]))
return ('/'.join(parts[:-1]))+'/'+bn return ('/'.join(parts[:-1])) + '/' + bn
imgpat2 = re.compile(r'var imageSrc = "([^"]+)"') imgpat2 = re.compile(r'var imageSrc = "([^"]+)"')
for script in root.xpath('//script'): for script in root.xpath('//script'):
@ -674,12 +704,14 @@ class Worker(Thread): # Get details {{{
if url: if url:
return url return url
imgs = root.xpath('//img[(@id="prodImage" or @id="original-main-image" or @id="main-image" or @id="main-image-nonjs") and @src]') imgs = root.xpath(
'//img[(@id="prodImage" or @id="original-main-image" or @id="main-image" or @id="main-image-nonjs") and @src]')
if not imgs: if not imgs:
imgs = ( imgs = (
root.xpath('//div[@class="main-image-inner-wrapper"]/img[@src]') or root.xpath('//div[@class="main-image-inner-wrapper"]/img[@src]') or
root.xpath('//div[@id="main-image-container" or @id="ebooks-main-image-container"]//img[@src]') or root.xpath('//div[@id="main-image-container" or @id="ebooks-main-image-container"]//img[@src]') or
root.xpath('//div[@id="mainImageContainer"]//img[@data-a-dynamic-image]') root.xpath(
'//div[@id="mainImageContainer"]//img[@data-a-dynamic-image]')
) )
for img in imgs: for img in imgs:
try: try:
@ -703,7 +735,7 @@ class Worker(Thread): # Get details {{{
if 'data:' in src: if 'data:' in src:
continue continue
if 'loading-' in src: if 'loading-' in src:
js_img = re.search(br'"largeImage":"(https?://[^"]+)",',raw) js_img = re.search(br'"largeImage":"(https?://[^"]+)",', raw)
if js_img: if js_img:
src = js_img.group(1).decode('utf-8') src = js_img.group(1).decode('utf-8')
if ('/no-image-avail' not in src and 'loading-' not in src and '/no-img-sm' not in src): if ('/no-image-avail' not in src and 'loading-' not in src and '/no-img-sm' not in src):
@ -884,17 +916,18 @@ class Amazon(Source):
return 'https://www.amazon.com.br/' return 'https://www.amazon.com.br/'
if domain == 'au': if domain == 'au':
return 'https://www.amazon.com.au/' return 'https://www.amazon.com.au/'
return 'https://www.amazon.%s/'%domain return 'https://www.amazon.%s/' % domain
def _get_book_url(self, identifiers): # {{{ def _get_book_url(self, identifiers): # {{{
domain, asin = self.get_domain_and_asin(identifiers, extra_domains=('in', 'au', 'ca')) domain, asin = self.get_domain_and_asin(
identifiers, extra_domains=('in', 'au', 'ca'))
if domain and asin: if domain and asin:
url = None url = None
r = self.referrer_for_domain(domain) r = self.referrer_for_domain(domain)
if r is not None: if r is not None:
url = r + 'dp/' + asin url = r + 'dp/' + asin
if url: if url:
idtype = 'amazon' if domain == 'com' else 'amazon_'+domain idtype = 'amazon' if domain == 'com' else 'amazon_' + domain
return domain, idtype, asin, url return domain, idtype, asin, url
def get_book_url(self, identifiers): def get_book_url(self, identifiers):
@ -955,7 +988,7 @@ class Amazon(Source):
return udomain return udomain
def create_query(self, log, title=None, authors=None, identifiers={}, # {{{ def create_query(self, log, title=None, authors=None, identifiers={}, # {{{
domain=None): domain=None, for_amazon=True):
from urllib import urlencode from urllib import urlencode
if domain is None: if domain is None:
domain = self.domain domain = self.domain
@ -965,6 +998,7 @@ class Amazon(Source):
domain = idomain domain = idomain
# See the amazon detailed search page to get all options # See the amazon detailed search page to get all options
terms = []
q = {'search-alias': 'aps', q = {'search-alias': 'aps',
'unfiltered': '1', 'unfiltered': '1',
} }
@ -978,26 +1012,34 @@ class Amazon(Source):
if asin is not None: if asin is not None:
q['field-keywords'] = asin q['field-keywords'] = asin
terms.append(asin)
elif isbn is not None: elif isbn is not None:
q['field-isbn'] = isbn q['field-isbn'] = isbn
terms.append(isbn)
else: else:
# Only return book results # Only return book results
q['search-alias'] = {'br':'digital-text', 'nl':'aps'}.get(domain, 'stripbooks') q['search-alias'] = {'br': 'digital-text',
'nl': 'aps'}.get(domain, 'stripbooks')
if title: if title:
title_tokens = list(self.get_title_tokens(title)) title_tokens = list(self.get_title_tokens(title))
if title_tokens: if title_tokens:
q['field-title'] = ' '.join(title_tokens) q['field-title'] = ' '.join(title_tokens)
terms.extend(title_tokens)
if authors: if authors:
author_tokens = self.get_author_tokens(authors, author_tokens = self.get_author_tokens(authors,
only_first_author=True) only_first_author=True)
if author_tokens: if author_tokens:
q['field-author'] = ' '.join(author_tokens) q['field-author'] = ' '.join(author_tokens)
terms.extend(author_tokens)
if not ('field-keywords' in q or 'field-isbn' in q or if not ('field-keywords' in q or 'field-isbn' in q or
('field-title' in q)): ('field-title' in q)):
# Insufficient metadata to make an identify query # Insufficient metadata to make an identify query
return None, None return None, None
if not for_amazon:
return terms, domain
# magic parameter to enable Japanese Shift_JIS encoding. # magic parameter to enable Japanese Shift_JIS encoding.
if domain == 'jp': if domain == 'jp':
q['__mk_ja_JP'] = u'カタカナ' q['__mk_ja_JP'] = u'カタカナ'
@ -1012,13 +1054,14 @@ class Amazon(Source):
if domain == 'jp': if domain == 'jp':
encode_to = 'Shift_JIS' encode_to = 'Shift_JIS'
elif domain == 'nl' or domain == 'cn': elif domain == 'nl' or domain == 'cn':
encode_to='utf-8' encode_to = 'utf-8'
else: else:
encode_to = 'latin1' encode_to = 'latin1'
encoded_q = dict([(x.encode(encode_to, 'ignore'), y.encode(encode_to, encoded_q = dict([(x.encode(encode_to, 'ignore'), y.encode(encode_to,
'ignore')) for x, y in 'ignore')) for x, y in
q.iteritems()]) q.iteritems()])
url = 'https://www.amazon.%s/s/?'%self.get_website_domain(domain) + urlencode(encoded_q) url = 'https://www.amazon.%s/s/?' % self.get_website_domain(
domain) + urlencode(encoded_q)
return url, domain return url, domain
# }}} # }}}
@ -1043,7 +1086,8 @@ class Amazon(Source):
def title_ok(title): def title_ok(title):
title = title.lower() title = title.lower()
bad = ['bulk pack', '[audiobook]', '[audio cd]', '(a book companion)', '( slipcase with door )', ': free sampler'] bad = ['bulk pack', '[audiobook]', '[audio cd]',
'(a book companion)', '( slipcase with door )', ': free sampler']
if self.domain == 'com': if self.domain == 'com':
bad.extend(['(%s edition)' % x for x in ('spanish', 'german')]) bad.extend(['(%s edition)' % x for x in ('spanish', 'german')])
for x in bad: for x in bad:
@ -1059,7 +1103,8 @@ class Amazon(Source):
if title_ok(title): if title_ok(title):
url = a.get('href') url = a.get('href')
if url.startswith('/'): if url.startswith('/'):
url = 'https://www.amazon.%s%s' % (self.get_website_domain(domain), url) url = 'https://www.amazon.%s%s' % (
self.get_website_domain(domain), url)
matches.append(url) matches.append(url)
if not matches: if not matches:
@ -1074,7 +1119,8 @@ class Amazon(Source):
if title_ok(title): if title_ok(title):
url = a.get('href') url = a.get('href')
if url.startswith('/'): if url.startswith('/'):
url = 'https://www.amazon.%s%s' % (self.get_website_domain(domain), url) url = 'https://www.amazon.%s%s' % (
self.get_website_domain(domain), url)
matches.append(url) matches.append(url)
break break
@ -1088,7 +1134,8 @@ class Amazon(Source):
if title_ok(title): if title_ok(title):
url = a.get('href') url = a.get('href')
if url.startswith('/'): if url.startswith('/'):
url = 'https://www.amazon.%s%s' % (self.get_website_domain(domain), url) url = 'https://www.amazon.%s%s' % (
self.get_website_domain(domain), url)
matches.append(url) matches.append(url)
break break
if not matches and root.xpath('//form[@action="/errors/validateCaptcha"]'): if not matches and root.xpath('//form[@action="/errors/validateCaptcha"]'):
@ -1101,7 +1148,7 @@ class Amazon(Source):
return matches[:3] return matches[:3]
# }}} # }}}
def search_amazon(self, br, testing, log, abort, title, authors, identifiers, timeout): def search_amazon(self, br, testing, log, abort, title, authors, identifiers, timeout): # {{{
import html5lib import html5lib
from calibre.utils.cleantext import clean_ascii_chars from calibre.utils.cleantext import clean_ascii_chars
from calibre.ebooks.chardet import xml_to_unicode from calibre.ebooks.chardet import xml_to_unicode
@ -1116,7 +1163,7 @@ class Amazon(Source):
except Exception as e: except Exception as e:
if callable(getattr(e, 'getcode', None)) and \ if callable(getattr(e, 'getcode', None)) and \
e.getcode() == 404: e.getcode() == 404:
log.error('Query malformed: %r'%query) log.error('Query malformed: %r' % query)
raise SearchFailed() raise SearchFailed()
attr = getattr(e, 'args', [None]) attr = getattr(e, 'args', [None])
attr = attr if attr else [None] attr = attr if attr else [None]
@ -1124,7 +1171,7 @@ class Amazon(Source):
msg = _('Amazon timed out. Try again later.') msg = _('Amazon timed out. Try again later.')
log.error(msg) log.error(msg)
else: else:
msg = 'Failed to make identify query: %r'%query msg = 'Failed to make identify query: %r' % query
log.exception(msg) log.exception(msg)
raise SearchFailed() raise SearchFailed()
@ -1146,16 +1193,48 @@ class Amazon(Source):
root = html5lib.parse(raw, treebuilder='lxml', root = html5lib.parse(raw, treebuilder='lxml',
namespaceHTMLElements=False) namespaceHTMLElements=False)
except Exception: except Exception:
msg = 'Failed to parse amazon page for query: %r'%query msg = 'Failed to parse amazon page for query: %r' % query
log.exception(msg) log.exception(msg)
raise SearchFailed() raise SearchFailed()
matches = self.parse_results_page(root, domain) matches = self.parse_results_page(root, domain)
return matches, query, domain return matches, query, domain, None
# }}}
def search_search_engine(self, br, testing, log, abort, title, authors, identifiers, timeout): # {{{
terms, domain = self.create_query(log, title=title, authors=authors,
identifiers=identifiers, for_amazon=False)
site = self.referrer_for_domain(
domain)[len('https://'):].partition('/')[0]
se = search_engines_module()
matches = []
for result in se.ddg_search(terms, site, log=log, br=br, timeout=timeout):
if abort.is_set():
return matches, terms, domain, None
purl = urlparse(result.url)
if '/dp/' in purl.path and site in purl.netloc:
url = result.cached_url
if url is None:
url = se.wayback_machine_cached_url(
result.url, br, timeout=timeout)
if url is None:
log('Failed to find cached page for:', result.url)
continue
if url not in matches:
matches.append(url)
if len(matches) >= 3:
break
else:
log('Skipping non-book result:', result)
if not matches:
log('No search engine results for terms:', ' '.join(terms))
return matches, terms, domain, se.wayback_url_processor
# }}}
def identify(self, log, result_queue, abort, title=None, authors=None, # {{{ def identify(self, log, result_queue, abort, title=None, authors=None, # {{{
identifiers={}, timeout=30): identifiers={}, timeout=60):
''' '''
Note this method will retry without identifiers automatically if no Note this method will retry without identifiers automatically if no
match is found with identifiers. match is found with identifiers.
@ -1165,23 +1244,38 @@ class Amazon(Source):
udata = self._get_book_url(identifiers) udata = self._get_book_url(identifiers)
br = self.browser br = self.browser
log('User-agent:', br.current_user_agent())
if testing: if testing:
print('User-agent:', br.current_user_agent()) print('User-agent:', br.current_user_agent())
if udata is not None: if udata is not None:
# Try to directly get details page instead of running a search # Try to directly get details page instead of running a search
domain, idtype, asin, durl = udata domain, idtype, asin, durl = udata
preparsed_root = parse_details_page(durl, log, timeout, br, domain) cover_url_processor = None
if USE_SEARCH_ENGINE:
se = search_engines_module()
durl = se.wayback_machine_cached_url(
durl, br, timeout=timeout, log=log)
cover_url_processor = se.wayback_url_processor
if durl is None:
log('Failed to get cached URL for asin:', asin)
else:
preparsed_root = parse_details_page(
durl, log, timeout, br, domain)
if preparsed_root is not None: if preparsed_root is not None:
qasin = parse_asin(preparsed_root[1], log, durl) qasin = parse_asin(preparsed_root[1], log, durl)
if qasin == asin: if qasin == asin:
w = Worker(durl, result_queue, br, log, 0, domain, self, testing=testing, preparsed_root=preparsed_root) w = Worker(durl, result_queue, br, log, 0, domain,
self, testing=testing, preparsed_root=preparsed_root, cover_url_processor=cover_url_processor)
try: try:
w.get_details() w.get_details()
return return
except Exception: except Exception:
log.exception('get_details failed for url: %r'%durl) log.exception(
'get_details failed for url: %r' % durl)
func = self.search_search_engine if USE_SEARCH_ENGINE else self.search_amazon
try: try:
matches, query, domain = self.search_amazon(br, testing, log, abort, title, authors, identifiers, timeout) matches, query, domain, cover_url_processor = func(
br, testing, log, abort, title, authors, identifiers, timeout)
except SearchFailed: except SearchFailed:
return return
@ -1191,15 +1285,15 @@ class Amazon(Source):
if not matches: if not matches:
if identifiers and title and authors: if identifiers and title and authors:
log('No matches found with identifiers, retrying using only' log('No matches found with identifiers, retrying using only'
' title and authors. Query: %r'%query) ' title and authors. Query: %r' % query)
time.sleep(1) time.sleep(1)
return self.identify(log, result_queue, abort, title=title, return self.identify(log, result_queue, abort, title=title,
authors=authors, timeout=timeout) authors=authors, timeout=timeout)
log.error('No matches found with query: %r'%query) log.error('No matches found with query: %r' % query)
return return
workers = [Worker(url, result_queue, br, log, i, domain, self, workers = [Worker(url, result_queue, br, log, i, domain, self, testing=testing,
testing=testing) for i, url in enumerate(matches)] cover_url_processor=cover_url_processor) for i, url in enumerate(matches)]
for w in workers: for w in workers:
# Don't send all requests at the same time # Don't send all requests at the same time
@ -1223,7 +1317,7 @@ class Amazon(Source):
# }}} # }}}
def download_cover(self, log, result_queue, abort, # {{{ def download_cover(self, log, result_queue, abort, # {{{
title=None, authors=None, identifiers={}, timeout=30, get_best_cover=False): title=None, authors=None, identifiers={}, timeout=60, get_best_cover=False):
cached_url = self.get_cached_cover_url(identifiers) cached_url = self.get_cached_cover_url(identifiers)
if cached_url is None: if cached_url is None:
log.info('No cached cover found, running identify') log.info('No cached cover found, running identify')
@ -1255,7 +1349,8 @@ class Amazon(Source):
log('Downloading cover from:', cached_url) log('Downloading cover from:', cached_url)
try: try:
time.sleep(1) time.sleep(1)
cdata = self.browser.open_novisit(cached_url, timeout=timeout).read() cdata = self.browser.open_novisit(
cached_url, timeout=timeout).read()
result_queue.put((self, cdata)) result_queue.put((self, cdata))
except: except:
log.exception('Failed to download cover from:', cached_url) log.exception('Failed to download cover from:', cached_url)
@ -1263,34 +1358,39 @@ class Amazon(Source):
if __name__ == '__main__': # tests {{{ if __name__ == '__main__': # tests {{{
# To run these test use: calibre-debug src/calibre/ebooks/metadata/sources/amazon.py # To run these test use: calibre-debug
# src/calibre/ebooks/metadata/sources/amazon.py
from calibre.ebooks.metadata.sources.test import (test_identify_plugin, from calibre.ebooks.metadata.sources.test import (test_identify_plugin,
isbn_test, title_test, authors_test, comments_test, series_test) isbn_test, title_test, authors_test, comments_test, series_test)
com_tests = [ # {{{ com_tests = [ # {{{
( # Paperback with series ( # Paperback with series
{'identifiers':{'amazon':'1423146786'}}, {'identifiers': {'amazon': '1423146786'}},
[title_test('The Heroes of Olympus, Book Five The Blood of Olympus', exact=True), series_test('Heroes of Olympus', 5)] [title_test('The Heroes of Olympus, Book Five The Blood of Olympus',
exact=True), series_test('Heroes of Olympus', 5)]
), ),
( # Kindle edition with series ( # Kindle edition with series
{'identifiers':{'amazon':'B0085UEQDO'}}, {'identifiers': {'amazon': 'B0085UEQDO'}},
[title_test('Three Parts Dead', exact=True), series_test('Craft Sequence', 1)] [title_test('Three Parts Dead', exact=True),
series_test('Craft Sequence', 1)]
), ),
( # A kindle edition that does not appear in the search results when searching by ASIN ( # A kindle edition that does not appear in the search results when searching by ASIN
{'identifiers':{'amazon':'B004JHY6OG'}}, {'identifiers': {'amazon': 'B004JHY6OG'}},
[title_test('The Heroes: A First Law Novel (First Law World 2)', exact=True)] [title_test(
'The Heroes: A First Law Novel (First Law World 2)', exact=True)]
), ),
( # + in title and uses id="main-image" for cover ( # + in title and uses id="main-image" for cover
{'identifiers':{'amazon':'1933988770'}}, {'identifiers': {'amazon': '1933988770'}},
[title_test('C++ Concurrency in Action: Practical Multithreading', exact=True)] [title_test(
'C++ Concurrency in Action: Practical Multithreading', exact=True)]
), ),
( # noscript description ( # noscript description
{'identifiers':{'amazon':'0756407117'}}, {'identifiers': {'amazon': '0756407117'}},
[title_test( [title_test(
"Throne of the Crescent Moon"), "Throne of the Crescent Moon"),
comments_test('Makhslood'), comments_test('Dhamsawaat'), comments_test('Makhslood'), comments_test('Dhamsawaat'),
@ -1298,7 +1398,7 @@ if __name__ == '__main__': # tests {{{
), ),
( # Different comments markup, using Book Description section ( # Different comments markup, using Book Description section
{'identifiers':{'amazon':'0982514506'}}, {'identifiers': {'amazon': '0982514506'}},
[title_test( [title_test(
"Griffin's Destiny: Book Three: The Griffin's Daughter Trilogy", "Griffin's Destiny: Book Three: The Griffin's Daughter Trilogy",
exact=True), exact=True),
@ -1307,15 +1407,15 @@ if __name__ == '__main__': # tests {{{
), ),
( # # in title ( # # in title
{'title':'Expert C# 2008 Business Objects', {'title': 'Expert C# 2008 Business Objects',
'authors':['Lhotka']}, 'authors': ['Lhotka']},
[title_test('Expert C# 2008 Business Objects'), [title_test('Expert C# 2008 Business Objects'),
authors_test(['Rockford Lhotka']) authors_test(['Rockford Lhotka'])
] ]
), ),
( # Description has links ( # Description has links
{'identifiers':{'isbn': '9780671578275'}}, {'identifiers': {'isbn': '9780671578275'}},
[title_test('A Civil Campaign: A Comedy of Biology and Manners', [title_test('A Civil Campaign: A Comedy of Biology and Manners',
exact=True), authors_test(['Lois McMaster Bujold']) exact=True), authors_test(['Lois McMaster Bujold'])
] ]
@ -1323,13 +1423,13 @@ if __name__ == '__main__': # tests {{{
), ),
( # Sophisticated comment formatting ( # Sophisticated comment formatting
{'identifiers':{'isbn': '9781416580829'}}, {'identifiers': {'isbn': '9781416580829'}},
[title_test('Angels & Demons - Movie Tie-In: A Novel', [title_test('Angels & Demons - Movie Tie-In: A Novel',
exact=True), authors_test(['Dan Brown'])] exact=True), authors_test(['Dan Brown'])]
), ),
( # No specific problems ( # No specific problems
{'identifiers':{'isbn': '0743273567'}}, {'identifiers': {'isbn': '0743273567'}},
[title_test('The great gatsby', exact=True), [title_test('The great gatsby', exact=True),
authors_test(['F. Scott Fitzgerald'])] authors_test(['F. Scott Fitzgerald'])]
), ),
@ -1338,7 +1438,7 @@ if __name__ == '__main__': # tests {{{
de_tests = [ # {{{ de_tests = [ # {{{
( (
{'identifiers':{'isbn': '9783453314979'}}, {'identifiers': {'isbn': '9783453314979'}},
[title_test('Die letzten Wächter: Roman', [title_test('Die letzten Wächter: Roman',
exact=False), authors_test(['Sergej Lukianenko', 'Christiane Pöhlmann']) exact=False), authors_test(['Sergej Lukianenko', 'Christiane Pöhlmann'])
] ]
@ -1346,7 +1446,7 @@ if __name__ == '__main__': # tests {{{
), ),
( (
{'identifiers':{'isbn': '3548283519'}}, {'identifiers': {'isbn': '3548283519'}},
[title_test('Wer Wind Sät: Der Fünfte Fall Für Bodenstein Und Kirchhoff', [title_test('Wer Wind Sät: Der Fünfte Fall Für Bodenstein Und Kirchhoff',
exact=False), authors_test(['Nele Neuhaus']) exact=False), authors_test(['Nele Neuhaus'])
] ]
@ -1356,7 +1456,7 @@ if __name__ == '__main__': # tests {{{
it_tests = [ # {{{ it_tests = [ # {{{
( (
{'identifiers':{'isbn': '8838922195'}}, {'identifiers': {'isbn': '8838922195'}},
[title_test('La briscola in cinque', [title_test('La briscola in cinque',
exact=True), authors_test(['Marco Malvaldi']) exact=True), authors_test(['Marco Malvaldi'])
] ]
@ -1366,7 +1466,7 @@ if __name__ == '__main__': # tests {{{
fr_tests = [ # {{{ fr_tests = [ # {{{
( (
{'identifiers':{'isbn': '2221116798'}}, {'identifiers': {'isbn': '2221116798'}},
[title_test('L\'étrange voyage de Monsieur Daldry', [title_test('L\'étrange voyage de Monsieur Daldry',
exact=True), authors_test(['Marc Levy']) exact=True), authors_test(['Marc Levy'])
] ]
@ -1376,7 +1476,7 @@ if __name__ == '__main__': # tests {{{
es_tests = [ # {{{ es_tests = [ # {{{
( (
{'identifiers':{'isbn': '8483460831'}}, {'identifiers': {'isbn': '8483460831'}},
[title_test('Tiempos Interesantes', [title_test('Tiempos Interesantes',
exact=True), authors_test(['Terry Pratchett']) exact=True), authors_test(['Terry Pratchett'])
] ]
@ -1386,12 +1486,12 @@ if __name__ == '__main__': # tests {{{
jp_tests = [ # {{{ jp_tests = [ # {{{
( # Adult filtering test ( # Adult filtering test
{'identifiers':{'isbn':'4799500066'}}, {'identifiers': {'isbn': '4799500066'}},
[title_test(u' '),] [title_test(u' '), ]
), ),
( # isbn -> title, authors ( # isbn -> title, authors
{'identifiers':{'isbn': '9784101302720'}}, {'identifiers': {'isbn': '9784101302720'}},
[title_test(u'精霊の守り人', [title_test(u'精霊の守り人',
exact=True), authors_test([u'上橋 菜穂子']) exact=True), authors_test([u'上橋 菜穂子'])
] ]
@ -1405,7 +1505,7 @@ if __name__ == '__main__': # tests {{{
br_tests = [ # {{{ br_tests = [ # {{{
( (
{'title':'Guerra dos Tronos'}, {'title': 'Guerra dos Tronos'},
[title_test('A Guerra dos Tronos - As Crônicas de Gelo e Fogo', [title_test('A Guerra dos Tronos - As Crônicas de Gelo e Fogo',
exact=True), authors_test(['George R. R. Martin']) exact=True), authors_test(['George R. R. Martin'])
] ]
@ -1415,7 +1515,7 @@ if __name__ == '__main__': # tests {{{
nl_tests = [ # {{{ nl_tests = [ # {{{
( (
{'title':'Freakonomics'}, {'title': 'Freakonomics'},
[title_test('Freakonomics', [title_test('Freakonomics',
exact=True), authors_test(['Steven Levitt & Stephen Dubner & R. Kuitenbrouwer & O. Brenninkmeijer & A. van Den Berg']) exact=True), authors_test(['Steven Levitt & Stephen Dubner & R. Kuitenbrouwer & O. Brenninkmeijer & A. van Den Berg'])
] ]
@ -1425,11 +1525,12 @@ if __name__ == '__main__': # tests {{{
cn_tests = [ # {{{ cn_tests = [ # {{{
( (
{'identifiers':{'isbn':'9787115369512'}}, {'identifiers': {'isbn': '9787115369512'}},
[title_test('若为自由故 自由软件之父理查德斯托曼传', exact=True), authors_test(['[美]sam Williams', '邓楠,李凡希'])] [title_test('若为自由故 自由软件之父理查德斯托曼传', exact=True),
authors_test(['[美]sam Williams', '邓楠,李凡希'])]
), ),
( (
{'title':'爱上Raspberry Pi'}, {'title': '爱上Raspberry Pi'},
[title_test('爱上Raspberry Pi', [title_test('爱上Raspberry Pi',
exact=True), authors_test(['Matt Richardson', 'Shawn Wallace', '李凡希']) exact=True), authors_test(['Matt Richardson', 'Shawn Wallace', '李凡希'])
] ]
@ -1439,28 +1540,30 @@ if __name__ == '__main__': # tests {{{
ca_tests = [ # {{{ ca_tests = [ # {{{
( # Paperback with series ( # Paperback with series
{'identifiers':{'isbn':'9781623808747'}}, {'identifiers': {'isbn': '9781623808747'}},
[title_test('Parting Shot', exact=True), authors_test(['Mary Calmes'])] [title_test('Parting Shot', exact=True),
authors_test(['Mary Calmes'])]
), ),
( # # in title ( # # in title
{'title':'Expert C# 2008 Business Objects', {'title': 'Expert C# 2008 Business Objects',
'authors':['Lhotka']}, 'authors': ['Lhotka']},
[title_test('Expert C# 2008 Business Objects'), authors_test(['Rockford Lhotka'])] [title_test('Expert C# 2008 Business Objects'),
authors_test(['Rockford Lhotka'])]
), ),
( # noscript description ( # noscript description
{'identifiers':{'amazon_ca':'162380874X'}}, {'identifiers': {'amazon_ca': '162380874X'}},
[title_test('Parting Shot', exact=True), authors_test(['Mary Calmes']) [title_test('Parting Shot', exact=True), authors_test(['Mary Calmes'])
] ]
), ),
] # }}} ] # }}}
def do_test(domain, start=0, stop=None): def do_test(domain, start=0, stop=None):
tests = globals().get(domain+'_tests') tests = globals().get(domain + '_tests')
if stop is None: if stop is None:
stop = len(tests) stop = len(tests)
tests = tests[start:stop] tests = tests[start:stop]
test_identify_plugin(Amazon.name, tests, modify_plugin=lambda test_identify_plugin(Amazon.name, tests, modify_plugin=lambda
p:(setattr(p, 'testing_domain', domain), setattr(p, 'touched_fields', p.touched_fields - {'tags'}))) p: (setattr(p, 'testing_domain', domain), setattr(p, 'touched_fields', p.touched_fields - {'tags'})))
do_test('com') do_test('com')
# do_test('de') # do_test('de')

View File

@ -46,12 +46,12 @@ def parse_html(raw):
return html5lib.parse(raw, treebuilder='lxml', namespaceHTMLElements=False) return html5lib.parse(raw, treebuilder='lxml', namespaceHTMLElements=False)
def query(br, url, key, dump_raw=None, limit=1, parser=parse_html): def query(br, url, key, dump_raw=None, limit=1, parser=parse_html, timeout=60):
delta = monotonic() - last_visited[key] delta = monotonic() - last_visited[key]
if delta < limit and delta > 0: if delta < limit and delta > 0:
time.sleep(delta) time.sleep(delta)
try: try:
raw = br.open_novisit(url).read() raw = br.open_novisit(url, timeout=timeout).read()
finally: finally:
last_visited[key] = monotonic() last_visited[key] = monotonic()
if dump_raw is not None: if dump_raw is not None:
@ -80,20 +80,29 @@ def ddg_href(url):
return url return url
def wayback_machine_cached_url(url, br=None): def wayback_machine_cached_url(url, br=None, log=prints, timeout=60):
q = quote_term(url) q = quote_term(url)
br = br or browser() br = br or browser()
data = query(br, 'https://archive.org/wayback/available?url=' + data = query(br, 'https://archive.org/wayback/available?url=' +
q, 'wayback', parser=json.loads, limit=0.25) q, 'wayback', parser=json.loads, limit=0.25, timeout=timeout)
try: try:
closest = data['archived_snapshots']['closest'] closest = data['archived_snapshots']['closest']
except KeyError: except KeyError:
return pass
else:
if closest['available']: if closest['available']:
return closest['url'] return closest['url']
from pprint import pformat
log('Response from wayback machine:', pformat(data))
def ddg_search(terms, site=None, br=None, log=prints, safe_search=False, dump_raw=None): def wayback_url_processor(url):
if url.startswith('/'):
url = 'https://web.archive.org' + url
return url
def ddg_search(terms, site=None, br=None, log=prints, safe_search=False, dump_raw=None, timeout=60):
# https://duck.co/help/results/syntax # https://duck.co/help/results/syntax
terms = map(ddg_term, terms) terms = map(ddg_term, terms)
terms = [quote_term(t) for t in terms] terms = [quote_term(t) for t in terms]
@ -104,7 +113,7 @@ def ddg_search(terms, site=None, br=None, log=prints, safe_search=False, dump_ra
q=q, kp=1 if safe_search else -1) q=q, kp=1 if safe_search else -1)
log('Making ddg query: ' + url) log('Making ddg query: ' + url)
br = br or browser() br = br or browser()
root = query(br, url, 'ddg', dump_raw) root = query(br, url, 'ddg', dump_raw, timeout=timeout)
ans = [] ans = []
for a in root.xpath('//*[@class="results"]//*[@class="result__title"]/a[@href and @class="result__a"]'): for a in root.xpath('//*[@class="results"]//*[@class="result__title"]/a[@href and @class="result__a"]'):
ans.append(Result(ddg_href(a.get('href')), etree.tostring( ans.append(Result(ddg_href(a.get('href')), etree.tostring(

View File

@ -14,7 +14,6 @@ from threading import Thread
import calibre.ebooks.metadata.sources.search_engines as builtin_search_engines import calibre.ebooks.metadata.sources.search_engines as builtin_search_engines
from calibre import as_unicode, prints from calibre import as_unicode, prints
from calibre.constants import DEBUG, numeric_version from calibre.constants import DEBUG, numeric_version
from calibre.customize.ui import patch_metadata_plugins
from calibre.ebooks.metadata.sources.base import Source from calibre.ebooks.metadata.sources.base import Source
from calibre.utils.config import JSONConfig from calibre.utils.config import JSONConfig
from calibre.utils.https import get_https_resource_securely from calibre.utils.https import get_https_resource_securely
@ -59,6 +58,7 @@ def patch_search_engines(src):
def patch_plugins(): def patch_plugins():
from calibre.customize.ui import patch_metadata_plugins
patches = {} patches = {}
for name, val in cache.iteritems(): for name, val in cache.iteritems():
if name == 'hashes': if name == 'hashes':