Implement searching amazon via wayback machine

Disabled, as wayback machine is really slow/flaky
This commit is contained in:
Kovid Goyal 2017-03-02 09:19:51 +05:30
parent 6c4c14ceca
commit d1ad4955a8
3 changed files with 447 additions and 335 deletions

View File

@ -1,24 +1,22 @@
#!/usr/bin/env python2 #!/usr/bin/env python2
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai # vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from __future__ import (unicode_literals, division, absolute_import, # License: GPLv3 Copyright: 2011, Kovid Goyal <kovid at kovidgoyal.net>
print_function) from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL v3' import re
__copyright__ = '2011, Kovid Goyal <kovid@kovidgoyal.net>' import socket
__docformat__ = 'restructuredtext en' import time
from Queue import Empty, Queue
import socket, time, re
from threading import Thread from threading import Thread
from Queue import Queue, Empty from urlparse import urlparse
from calibre import as_unicode, browser from calibre import as_unicode, browser
from calibre.ebooks.metadata import check_isbn from calibre.ebooks.metadata import check_isbn
from calibre.ebooks.metadata.sources.base import (Source, Option, fixcase,
fixauthors)
from calibre.ebooks.metadata.book.base import Metadata from calibre.ebooks.metadata.book.base import Metadata
from calibre.ebooks.metadata.sources.base import Option, Source, fixauthors, fixcase
from calibre.ebooks.metadata.sources.update import search_engines_module
from calibre.utils.localization import canonicalize_lang from calibre.utils.localization import canonicalize_lang
from calibre.utils.random_ua import all_user_agents, accept_header_for_ua from calibre.utils.random_ua import accept_header_for_ua, all_user_agents
class CaptchaError(Exception): class CaptchaError(Exception):
@ -30,6 +28,7 @@ class SearchFailed(ValueError):
ua_index = -1 ua_index = -1
USE_SEARCH_ENGINE = False
def parse_details_page(url, log, timeout, browser, domain): def parse_details_page(url, log, timeout, browser, domain):
@ -37,6 +36,7 @@ def parse_details_page(url, log, timeout, browser, domain):
from calibre.ebooks.chardet import xml_to_unicode from calibre.ebooks.chardet import xml_to_unicode
import html5lib import html5lib
from lxml.html import tostring from lxml.html import tostring
log('Getting details from:', url)
try: try:
raw = browser.open_novisit(url, timeout=timeout).read().strip() raw = browser.open_novisit(url, timeout=timeout).read().strip()
except Exception as e: except Exception as e:
@ -56,8 +56,10 @@ def parse_details_page(url, log, timeout, browser, domain):
oraw = raw oraw = raw
if 'amazon.com.br' in url: if 'amazon.com.br' in url:
raw = raw.decode('utf-8') # amazon.com.br serves utf-8 but has an incorrect latin1 <meta> tag # amazon.com.br serves utf-8 but has an incorrect latin1 <meta> tag
raw = xml_to_unicode(raw, strip_encoding_pats=True, resolve_entities=True)[0] raw = raw.decode('utf-8')
raw = xml_to_unicode(raw, strip_encoding_pats=True,
resolve_entities=True)[0]
if '<title>404 - ' in raw: if '<title>404 - ' in raw:
log.error('URL malformed: %r' % url) log.error('URL malformed: %r' % url)
return return
@ -104,8 +106,9 @@ class Worker(Thread): # Get details {{{
''' '''
def __init__(self, url, result_queue, browser, log, relevance, domain, def __init__(self, url, result_queue, browser, log, relevance, domain,
plugin, timeout=20, testing=False, preparsed_root=None): plugin, timeout=20, testing=False, preparsed_root=None, cover_url_processor=None):
Thread.__init__(self) Thread.__init__(self)
self.cover_url_processor = cover_url_processor
self.preparsed_root = preparsed_root self.preparsed_root = preparsed_root
self.daemon = True self.daemon = True
self.testing = testing self.testing = testing
@ -230,7 +233,8 @@ class Worker(Thread): # Get details {{{
starts-with(text(), "Uitgever:") or \ starts-with(text(), "Uitgever:") or \
starts-with(text(), "出版社:")] starts-with(text(), "出版社:")]
''' '''
self.publisher_names = {'Publisher', 'Uitgever', 'Verlag', 'Editore', 'Editeur', 'Editor', 'Editora', '出版社'} self.publisher_names = {'Publisher', 'Uitgever', 'Verlag',
'Editore', 'Editeur', 'Editor', 'Editora', '出版社'}
self.language_xpath = ''' self.language_xpath = '''
descendant::*[ descendant::*[
@ -244,7 +248,8 @@ class Worker(Thread): # Get details {{{
or starts-with(text(), "语种") or starts-with(text(), "语种")
] ]
''' '''
self.language_names = {'Language', 'Sprache', 'Lingua', 'Idioma', 'Langue', '言語', 'Taal', '语种'} self.language_names = {'Language', 'Sprache',
'Lingua', 'Idioma', 'Langue', '言語', 'Taal', '语种'}
self.tags_xpath = ''' self.tags_xpath = '''
descendant::h2[ descendant::h2[
@ -308,7 +313,8 @@ class Worker(Thread): # Get details {{{
def get_details(self): def get_details(self):
if self.preparsed_root is None: if self.preparsed_root is None:
raw, root, selector = parse_details_page(self.url, self.log, self.timeout, self.browser, self.domain) raw, root, selector = parse_details_page(
self.url, self.log, self.timeout, self.browser, self.domain)
else: else:
raw, root, selector = self.preparsed_root raw, root, selector = self.preparsed_root
@ -319,9 +325,11 @@ class Worker(Thread): # Get details {{{
def parse_details(self, raw, root): def parse_details(self, raw, root):
asin = parse_asin(root, self.log, self.url) asin = parse_asin(root, self.log, self.url)
if not asin and root.xpath('//form[@action="/errors/validateCaptcha"]'): if not asin and root.xpath('//form[@action="/errors/validateCaptcha"]'):
raise CaptchaError('Amazon returned a CAPTCHA page, probably because you downloaded too many books. Wait for some time and try again.') raise CaptchaError(
'Amazon returned a CAPTCHA page, probably because you downloaded too many books. Wait for some time and try again.')
if self.testing: if self.testing:
import tempfile, uuid import tempfile
import uuid
with tempfile.NamedTemporaryFile(prefix=(asin or str(uuid.uuid4())) + '_', with tempfile.NamedTemporaryFile(prefix=(asin or str(uuid.uuid4())) + '_',
suffix='.html', delete=False) as f: suffix='.html', delete=False) as f:
f.write(raw) f.write(raw)
@ -340,7 +348,8 @@ class Worker(Thread): # Get details {{{
authors = [] authors = []
if not title or not authors or not asin: if not title or not authors or not asin:
self.log.error('Could not find title/authors/asin for %r'%self.url) self.log.error(
'Could not find title/authors/asin for %r' % self.url)
self.log.error('ASIN: %r Title: %r Authors: %r' % (asin, title, self.log.error('ASIN: %r Title: %r Authors: %r' % (asin, title,
authors)) authors))
return return
@ -378,15 +387,19 @@ class Worker(Thread): # Get details {{{
self.cover_url = self.parse_cover(root, raw) self.cover_url = self.parse_cover(root, raw)
except: except:
self.log.exception('Error parsing cover for url: %r' % self.url) self.log.exception('Error parsing cover for url: %r' % self.url)
if self.cover_url_processor is not None and self.cover_url.startswith('/'):
self.cover_url = self.cover_url_processor(self.cover_url)
mi.has_cover = bool(self.cover_url) mi.has_cover = bool(self.cover_url)
non_hero = tuple(self.selector('div#bookDetails_container_div div#nonHeroSection')) non_hero = tuple(self.selector(
'div#bookDetails_container_div div#nonHeroSection'))
if non_hero: if non_hero:
# New style markup # New style markup
try: try:
self.parse_new_details(root, mi, non_hero[0]) self.parse_new_details(root, mi, non_hero[0])
except: except:
self.log.exception('Failed to parse new-style book details section') self.log.exception(
'Failed to parse new-style book details section')
else: else:
pd = root.xpath(self.pd_xpath) pd = root.xpath(self.pd_xpath)
if pd: if pd:
@ -397,27 +410,32 @@ class Worker(Thread): # Get details {{{
if isbn: if isbn:
self.isbn = mi.isbn = isbn self.isbn = mi.isbn = isbn
except: except:
self.log.exception('Error parsing ISBN for url: %r'%self.url) self.log.exception(
'Error parsing ISBN for url: %r' % self.url)
try: try:
mi.publisher = self.parse_publisher(pd) mi.publisher = self.parse_publisher(pd)
except: except:
self.log.exception('Error parsing publisher for url: %r'%self.url) self.log.exception(
'Error parsing publisher for url: %r' % self.url)
try: try:
mi.pubdate = self.parse_pubdate(pd) mi.pubdate = self.parse_pubdate(pd)
except: except:
self.log.exception('Error parsing publish date for url: %r'%self.url) self.log.exception(
'Error parsing publish date for url: %r' % self.url)
try: try:
lang = self.parse_language(pd) lang = self.parse_language(pd)
if lang: if lang:
mi.language = lang mi.language = lang
except: except:
self.log.exception('Error parsing language for url: %r'%self.url) self.log.exception(
'Error parsing language for url: %r' % self.url)
else: else:
self.log.warning('Failed to find product description for url: %r'%self.url) self.log.warning(
'Failed to find product description for url: %r' % self.url)
mi.source_relevance = self.relevance mi.source_relevance = self.relevance
@ -448,7 +466,8 @@ class Worker(Thread): # Get details {{{
title = self.tostring(actual_title[0], encoding=unicode, title = self.tostring(actual_title[0], encoding=unicode,
method='text').strip() method='text').strip()
else: else:
title = self.tostring(tdiv, encoding=unicode, method='text').strip() title = self.tostring(tdiv, encoding=unicode,
method='text').strip()
ans = re.sub(r'[(\[].*[)\]]', '', title).strip() ans = re.sub(r'[(\[].*[)\]]', '', title).strip()
if not ans: if not ans:
ans = title.rpartition('[')[0].strip() ans = title.rpartition('[')[0].strip()
@ -540,7 +559,8 @@ class Worker(Thread): # Get details {{{
if len(ns) == 0 and ns.text: if len(ns) == 0 and ns.text:
import html5lib import html5lib
# html5lib parsed noscript as CDATA # html5lib parsed noscript as CDATA
ns = html5lib.parseFragment('<div>%s</div>' % (ns.text), treebuilder='lxml', namespaceHTMLElements=False)[0] ns = html5lib.parseFragment(
'<div>%s</div>' % (ns.text), treebuilder='lxml', namespaceHTMLElements=False)[0]
else: else:
ns.tag = 'div' ns.tag = 'div'
ans = self._render_comments(ns) ans = self._render_comments(ns)
@ -549,7 +569,8 @@ class Worker(Thread): # Get details {{{
if desc: if desc:
ans = self._render_comments(desc[0]) ans = self._render_comments(desc[0])
desc = root.xpath('//div[@id="productDescription"]/*[@class="content"]') desc = root.xpath(
'//div[@id="productDescription"]/*[@class="content"]')
if desc: if desc:
ans += self._render_comments(desc[0]) ans += self._render_comments(desc[0])
else: else:
@ -559,12 +580,15 @@ class Worker(Thread): # Get details {{{
if m is not None: if m is not None:
try: try:
text = unquote(m.group(1)).decode('utf-8') text = unquote(m.group(1)).decode('utf-8')
nr = html5lib.parse(text, treebuilder='lxml', namespaceHTMLElements=False) nr = html5lib.parse(
desc = nr.xpath('//div[@id="productDescription"]/*[@class="content"]') text, treebuilder='lxml', namespaceHTMLElements=False)
desc = nr.xpath(
'//div[@id="productDescription"]/*[@class="content"]')
if desc: if desc:
ans += self._render_comments(desc[0]) ans += self._render_comments(desc[0])
except Exception as e: except Exception as e:
self.log.warn('Parsing of obfuscated product description failed with error: %s' % as_unicode(e)) self.log.warn(
'Parsing of obfuscated product description failed with error: %s' % as_unicode(e))
return ans return ans
@ -577,13 +601,15 @@ class Worker(Thread): # Get details {{{
series = series[0] series = series[0]
spans = series.xpath('./span') spans = series.xpath('./span')
if spans: if spans:
raw = self.tostring(spans[0], encoding=unicode, method='text', with_tail=False).strip() raw = self.tostring(
spans[0], encoding=unicode, method='text', with_tail=False).strip()
m = re.search('\s+([0-9.]+)$', raw.strip()) m = re.search('\s+([0-9.]+)$', raw.strip())
if m is not None: if m is not None:
series_index = float(m.group(1)) series_index = float(m.group(1))
s = series.xpath('./a[@id="series-page-link"]') s = series.xpath('./a[@id="series-page-link"]')
if s: if s:
series = self.tostring(s[0], encoding=unicode, method='text', with_tail=False).strip() series = self.tostring(
s[0], encoding=unicode, method='text', with_tail=False).strip()
if series: if series:
ans = (series, series_index) ans = (series, series_index)
# This is found on Kindle edition pages on amazon.com # This is found on Kindle edition pages on amazon.com
@ -595,7 +621,8 @@ class Worker(Thread): # Get details {{{
series_index = float(m.group(1)) series_index = float(m.group(1))
a = span.xpath('./a[@href]') a = span.xpath('./a[@href]')
if a: if a:
series = self.tostring(a[0], encoding=unicode, method='text', with_tail=False).strip() series = self.tostring(
a[0], encoding=unicode, method='text', with_tail=False).strip()
if series: if series:
ans = (series, series_index) ans = (series, series_index)
# This is found on newer Kindle edition pages on amazon.com # This is found on newer Kindle edition pages on amazon.com
@ -607,7 +634,8 @@ class Worker(Thread): # Get details {{{
series_index = float(m.group(1)) series_index = float(m.group(1))
a = b.getparent().xpath('./a[@href]') a = b.getparent().xpath('./a[@href]')
if a: if a:
series = self.tostring(a[0], encoding=unicode, method='text', with_tail=False).partition('(')[0].strip() series = self.tostring(
a[0], encoding=unicode, method='text', with_tail=False).partition('(')[0].strip()
if series: if series:
ans = series, series_index ans = series, series_index
@ -629,12 +657,14 @@ class Worker(Thread): # Get details {{{
def parse_tags(self, root): def parse_tags(self, root):
ans = [] ans = []
exclude_tokens = {'kindle', 'a-z'} exclude_tokens = {'kindle', 'a-z'}
exclude = {'special features', 'by authors', 'authors & illustrators', 'books', 'new; used & rental textbooks'} exclude = {'special features', 'by authors',
'authors & illustrators', 'books', 'new; used & rental textbooks'}
seen = set() seen = set()
for li in root.xpath(self.tags_xpath): for li in root.xpath(self.tags_xpath):
for i, a in enumerate(li.iterdescendants('a')): for i, a in enumerate(li.iterdescendants('a')):
if i > 0: if i > 0:
# we ignore the first category since it is almost always too broad # we ignore the first category since it is almost always
# too broad
raw = (a.text or '').strip().replace(',', ';') raw = (a.text or '').strip().replace(',', ';')
lraw = icu_lower(raw) lraw = icu_lower(raw)
tokens = frozenset(lraw.split()) tokens = frozenset(lraw.split())
@ -674,12 +704,14 @@ class Worker(Thread): # Get details {{{
if url: if url:
return url return url
imgs = root.xpath('//img[(@id="prodImage" or @id="original-main-image" or @id="main-image" or @id="main-image-nonjs") and @src]') imgs = root.xpath(
'//img[(@id="prodImage" or @id="original-main-image" or @id="main-image" or @id="main-image-nonjs") and @src]')
if not imgs: if not imgs:
imgs = ( imgs = (
root.xpath('//div[@class="main-image-inner-wrapper"]/img[@src]') or root.xpath('//div[@class="main-image-inner-wrapper"]/img[@src]') or
root.xpath('//div[@id="main-image-container" or @id="ebooks-main-image-container"]//img[@src]') or root.xpath('//div[@id="main-image-container" or @id="ebooks-main-image-container"]//img[@src]') or
root.xpath('//div[@id="mainImageContainer"]//img[@data-a-dynamic-image]') root.xpath(
'//div[@id="mainImageContainer"]//img[@data-a-dynamic-image]')
) )
for img in imgs: for img in imgs:
try: try:
@ -887,7 +919,8 @@ class Amazon(Source):
return 'https://www.amazon.%s/' % domain return 'https://www.amazon.%s/' % domain
def _get_book_url(self, identifiers): # {{{ def _get_book_url(self, identifiers): # {{{
domain, asin = self.get_domain_and_asin(identifiers, extra_domains=('in', 'au', 'ca')) domain, asin = self.get_domain_and_asin(
identifiers, extra_domains=('in', 'au', 'ca'))
if domain and asin: if domain and asin:
url = None url = None
r = self.referrer_for_domain(domain) r = self.referrer_for_domain(domain)
@ -955,7 +988,7 @@ class Amazon(Source):
return udomain return udomain
def create_query(self, log, title=None, authors=None, identifiers={}, # {{{ def create_query(self, log, title=None, authors=None, identifiers={}, # {{{
domain=None): domain=None, for_amazon=True):
from urllib import urlencode from urllib import urlencode
if domain is None: if domain is None:
domain = self.domain domain = self.domain
@ -965,6 +998,7 @@ class Amazon(Source):
domain = idomain domain = idomain
# See the amazon detailed search page to get all options # See the amazon detailed search page to get all options
terms = []
q = {'search-alias': 'aps', q = {'search-alias': 'aps',
'unfiltered': '1', 'unfiltered': '1',
} }
@ -978,26 +1012,34 @@ class Amazon(Source):
if asin is not None: if asin is not None:
q['field-keywords'] = asin q['field-keywords'] = asin
terms.append(asin)
elif isbn is not None: elif isbn is not None:
q['field-isbn'] = isbn q['field-isbn'] = isbn
terms.append(isbn)
else: else:
# Only return book results # Only return book results
q['search-alias'] = {'br':'digital-text', 'nl':'aps'}.get(domain, 'stripbooks') q['search-alias'] = {'br': 'digital-text',
'nl': 'aps'}.get(domain, 'stripbooks')
if title: if title:
title_tokens = list(self.get_title_tokens(title)) title_tokens = list(self.get_title_tokens(title))
if title_tokens: if title_tokens:
q['field-title'] = ' '.join(title_tokens) q['field-title'] = ' '.join(title_tokens)
terms.extend(title_tokens)
if authors: if authors:
author_tokens = self.get_author_tokens(authors, author_tokens = self.get_author_tokens(authors,
only_first_author=True) only_first_author=True)
if author_tokens: if author_tokens:
q['field-author'] = ' '.join(author_tokens) q['field-author'] = ' '.join(author_tokens)
terms.extend(author_tokens)
if not ('field-keywords' in q or 'field-isbn' in q or if not ('field-keywords' in q or 'field-isbn' in q or
('field-title' in q)): ('field-title' in q)):
# Insufficient metadata to make an identify query # Insufficient metadata to make an identify query
return None, None return None, None
if not for_amazon:
return terms, domain
# magic parameter to enable Japanese Shift_JIS encoding. # magic parameter to enable Japanese Shift_JIS encoding.
if domain == 'jp': if domain == 'jp':
q['__mk_ja_JP'] = u'カタカナ' q['__mk_ja_JP'] = u'カタカナ'
@ -1018,7 +1060,8 @@ class Amazon(Source):
encoded_q = dict([(x.encode(encode_to, 'ignore'), y.encode(encode_to, encoded_q = dict([(x.encode(encode_to, 'ignore'), y.encode(encode_to,
'ignore')) for x, y in 'ignore')) for x, y in
q.iteritems()]) q.iteritems()])
url = 'https://www.amazon.%s/s/?'%self.get_website_domain(domain) + urlencode(encoded_q) url = 'https://www.amazon.%s/s/?' % self.get_website_domain(
domain) + urlencode(encoded_q)
return url, domain return url, domain
# }}} # }}}
@ -1043,7 +1086,8 @@ class Amazon(Source):
def title_ok(title): def title_ok(title):
title = title.lower() title = title.lower()
bad = ['bulk pack', '[audiobook]', '[audio cd]', '(a book companion)', '( slipcase with door )', ': free sampler'] bad = ['bulk pack', '[audiobook]', '[audio cd]',
'(a book companion)', '( slipcase with door )', ': free sampler']
if self.domain == 'com': if self.domain == 'com':
bad.extend(['(%s edition)' % x for x in ('spanish', 'german')]) bad.extend(['(%s edition)' % x for x in ('spanish', 'german')])
for x in bad: for x in bad:
@ -1059,7 +1103,8 @@ class Amazon(Source):
if title_ok(title): if title_ok(title):
url = a.get('href') url = a.get('href')
if url.startswith('/'): if url.startswith('/'):
url = 'https://www.amazon.%s%s' % (self.get_website_domain(domain), url) url = 'https://www.amazon.%s%s' % (
self.get_website_domain(domain), url)
matches.append(url) matches.append(url)
if not matches: if not matches:
@ -1074,7 +1119,8 @@ class Amazon(Source):
if title_ok(title): if title_ok(title):
url = a.get('href') url = a.get('href')
if url.startswith('/'): if url.startswith('/'):
url = 'https://www.amazon.%s%s' % (self.get_website_domain(domain), url) url = 'https://www.amazon.%s%s' % (
self.get_website_domain(domain), url)
matches.append(url) matches.append(url)
break break
@ -1088,7 +1134,8 @@ class Amazon(Source):
if title_ok(title): if title_ok(title):
url = a.get('href') url = a.get('href')
if url.startswith('/'): if url.startswith('/'):
url = 'https://www.amazon.%s%s' % (self.get_website_domain(domain), url) url = 'https://www.amazon.%s%s' % (
self.get_website_domain(domain), url)
matches.append(url) matches.append(url)
break break
if not matches and root.xpath('//form[@action="/errors/validateCaptcha"]'): if not matches and root.xpath('//form[@action="/errors/validateCaptcha"]'):
@ -1101,7 +1148,7 @@ class Amazon(Source):
return matches[:3] return matches[:3]
# }}} # }}}
def search_amazon(self, br, testing, log, abort, title, authors, identifiers, timeout): def search_amazon(self, br, testing, log, abort, title, authors, identifiers, timeout): # {{{
import html5lib import html5lib
from calibre.utils.cleantext import clean_ascii_chars from calibre.utils.cleantext import clean_ascii_chars
from calibre.ebooks.chardet import xml_to_unicode from calibre.ebooks.chardet import xml_to_unicode
@ -1152,10 +1199,42 @@ class Amazon(Source):
matches = self.parse_results_page(root, domain) matches = self.parse_results_page(root, domain)
return matches, query, domain return matches, query, domain, None
# }}}
def search_search_engine(self, br, testing, log, abort, title, authors, identifiers, timeout): # {{{
terms, domain = self.create_query(log, title=title, authors=authors,
identifiers=identifiers, for_amazon=False)
site = self.referrer_for_domain(
domain)[len('https://'):].partition('/')[0]
se = search_engines_module()
matches = []
for result in se.ddg_search(terms, site, log=log, br=br, timeout=timeout):
if abort.is_set():
return matches, terms, domain, None
purl = urlparse(result.url)
if '/dp/' in purl.path and site in purl.netloc:
url = result.cached_url
if url is None:
url = se.wayback_machine_cached_url(
result.url, br, timeout=timeout)
if url is None:
log('Failed to find cached page for:', result.url)
continue
if url not in matches:
matches.append(url)
if len(matches) >= 3:
break
else:
log('Skipping non-book result:', result)
if not matches:
log('No search engine results for terms:', ' '.join(terms))
return matches, terms, domain, se.wayback_url_processor
# }}}
def identify(self, log, result_queue, abort, title=None, authors=None, # {{{ def identify(self, log, result_queue, abort, title=None, authors=None, # {{{
identifiers={}, timeout=30): identifiers={}, timeout=60):
''' '''
Note this method will retry without identifiers automatically if no Note this method will retry without identifiers automatically if no
match is found with identifiers. match is found with identifiers.
@ -1165,23 +1244,38 @@ class Amazon(Source):
udata = self._get_book_url(identifiers) udata = self._get_book_url(identifiers)
br = self.browser br = self.browser
log('User-agent:', br.current_user_agent())
if testing: if testing:
print('User-agent:', br.current_user_agent()) print('User-agent:', br.current_user_agent())
if udata is not None: if udata is not None:
# Try to directly get details page instead of running a search # Try to directly get details page instead of running a search
domain, idtype, asin, durl = udata domain, idtype, asin, durl = udata
preparsed_root = parse_details_page(durl, log, timeout, br, domain) cover_url_processor = None
if USE_SEARCH_ENGINE:
se = search_engines_module()
durl = se.wayback_machine_cached_url(
durl, br, timeout=timeout, log=log)
cover_url_processor = se.wayback_url_processor
if durl is None:
log('Failed to get cached URL for asin:', asin)
else:
preparsed_root = parse_details_page(
durl, log, timeout, br, domain)
if preparsed_root is not None: if preparsed_root is not None:
qasin = parse_asin(preparsed_root[1], log, durl) qasin = parse_asin(preparsed_root[1], log, durl)
if qasin == asin: if qasin == asin:
w = Worker(durl, result_queue, br, log, 0, domain, self, testing=testing, preparsed_root=preparsed_root) w = Worker(durl, result_queue, br, log, 0, domain,
self, testing=testing, preparsed_root=preparsed_root, cover_url_processor=cover_url_processor)
try: try:
w.get_details() w.get_details()
return return
except Exception: except Exception:
log.exception('get_details failed for url: %r'%durl) log.exception(
'get_details failed for url: %r' % durl)
func = self.search_search_engine if USE_SEARCH_ENGINE else self.search_amazon
try: try:
matches, query, domain = self.search_amazon(br, testing, log, abort, title, authors, identifiers, timeout) matches, query, domain, cover_url_processor = func(
br, testing, log, abort, title, authors, identifiers, timeout)
except SearchFailed: except SearchFailed:
return return
@ -1198,8 +1292,8 @@ class Amazon(Source):
log.error('No matches found with query: %r' % query) log.error('No matches found with query: %r' % query)
return return
workers = [Worker(url, result_queue, br, log, i, domain, self, workers = [Worker(url, result_queue, br, log, i, domain, self, testing=testing,
testing=testing) for i, url in enumerate(matches)] cover_url_processor=cover_url_processor) for i, url in enumerate(matches)]
for w in workers: for w in workers:
# Don't send all requests at the same time # Don't send all requests at the same time
@ -1223,7 +1317,7 @@ class Amazon(Source):
# }}} # }}}
def download_cover(self, log, result_queue, abort, # {{{ def download_cover(self, log, result_queue, abort, # {{{
title=None, authors=None, identifiers={}, timeout=30, get_best_cover=False): title=None, authors=None, identifiers={}, timeout=60, get_best_cover=False):
cached_url = self.get_cached_cover_url(identifiers) cached_url = self.get_cached_cover_url(identifiers)
if cached_url is None: if cached_url is None:
log.info('No cached cover found, running identify') log.info('No cached cover found, running identify')
@ -1255,7 +1349,8 @@ class Amazon(Source):
log('Downloading cover from:', cached_url) log('Downloading cover from:', cached_url)
try: try:
time.sleep(1) time.sleep(1)
cdata = self.browser.open_novisit(cached_url, timeout=timeout).read() cdata = self.browser.open_novisit(
cached_url, timeout=timeout).read()
result_queue.put((self, cdata)) result_queue.put((self, cdata))
except: except:
log.exception('Failed to download cover from:', cached_url) log.exception('Failed to download cover from:', cached_url)
@ -1263,29 +1358,34 @@ class Amazon(Source):
if __name__ == '__main__': # tests {{{ if __name__ == '__main__': # tests {{{
# To run these test use: calibre-debug src/calibre/ebooks/metadata/sources/amazon.py # To run these test use: calibre-debug
# src/calibre/ebooks/metadata/sources/amazon.py
from calibre.ebooks.metadata.sources.test import (test_identify_plugin, from calibre.ebooks.metadata.sources.test import (test_identify_plugin,
isbn_test, title_test, authors_test, comments_test, series_test) isbn_test, title_test, authors_test, comments_test, series_test)
com_tests = [ # {{{ com_tests = [ # {{{
( # Paperback with series ( # Paperback with series
{'identifiers': {'amazon': '1423146786'}}, {'identifiers': {'amazon': '1423146786'}},
[title_test('The Heroes of Olympus, Book Five The Blood of Olympus', exact=True), series_test('Heroes of Olympus', 5)] [title_test('The Heroes of Olympus, Book Five The Blood of Olympus',
exact=True), series_test('Heroes of Olympus', 5)]
), ),
( # Kindle edition with series ( # Kindle edition with series
{'identifiers': {'amazon': 'B0085UEQDO'}}, {'identifiers': {'amazon': 'B0085UEQDO'}},
[title_test('Three Parts Dead', exact=True), series_test('Craft Sequence', 1)] [title_test('Three Parts Dead', exact=True),
series_test('Craft Sequence', 1)]
), ),
( # A kindle edition that does not appear in the search results when searching by ASIN ( # A kindle edition that does not appear in the search results when searching by ASIN
{'identifiers': {'amazon': 'B004JHY6OG'}}, {'identifiers': {'amazon': 'B004JHY6OG'}},
[title_test('The Heroes: A First Law Novel (First Law World 2)', exact=True)] [title_test(
'The Heroes: A First Law Novel (First Law World 2)', exact=True)]
), ),
( # + in title and uses id="main-image" for cover ( # + in title and uses id="main-image" for cover
{'identifiers': {'amazon': '1933988770'}}, {'identifiers': {'amazon': '1933988770'}},
[title_test('C++ Concurrency in Action: Practical Multithreading', exact=True)] [title_test(
'C++ Concurrency in Action: Practical Multithreading', exact=True)]
), ),
@ -1426,7 +1526,8 @@ if __name__ == '__main__': # tests {{{
cn_tests = [ # {{{ cn_tests = [ # {{{
( (
{'identifiers': {'isbn': '9787115369512'}}, {'identifiers': {'isbn': '9787115369512'}},
[title_test('若为自由故 自由软件之父理查德斯托曼传', exact=True), authors_test(['[美]sam Williams', '邓楠,李凡希'])] [title_test('若为自由故 自由软件之父理查德斯托曼传', exact=True),
authors_test(['[美]sam Williams', '邓楠,李凡希'])]
), ),
( (
{'title': '爱上Raspberry Pi'}, {'title': '爱上Raspberry Pi'},
@ -1440,12 +1541,14 @@ if __name__ == '__main__': # tests {{{
ca_tests = [ # {{{ ca_tests = [ # {{{
( # Paperback with series ( # Paperback with series
{'identifiers': {'isbn': '9781623808747'}}, {'identifiers': {'isbn': '9781623808747'}},
[title_test('Parting Shot', exact=True), authors_test(['Mary Calmes'])] [title_test('Parting Shot', exact=True),
authors_test(['Mary Calmes'])]
), ),
( # # in title ( # # in title
{'title': 'Expert C# 2008 Business Objects', {'title': 'Expert C# 2008 Business Objects',
'authors': ['Lhotka']}, 'authors': ['Lhotka']},
[title_test('Expert C# 2008 Business Objects'), authors_test(['Rockford Lhotka'])] [title_test('Expert C# 2008 Business Objects'),
authors_test(['Rockford Lhotka'])]
), ),
( # noscript description ( # noscript description
{'identifiers': {'amazon_ca': '162380874X'}}, {'identifiers': {'amazon_ca': '162380874X'}},

View File

@ -46,12 +46,12 @@ def parse_html(raw):
return html5lib.parse(raw, treebuilder='lxml', namespaceHTMLElements=False) return html5lib.parse(raw, treebuilder='lxml', namespaceHTMLElements=False)
def query(br, url, key, dump_raw=None, limit=1, parser=parse_html): def query(br, url, key, dump_raw=None, limit=1, parser=parse_html, timeout=60):
delta = monotonic() - last_visited[key] delta = monotonic() - last_visited[key]
if delta < limit and delta > 0: if delta < limit and delta > 0:
time.sleep(delta) time.sleep(delta)
try: try:
raw = br.open_novisit(url).read() raw = br.open_novisit(url, timeout=timeout).read()
finally: finally:
last_visited[key] = monotonic() last_visited[key] = monotonic()
if dump_raw is not None: if dump_raw is not None:
@ -80,20 +80,29 @@ def ddg_href(url):
return url return url
def wayback_machine_cached_url(url, br=None): def wayback_machine_cached_url(url, br=None, log=prints, timeout=60):
q = quote_term(url) q = quote_term(url)
br = br or browser() br = br or browser()
data = query(br, 'https://archive.org/wayback/available?url=' + data = query(br, 'https://archive.org/wayback/available?url=' +
q, 'wayback', parser=json.loads, limit=0.25) q, 'wayback', parser=json.loads, limit=0.25, timeout=timeout)
try: try:
closest = data['archived_snapshots']['closest'] closest = data['archived_snapshots']['closest']
except KeyError: except KeyError:
return pass
else:
if closest['available']: if closest['available']:
return closest['url'] return closest['url']
from pprint import pformat
log('Response from wayback machine:', pformat(data))
def ddg_search(terms, site=None, br=None, log=prints, safe_search=False, dump_raw=None): def wayback_url_processor(url):
if url.startswith('/'):
url = 'https://web.archive.org' + url
return url
def ddg_search(terms, site=None, br=None, log=prints, safe_search=False, dump_raw=None, timeout=60):
# https://duck.co/help/results/syntax # https://duck.co/help/results/syntax
terms = map(ddg_term, terms) terms = map(ddg_term, terms)
terms = [quote_term(t) for t in terms] terms = [quote_term(t) for t in terms]
@ -104,7 +113,7 @@ def ddg_search(terms, site=None, br=None, log=prints, safe_search=False, dump_ra
q=q, kp=1 if safe_search else -1) q=q, kp=1 if safe_search else -1)
log('Making ddg query: ' + url) log('Making ddg query: ' + url)
br = br or browser() br = br or browser()
root = query(br, url, 'ddg', dump_raw) root = query(br, url, 'ddg', dump_raw, timeout=timeout)
ans = [] ans = []
for a in root.xpath('//*[@class="results"]//*[@class="result__title"]/a[@href and @class="result__a"]'): for a in root.xpath('//*[@class="results"]//*[@class="result__title"]/a[@href and @class="result__a"]'):
ans.append(Result(ddg_href(a.get('href')), etree.tostring( ans.append(Result(ddg_href(a.get('href')), etree.tostring(

View File

@ -14,7 +14,6 @@ from threading import Thread
import calibre.ebooks.metadata.sources.search_engines as builtin_search_engines import calibre.ebooks.metadata.sources.search_engines as builtin_search_engines
from calibre import as_unicode, prints from calibre import as_unicode, prints
from calibre.constants import DEBUG, numeric_version from calibre.constants import DEBUG, numeric_version
from calibre.customize.ui import patch_metadata_plugins
from calibre.ebooks.metadata.sources.base import Source from calibre.ebooks.metadata.sources.base import Source
from calibre.utils.config import JSONConfig from calibre.utils.config import JSONConfig
from calibre.utils.https import get_https_resource_securely from calibre.utils.https import get_https_resource_securely
@ -59,6 +58,7 @@ def patch_search_engines(src):
def patch_plugins(): def patch_plugins():
from calibre.customize.ui import patch_metadata_plugins
patches = {} patches = {}
for name, val in cache.iteritems(): for name, val in cache.iteritems():
if name == 'hashes': if name == 'hashes':