mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Implement searching amazon via wayback machine
Disabled, as wayback machine is really slow/flaky
This commit is contained in:
parent
6c4c14ceca
commit
d1ad4955a8
@ -1,24 +1,22 @@
|
|||||||
#!/usr/bin/env python2
|
#!/usr/bin/env python2
|
||||||
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
|
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
|
||||||
from __future__ import (unicode_literals, division, absolute_import,
|
# License: GPLv3 Copyright: 2011, Kovid Goyal <kovid at kovidgoyal.net>
|
||||||
print_function)
|
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||||
|
|
||||||
__license__ = 'GPL v3'
|
import re
|
||||||
__copyright__ = '2011, Kovid Goyal <kovid@kovidgoyal.net>'
|
import socket
|
||||||
__docformat__ = 'restructuredtext en'
|
import time
|
||||||
|
from Queue import Empty, Queue
|
||||||
import socket, time, re
|
|
||||||
from threading import Thread
|
from threading import Thread
|
||||||
from Queue import Queue, Empty
|
from urlparse import urlparse
|
||||||
|
|
||||||
|
|
||||||
from calibre import as_unicode, browser
|
from calibre import as_unicode, browser
|
||||||
from calibre.ebooks.metadata import check_isbn
|
from calibre.ebooks.metadata import check_isbn
|
||||||
from calibre.ebooks.metadata.sources.base import (Source, Option, fixcase,
|
|
||||||
fixauthors)
|
|
||||||
from calibre.ebooks.metadata.book.base import Metadata
|
from calibre.ebooks.metadata.book.base import Metadata
|
||||||
|
from calibre.ebooks.metadata.sources.base import Option, Source, fixauthors, fixcase
|
||||||
|
from calibre.ebooks.metadata.sources.update import search_engines_module
|
||||||
from calibre.utils.localization import canonicalize_lang
|
from calibre.utils.localization import canonicalize_lang
|
||||||
from calibre.utils.random_ua import all_user_agents, accept_header_for_ua
|
from calibre.utils.random_ua import accept_header_for_ua, all_user_agents
|
||||||
|
|
||||||
|
|
||||||
class CaptchaError(Exception):
|
class CaptchaError(Exception):
|
||||||
@ -30,6 +28,7 @@ class SearchFailed(ValueError):
|
|||||||
|
|
||||||
|
|
||||||
ua_index = -1
|
ua_index = -1
|
||||||
|
USE_SEARCH_ENGINE = False
|
||||||
|
|
||||||
|
|
||||||
def parse_details_page(url, log, timeout, browser, domain):
|
def parse_details_page(url, log, timeout, browser, domain):
|
||||||
@ -37,6 +36,7 @@ def parse_details_page(url, log, timeout, browser, domain):
|
|||||||
from calibre.ebooks.chardet import xml_to_unicode
|
from calibre.ebooks.chardet import xml_to_unicode
|
||||||
import html5lib
|
import html5lib
|
||||||
from lxml.html import tostring
|
from lxml.html import tostring
|
||||||
|
log('Getting details from:', url)
|
||||||
try:
|
try:
|
||||||
raw = browser.open_novisit(url, timeout=timeout).read().strip()
|
raw = browser.open_novisit(url, timeout=timeout).read().strip()
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
@ -56,8 +56,10 @@ def parse_details_page(url, log, timeout, browser, domain):
|
|||||||
|
|
||||||
oraw = raw
|
oraw = raw
|
||||||
if 'amazon.com.br' in url:
|
if 'amazon.com.br' in url:
|
||||||
raw = raw.decode('utf-8') # amazon.com.br serves utf-8 but has an incorrect latin1 <meta> tag
|
# amazon.com.br serves utf-8 but has an incorrect latin1 <meta> tag
|
||||||
raw = xml_to_unicode(raw, strip_encoding_pats=True, resolve_entities=True)[0]
|
raw = raw.decode('utf-8')
|
||||||
|
raw = xml_to_unicode(raw, strip_encoding_pats=True,
|
||||||
|
resolve_entities=True)[0]
|
||||||
if '<title>404 - ' in raw:
|
if '<title>404 - ' in raw:
|
||||||
log.error('URL malformed: %r' % url)
|
log.error('URL malformed: %r' % url)
|
||||||
return
|
return
|
||||||
@ -104,8 +106,9 @@ class Worker(Thread): # Get details {{{
|
|||||||
'''
|
'''
|
||||||
|
|
||||||
def __init__(self, url, result_queue, browser, log, relevance, domain,
|
def __init__(self, url, result_queue, browser, log, relevance, domain,
|
||||||
plugin, timeout=20, testing=False, preparsed_root=None):
|
plugin, timeout=20, testing=False, preparsed_root=None, cover_url_processor=None):
|
||||||
Thread.__init__(self)
|
Thread.__init__(self)
|
||||||
|
self.cover_url_processor = cover_url_processor
|
||||||
self.preparsed_root = preparsed_root
|
self.preparsed_root = preparsed_root
|
||||||
self.daemon = True
|
self.daemon = True
|
||||||
self.testing = testing
|
self.testing = testing
|
||||||
@ -230,7 +233,8 @@ class Worker(Thread): # Get details {{{
|
|||||||
starts-with(text(), "Uitgever:") or \
|
starts-with(text(), "Uitgever:") or \
|
||||||
starts-with(text(), "出版社:")]
|
starts-with(text(), "出版社:")]
|
||||||
'''
|
'''
|
||||||
self.publisher_names = {'Publisher', 'Uitgever', 'Verlag', 'Editore', 'Editeur', 'Editor', 'Editora', '出版社'}
|
self.publisher_names = {'Publisher', 'Uitgever', 'Verlag',
|
||||||
|
'Editore', 'Editeur', 'Editor', 'Editora', '出版社'}
|
||||||
|
|
||||||
self.language_xpath = '''
|
self.language_xpath = '''
|
||||||
descendant::*[
|
descendant::*[
|
||||||
@ -244,7 +248,8 @@ class Worker(Thread): # Get details {{{
|
|||||||
or starts-with(text(), "语种")
|
or starts-with(text(), "语种")
|
||||||
]
|
]
|
||||||
'''
|
'''
|
||||||
self.language_names = {'Language', 'Sprache', 'Lingua', 'Idioma', 'Langue', '言語', 'Taal', '语种'}
|
self.language_names = {'Language', 'Sprache',
|
||||||
|
'Lingua', 'Idioma', 'Langue', '言語', 'Taal', '语种'}
|
||||||
|
|
||||||
self.tags_xpath = '''
|
self.tags_xpath = '''
|
||||||
descendant::h2[
|
descendant::h2[
|
||||||
@ -308,7 +313,8 @@ class Worker(Thread): # Get details {{{
|
|||||||
|
|
||||||
def get_details(self):
|
def get_details(self):
|
||||||
if self.preparsed_root is None:
|
if self.preparsed_root is None:
|
||||||
raw, root, selector = parse_details_page(self.url, self.log, self.timeout, self.browser, self.domain)
|
raw, root, selector = parse_details_page(
|
||||||
|
self.url, self.log, self.timeout, self.browser, self.domain)
|
||||||
else:
|
else:
|
||||||
raw, root, selector = self.preparsed_root
|
raw, root, selector = self.preparsed_root
|
||||||
|
|
||||||
@ -319,9 +325,11 @@ class Worker(Thread): # Get details {{{
|
|||||||
def parse_details(self, raw, root):
|
def parse_details(self, raw, root):
|
||||||
asin = parse_asin(root, self.log, self.url)
|
asin = parse_asin(root, self.log, self.url)
|
||||||
if not asin and root.xpath('//form[@action="/errors/validateCaptcha"]'):
|
if not asin and root.xpath('//form[@action="/errors/validateCaptcha"]'):
|
||||||
raise CaptchaError('Amazon returned a CAPTCHA page, probably because you downloaded too many books. Wait for some time and try again.')
|
raise CaptchaError(
|
||||||
|
'Amazon returned a CAPTCHA page, probably because you downloaded too many books. Wait for some time and try again.')
|
||||||
if self.testing:
|
if self.testing:
|
||||||
import tempfile, uuid
|
import tempfile
|
||||||
|
import uuid
|
||||||
with tempfile.NamedTemporaryFile(prefix=(asin or str(uuid.uuid4())) + '_',
|
with tempfile.NamedTemporaryFile(prefix=(asin or str(uuid.uuid4())) + '_',
|
||||||
suffix='.html', delete=False) as f:
|
suffix='.html', delete=False) as f:
|
||||||
f.write(raw)
|
f.write(raw)
|
||||||
@ -340,7 +348,8 @@ class Worker(Thread): # Get details {{{
|
|||||||
authors = []
|
authors = []
|
||||||
|
|
||||||
if not title or not authors or not asin:
|
if not title or not authors or not asin:
|
||||||
self.log.error('Could not find title/authors/asin for %r'%self.url)
|
self.log.error(
|
||||||
|
'Could not find title/authors/asin for %r' % self.url)
|
||||||
self.log.error('ASIN: %r Title: %r Authors: %r' % (asin, title,
|
self.log.error('ASIN: %r Title: %r Authors: %r' % (asin, title,
|
||||||
authors))
|
authors))
|
||||||
return
|
return
|
||||||
@ -378,15 +387,19 @@ class Worker(Thread): # Get details {{{
|
|||||||
self.cover_url = self.parse_cover(root, raw)
|
self.cover_url = self.parse_cover(root, raw)
|
||||||
except:
|
except:
|
||||||
self.log.exception('Error parsing cover for url: %r' % self.url)
|
self.log.exception('Error parsing cover for url: %r' % self.url)
|
||||||
|
if self.cover_url_processor is not None and self.cover_url.startswith('/'):
|
||||||
|
self.cover_url = self.cover_url_processor(self.cover_url)
|
||||||
mi.has_cover = bool(self.cover_url)
|
mi.has_cover = bool(self.cover_url)
|
||||||
|
|
||||||
non_hero = tuple(self.selector('div#bookDetails_container_div div#nonHeroSection'))
|
non_hero = tuple(self.selector(
|
||||||
|
'div#bookDetails_container_div div#nonHeroSection'))
|
||||||
if non_hero:
|
if non_hero:
|
||||||
# New style markup
|
# New style markup
|
||||||
try:
|
try:
|
||||||
self.parse_new_details(root, mi, non_hero[0])
|
self.parse_new_details(root, mi, non_hero[0])
|
||||||
except:
|
except:
|
||||||
self.log.exception('Failed to parse new-style book details section')
|
self.log.exception(
|
||||||
|
'Failed to parse new-style book details section')
|
||||||
else:
|
else:
|
||||||
pd = root.xpath(self.pd_xpath)
|
pd = root.xpath(self.pd_xpath)
|
||||||
if pd:
|
if pd:
|
||||||
@ -397,27 +410,32 @@ class Worker(Thread): # Get details {{{
|
|||||||
if isbn:
|
if isbn:
|
||||||
self.isbn = mi.isbn = isbn
|
self.isbn = mi.isbn = isbn
|
||||||
except:
|
except:
|
||||||
self.log.exception('Error parsing ISBN for url: %r'%self.url)
|
self.log.exception(
|
||||||
|
'Error parsing ISBN for url: %r' % self.url)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
mi.publisher = self.parse_publisher(pd)
|
mi.publisher = self.parse_publisher(pd)
|
||||||
except:
|
except:
|
||||||
self.log.exception('Error parsing publisher for url: %r'%self.url)
|
self.log.exception(
|
||||||
|
'Error parsing publisher for url: %r' % self.url)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
mi.pubdate = self.parse_pubdate(pd)
|
mi.pubdate = self.parse_pubdate(pd)
|
||||||
except:
|
except:
|
||||||
self.log.exception('Error parsing publish date for url: %r'%self.url)
|
self.log.exception(
|
||||||
|
'Error parsing publish date for url: %r' % self.url)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
lang = self.parse_language(pd)
|
lang = self.parse_language(pd)
|
||||||
if lang:
|
if lang:
|
||||||
mi.language = lang
|
mi.language = lang
|
||||||
except:
|
except:
|
||||||
self.log.exception('Error parsing language for url: %r'%self.url)
|
self.log.exception(
|
||||||
|
'Error parsing language for url: %r' % self.url)
|
||||||
|
|
||||||
else:
|
else:
|
||||||
self.log.warning('Failed to find product description for url: %r'%self.url)
|
self.log.warning(
|
||||||
|
'Failed to find product description for url: %r' % self.url)
|
||||||
|
|
||||||
mi.source_relevance = self.relevance
|
mi.source_relevance = self.relevance
|
||||||
|
|
||||||
@ -448,7 +466,8 @@ class Worker(Thread): # Get details {{{
|
|||||||
title = self.tostring(actual_title[0], encoding=unicode,
|
title = self.tostring(actual_title[0], encoding=unicode,
|
||||||
method='text').strip()
|
method='text').strip()
|
||||||
else:
|
else:
|
||||||
title = self.tostring(tdiv, encoding=unicode, method='text').strip()
|
title = self.tostring(tdiv, encoding=unicode,
|
||||||
|
method='text').strip()
|
||||||
ans = re.sub(r'[(\[].*[)\]]', '', title).strip()
|
ans = re.sub(r'[(\[].*[)\]]', '', title).strip()
|
||||||
if not ans:
|
if not ans:
|
||||||
ans = title.rpartition('[')[0].strip()
|
ans = title.rpartition('[')[0].strip()
|
||||||
@ -540,7 +559,8 @@ class Worker(Thread): # Get details {{{
|
|||||||
if len(ns) == 0 and ns.text:
|
if len(ns) == 0 and ns.text:
|
||||||
import html5lib
|
import html5lib
|
||||||
# html5lib parsed noscript as CDATA
|
# html5lib parsed noscript as CDATA
|
||||||
ns = html5lib.parseFragment('<div>%s</div>' % (ns.text), treebuilder='lxml', namespaceHTMLElements=False)[0]
|
ns = html5lib.parseFragment(
|
||||||
|
'<div>%s</div>' % (ns.text), treebuilder='lxml', namespaceHTMLElements=False)[0]
|
||||||
else:
|
else:
|
||||||
ns.tag = 'div'
|
ns.tag = 'div'
|
||||||
ans = self._render_comments(ns)
|
ans = self._render_comments(ns)
|
||||||
@ -549,7 +569,8 @@ class Worker(Thread): # Get details {{{
|
|||||||
if desc:
|
if desc:
|
||||||
ans = self._render_comments(desc[0])
|
ans = self._render_comments(desc[0])
|
||||||
|
|
||||||
desc = root.xpath('//div[@id="productDescription"]/*[@class="content"]')
|
desc = root.xpath(
|
||||||
|
'//div[@id="productDescription"]/*[@class="content"]')
|
||||||
if desc:
|
if desc:
|
||||||
ans += self._render_comments(desc[0])
|
ans += self._render_comments(desc[0])
|
||||||
else:
|
else:
|
||||||
@ -559,12 +580,15 @@ class Worker(Thread): # Get details {{{
|
|||||||
if m is not None:
|
if m is not None:
|
||||||
try:
|
try:
|
||||||
text = unquote(m.group(1)).decode('utf-8')
|
text = unquote(m.group(1)).decode('utf-8')
|
||||||
nr = html5lib.parse(text, treebuilder='lxml', namespaceHTMLElements=False)
|
nr = html5lib.parse(
|
||||||
desc = nr.xpath('//div[@id="productDescription"]/*[@class="content"]')
|
text, treebuilder='lxml', namespaceHTMLElements=False)
|
||||||
|
desc = nr.xpath(
|
||||||
|
'//div[@id="productDescription"]/*[@class="content"]')
|
||||||
if desc:
|
if desc:
|
||||||
ans += self._render_comments(desc[0])
|
ans += self._render_comments(desc[0])
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
self.log.warn('Parsing of obfuscated product description failed with error: %s' % as_unicode(e))
|
self.log.warn(
|
||||||
|
'Parsing of obfuscated product description failed with error: %s' % as_unicode(e))
|
||||||
|
|
||||||
return ans
|
return ans
|
||||||
|
|
||||||
@ -577,13 +601,15 @@ class Worker(Thread): # Get details {{{
|
|||||||
series = series[0]
|
series = series[0]
|
||||||
spans = series.xpath('./span')
|
spans = series.xpath('./span')
|
||||||
if spans:
|
if spans:
|
||||||
raw = self.tostring(spans[0], encoding=unicode, method='text', with_tail=False).strip()
|
raw = self.tostring(
|
||||||
|
spans[0], encoding=unicode, method='text', with_tail=False).strip()
|
||||||
m = re.search('\s+([0-9.]+)$', raw.strip())
|
m = re.search('\s+([0-9.]+)$', raw.strip())
|
||||||
if m is not None:
|
if m is not None:
|
||||||
series_index = float(m.group(1))
|
series_index = float(m.group(1))
|
||||||
s = series.xpath('./a[@id="series-page-link"]')
|
s = series.xpath('./a[@id="series-page-link"]')
|
||||||
if s:
|
if s:
|
||||||
series = self.tostring(s[0], encoding=unicode, method='text', with_tail=False).strip()
|
series = self.tostring(
|
||||||
|
s[0], encoding=unicode, method='text', with_tail=False).strip()
|
||||||
if series:
|
if series:
|
||||||
ans = (series, series_index)
|
ans = (series, series_index)
|
||||||
# This is found on Kindle edition pages on amazon.com
|
# This is found on Kindle edition pages on amazon.com
|
||||||
@ -595,7 +621,8 @@ class Worker(Thread): # Get details {{{
|
|||||||
series_index = float(m.group(1))
|
series_index = float(m.group(1))
|
||||||
a = span.xpath('./a[@href]')
|
a = span.xpath('./a[@href]')
|
||||||
if a:
|
if a:
|
||||||
series = self.tostring(a[0], encoding=unicode, method='text', with_tail=False).strip()
|
series = self.tostring(
|
||||||
|
a[0], encoding=unicode, method='text', with_tail=False).strip()
|
||||||
if series:
|
if series:
|
||||||
ans = (series, series_index)
|
ans = (series, series_index)
|
||||||
# This is found on newer Kindle edition pages on amazon.com
|
# This is found on newer Kindle edition pages on amazon.com
|
||||||
@ -607,7 +634,8 @@ class Worker(Thread): # Get details {{{
|
|||||||
series_index = float(m.group(1))
|
series_index = float(m.group(1))
|
||||||
a = b.getparent().xpath('./a[@href]')
|
a = b.getparent().xpath('./a[@href]')
|
||||||
if a:
|
if a:
|
||||||
series = self.tostring(a[0], encoding=unicode, method='text', with_tail=False).partition('(')[0].strip()
|
series = self.tostring(
|
||||||
|
a[0], encoding=unicode, method='text', with_tail=False).partition('(')[0].strip()
|
||||||
if series:
|
if series:
|
||||||
ans = series, series_index
|
ans = series, series_index
|
||||||
|
|
||||||
@ -629,12 +657,14 @@ class Worker(Thread): # Get details {{{
|
|||||||
def parse_tags(self, root):
|
def parse_tags(self, root):
|
||||||
ans = []
|
ans = []
|
||||||
exclude_tokens = {'kindle', 'a-z'}
|
exclude_tokens = {'kindle', 'a-z'}
|
||||||
exclude = {'special features', 'by authors', 'authors & illustrators', 'books', 'new; used & rental textbooks'}
|
exclude = {'special features', 'by authors',
|
||||||
|
'authors & illustrators', 'books', 'new; used & rental textbooks'}
|
||||||
seen = set()
|
seen = set()
|
||||||
for li in root.xpath(self.tags_xpath):
|
for li in root.xpath(self.tags_xpath):
|
||||||
for i, a in enumerate(li.iterdescendants('a')):
|
for i, a in enumerate(li.iterdescendants('a')):
|
||||||
if i > 0:
|
if i > 0:
|
||||||
# we ignore the first category since it is almost always too broad
|
# we ignore the first category since it is almost always
|
||||||
|
# too broad
|
||||||
raw = (a.text or '').strip().replace(',', ';')
|
raw = (a.text or '').strip().replace(',', ';')
|
||||||
lraw = icu_lower(raw)
|
lraw = icu_lower(raw)
|
||||||
tokens = frozenset(lraw.split())
|
tokens = frozenset(lraw.split())
|
||||||
@ -674,12 +704,14 @@ class Worker(Thread): # Get details {{{
|
|||||||
if url:
|
if url:
|
||||||
return url
|
return url
|
||||||
|
|
||||||
imgs = root.xpath('//img[(@id="prodImage" or @id="original-main-image" or @id="main-image" or @id="main-image-nonjs") and @src]')
|
imgs = root.xpath(
|
||||||
|
'//img[(@id="prodImage" or @id="original-main-image" or @id="main-image" or @id="main-image-nonjs") and @src]')
|
||||||
if not imgs:
|
if not imgs:
|
||||||
imgs = (
|
imgs = (
|
||||||
root.xpath('//div[@class="main-image-inner-wrapper"]/img[@src]') or
|
root.xpath('//div[@class="main-image-inner-wrapper"]/img[@src]') or
|
||||||
root.xpath('//div[@id="main-image-container" or @id="ebooks-main-image-container"]//img[@src]') or
|
root.xpath('//div[@id="main-image-container" or @id="ebooks-main-image-container"]//img[@src]') or
|
||||||
root.xpath('//div[@id="mainImageContainer"]//img[@data-a-dynamic-image]')
|
root.xpath(
|
||||||
|
'//div[@id="mainImageContainer"]//img[@data-a-dynamic-image]')
|
||||||
)
|
)
|
||||||
for img in imgs:
|
for img in imgs:
|
||||||
try:
|
try:
|
||||||
@ -887,7 +919,8 @@ class Amazon(Source):
|
|||||||
return 'https://www.amazon.%s/' % domain
|
return 'https://www.amazon.%s/' % domain
|
||||||
|
|
||||||
def _get_book_url(self, identifiers): # {{{
|
def _get_book_url(self, identifiers): # {{{
|
||||||
domain, asin = self.get_domain_and_asin(identifiers, extra_domains=('in', 'au', 'ca'))
|
domain, asin = self.get_domain_and_asin(
|
||||||
|
identifiers, extra_domains=('in', 'au', 'ca'))
|
||||||
if domain and asin:
|
if domain and asin:
|
||||||
url = None
|
url = None
|
||||||
r = self.referrer_for_domain(domain)
|
r = self.referrer_for_domain(domain)
|
||||||
@ -955,7 +988,7 @@ class Amazon(Source):
|
|||||||
return udomain
|
return udomain
|
||||||
|
|
||||||
def create_query(self, log, title=None, authors=None, identifiers={}, # {{{
|
def create_query(self, log, title=None, authors=None, identifiers={}, # {{{
|
||||||
domain=None):
|
domain=None, for_amazon=True):
|
||||||
from urllib import urlencode
|
from urllib import urlencode
|
||||||
if domain is None:
|
if domain is None:
|
||||||
domain = self.domain
|
domain = self.domain
|
||||||
@ -965,6 +998,7 @@ class Amazon(Source):
|
|||||||
domain = idomain
|
domain = idomain
|
||||||
|
|
||||||
# See the amazon detailed search page to get all options
|
# See the amazon detailed search page to get all options
|
||||||
|
terms = []
|
||||||
q = {'search-alias': 'aps',
|
q = {'search-alias': 'aps',
|
||||||
'unfiltered': '1',
|
'unfiltered': '1',
|
||||||
}
|
}
|
||||||
@ -978,26 +1012,34 @@ class Amazon(Source):
|
|||||||
|
|
||||||
if asin is not None:
|
if asin is not None:
|
||||||
q['field-keywords'] = asin
|
q['field-keywords'] = asin
|
||||||
|
terms.append(asin)
|
||||||
elif isbn is not None:
|
elif isbn is not None:
|
||||||
q['field-isbn'] = isbn
|
q['field-isbn'] = isbn
|
||||||
|
terms.append(isbn)
|
||||||
else:
|
else:
|
||||||
# Only return book results
|
# Only return book results
|
||||||
q['search-alias'] = {'br':'digital-text', 'nl':'aps'}.get(domain, 'stripbooks')
|
q['search-alias'] = {'br': 'digital-text',
|
||||||
|
'nl': 'aps'}.get(domain, 'stripbooks')
|
||||||
if title:
|
if title:
|
||||||
title_tokens = list(self.get_title_tokens(title))
|
title_tokens = list(self.get_title_tokens(title))
|
||||||
if title_tokens:
|
if title_tokens:
|
||||||
q['field-title'] = ' '.join(title_tokens)
|
q['field-title'] = ' '.join(title_tokens)
|
||||||
|
terms.extend(title_tokens)
|
||||||
if authors:
|
if authors:
|
||||||
author_tokens = self.get_author_tokens(authors,
|
author_tokens = self.get_author_tokens(authors,
|
||||||
only_first_author=True)
|
only_first_author=True)
|
||||||
if author_tokens:
|
if author_tokens:
|
||||||
q['field-author'] = ' '.join(author_tokens)
|
q['field-author'] = ' '.join(author_tokens)
|
||||||
|
terms.extend(author_tokens)
|
||||||
|
|
||||||
if not ('field-keywords' in q or 'field-isbn' in q or
|
if not ('field-keywords' in q or 'field-isbn' in q or
|
||||||
('field-title' in q)):
|
('field-title' in q)):
|
||||||
# Insufficient metadata to make an identify query
|
# Insufficient metadata to make an identify query
|
||||||
return None, None
|
return None, None
|
||||||
|
|
||||||
|
if not for_amazon:
|
||||||
|
return terms, domain
|
||||||
|
|
||||||
# magic parameter to enable Japanese Shift_JIS encoding.
|
# magic parameter to enable Japanese Shift_JIS encoding.
|
||||||
if domain == 'jp':
|
if domain == 'jp':
|
||||||
q['__mk_ja_JP'] = u'カタカナ'
|
q['__mk_ja_JP'] = u'カタカナ'
|
||||||
@ -1018,7 +1060,8 @@ class Amazon(Source):
|
|||||||
encoded_q = dict([(x.encode(encode_to, 'ignore'), y.encode(encode_to,
|
encoded_q = dict([(x.encode(encode_to, 'ignore'), y.encode(encode_to,
|
||||||
'ignore')) for x, y in
|
'ignore')) for x, y in
|
||||||
q.iteritems()])
|
q.iteritems()])
|
||||||
url = 'https://www.amazon.%s/s/?'%self.get_website_domain(domain) + urlencode(encoded_q)
|
url = 'https://www.amazon.%s/s/?' % self.get_website_domain(
|
||||||
|
domain) + urlencode(encoded_q)
|
||||||
return url, domain
|
return url, domain
|
||||||
|
|
||||||
# }}}
|
# }}}
|
||||||
@ -1043,7 +1086,8 @@ class Amazon(Source):
|
|||||||
|
|
||||||
def title_ok(title):
|
def title_ok(title):
|
||||||
title = title.lower()
|
title = title.lower()
|
||||||
bad = ['bulk pack', '[audiobook]', '[audio cd]', '(a book companion)', '( slipcase with door )', ': free sampler']
|
bad = ['bulk pack', '[audiobook]', '[audio cd]',
|
||||||
|
'(a book companion)', '( slipcase with door )', ': free sampler']
|
||||||
if self.domain == 'com':
|
if self.domain == 'com':
|
||||||
bad.extend(['(%s edition)' % x for x in ('spanish', 'german')])
|
bad.extend(['(%s edition)' % x for x in ('spanish', 'german')])
|
||||||
for x in bad:
|
for x in bad:
|
||||||
@ -1059,7 +1103,8 @@ class Amazon(Source):
|
|||||||
if title_ok(title):
|
if title_ok(title):
|
||||||
url = a.get('href')
|
url = a.get('href')
|
||||||
if url.startswith('/'):
|
if url.startswith('/'):
|
||||||
url = 'https://www.amazon.%s%s' % (self.get_website_domain(domain), url)
|
url = 'https://www.amazon.%s%s' % (
|
||||||
|
self.get_website_domain(domain), url)
|
||||||
matches.append(url)
|
matches.append(url)
|
||||||
|
|
||||||
if not matches:
|
if not matches:
|
||||||
@ -1074,7 +1119,8 @@ class Amazon(Source):
|
|||||||
if title_ok(title):
|
if title_ok(title):
|
||||||
url = a.get('href')
|
url = a.get('href')
|
||||||
if url.startswith('/'):
|
if url.startswith('/'):
|
||||||
url = 'https://www.amazon.%s%s' % (self.get_website_domain(domain), url)
|
url = 'https://www.amazon.%s%s' % (
|
||||||
|
self.get_website_domain(domain), url)
|
||||||
matches.append(url)
|
matches.append(url)
|
||||||
break
|
break
|
||||||
|
|
||||||
@ -1088,7 +1134,8 @@ class Amazon(Source):
|
|||||||
if title_ok(title):
|
if title_ok(title):
|
||||||
url = a.get('href')
|
url = a.get('href')
|
||||||
if url.startswith('/'):
|
if url.startswith('/'):
|
||||||
url = 'https://www.amazon.%s%s' % (self.get_website_domain(domain), url)
|
url = 'https://www.amazon.%s%s' % (
|
||||||
|
self.get_website_domain(domain), url)
|
||||||
matches.append(url)
|
matches.append(url)
|
||||||
break
|
break
|
||||||
if not matches and root.xpath('//form[@action="/errors/validateCaptcha"]'):
|
if not matches and root.xpath('//form[@action="/errors/validateCaptcha"]'):
|
||||||
@ -1101,7 +1148,7 @@ class Amazon(Source):
|
|||||||
return matches[:3]
|
return matches[:3]
|
||||||
# }}}
|
# }}}
|
||||||
|
|
||||||
def search_amazon(self, br, testing, log, abort, title, authors, identifiers, timeout):
|
def search_amazon(self, br, testing, log, abort, title, authors, identifiers, timeout): # {{{
|
||||||
import html5lib
|
import html5lib
|
||||||
from calibre.utils.cleantext import clean_ascii_chars
|
from calibre.utils.cleantext import clean_ascii_chars
|
||||||
from calibre.ebooks.chardet import xml_to_unicode
|
from calibre.ebooks.chardet import xml_to_unicode
|
||||||
@ -1152,10 +1199,42 @@ class Amazon(Source):
|
|||||||
|
|
||||||
matches = self.parse_results_page(root, domain)
|
matches = self.parse_results_page(root, domain)
|
||||||
|
|
||||||
return matches, query, domain
|
return matches, query, domain, None
|
||||||
|
# }}}
|
||||||
|
|
||||||
|
def search_search_engine(self, br, testing, log, abort, title, authors, identifiers, timeout): # {{{
|
||||||
|
terms, domain = self.create_query(log, title=title, authors=authors,
|
||||||
|
identifiers=identifiers, for_amazon=False)
|
||||||
|
site = self.referrer_for_domain(
|
||||||
|
domain)[len('https://'):].partition('/')[0]
|
||||||
|
se = search_engines_module()
|
||||||
|
matches = []
|
||||||
|
for result in se.ddg_search(terms, site, log=log, br=br, timeout=timeout):
|
||||||
|
if abort.is_set():
|
||||||
|
return matches, terms, domain, None
|
||||||
|
|
||||||
|
purl = urlparse(result.url)
|
||||||
|
if '/dp/' in purl.path and site in purl.netloc:
|
||||||
|
url = result.cached_url
|
||||||
|
if url is None:
|
||||||
|
url = se.wayback_machine_cached_url(
|
||||||
|
result.url, br, timeout=timeout)
|
||||||
|
if url is None:
|
||||||
|
log('Failed to find cached page for:', result.url)
|
||||||
|
continue
|
||||||
|
if url not in matches:
|
||||||
|
matches.append(url)
|
||||||
|
if len(matches) >= 3:
|
||||||
|
break
|
||||||
|
else:
|
||||||
|
log('Skipping non-book result:', result)
|
||||||
|
if not matches:
|
||||||
|
log('No search engine results for terms:', ' '.join(terms))
|
||||||
|
return matches, terms, domain, se.wayback_url_processor
|
||||||
|
# }}}
|
||||||
|
|
||||||
def identify(self, log, result_queue, abort, title=None, authors=None, # {{{
|
def identify(self, log, result_queue, abort, title=None, authors=None, # {{{
|
||||||
identifiers={}, timeout=30):
|
identifiers={}, timeout=60):
|
||||||
'''
|
'''
|
||||||
Note this method will retry without identifiers automatically if no
|
Note this method will retry without identifiers automatically if no
|
||||||
match is found with identifiers.
|
match is found with identifiers.
|
||||||
@ -1165,23 +1244,38 @@ class Amazon(Source):
|
|||||||
|
|
||||||
udata = self._get_book_url(identifiers)
|
udata = self._get_book_url(identifiers)
|
||||||
br = self.browser
|
br = self.browser
|
||||||
|
log('User-agent:', br.current_user_agent())
|
||||||
if testing:
|
if testing:
|
||||||
print('User-agent:', br.current_user_agent())
|
print('User-agent:', br.current_user_agent())
|
||||||
if udata is not None:
|
if udata is not None:
|
||||||
# Try to directly get details page instead of running a search
|
# Try to directly get details page instead of running a search
|
||||||
domain, idtype, asin, durl = udata
|
domain, idtype, asin, durl = udata
|
||||||
preparsed_root = parse_details_page(durl, log, timeout, br, domain)
|
cover_url_processor = None
|
||||||
|
if USE_SEARCH_ENGINE:
|
||||||
|
se = search_engines_module()
|
||||||
|
durl = se.wayback_machine_cached_url(
|
||||||
|
durl, br, timeout=timeout, log=log)
|
||||||
|
cover_url_processor = se.wayback_url_processor
|
||||||
|
if durl is None:
|
||||||
|
log('Failed to get cached URL for asin:', asin)
|
||||||
|
else:
|
||||||
|
preparsed_root = parse_details_page(
|
||||||
|
durl, log, timeout, br, domain)
|
||||||
if preparsed_root is not None:
|
if preparsed_root is not None:
|
||||||
qasin = parse_asin(preparsed_root[1], log, durl)
|
qasin = parse_asin(preparsed_root[1], log, durl)
|
||||||
if qasin == asin:
|
if qasin == asin:
|
||||||
w = Worker(durl, result_queue, br, log, 0, domain, self, testing=testing, preparsed_root=preparsed_root)
|
w = Worker(durl, result_queue, br, log, 0, domain,
|
||||||
|
self, testing=testing, preparsed_root=preparsed_root, cover_url_processor=cover_url_processor)
|
||||||
try:
|
try:
|
||||||
w.get_details()
|
w.get_details()
|
||||||
return
|
return
|
||||||
except Exception:
|
except Exception:
|
||||||
log.exception('get_details failed for url: %r'%durl)
|
log.exception(
|
||||||
|
'get_details failed for url: %r' % durl)
|
||||||
|
func = self.search_search_engine if USE_SEARCH_ENGINE else self.search_amazon
|
||||||
try:
|
try:
|
||||||
matches, query, domain = self.search_amazon(br, testing, log, abort, title, authors, identifiers, timeout)
|
matches, query, domain, cover_url_processor = func(
|
||||||
|
br, testing, log, abort, title, authors, identifiers, timeout)
|
||||||
except SearchFailed:
|
except SearchFailed:
|
||||||
return
|
return
|
||||||
|
|
||||||
@ -1198,8 +1292,8 @@ class Amazon(Source):
|
|||||||
log.error('No matches found with query: %r' % query)
|
log.error('No matches found with query: %r' % query)
|
||||||
return
|
return
|
||||||
|
|
||||||
workers = [Worker(url, result_queue, br, log, i, domain, self,
|
workers = [Worker(url, result_queue, br, log, i, domain, self, testing=testing,
|
||||||
testing=testing) for i, url in enumerate(matches)]
|
cover_url_processor=cover_url_processor) for i, url in enumerate(matches)]
|
||||||
|
|
||||||
for w in workers:
|
for w in workers:
|
||||||
# Don't send all requests at the same time
|
# Don't send all requests at the same time
|
||||||
@ -1223,7 +1317,7 @@ class Amazon(Source):
|
|||||||
# }}}
|
# }}}
|
||||||
|
|
||||||
def download_cover(self, log, result_queue, abort, # {{{
|
def download_cover(self, log, result_queue, abort, # {{{
|
||||||
title=None, authors=None, identifiers={}, timeout=30, get_best_cover=False):
|
title=None, authors=None, identifiers={}, timeout=60, get_best_cover=False):
|
||||||
cached_url = self.get_cached_cover_url(identifiers)
|
cached_url = self.get_cached_cover_url(identifiers)
|
||||||
if cached_url is None:
|
if cached_url is None:
|
||||||
log.info('No cached cover found, running identify')
|
log.info('No cached cover found, running identify')
|
||||||
@ -1255,7 +1349,8 @@ class Amazon(Source):
|
|||||||
log('Downloading cover from:', cached_url)
|
log('Downloading cover from:', cached_url)
|
||||||
try:
|
try:
|
||||||
time.sleep(1)
|
time.sleep(1)
|
||||||
cdata = self.browser.open_novisit(cached_url, timeout=timeout).read()
|
cdata = self.browser.open_novisit(
|
||||||
|
cached_url, timeout=timeout).read()
|
||||||
result_queue.put((self, cdata))
|
result_queue.put((self, cdata))
|
||||||
except:
|
except:
|
||||||
log.exception('Failed to download cover from:', cached_url)
|
log.exception('Failed to download cover from:', cached_url)
|
||||||
@ -1263,29 +1358,34 @@ class Amazon(Source):
|
|||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__': # tests {{{
|
if __name__ == '__main__': # tests {{{
|
||||||
# To run these test use: calibre-debug src/calibre/ebooks/metadata/sources/amazon.py
|
# To run these test use: calibre-debug
|
||||||
|
# src/calibre/ebooks/metadata/sources/amazon.py
|
||||||
from calibre.ebooks.metadata.sources.test import (test_identify_plugin,
|
from calibre.ebooks.metadata.sources.test import (test_identify_plugin,
|
||||||
isbn_test, title_test, authors_test, comments_test, series_test)
|
isbn_test, title_test, authors_test, comments_test, series_test)
|
||||||
com_tests = [ # {{{
|
com_tests = [ # {{{
|
||||||
|
|
||||||
( # Paperback with series
|
( # Paperback with series
|
||||||
{'identifiers': {'amazon': '1423146786'}},
|
{'identifiers': {'amazon': '1423146786'}},
|
||||||
[title_test('The Heroes of Olympus, Book Five The Blood of Olympus', exact=True), series_test('Heroes of Olympus', 5)]
|
[title_test('The Heroes of Olympus, Book Five The Blood of Olympus',
|
||||||
|
exact=True), series_test('Heroes of Olympus', 5)]
|
||||||
),
|
),
|
||||||
|
|
||||||
( # Kindle edition with series
|
( # Kindle edition with series
|
||||||
{'identifiers': {'amazon': 'B0085UEQDO'}},
|
{'identifiers': {'amazon': 'B0085UEQDO'}},
|
||||||
[title_test('Three Parts Dead', exact=True), series_test('Craft Sequence', 1)]
|
[title_test('Three Parts Dead', exact=True),
|
||||||
|
series_test('Craft Sequence', 1)]
|
||||||
),
|
),
|
||||||
|
|
||||||
( # A kindle edition that does not appear in the search results when searching by ASIN
|
( # A kindle edition that does not appear in the search results when searching by ASIN
|
||||||
{'identifiers': {'amazon': 'B004JHY6OG'}},
|
{'identifiers': {'amazon': 'B004JHY6OG'}},
|
||||||
[title_test('The Heroes: A First Law Novel (First Law World 2)', exact=True)]
|
[title_test(
|
||||||
|
'The Heroes: A First Law Novel (First Law World 2)', exact=True)]
|
||||||
),
|
),
|
||||||
|
|
||||||
( # + in title and uses id="main-image" for cover
|
( # + in title and uses id="main-image" for cover
|
||||||
{'identifiers': {'amazon': '1933988770'}},
|
{'identifiers': {'amazon': '1933988770'}},
|
||||||
[title_test('C++ Concurrency in Action: Practical Multithreading', exact=True)]
|
[title_test(
|
||||||
|
'C++ Concurrency in Action: Practical Multithreading', exact=True)]
|
||||||
),
|
),
|
||||||
|
|
||||||
|
|
||||||
@ -1426,7 +1526,8 @@ if __name__ == '__main__': # tests {{{
|
|||||||
cn_tests = [ # {{{
|
cn_tests = [ # {{{
|
||||||
(
|
(
|
||||||
{'identifiers': {'isbn': '9787115369512'}},
|
{'identifiers': {'isbn': '9787115369512'}},
|
||||||
[title_test('若为自由故 自由软件之父理查德斯托曼传', exact=True), authors_test(['[美]sam Williams', '邓楠,李凡希'])]
|
[title_test('若为自由故 自由软件之父理查德斯托曼传', exact=True),
|
||||||
|
authors_test(['[美]sam Williams', '邓楠,李凡希'])]
|
||||||
),
|
),
|
||||||
(
|
(
|
||||||
{'title': '爱上Raspberry Pi'},
|
{'title': '爱上Raspberry Pi'},
|
||||||
@ -1440,12 +1541,14 @@ if __name__ == '__main__': # tests {{{
|
|||||||
ca_tests = [ # {{{
|
ca_tests = [ # {{{
|
||||||
( # Paperback with series
|
( # Paperback with series
|
||||||
{'identifiers': {'isbn': '9781623808747'}},
|
{'identifiers': {'isbn': '9781623808747'}},
|
||||||
[title_test('Parting Shot', exact=True), authors_test(['Mary Calmes'])]
|
[title_test('Parting Shot', exact=True),
|
||||||
|
authors_test(['Mary Calmes'])]
|
||||||
),
|
),
|
||||||
( # # in title
|
( # # in title
|
||||||
{'title': 'Expert C# 2008 Business Objects',
|
{'title': 'Expert C# 2008 Business Objects',
|
||||||
'authors': ['Lhotka']},
|
'authors': ['Lhotka']},
|
||||||
[title_test('Expert C# 2008 Business Objects'), authors_test(['Rockford Lhotka'])]
|
[title_test('Expert C# 2008 Business Objects'),
|
||||||
|
authors_test(['Rockford Lhotka'])]
|
||||||
),
|
),
|
||||||
( # noscript description
|
( # noscript description
|
||||||
{'identifiers': {'amazon_ca': '162380874X'}},
|
{'identifiers': {'amazon_ca': '162380874X'}},
|
||||||
|
@ -46,12 +46,12 @@ def parse_html(raw):
|
|||||||
return html5lib.parse(raw, treebuilder='lxml', namespaceHTMLElements=False)
|
return html5lib.parse(raw, treebuilder='lxml', namespaceHTMLElements=False)
|
||||||
|
|
||||||
|
|
||||||
def query(br, url, key, dump_raw=None, limit=1, parser=parse_html):
|
def query(br, url, key, dump_raw=None, limit=1, parser=parse_html, timeout=60):
|
||||||
delta = monotonic() - last_visited[key]
|
delta = monotonic() - last_visited[key]
|
||||||
if delta < limit and delta > 0:
|
if delta < limit and delta > 0:
|
||||||
time.sleep(delta)
|
time.sleep(delta)
|
||||||
try:
|
try:
|
||||||
raw = br.open_novisit(url).read()
|
raw = br.open_novisit(url, timeout=timeout).read()
|
||||||
finally:
|
finally:
|
||||||
last_visited[key] = monotonic()
|
last_visited[key] = monotonic()
|
||||||
if dump_raw is not None:
|
if dump_raw is not None:
|
||||||
@ -80,20 +80,29 @@ def ddg_href(url):
|
|||||||
return url
|
return url
|
||||||
|
|
||||||
|
|
||||||
def wayback_machine_cached_url(url, br=None):
|
def wayback_machine_cached_url(url, br=None, log=prints, timeout=60):
|
||||||
q = quote_term(url)
|
q = quote_term(url)
|
||||||
br = br or browser()
|
br = br or browser()
|
||||||
data = query(br, 'https://archive.org/wayback/available?url=' +
|
data = query(br, 'https://archive.org/wayback/available?url=' +
|
||||||
q, 'wayback', parser=json.loads, limit=0.25)
|
q, 'wayback', parser=json.loads, limit=0.25, timeout=timeout)
|
||||||
try:
|
try:
|
||||||
closest = data['archived_snapshots']['closest']
|
closest = data['archived_snapshots']['closest']
|
||||||
except KeyError:
|
except KeyError:
|
||||||
return
|
pass
|
||||||
|
else:
|
||||||
if closest['available']:
|
if closest['available']:
|
||||||
return closest['url']
|
return closest['url']
|
||||||
|
from pprint import pformat
|
||||||
|
log('Response from wayback machine:', pformat(data))
|
||||||
|
|
||||||
|
|
||||||
def ddg_search(terms, site=None, br=None, log=prints, safe_search=False, dump_raw=None):
|
def wayback_url_processor(url):
|
||||||
|
if url.startswith('/'):
|
||||||
|
url = 'https://web.archive.org' + url
|
||||||
|
return url
|
||||||
|
|
||||||
|
|
||||||
|
def ddg_search(terms, site=None, br=None, log=prints, safe_search=False, dump_raw=None, timeout=60):
|
||||||
# https://duck.co/help/results/syntax
|
# https://duck.co/help/results/syntax
|
||||||
terms = map(ddg_term, terms)
|
terms = map(ddg_term, terms)
|
||||||
terms = [quote_term(t) for t in terms]
|
terms = [quote_term(t) for t in terms]
|
||||||
@ -104,7 +113,7 @@ def ddg_search(terms, site=None, br=None, log=prints, safe_search=False, dump_ra
|
|||||||
q=q, kp=1 if safe_search else -1)
|
q=q, kp=1 if safe_search else -1)
|
||||||
log('Making ddg query: ' + url)
|
log('Making ddg query: ' + url)
|
||||||
br = br or browser()
|
br = br or browser()
|
||||||
root = query(br, url, 'ddg', dump_raw)
|
root = query(br, url, 'ddg', dump_raw, timeout=timeout)
|
||||||
ans = []
|
ans = []
|
||||||
for a in root.xpath('//*[@class="results"]//*[@class="result__title"]/a[@href and @class="result__a"]'):
|
for a in root.xpath('//*[@class="results"]//*[@class="result__title"]/a[@href and @class="result__a"]'):
|
||||||
ans.append(Result(ddg_href(a.get('href')), etree.tostring(
|
ans.append(Result(ddg_href(a.get('href')), etree.tostring(
|
||||||
|
@ -14,7 +14,6 @@ from threading import Thread
|
|||||||
import calibre.ebooks.metadata.sources.search_engines as builtin_search_engines
|
import calibre.ebooks.metadata.sources.search_engines as builtin_search_engines
|
||||||
from calibre import as_unicode, prints
|
from calibre import as_unicode, prints
|
||||||
from calibre.constants import DEBUG, numeric_version
|
from calibre.constants import DEBUG, numeric_version
|
||||||
from calibre.customize.ui import patch_metadata_plugins
|
|
||||||
from calibre.ebooks.metadata.sources.base import Source
|
from calibre.ebooks.metadata.sources.base import Source
|
||||||
from calibre.utils.config import JSONConfig
|
from calibre.utils.config import JSONConfig
|
||||||
from calibre.utils.https import get_https_resource_securely
|
from calibre.utils.https import get_https_resource_securely
|
||||||
@ -59,6 +58,7 @@ def patch_search_engines(src):
|
|||||||
|
|
||||||
|
|
||||||
def patch_plugins():
|
def patch_plugins():
|
||||||
|
from calibre.customize.ui import patch_metadata_plugins
|
||||||
patches = {}
|
patches = {}
|
||||||
for name, val in cache.iteritems():
|
for name, val in cache.iteritems():
|
||||||
if name == 'hashes':
|
if name == 'hashes':
|
||||||
|
Loading…
x
Reference in New Issue
Block a user