mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Amazon metadata download: Fallback to using DDG if Google starts returning too many request errors
This commit is contained in:
parent
e82a9274e7
commit
cd1a36c5fe
@ -18,6 +18,8 @@ try:
|
||||
except ImportError:
|
||||
from urlparse import urlparse
|
||||
|
||||
from mechanize import HTTPError
|
||||
|
||||
from calibre import as_unicode, browser, random_user_agent, xml_replace_entities
|
||||
from calibre.ebooks.metadata import check_isbn
|
||||
from calibre.ebooks.metadata.book.base import Metadata
|
||||
@ -78,23 +80,32 @@ def parse_details_page(url, log, timeout, browser, domain):
|
||||
from calibre.utils.cleantext import clean_ascii_chars
|
||||
from calibre.ebooks.chardet import xml_to_unicode
|
||||
from lxml.html import tostring
|
||||
log('Getting details from:', url)
|
||||
try:
|
||||
raw = browser.open_novisit(url, timeout=timeout).read().strip()
|
||||
except Exception as e:
|
||||
if callable(getattr(e, 'getcode', None)) and \
|
||||
e.getcode() == 404:
|
||||
log.error('URL malformed: %r' % url)
|
||||
from calibre.ebooks.metadata.sources.update import search_engines_module
|
||||
get_data_for_cached_url = search_engines_module().get_data_for_cached_url
|
||||
except Exception:
|
||||
get_data_for_cached_url = lambda *a: None
|
||||
raw = get_data_for_cached_url(url)
|
||||
if raw:
|
||||
log('Using cached details for url:', url)
|
||||
else:
|
||||
log('Downloading details from:', url)
|
||||
try:
|
||||
raw = browser.open_novisit(url, timeout=timeout).read().strip()
|
||||
except Exception as e:
|
||||
if callable(getattr(e, 'getcode', None)) and \
|
||||
e.getcode() == 404:
|
||||
log.error('URL malformed: %r' % url)
|
||||
return
|
||||
attr = getattr(e, 'args', [None])
|
||||
attr = attr if attr else [None]
|
||||
if isinstance(attr[0], socket.timeout):
|
||||
msg = 'Details page timed out. Try again later.'
|
||||
log.error(msg)
|
||||
else:
|
||||
msg = 'Failed to make details query: %r' % url
|
||||
log.exception(msg)
|
||||
return
|
||||
attr = getattr(e, 'args', [None])
|
||||
attr = attr if attr else [None]
|
||||
if isinstance(attr[0], socket.timeout):
|
||||
msg = 'Details page timed out. Try again later.'
|
||||
log.error(msg)
|
||||
else:
|
||||
msg = 'Failed to make details query: %r' % url
|
||||
log.exception(msg)
|
||||
return
|
||||
|
||||
oraw = raw
|
||||
if 'amazon.com.br' in url:
|
||||
@ -404,7 +415,7 @@ class Worker(Thread): # Get details {{{
|
||||
with tempfile.NamedTemporaryFile(prefix=(asin or type('')(uuid.uuid4())) + '_',
|
||||
suffix='.html', delete=False) as f:
|
||||
f.write(raw)
|
||||
print('Downloaded html for', asin, 'saved in', f.name)
|
||||
print('Downloaded HTML for', asin, 'saved in', f.name)
|
||||
|
||||
try:
|
||||
title = self.parse_title(root)
|
||||
@ -992,7 +1003,7 @@ class Worker(Thread): # Get details {{{
|
||||
class Amazon(Source):
|
||||
|
||||
name = 'Amazon.com'
|
||||
version = (1, 2, 28)
|
||||
version = (1, 3, 0)
|
||||
minimum_calibre_version = (2, 82, 0)
|
||||
description = _('Downloads metadata and covers from Amazon')
|
||||
|
||||
@ -1027,6 +1038,7 @@ class Amazon(Source):
|
||||
'bing': _('Bing search cache'),
|
||||
'google': _('Google search cache'),
|
||||
'wayback': _('Wayback machine cache (slow)'),
|
||||
'ddg': _('DuckDuckGo search and Google cache'),
|
||||
}
|
||||
|
||||
options = (
|
||||
@ -1453,20 +1465,30 @@ class Amazon(Source):
|
||||
|
||||
def search_search_engine(self, br, testing, log, abort, title, authors, identifiers, timeout, override_server=None): # {{{
|
||||
from calibre.ebooks.metadata.sources.update import search_engines_module
|
||||
se = search_engines_module()
|
||||
terms, domain = self.create_query(log, title=title, authors=authors,
|
||||
identifiers=identifiers, for_amazon=False)
|
||||
site = self.referrer_for_domain(
|
||||
domain)[len('https://'):].partition('/')[0]
|
||||
matches = []
|
||||
se = search_engines_module()
|
||||
server = override_server or self.server
|
||||
if server in ('bing',):
|
||||
urlproc, sfunc = se.google_url_processor, se.google_search
|
||||
if server == 'bing':
|
||||
urlproc, sfunc = se.bing_url_processor, se.bing_search
|
||||
elif server in ('auto', 'google'):
|
||||
urlproc, sfunc = se.google_url_processor, se.google_search
|
||||
elif server == 'wayback':
|
||||
urlproc, sfunc = se.wayback_url_processor, se.ddg_search
|
||||
results, qurl = sfunc(terms, site, log=log, br=br, timeout=timeout)
|
||||
elif server == 'ddg':
|
||||
urlproc, sfunc = se.ddg_url_processor, se.ddg_search
|
||||
try:
|
||||
results, qurl = sfunc(terms, site, log=log, br=br, timeout=timeout)
|
||||
except HTTPError as err:
|
||||
if err.code == 429 and sfunc is se.google_search:
|
||||
log('Got too many requests error from Google, trying via DuckDuckGo')
|
||||
urlproc, sfunc = se.ddg_url_processor, se.ddg_search
|
||||
results, qurl = sfunc(terms, site, log=log, br=br, timeout=timeout)
|
||||
else:
|
||||
raise
|
||||
|
||||
br.set_current_header('Referer', qurl)
|
||||
for result in results:
|
||||
if abort.is_set():
|
||||
@ -1476,8 +1498,7 @@ class Amazon(Source):
|
||||
if '/dp/' in purl.path and site in purl.netloc:
|
||||
url = result.cached_url
|
||||
if url is None:
|
||||
url = se.wayback_machine_cached_url(
|
||||
result.url, br, timeout=timeout)
|
||||
url = se.get_cached_url(result.url, br, timeout=timeout)
|
||||
if url is None:
|
||||
log('Failed to find cached page for:', result.url)
|
||||
continue
|
||||
|
@ -25,7 +25,7 @@ from calibre.ebooks.chardet import xml_to_unicode
|
||||
from calibre.utils.lock import ExclusiveFile
|
||||
from calibre.utils.random_ua import accept_header_for_ua
|
||||
|
||||
current_version = (1, 1, 0)
|
||||
current_version = (1, 1, 1)
|
||||
minimum_calibre_version = (2, 80, 0)
|
||||
webcache = {}
|
||||
webcache_lock = Lock()
|
||||
@ -170,7 +170,10 @@ def ddg_search(terms, site=None, br=None, log=prints, safe_search=False, dump_ra
|
||||
root = query(br, url, 'ddg', dump_raw, timeout=timeout)
|
||||
ans = []
|
||||
for a in root.xpath('//*[@class="results"]//*[@class="result__title"]/a[@href and @class="result__a"]'):
|
||||
ans.append(Result(ddg_href(a.get('href')), tostring(a), None))
|
||||
try:
|
||||
ans.append(Result(ddg_href(a.get('href')), tostring(a), None))
|
||||
except KeyError:
|
||||
log('Failed to find ddg href in:', a.get('href'))
|
||||
return ans, url
|
||||
|
||||
|
||||
@ -270,7 +273,7 @@ def google_get_cached_url(url, br=None, log=prints, timeout=60):
|
||||
cached_url = 'https://webcache.googleusercontent.com/search?q=cache:' + cu
|
||||
br = google_specialize_browser(br or browser())
|
||||
try:
|
||||
raw = query(br, cached_url, 'google-cache', parser=lambda x: x, timeout=timeout)
|
||||
raw = query(br, cached_url, 'google-cache', parser=lambda x: x.encode('utf-8'), timeout=timeout)
|
||||
except Exception as err:
|
||||
log('Failed to get cached URL from google for URL: {} with error: {}'.format(ourl, err))
|
||||
else:
|
||||
|
Loading…
x
Reference in New Issue
Block a user