Amazon metadata download: Fallback to using DDG if Google starts returning too many request errors

This commit is contained in:
Kovid Goyal 2022-07-31 13:57:01 +05:30
parent e82a9274e7
commit cd1a36c5fe
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C
2 changed files with 51 additions and 27 deletions

View File

@ -18,6 +18,8 @@ try:
except ImportError: except ImportError:
from urlparse import urlparse from urlparse import urlparse
from mechanize import HTTPError
from calibre import as_unicode, browser, random_user_agent, xml_replace_entities from calibre import as_unicode, browser, random_user_agent, xml_replace_entities
from calibre.ebooks.metadata import check_isbn from calibre.ebooks.metadata import check_isbn
from calibre.ebooks.metadata.book.base import Metadata from calibre.ebooks.metadata.book.base import Metadata
@ -78,23 +80,32 @@ def parse_details_page(url, log, timeout, browser, domain):
from calibre.utils.cleantext import clean_ascii_chars from calibre.utils.cleantext import clean_ascii_chars
from calibre.ebooks.chardet import xml_to_unicode from calibre.ebooks.chardet import xml_to_unicode
from lxml.html import tostring from lxml.html import tostring
log('Getting details from:', url)
try: try:
raw = browser.open_novisit(url, timeout=timeout).read().strip() from calibre.ebooks.metadata.sources.update import search_engines_module
except Exception as e: get_data_for_cached_url = search_engines_module().get_data_for_cached_url
if callable(getattr(e, 'getcode', None)) and \ except Exception:
e.getcode() == 404: get_data_for_cached_url = lambda *a: None
log.error('URL malformed: %r' % url) raw = get_data_for_cached_url(url)
if raw:
log('Using cached details for url:', url)
else:
log('Downloading details from:', url)
try:
raw = browser.open_novisit(url, timeout=timeout).read().strip()
except Exception as e:
if callable(getattr(e, 'getcode', None)) and \
e.getcode() == 404:
log.error('URL malformed: %r' % url)
return
attr = getattr(e, 'args', [None])
attr = attr if attr else [None]
if isinstance(attr[0], socket.timeout):
msg = 'Details page timed out. Try again later.'
log.error(msg)
else:
msg = 'Failed to make details query: %r' % url
log.exception(msg)
return return
attr = getattr(e, 'args', [None])
attr = attr if attr else [None]
if isinstance(attr[0], socket.timeout):
msg = 'Details page timed out. Try again later.'
log.error(msg)
else:
msg = 'Failed to make details query: %r' % url
log.exception(msg)
return
oraw = raw oraw = raw
if 'amazon.com.br' in url: if 'amazon.com.br' in url:
@ -404,7 +415,7 @@ class Worker(Thread): # Get details {{{
with tempfile.NamedTemporaryFile(prefix=(asin or type('')(uuid.uuid4())) + '_', with tempfile.NamedTemporaryFile(prefix=(asin or type('')(uuid.uuid4())) + '_',
suffix='.html', delete=False) as f: suffix='.html', delete=False) as f:
f.write(raw) f.write(raw)
print('Downloaded html for', asin, 'saved in', f.name) print('Downloaded HTML for', asin, 'saved in', f.name)
try: try:
title = self.parse_title(root) title = self.parse_title(root)
@ -992,7 +1003,7 @@ class Worker(Thread): # Get details {{{
class Amazon(Source): class Amazon(Source):
name = 'Amazon.com' name = 'Amazon.com'
version = (1, 2, 28) version = (1, 3, 0)
minimum_calibre_version = (2, 82, 0) minimum_calibre_version = (2, 82, 0)
description = _('Downloads metadata and covers from Amazon') description = _('Downloads metadata and covers from Amazon')
@ -1027,6 +1038,7 @@ class Amazon(Source):
'bing': _('Bing search cache'), 'bing': _('Bing search cache'),
'google': _('Google search cache'), 'google': _('Google search cache'),
'wayback': _('Wayback machine cache (slow)'), 'wayback': _('Wayback machine cache (slow)'),
'ddg': _('DuckDuckGo search and Google cache'),
} }
options = ( options = (
@ -1453,20 +1465,30 @@ class Amazon(Source):
def search_search_engine(self, br, testing, log, abort, title, authors, identifiers, timeout, override_server=None): # {{{ def search_search_engine(self, br, testing, log, abort, title, authors, identifiers, timeout, override_server=None): # {{{
from calibre.ebooks.metadata.sources.update import search_engines_module from calibre.ebooks.metadata.sources.update import search_engines_module
se = search_engines_module()
terms, domain = self.create_query(log, title=title, authors=authors, terms, domain = self.create_query(log, title=title, authors=authors,
identifiers=identifiers, for_amazon=False) identifiers=identifiers, for_amazon=False)
site = self.referrer_for_domain( site = self.referrer_for_domain(
domain)[len('https://'):].partition('/')[0] domain)[len('https://'):].partition('/')[0]
matches = [] matches = []
se = search_engines_module()
server = override_server or self.server server = override_server or self.server
if server in ('bing',): urlproc, sfunc = se.google_url_processor, se.google_search
if server == 'bing':
urlproc, sfunc = se.bing_url_processor, se.bing_search urlproc, sfunc = se.bing_url_processor, se.bing_search
elif server in ('auto', 'google'):
urlproc, sfunc = se.google_url_processor, se.google_search
elif server == 'wayback': elif server == 'wayback':
urlproc, sfunc = se.wayback_url_processor, se.ddg_search urlproc, sfunc = se.wayback_url_processor, se.ddg_search
results, qurl = sfunc(terms, site, log=log, br=br, timeout=timeout) elif server == 'ddg':
urlproc, sfunc = se.ddg_url_processor, se.ddg_search
try:
results, qurl = sfunc(terms, site, log=log, br=br, timeout=timeout)
except HTTPError as err:
if err.code == 429 and sfunc is se.google_search:
log('Got too many requests error from Google, trying via DuckDuckGo')
urlproc, sfunc = se.ddg_url_processor, se.ddg_search
results, qurl = sfunc(terms, site, log=log, br=br, timeout=timeout)
else:
raise
br.set_current_header('Referer', qurl) br.set_current_header('Referer', qurl)
for result in results: for result in results:
if abort.is_set(): if abort.is_set():
@ -1476,8 +1498,7 @@ class Amazon(Source):
if '/dp/' in purl.path and site in purl.netloc: if '/dp/' in purl.path and site in purl.netloc:
url = result.cached_url url = result.cached_url
if url is None: if url is None:
url = se.wayback_machine_cached_url( url = se.get_cached_url(result.url, br, timeout=timeout)
result.url, br, timeout=timeout)
if url is None: if url is None:
log('Failed to find cached page for:', result.url) log('Failed to find cached page for:', result.url)
continue continue

View File

@ -25,7 +25,7 @@ from calibre.ebooks.chardet import xml_to_unicode
from calibre.utils.lock import ExclusiveFile from calibre.utils.lock import ExclusiveFile
from calibre.utils.random_ua import accept_header_for_ua from calibre.utils.random_ua import accept_header_for_ua
current_version = (1, 1, 0) current_version = (1, 1, 1)
minimum_calibre_version = (2, 80, 0) minimum_calibre_version = (2, 80, 0)
webcache = {} webcache = {}
webcache_lock = Lock() webcache_lock = Lock()
@ -170,7 +170,10 @@ def ddg_search(terms, site=None, br=None, log=prints, safe_search=False, dump_ra
root = query(br, url, 'ddg', dump_raw, timeout=timeout) root = query(br, url, 'ddg', dump_raw, timeout=timeout)
ans = [] ans = []
for a in root.xpath('//*[@class="results"]//*[@class="result__title"]/a[@href and @class="result__a"]'): for a in root.xpath('//*[@class="results"]//*[@class="result__title"]/a[@href and @class="result__a"]'):
ans.append(Result(ddg_href(a.get('href')), tostring(a), None)) try:
ans.append(Result(ddg_href(a.get('href')), tostring(a), None))
except KeyError:
log('Failed to find ddg href in:', a.get('href'))
return ans, url return ans, url
@ -270,7 +273,7 @@ def google_get_cached_url(url, br=None, log=prints, timeout=60):
cached_url = 'https://webcache.googleusercontent.com/search?q=cache:' + cu cached_url = 'https://webcache.googleusercontent.com/search?q=cache:' + cu
br = google_specialize_browser(br or browser()) br = google_specialize_browser(br or browser())
try: try:
raw = query(br, cached_url, 'google-cache', parser=lambda x: x, timeout=timeout) raw = query(br, cached_url, 'google-cache', parser=lambda x: x.encode('utf-8'), timeout=timeout)
except Exception as err: except Exception as err:
log('Failed to get cached URL from google for URL: {} with error: {}'.format(ourl, err)) log('Failed to get cached URL from google for URL: {} with error: {}'.format(ourl, err))
else: else: