mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Amazon metadata download: Fallback to using DDG if Google starts returning too many request errors
This commit is contained in:
parent
e82a9274e7
commit
cd1a36c5fe
@ -18,6 +18,8 @@ try:
|
|||||||
except ImportError:
|
except ImportError:
|
||||||
from urlparse import urlparse
|
from urlparse import urlparse
|
||||||
|
|
||||||
|
from mechanize import HTTPError
|
||||||
|
|
||||||
from calibre import as_unicode, browser, random_user_agent, xml_replace_entities
|
from calibre import as_unicode, browser, random_user_agent, xml_replace_entities
|
||||||
from calibre.ebooks.metadata import check_isbn
|
from calibre.ebooks.metadata import check_isbn
|
||||||
from calibre.ebooks.metadata.book.base import Metadata
|
from calibre.ebooks.metadata.book.base import Metadata
|
||||||
@ -78,23 +80,32 @@ def parse_details_page(url, log, timeout, browser, domain):
|
|||||||
from calibre.utils.cleantext import clean_ascii_chars
|
from calibre.utils.cleantext import clean_ascii_chars
|
||||||
from calibre.ebooks.chardet import xml_to_unicode
|
from calibre.ebooks.chardet import xml_to_unicode
|
||||||
from lxml.html import tostring
|
from lxml.html import tostring
|
||||||
log('Getting details from:', url)
|
|
||||||
try:
|
try:
|
||||||
raw = browser.open_novisit(url, timeout=timeout).read().strip()
|
from calibre.ebooks.metadata.sources.update import search_engines_module
|
||||||
except Exception as e:
|
get_data_for_cached_url = search_engines_module().get_data_for_cached_url
|
||||||
if callable(getattr(e, 'getcode', None)) and \
|
except Exception:
|
||||||
e.getcode() == 404:
|
get_data_for_cached_url = lambda *a: None
|
||||||
log.error('URL malformed: %r' % url)
|
raw = get_data_for_cached_url(url)
|
||||||
|
if raw:
|
||||||
|
log('Using cached details for url:', url)
|
||||||
|
else:
|
||||||
|
log('Downloading details from:', url)
|
||||||
|
try:
|
||||||
|
raw = browser.open_novisit(url, timeout=timeout).read().strip()
|
||||||
|
except Exception as e:
|
||||||
|
if callable(getattr(e, 'getcode', None)) and \
|
||||||
|
e.getcode() == 404:
|
||||||
|
log.error('URL malformed: %r' % url)
|
||||||
|
return
|
||||||
|
attr = getattr(e, 'args', [None])
|
||||||
|
attr = attr if attr else [None]
|
||||||
|
if isinstance(attr[0], socket.timeout):
|
||||||
|
msg = 'Details page timed out. Try again later.'
|
||||||
|
log.error(msg)
|
||||||
|
else:
|
||||||
|
msg = 'Failed to make details query: %r' % url
|
||||||
|
log.exception(msg)
|
||||||
return
|
return
|
||||||
attr = getattr(e, 'args', [None])
|
|
||||||
attr = attr if attr else [None]
|
|
||||||
if isinstance(attr[0], socket.timeout):
|
|
||||||
msg = 'Details page timed out. Try again later.'
|
|
||||||
log.error(msg)
|
|
||||||
else:
|
|
||||||
msg = 'Failed to make details query: %r' % url
|
|
||||||
log.exception(msg)
|
|
||||||
return
|
|
||||||
|
|
||||||
oraw = raw
|
oraw = raw
|
||||||
if 'amazon.com.br' in url:
|
if 'amazon.com.br' in url:
|
||||||
@ -404,7 +415,7 @@ class Worker(Thread): # Get details {{{
|
|||||||
with tempfile.NamedTemporaryFile(prefix=(asin or type('')(uuid.uuid4())) + '_',
|
with tempfile.NamedTemporaryFile(prefix=(asin or type('')(uuid.uuid4())) + '_',
|
||||||
suffix='.html', delete=False) as f:
|
suffix='.html', delete=False) as f:
|
||||||
f.write(raw)
|
f.write(raw)
|
||||||
print('Downloaded html for', asin, 'saved in', f.name)
|
print('Downloaded HTML for', asin, 'saved in', f.name)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
title = self.parse_title(root)
|
title = self.parse_title(root)
|
||||||
@ -992,7 +1003,7 @@ class Worker(Thread): # Get details {{{
|
|||||||
class Amazon(Source):
|
class Amazon(Source):
|
||||||
|
|
||||||
name = 'Amazon.com'
|
name = 'Amazon.com'
|
||||||
version = (1, 2, 28)
|
version = (1, 3, 0)
|
||||||
minimum_calibre_version = (2, 82, 0)
|
minimum_calibre_version = (2, 82, 0)
|
||||||
description = _('Downloads metadata and covers from Amazon')
|
description = _('Downloads metadata and covers from Amazon')
|
||||||
|
|
||||||
@ -1027,6 +1038,7 @@ class Amazon(Source):
|
|||||||
'bing': _('Bing search cache'),
|
'bing': _('Bing search cache'),
|
||||||
'google': _('Google search cache'),
|
'google': _('Google search cache'),
|
||||||
'wayback': _('Wayback machine cache (slow)'),
|
'wayback': _('Wayback machine cache (slow)'),
|
||||||
|
'ddg': _('DuckDuckGo search and Google cache'),
|
||||||
}
|
}
|
||||||
|
|
||||||
options = (
|
options = (
|
||||||
@ -1453,20 +1465,30 @@ class Amazon(Source):
|
|||||||
|
|
||||||
def search_search_engine(self, br, testing, log, abort, title, authors, identifiers, timeout, override_server=None): # {{{
|
def search_search_engine(self, br, testing, log, abort, title, authors, identifiers, timeout, override_server=None): # {{{
|
||||||
from calibre.ebooks.metadata.sources.update import search_engines_module
|
from calibre.ebooks.metadata.sources.update import search_engines_module
|
||||||
|
se = search_engines_module()
|
||||||
terms, domain = self.create_query(log, title=title, authors=authors,
|
terms, domain = self.create_query(log, title=title, authors=authors,
|
||||||
identifiers=identifiers, for_amazon=False)
|
identifiers=identifiers, for_amazon=False)
|
||||||
site = self.referrer_for_domain(
|
site = self.referrer_for_domain(
|
||||||
domain)[len('https://'):].partition('/')[0]
|
domain)[len('https://'):].partition('/')[0]
|
||||||
matches = []
|
matches = []
|
||||||
se = search_engines_module()
|
|
||||||
server = override_server or self.server
|
server = override_server or self.server
|
||||||
if server in ('bing',):
|
urlproc, sfunc = se.google_url_processor, se.google_search
|
||||||
|
if server == 'bing':
|
||||||
urlproc, sfunc = se.bing_url_processor, se.bing_search
|
urlproc, sfunc = se.bing_url_processor, se.bing_search
|
||||||
elif server in ('auto', 'google'):
|
|
||||||
urlproc, sfunc = se.google_url_processor, se.google_search
|
|
||||||
elif server == 'wayback':
|
elif server == 'wayback':
|
||||||
urlproc, sfunc = se.wayback_url_processor, se.ddg_search
|
urlproc, sfunc = se.wayback_url_processor, se.ddg_search
|
||||||
results, qurl = sfunc(terms, site, log=log, br=br, timeout=timeout)
|
elif server == 'ddg':
|
||||||
|
urlproc, sfunc = se.ddg_url_processor, se.ddg_search
|
||||||
|
try:
|
||||||
|
results, qurl = sfunc(terms, site, log=log, br=br, timeout=timeout)
|
||||||
|
except HTTPError as err:
|
||||||
|
if err.code == 429 and sfunc is se.google_search:
|
||||||
|
log('Got too many requests error from Google, trying via DuckDuckGo')
|
||||||
|
urlproc, sfunc = se.ddg_url_processor, se.ddg_search
|
||||||
|
results, qurl = sfunc(terms, site, log=log, br=br, timeout=timeout)
|
||||||
|
else:
|
||||||
|
raise
|
||||||
|
|
||||||
br.set_current_header('Referer', qurl)
|
br.set_current_header('Referer', qurl)
|
||||||
for result in results:
|
for result in results:
|
||||||
if abort.is_set():
|
if abort.is_set():
|
||||||
@ -1476,8 +1498,7 @@ class Amazon(Source):
|
|||||||
if '/dp/' in purl.path and site in purl.netloc:
|
if '/dp/' in purl.path and site in purl.netloc:
|
||||||
url = result.cached_url
|
url = result.cached_url
|
||||||
if url is None:
|
if url is None:
|
||||||
url = se.wayback_machine_cached_url(
|
url = se.get_cached_url(result.url, br, timeout=timeout)
|
||||||
result.url, br, timeout=timeout)
|
|
||||||
if url is None:
|
if url is None:
|
||||||
log('Failed to find cached page for:', result.url)
|
log('Failed to find cached page for:', result.url)
|
||||||
continue
|
continue
|
||||||
|
@ -25,7 +25,7 @@ from calibre.ebooks.chardet import xml_to_unicode
|
|||||||
from calibre.utils.lock import ExclusiveFile
|
from calibre.utils.lock import ExclusiveFile
|
||||||
from calibre.utils.random_ua import accept_header_for_ua
|
from calibre.utils.random_ua import accept_header_for_ua
|
||||||
|
|
||||||
current_version = (1, 1, 0)
|
current_version = (1, 1, 1)
|
||||||
minimum_calibre_version = (2, 80, 0)
|
minimum_calibre_version = (2, 80, 0)
|
||||||
webcache = {}
|
webcache = {}
|
||||||
webcache_lock = Lock()
|
webcache_lock = Lock()
|
||||||
@ -170,7 +170,10 @@ def ddg_search(terms, site=None, br=None, log=prints, safe_search=False, dump_ra
|
|||||||
root = query(br, url, 'ddg', dump_raw, timeout=timeout)
|
root = query(br, url, 'ddg', dump_raw, timeout=timeout)
|
||||||
ans = []
|
ans = []
|
||||||
for a in root.xpath('//*[@class="results"]//*[@class="result__title"]/a[@href and @class="result__a"]'):
|
for a in root.xpath('//*[@class="results"]//*[@class="result__title"]/a[@href and @class="result__a"]'):
|
||||||
ans.append(Result(ddg_href(a.get('href')), tostring(a), None))
|
try:
|
||||||
|
ans.append(Result(ddg_href(a.get('href')), tostring(a), None))
|
||||||
|
except KeyError:
|
||||||
|
log('Failed to find ddg href in:', a.get('href'))
|
||||||
return ans, url
|
return ans, url
|
||||||
|
|
||||||
|
|
||||||
@ -270,7 +273,7 @@ def google_get_cached_url(url, br=None, log=prints, timeout=60):
|
|||||||
cached_url = 'https://webcache.googleusercontent.com/search?q=cache:' + cu
|
cached_url = 'https://webcache.googleusercontent.com/search?q=cache:' + cu
|
||||||
br = google_specialize_browser(br or browser())
|
br = google_specialize_browser(br or browser())
|
||||||
try:
|
try:
|
||||||
raw = query(br, cached_url, 'google-cache', parser=lambda x: x, timeout=timeout)
|
raw = query(br, cached_url, 'google-cache', parser=lambda x: x.encode('utf-8'), timeout=timeout)
|
||||||
except Exception as err:
|
except Exception as err:
|
||||||
log('Failed to get cached URL from google for URL: {} with error: {}'.format(ourl, err))
|
log('Failed to get cached URL from google for URL: {} with error: {}'.format(ourl, err))
|
||||||
else:
|
else:
|
||||||
|
Loading…
x
Reference in New Issue
Block a user