Throttle amazon requests some more

Hopefully this will avoid triggering their new bot detection.
This commit is contained in:
Kovid Goyal 2017-03-01 09:11:34 +05:30
parent f143d1095a
commit 58f17e9589
2 changed files with 51 additions and 30 deletions

View File

@ -12,18 +12,22 @@ from threading import Thread
from Queue import Queue, Empty from Queue import Queue, Empty
from calibre import as_unicode, browser, random_user_agent from calibre import as_unicode, browser
from calibre.ebooks.metadata import check_isbn from calibre.ebooks.metadata import check_isbn
from calibre.ebooks.metadata.sources.base import (Source, Option, fixcase, from calibre.ebooks.metadata.sources.base import (Source, Option, fixcase,
fixauthors) fixauthors)
from calibre.ebooks.metadata.book.base import Metadata from calibre.ebooks.metadata.book.base import Metadata
from calibre.utils.localization import canonicalize_lang from calibre.utils.localization import canonicalize_lang
from calibre.utils.random_ua import all_user_agents, accept_header_for_ua
class CaptchaError(Exception): class CaptchaError(Exception):
pass pass
ua_index = -1
def parse_details_page(url, log, timeout, browser, domain): def parse_details_page(url, log, timeout, browser, domain):
from calibre.utils.cleantext import clean_ascii_chars from calibre.utils.cleantext import clean_ascii_chars
from calibre.ebooks.chardet import xml_to_unicode from calibre.ebooks.chardet import xml_to_unicode
@ -104,7 +108,7 @@ class Worker(Thread): # Get details {{{
self.url, self.result_queue = url, result_queue self.url, self.result_queue = url, result_queue
self.log, self.timeout = log, timeout self.log, self.timeout = log, timeout
self.relevance, self.plugin = relevance, plugin self.relevance, self.plugin = relevance, plugin
self.browser = browser.clone_browser() self.browser = browser
self.cover_url = self.amazon_id = self.isbn = None self.cover_url = self.amazon_id = self.isbn = None
self.domain = domain self.domain = domain
from lxml.html import tostring from lxml.html import tostring
@ -299,7 +303,6 @@ class Worker(Thread): # Get details {{{
self.log.exception('get_details failed for url: %r'%self.url) self.log.exception('get_details failed for url: %r'%self.url)
def get_details(self): def get_details(self):
if self.preparsed_root is None: if self.preparsed_root is None:
raw, root, selector = parse_details_page(self.url, self.log, self.timeout, self.browser, self.domain) raw, root, selector = parse_details_page(self.url, self.log, self.timeout, self.browser, self.domain)
else: else:
@ -833,14 +836,18 @@ class Amazon(Source):
@property @property
def browser(self): def browser(self):
if self._browser is None: global ua_index
self._browser = br = browser(user_agent=random_user_agent(allow_ie=False)) all_uas = all_user_agents()
br.set_handle_gzip(True) ua_index = (ua_index + 1) % len(all_uas)
br.addheaders += [ ua = all_uas[ua_index]
('Accept', 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'), self._browser = br = browser(user_agent=ua)
('Upgrade-Insecure-Requests', '1'), br.set_handle_gzip(True)
] br.addheaders += [
return self._browser.clone_browser() ('Accept', accept_header_for_ua(ua)),
('Upgrade-insecure-requests', '1'),
('Referer', self.referrer_for_domain()),
]
return br
def save_settings(self, *args, **kwargs): def save_settings(self, *args, **kwargs):
Source.save_settings(self, *args, **kwargs) Source.save_settings(self, *args, **kwargs)
@ -865,20 +872,23 @@ class Amazon(Source):
return domain, val return domain, val
return None, None return None, None
def referrer_for_domain(self, domain=None):
domain = domain or self.domain
if domain == 'uk':
return 'https://www.amazon.co.uk/'
if domain == 'br':
return 'https://www.amazon.com.br/'
if domain == 'au':
return 'https://www.amazon.com.au/'
return 'https://www.amazon.%s/'%domain
def _get_book_url(self, identifiers): # {{{ def _get_book_url(self, identifiers): # {{{
domain, asin = self.get_domain_and_asin(identifiers, extra_domains=('in', 'au', 'ca')) domain, asin = self.get_domain_and_asin(identifiers, extra_domains=('in', 'au', 'ca'))
if domain and asin: if domain and asin:
url = None url = None
if domain == 'com': r = self.referrer_for_domain(domain)
url = 'https://amzn.com/'+asin if r is not None:
elif domain == 'uk': url = r + 'dp/' + asin
url = 'https://www.amazon.co.uk/dp/'+asin
elif domain == 'br':
url = 'https://www.amazon.com.br/dp/'+asin
elif domain == 'au':
url = 'https://www.amazon.com.au/dp/' + asin
else:
url = 'https://www.amazon.%s/dp/%s'%(domain, asin)
if url: if url:
idtype = 'amazon' if domain == 'com' else 'amazon_'+domain idtype = 'amazon' if domain == 'com' else 'amazon_'+domain
return domain, idtype, asin, url return domain, idtype, asin, url
@ -1082,9 +1092,9 @@ class Amazon(Source):
' profiling to block access to its website. As such this metadata plugin is' ' profiling to block access to its website. As such this metadata plugin is'
' unlikely to ever work reliably.') ' unlikely to ever work reliably.')
# Keep only the top 5 matches as the matches are sorted by relevance by # Keep only the top 3 matches as the matches are sorted by relevance by
# Amazon so lower matches are not likely to be very relevant # Amazon so lower matches are not likely to be very relevant
return matches[:5] return matches[:3]
# }}} # }}}
def identify(self, log, result_queue, abort, title=None, authors=None, # {{{ def identify(self, log, result_queue, abort, title=None, authors=None, # {{{
@ -1099,9 +1109,11 @@ class Amazon(Source):
import html5lib import html5lib
testing = getattr(self, 'running_a_test', False) testing = getattr(self, 'running_a_test', False)
br = self.browser
udata = self._get_book_url(identifiers) udata = self._get_book_url(identifiers)
br = self.browser
if testing:
print('User-agent:', br.current_user_agent())
if udata is not None: if udata is not None:
# Try to directly get details page instead of running a search # Try to directly get details page instead of running a search
domain, idtype, asin, durl = udata domain, idtype, asin, durl = udata
@ -1121,8 +1133,6 @@ class Amazon(Source):
if query is None: if query is None:
log.error('Insufficient metadata to construct query') log.error('Insufficient metadata to construct query')
return return
if testing:
print ('Using user agent for amazon: %s'%self.user_agent)
try: try:
raw = br.open_novisit(query, timeout=timeout).read().strip() raw = br.open_novisit(query, timeout=timeout).read().strip()
except Exception as e: except Exception as e:
@ -1179,6 +1189,7 @@ class Amazon(Source):
if identifiers and title and authors: if identifiers and title and authors:
log('No matches found with identifiers, retrying using only' log('No matches found with identifiers, retrying using only'
' title and authors. Query: %r'%query) ' title and authors. Query: %r'%query)
time.sleep(1)
return self.identify(log, result_queue, abort, title=title, return self.identify(log, result_queue, abort, title=title,
authors=authors, timeout=timeout) authors=authors, timeout=timeout)
log.error('No matches found with query: %r'%query) log.error('No matches found with query: %r'%query)
@ -1188,9 +1199,11 @@ class Amazon(Source):
testing=testing) for i, url in enumerate(matches)] testing=testing) for i, url in enumerate(matches)]
for w in workers: for w in workers:
w.start()
# Don't send all requests at the same time # Don't send all requests at the same time
time.sleep(0.1) time.sleep(1)
w.start()
if abort.is_set():
return
while not abort.is_set(): while not abort.is_set():
a_worker_is_alive = False a_worker_is_alive = False
@ -1216,6 +1229,8 @@ class Amazon(Source):
identifiers=identifiers) identifiers=identifiers)
if abort.is_set(): if abort.is_set():
return return
if abort.is_set():
return
results = [] results = []
while True: while True:
try: try:
@ -1234,10 +1249,10 @@ class Amazon(Source):
if abort.is_set(): if abort.is_set():
return return
br = self.browser
log('Downloading cover from:', cached_url) log('Downloading cover from:', cached_url)
try: try:
cdata = br.open_novisit(cached_url, timeout=timeout).read() time.sleep(1)
cdata = self.browser.open_novisit(cached_url, timeout=timeout).read()
result_queue.put((self, cdata)) result_queue.put((self, cdata))
except: except:
log.exception('Failed to download cover from:', cached_url) log.exception('Failed to download cover from:', cached_url)

View File

@ -78,3 +78,9 @@ def all_user_agents():
def random_user_agent(): def random_user_agent():
return random.choice(all_user_agents()) return random.choice(all_user_agents())
def accept_header_for_ua(ua):
if 'Firefox/' in ua:
return 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'
return 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'