mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Throttle amazon requests some more
Hopefully this will avoid triggering their new bot detection.
This commit is contained in:
parent
f143d1095a
commit
58f17e9589
@ -12,18 +12,22 @@ from threading import Thread
|
|||||||
from Queue import Queue, Empty
|
from Queue import Queue, Empty
|
||||||
|
|
||||||
|
|
||||||
from calibre import as_unicode, browser, random_user_agent
|
from calibre import as_unicode, browser
|
||||||
from calibre.ebooks.metadata import check_isbn
|
from calibre.ebooks.metadata import check_isbn
|
||||||
from calibre.ebooks.metadata.sources.base import (Source, Option, fixcase,
|
from calibre.ebooks.metadata.sources.base import (Source, Option, fixcase,
|
||||||
fixauthors)
|
fixauthors)
|
||||||
from calibre.ebooks.metadata.book.base import Metadata
|
from calibre.ebooks.metadata.book.base import Metadata
|
||||||
from calibre.utils.localization import canonicalize_lang
|
from calibre.utils.localization import canonicalize_lang
|
||||||
|
from calibre.utils.random_ua import all_user_agents, accept_header_for_ua
|
||||||
|
|
||||||
|
|
||||||
class CaptchaError(Exception):
|
class CaptchaError(Exception):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
ua_index = -1
|
||||||
|
|
||||||
|
|
||||||
def parse_details_page(url, log, timeout, browser, domain):
|
def parse_details_page(url, log, timeout, browser, domain):
|
||||||
from calibre.utils.cleantext import clean_ascii_chars
|
from calibre.utils.cleantext import clean_ascii_chars
|
||||||
from calibre.ebooks.chardet import xml_to_unicode
|
from calibre.ebooks.chardet import xml_to_unicode
|
||||||
@ -104,7 +108,7 @@ class Worker(Thread): # Get details {{{
|
|||||||
self.url, self.result_queue = url, result_queue
|
self.url, self.result_queue = url, result_queue
|
||||||
self.log, self.timeout = log, timeout
|
self.log, self.timeout = log, timeout
|
||||||
self.relevance, self.plugin = relevance, plugin
|
self.relevance, self.plugin = relevance, plugin
|
||||||
self.browser = browser.clone_browser()
|
self.browser = browser
|
||||||
self.cover_url = self.amazon_id = self.isbn = None
|
self.cover_url = self.amazon_id = self.isbn = None
|
||||||
self.domain = domain
|
self.domain = domain
|
||||||
from lxml.html import tostring
|
from lxml.html import tostring
|
||||||
@ -299,7 +303,6 @@ class Worker(Thread): # Get details {{{
|
|||||||
self.log.exception('get_details failed for url: %r'%self.url)
|
self.log.exception('get_details failed for url: %r'%self.url)
|
||||||
|
|
||||||
def get_details(self):
|
def get_details(self):
|
||||||
|
|
||||||
if self.preparsed_root is None:
|
if self.preparsed_root is None:
|
||||||
raw, root, selector = parse_details_page(self.url, self.log, self.timeout, self.browser, self.domain)
|
raw, root, selector = parse_details_page(self.url, self.log, self.timeout, self.browser, self.domain)
|
||||||
else:
|
else:
|
||||||
@ -833,14 +836,18 @@ class Amazon(Source):
|
|||||||
|
|
||||||
@property
|
@property
|
||||||
def browser(self):
|
def browser(self):
|
||||||
if self._browser is None:
|
global ua_index
|
||||||
self._browser = br = browser(user_agent=random_user_agent(allow_ie=False))
|
all_uas = all_user_agents()
|
||||||
br.set_handle_gzip(True)
|
ua_index = (ua_index + 1) % len(all_uas)
|
||||||
br.addheaders += [
|
ua = all_uas[ua_index]
|
||||||
('Accept', 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'),
|
self._browser = br = browser(user_agent=ua)
|
||||||
('Upgrade-Insecure-Requests', '1'),
|
br.set_handle_gzip(True)
|
||||||
]
|
br.addheaders += [
|
||||||
return self._browser.clone_browser()
|
('Accept', accept_header_for_ua(ua)),
|
||||||
|
('Upgrade-insecure-requests', '1'),
|
||||||
|
('Referer', self.referrer_for_domain()),
|
||||||
|
]
|
||||||
|
return br
|
||||||
|
|
||||||
def save_settings(self, *args, **kwargs):
|
def save_settings(self, *args, **kwargs):
|
||||||
Source.save_settings(self, *args, **kwargs)
|
Source.save_settings(self, *args, **kwargs)
|
||||||
@ -865,20 +872,23 @@ class Amazon(Source):
|
|||||||
return domain, val
|
return domain, val
|
||||||
return None, None
|
return None, None
|
||||||
|
|
||||||
|
def referrer_for_domain(self, domain=None):
|
||||||
|
domain = domain or self.domain
|
||||||
|
if domain == 'uk':
|
||||||
|
return 'https://www.amazon.co.uk/'
|
||||||
|
if domain == 'br':
|
||||||
|
return 'https://www.amazon.com.br/'
|
||||||
|
if domain == 'au':
|
||||||
|
return 'https://www.amazon.com.au/'
|
||||||
|
return 'https://www.amazon.%s/'%domain
|
||||||
|
|
||||||
def _get_book_url(self, identifiers): # {{{
|
def _get_book_url(self, identifiers): # {{{
|
||||||
domain, asin = self.get_domain_and_asin(identifiers, extra_domains=('in', 'au', 'ca'))
|
domain, asin = self.get_domain_and_asin(identifiers, extra_domains=('in', 'au', 'ca'))
|
||||||
if domain and asin:
|
if domain and asin:
|
||||||
url = None
|
url = None
|
||||||
if domain == 'com':
|
r = self.referrer_for_domain(domain)
|
||||||
url = 'https://amzn.com/'+asin
|
if r is not None:
|
||||||
elif domain == 'uk':
|
url = r + 'dp/' + asin
|
||||||
url = 'https://www.amazon.co.uk/dp/'+asin
|
|
||||||
elif domain == 'br':
|
|
||||||
url = 'https://www.amazon.com.br/dp/'+asin
|
|
||||||
elif domain == 'au':
|
|
||||||
url = 'https://www.amazon.com.au/dp/' + asin
|
|
||||||
else:
|
|
||||||
url = 'https://www.amazon.%s/dp/%s'%(domain, asin)
|
|
||||||
if url:
|
if url:
|
||||||
idtype = 'amazon' if domain == 'com' else 'amazon_'+domain
|
idtype = 'amazon' if domain == 'com' else 'amazon_'+domain
|
||||||
return domain, idtype, asin, url
|
return domain, idtype, asin, url
|
||||||
@ -1082,9 +1092,9 @@ class Amazon(Source):
|
|||||||
' profiling to block access to its website. As such this metadata plugin is'
|
' profiling to block access to its website. As such this metadata plugin is'
|
||||||
' unlikely to ever work reliably.')
|
' unlikely to ever work reliably.')
|
||||||
|
|
||||||
# Keep only the top 5 matches as the matches are sorted by relevance by
|
# Keep only the top 3 matches as the matches are sorted by relevance by
|
||||||
# Amazon so lower matches are not likely to be very relevant
|
# Amazon so lower matches are not likely to be very relevant
|
||||||
return matches[:5]
|
return matches[:3]
|
||||||
# }}}
|
# }}}
|
||||||
|
|
||||||
def identify(self, log, result_queue, abort, title=None, authors=None, # {{{
|
def identify(self, log, result_queue, abort, title=None, authors=None, # {{{
|
||||||
@ -1099,9 +1109,11 @@ class Amazon(Source):
|
|||||||
import html5lib
|
import html5lib
|
||||||
|
|
||||||
testing = getattr(self, 'running_a_test', False)
|
testing = getattr(self, 'running_a_test', False)
|
||||||
br = self.browser
|
|
||||||
|
|
||||||
udata = self._get_book_url(identifiers)
|
udata = self._get_book_url(identifiers)
|
||||||
|
br = self.browser
|
||||||
|
if testing:
|
||||||
|
print('User-agent:', br.current_user_agent())
|
||||||
if udata is not None:
|
if udata is not None:
|
||||||
# Try to directly get details page instead of running a search
|
# Try to directly get details page instead of running a search
|
||||||
domain, idtype, asin, durl = udata
|
domain, idtype, asin, durl = udata
|
||||||
@ -1121,8 +1133,6 @@ class Amazon(Source):
|
|||||||
if query is None:
|
if query is None:
|
||||||
log.error('Insufficient metadata to construct query')
|
log.error('Insufficient metadata to construct query')
|
||||||
return
|
return
|
||||||
if testing:
|
|
||||||
print ('Using user agent for amazon: %s'%self.user_agent)
|
|
||||||
try:
|
try:
|
||||||
raw = br.open_novisit(query, timeout=timeout).read().strip()
|
raw = br.open_novisit(query, timeout=timeout).read().strip()
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
@ -1179,6 +1189,7 @@ class Amazon(Source):
|
|||||||
if identifiers and title and authors:
|
if identifiers and title and authors:
|
||||||
log('No matches found with identifiers, retrying using only'
|
log('No matches found with identifiers, retrying using only'
|
||||||
' title and authors. Query: %r'%query)
|
' title and authors. Query: %r'%query)
|
||||||
|
time.sleep(1)
|
||||||
return self.identify(log, result_queue, abort, title=title,
|
return self.identify(log, result_queue, abort, title=title,
|
||||||
authors=authors, timeout=timeout)
|
authors=authors, timeout=timeout)
|
||||||
log.error('No matches found with query: %r'%query)
|
log.error('No matches found with query: %r'%query)
|
||||||
@ -1188,9 +1199,11 @@ class Amazon(Source):
|
|||||||
testing=testing) for i, url in enumerate(matches)]
|
testing=testing) for i, url in enumerate(matches)]
|
||||||
|
|
||||||
for w in workers:
|
for w in workers:
|
||||||
w.start()
|
|
||||||
# Don't send all requests at the same time
|
# Don't send all requests at the same time
|
||||||
time.sleep(0.1)
|
time.sleep(1)
|
||||||
|
w.start()
|
||||||
|
if abort.is_set():
|
||||||
|
return
|
||||||
|
|
||||||
while not abort.is_set():
|
while not abort.is_set():
|
||||||
a_worker_is_alive = False
|
a_worker_is_alive = False
|
||||||
@ -1216,6 +1229,8 @@ class Amazon(Source):
|
|||||||
identifiers=identifiers)
|
identifiers=identifiers)
|
||||||
if abort.is_set():
|
if abort.is_set():
|
||||||
return
|
return
|
||||||
|
if abort.is_set():
|
||||||
|
return
|
||||||
results = []
|
results = []
|
||||||
while True:
|
while True:
|
||||||
try:
|
try:
|
||||||
@ -1234,10 +1249,10 @@ class Amazon(Source):
|
|||||||
|
|
||||||
if abort.is_set():
|
if abort.is_set():
|
||||||
return
|
return
|
||||||
br = self.browser
|
|
||||||
log('Downloading cover from:', cached_url)
|
log('Downloading cover from:', cached_url)
|
||||||
try:
|
try:
|
||||||
cdata = br.open_novisit(cached_url, timeout=timeout).read()
|
time.sleep(1)
|
||||||
|
cdata = self.browser.open_novisit(cached_url, timeout=timeout).read()
|
||||||
result_queue.put((self, cdata))
|
result_queue.put((self, cdata))
|
||||||
except:
|
except:
|
||||||
log.exception('Failed to download cover from:', cached_url)
|
log.exception('Failed to download cover from:', cached_url)
|
||||||
|
@ -78,3 +78,9 @@ def all_user_agents():
|
|||||||
|
|
||||||
def random_user_agent():
|
def random_user_agent():
|
||||||
return random.choice(all_user_agents())
|
return random.choice(all_user_agents())
|
||||||
|
|
||||||
|
|
||||||
|
def accept_header_for_ua(ua):
|
||||||
|
if 'Firefox/' in ua:
|
||||||
|
return 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'
|
||||||
|
return 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'
|
||||||
|
Loading…
x
Reference in New Issue
Block a user