Throttle amazon requests some more

Hopefully this will avoid triggering their new bot detection.
2025-07-09 03:04:10 -04:00 · 2017-03-01 09:11:34 +05:30 · 2017-03-01 09:11:34 +05:30 · 58f17e9589
commit 58f17e9589
parent f143d1095a
2 changed files with 51 additions and 30 deletions
--- a/src/calibre/ebooks/metadata/sources/amazon.py
+++ b/src/calibre/ebooks/metadata/sources/amazon.py
@ -12,18 +12,22 @@ from threading import Thread
 from Queue import Queue, Empty
-from calibre import as_unicode, browser, random_user_agent
+from calibre import as_unicode, browser
 from calibre.ebooks.metadata import check_isbn
 from calibre.ebooks.metadata.sources.base import (Source, Option, fixcase,
        fixauthors)
 from calibre.ebooks.metadata.book.base import Metadata
 from calibre.utils.localization import canonicalize_lang
 from calibre.utils.random_ua import all_user_agents, accept_header_for_ua
 class CaptchaError(Exception):
    pass
 ua_index = -1
 def parse_details_page(url, log, timeout, browser, domain):
    from calibre.utils.cleantext import clean_ascii_chars
    from calibre.ebooks.chardet import xml_to_unicode
@ -104,7 +108,7 @@ class Worker(Thread):  # Get details {{{
        self.url, self.result_queue = url, result_queue
        self.log, self.timeout = log, timeout
        self.relevance, self.plugin = relevance, plugin
-        self.browser = browser.clone_browser()
+        self.browser = browser
        self.cover_url = self.amazon_id = self.isbn = None
        self.domain = domain
        from lxml.html import tostring
@ -299,7 +303,6 @@ class Worker(Thread):  # Get details {{{
            self.log.exception('get_details failed for url: %r'%self.url)
    def get_details(self):
        if self.preparsed_root is None:
            raw, root, selector = parse_details_page(self.url, self.log, self.timeout, self.browser, self.domain)
        else:
@ -833,14 +836,18 @@ class Amazon(Source):
    @property
    def browser(self):
-        if self._browser is None:
+        global ua_index
-            self._browser = br = browser(user_agent=random_user_agent(allow_ie=False))
+        all_uas = all_user_agents()
-            br.set_handle_gzip(True)
+        ua_index = (ua_index + 1) % len(all_uas)
-            br.addheaders += [
+        ua = all_uas[ua_index]
-                ('Accept', 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'),
+        self._browser = br = browser(user_agent=ua)
-                ('Upgrade-Insecure-Requests', '1'),
+        br.set_handle_gzip(True)
-            ]
+        br.addheaders += [
-        return self._browser.clone_browser()
+            ('Accept', accept_header_for_ua(ua)),
            ('Upgrade-insecure-requests', '1'),
            ('Referer', self.referrer_for_domain()),
        ]
        return br
    def save_settings(self, *args, **kwargs):
        Source.save_settings(self, *args, **kwargs)
@ -865,20 +872,23 @@ class Amazon(Source):
                    return domain, val
        return None, None
    def referrer_for_domain(self, domain=None):
        domain = domain or self.domain
        if domain == 'uk':
            return 'https://www.amazon.co.uk/'
        if domain == 'br':
            return 'https://www.amazon.com.br/'
        if domain == 'au':
            return 'https://www.amazon.com.au/'
        return 'https://www.amazon.%s/'%domain
    def _get_book_url(self, identifiers):  # {{{
        domain, asin = self.get_domain_and_asin(identifiers, extra_domains=('in', 'au', 'ca'))
        if domain and asin:
            url = None
-            if domain == 'com':
+            r = self.referrer_for_domain(domain)
-                url = 'https://amzn.com/'+asin
+            if r is not None:
-            elif domain == 'uk':
+                url = r + 'dp/' + asin
                url = 'https://www.amazon.co.uk/dp/'+asin
            elif domain == 'br':
                url = 'https://www.amazon.com.br/dp/'+asin
            elif domain == 'au':
                url = 'https://www.amazon.com.au/dp/' + asin
            else:
                url = 'https://www.amazon.%s/dp/%s'%(domain, asin)
            if url:
                idtype = 'amazon' if domain == 'com' else 'amazon_'+domain
                return domain, idtype, asin, url
@ -1082,9 +1092,9 @@ class Amazon(Source):
                               ' profiling to block access to its website. As such this metadata plugin is'
                               ' unlikely to ever work reliably.')
-        # Keep only the top 5 matches as the matches are sorted by relevance by
+        # Keep only the top 3 matches as the matches are sorted by relevance by
        # Amazon so lower matches are not likely to be very relevant
-        return matches[:5]
+        return matches[:3]
    # }}}
    def identify(self, log, result_queue, abort, title=None, authors=None,  # {{{
@ -1099,9 +1109,11 @@ class Amazon(Source):
        import html5lib
        testing = getattr(self, 'running_a_test', False)
        br = self.browser
        udata = self._get_book_url(identifiers)
        br = self.browser
        if testing:
            print('User-agent:', br.current_user_agent())
        if udata is not None:
            # Try to directly get details page instead of running a search
            domain, idtype, asin, durl = udata
@ -1121,8 +1133,6 @@ class Amazon(Source):
        if query is None:
            log.error('Insufficient metadata to construct query')
            return
        if testing:
            print ('Using user agent for amazon: %s'%self.user_agent)
        try:
            raw = br.open_novisit(query, timeout=timeout).read().strip()
        except Exception as e:
@ -1179,6 +1189,7 @@ class Amazon(Source):
            if identifiers and title and authors:
                log('No matches found with identifiers, retrying using only'
                        ' title and authors. Query: %r'%query)
                time.sleep(1)
                return self.identify(log, result_queue, abort, title=title,
                        authors=authors, timeout=timeout)
            log.error('No matches found with query: %r'%query)
@ -1188,9 +1199,11 @@ class Amazon(Source):
                            testing=testing) for i, url in enumerate(matches)]
        for w in workers:
            w.start()
            # Don't send all requests at the same time
-            time.sleep(0.1)
+            time.sleep(1)
            w.start()
            if abort.is_set():
                return
        while not abort.is_set():
            a_worker_is_alive = False
@ -1216,6 +1229,8 @@ class Amazon(Source):
                    identifiers=identifiers)
            if abort.is_set():
                return
            if abort.is_set():
                return
            results = []
            while True:
                try:
@ -1234,10 +1249,10 @@ class Amazon(Source):
        if abort.is_set():
            return
        br = self.browser
        log('Downloading cover from:', cached_url)
        try:
-            cdata = br.open_novisit(cached_url, timeout=timeout).read()
+            time.sleep(1)
            cdata = self.browser.open_novisit(cached_url, timeout=timeout).read()
            result_queue.put((self, cdata))
        except:
            log.exception('Failed to download cover from:', cached_url)
--- a/src/calibre/utils/random_ua.py
+++ b/src/calibre/utils/random_ua.py
@ -78,3 +78,9 @@ def all_user_agents():
 def random_user_agent():
    return random.choice(all_user_agents())
 def accept_header_for_ua(ua):
    if 'Firefox/' in ua:
        return 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'
    return 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'