Throttle amazon requests some more

Hopefully this will avoid triggering their new bot detection.
2025-08-30 23:00:21 -04:00 · 2017-03-01 09:11:34 +05:30 · 2017-03-01 09:11:34 +05:30 · 58f17e9589
commit 58f17e9589
parent f143d1095a
2 changed files with 51 additions and 30 deletions
--- a/src/calibre/ebooks/metadata/sources/amazon.py
+++ b/src/calibre/ebooks/metadata/sources/amazon.py
@ -12,18 +12,22 @@ from threading import Thread
 from Queue import Queue, Empty


-from calibre import as_unicode, browser, random_user_agent
+from calibre import as_unicode, browser
 from calibre.ebooks.metadata import check_isbn
 from calibre.ebooks.metadata.sources.base import (Source, Option, fixcase,
        fixauthors)
 from calibre.ebooks.metadata.book.base import Metadata
 from calibre.utils.localization import canonicalize_lang
+from calibre.utils.random_ua import all_user_agents, accept_header_for_ua


 class CaptchaError(Exception):
    pass


+ua_index = -1
+
+
 def parse_details_page(url, log, timeout, browser, domain):
    from calibre.utils.cleantext import clean_ascii_chars
    from calibre.ebooks.chardet import xml_to_unicode
@ -104,7 +108,7 @@ class Worker(Thread):  # Get details {{{
        self.url, self.result_queue = url, result_queue
        self.log, self.timeout = log, timeout
        self.relevance, self.plugin = relevance, plugin
-        self.browser = browser.clone_browser()
+        self.browser = browser
        self.cover_url = self.amazon_id = self.isbn = None
        self.domain = domain
        from lxml.html import tostring
@ -299,7 +303,6 @@ class Worker(Thread):  # Get details {{{
            self.log.exception('get_details failed for url: %r'%self.url)

    def get_details(self):
-
        if self.preparsed_root is None:
            raw, root, selector = parse_details_page(self.url, self.log, self.timeout, self.browser, self.domain)
        else:
@ -833,14 +836,18 @@ class Amazon(Source):

    @property
    def browser(self):
-        if self._browser is None:
-            self._browser = br = browser(user_agent=random_user_agent(allow_ie=False))
-            br.set_handle_gzip(True)
-            br.addheaders += [
-                ('Accept', 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'),
-                ('Upgrade-Insecure-Requests', '1'),
-            ]
-        return self._browser.clone_browser()
+        global ua_index
+        all_uas = all_user_agents()
+        ua_index = (ua_index + 1) % len(all_uas)
+        ua = all_uas[ua_index]
+        self._browser = br = browser(user_agent=ua)
+        br.set_handle_gzip(True)
+        br.addheaders += [
+            ('Accept', accept_header_for_ua(ua)),
+            ('Upgrade-insecure-requests', '1'),
+            ('Referer', self.referrer_for_domain()),
+        ]
+        return br

    def save_settings(self, *args, **kwargs):
        Source.save_settings(self, *args, **kwargs)
@ -865,20 +872,23 @@ class Amazon(Source):
                    return domain, val
        return None, None

+    def referrer_for_domain(self, domain=None):
+        domain = domain or self.domain
+        if domain == 'uk':
+            return 'https://www.amazon.co.uk/'
+        if domain == 'br':
+            return 'https://www.amazon.com.br/'
+        if domain == 'au':
+            return 'https://www.amazon.com.au/'
+        return 'https://www.amazon.%s/'%domain
+
    def _get_book_url(self, identifiers):  # {{{
        domain, asin = self.get_domain_and_asin(identifiers, extra_domains=('in', 'au', 'ca'))
        if domain and asin:
            url = None
-            if domain == 'com':
-                url = 'https://amzn.com/'+asin
-            elif domain == 'uk':
-                url = 'https://www.amazon.co.uk/dp/'+asin
-            elif domain == 'br':
-                url = 'https://www.amazon.com.br/dp/'+asin
-            elif domain == 'au':
-                url = 'https://www.amazon.com.au/dp/' + asin
-            else:
-                url = 'https://www.amazon.%s/dp/%s'%(domain, asin)
+            r = self.referrer_for_domain(domain)
+            if r is not None:
+                url = r + 'dp/' + asin
            if url:
                idtype = 'amazon' if domain == 'com' else 'amazon_'+domain
                return domain, idtype, asin, url
@ -1082,9 +1092,9 @@ class Amazon(Source):
                               ' profiling to block access to its website. As such this metadata plugin is'
                               ' unlikely to ever work reliably.')

-        # Keep only the top 5 matches as the matches are sorted by relevance by
+        # Keep only the top 3 matches as the matches are sorted by relevance by
        # Amazon so lower matches are not likely to be very relevant
-        return matches[:5]
+        return matches[:3]
    # }}}

    def identify(self, log, result_queue, abort, title=None, authors=None,  # {{{
@ -1099,9 +1109,11 @@ class Amazon(Source):
        import html5lib

        testing = getattr(self, 'running_a_test', False)
-        br = self.browser

        udata = self._get_book_url(identifiers)
+        br = self.browser
+        if testing:
+            print('User-agent:', br.current_user_agent())
        if udata is not None:
            # Try to directly get details page instead of running a search
            domain, idtype, asin, durl = udata
@ -1121,8 +1133,6 @@ class Amazon(Source):
        if query is None:
            log.error('Insufficient metadata to construct query')
            return
-        if testing:
-            print ('Using user agent for amazon: %s'%self.user_agent)
        try:
            raw = br.open_novisit(query, timeout=timeout).read().strip()
        except Exception as e:
@ -1179,6 +1189,7 @@ class Amazon(Source):
            if identifiers and title and authors:
                log('No matches found with identifiers, retrying using only'
                        ' title and authors. Query: %r'%query)
+                time.sleep(1)
                return self.identify(log, result_queue, abort, title=title,
                        authors=authors, timeout=timeout)
            log.error('No matches found with query: %r'%query)
@ -1188,9 +1199,11 @@ class Amazon(Source):
                            testing=testing) for i, url in enumerate(matches)]

        for w in workers:
-            w.start()
            # Don't send all requests at the same time
-            time.sleep(0.1)
+            time.sleep(1)
+            w.start()
+            if abort.is_set():
+                return

        while not abort.is_set():
            a_worker_is_alive = False
@ -1216,6 +1229,8 @@ class Amazon(Source):
                    identifiers=identifiers)
            if abort.is_set():
                return
+            if abort.is_set():
+                return
            results = []
            while True:
                try:
@ -1234,10 +1249,10 @@ class Amazon(Source):

        if abort.is_set():
            return
-        br = self.browser
        log('Downloading cover from:', cached_url)
        try:
-            cdata = br.open_novisit(cached_url, timeout=timeout).read()
+            time.sleep(1)
+            cdata = self.browser.open_novisit(cached_url, timeout=timeout).read()
            result_queue.put((self, cdata))
        except:
            log.exception('Failed to download cover from:', cached_url)
--- a/src/calibre/utils/random_ua.py
+++ b/src/calibre/utils/random_ua.py
@ -78,3 +78,9 @@ def all_user_agents():

 def random_user_agent():
    return random.choice(all_user_agents())
+
+
+def accept_header_for_ua(ua):
+    if 'Firefox/' in ua:
+        return 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'
+    return 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'