mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Use a single user agent string for amazon
Amazon returns CAPTCHA pages based on user agent sniffing, so use a common user agent (IE II)
This commit is contained in:
parent
68cc6ae288
commit
19c8784ad9
@ -12,13 +12,16 @@ from threading import Thread
|
|||||||
from Queue import Queue, Empty
|
from Queue import Queue, Empty
|
||||||
|
|
||||||
|
|
||||||
from calibre import as_unicode, random_user_agent
|
from calibre import as_unicode
|
||||||
from calibre.ebooks.metadata import check_isbn
|
from calibre.ebooks.metadata import check_isbn
|
||||||
from calibre.ebooks.metadata.sources.base import (Source, Option, fixcase,
|
from calibre.ebooks.metadata.sources.base import (Source, Option, fixcase,
|
||||||
fixauthors)
|
fixauthors)
|
||||||
from calibre.ebooks.metadata.book.base import Metadata
|
from calibre.ebooks.metadata.book.base import Metadata
|
||||||
from calibre.utils.localization import canonicalize_lang
|
from calibre.utils.localization import canonicalize_lang
|
||||||
|
|
||||||
|
class CaptchaError(Exception):
|
||||||
|
pass
|
||||||
|
|
||||||
def parse_details_page(url, log, timeout, browser, domain):
|
def parse_details_page(url, log, timeout, browser, domain):
|
||||||
from calibre.utils.cleantext import clean_ascii_chars
|
from calibre.utils.cleantext import clean_ascii_chars
|
||||||
from calibre.ebooks.chardet import xml_to_unicode
|
from calibre.ebooks.chardet import xml_to_unicode
|
||||||
@ -299,6 +302,8 @@ class Worker(Thread): # Get details {{{
|
|||||||
|
|
||||||
def parse_details(self, raw, root):
|
def parse_details(self, raw, root):
|
||||||
asin = parse_asin(root, self.log, self.url)
|
asin = parse_asin(root, self.log, self.url)
|
||||||
|
if not asin and root.xpath('//form[@action="/errors/validateCaptcha"]'):
|
||||||
|
raise CaptchaError('Amazon returned a CAPTCHA page, probably because you downloaded too many books. Wait for some time and try again.')
|
||||||
if self.testing:
|
if self.testing:
|
||||||
import tempfile, uuid
|
import tempfile, uuid
|
||||||
with tempfile.NamedTemporaryFile(prefix=(asin or str(uuid.uuid4()))+ '_',
|
with tempfile.NamedTemporaryFile(prefix=(asin or str(uuid.uuid4()))+ '_',
|
||||||
@ -764,9 +769,7 @@ class Amazon(Source):
|
|||||||
|
|
||||||
@property
|
@property
|
||||||
def user_agent(self):
|
def user_agent(self):
|
||||||
# Pass in an index to random_user_agent() to test with a particular
|
return 'Mozilla/5.0 (compatible, MSIE 11, Windows NT 6.3; Trident/7.0; rv:11.0) like Gecko'
|
||||||
# user agent
|
|
||||||
return random_user_agent()
|
|
||||||
|
|
||||||
def save_settings(self, *args, **kwargs):
|
def save_settings(self, *args, **kwargs):
|
||||||
Source.save_settings(self, *args, **kwargs)
|
Source.save_settings(self, *args, **kwargs)
|
||||||
@ -985,6 +988,8 @@ class Amazon(Source):
|
|||||||
url = 'http://www.amazon.%s%s' % (self.get_website_domain(domain), url)
|
url = 'http://www.amazon.%s%s' % (self.get_website_domain(domain), url)
|
||||||
matches.append(url)
|
matches.append(url)
|
||||||
break
|
break
|
||||||
|
if not matches and root.xpath('//form[@action="/errors/validateCaptcha"]'):
|
||||||
|
raise CaptchaError('Amazon returned a CAPTCHA page, probably because you downloaded too many books. Wait for some time and try again.')
|
||||||
|
|
||||||
# Keep only the top 5 matches as the matches are sorted by relevance by
|
# Keep only the top 5 matches as the matches are sorted by relevance by
|
||||||
# Amazon so lower matches are not likely to be very relevant
|
# Amazon so lower matches are not likely to be very relevant
|
||||||
|
Loading…
x
Reference in New Issue
Block a user