Add a config option to allow the user to control which servers to query for amazon metadata

This commit is contained in:
Kovid Goyal 2017-03-06 10:33:52 +05:30
parent 0030c69626
commit e2bb9b1508

View File

@ -28,7 +28,6 @@ class SearchFailed(ValueError):
ua_index = -1
USE_SEARCH_ENGINE = True
def parse_details_page(url, log, timeout, browser, domain):
@ -844,10 +843,25 @@ class Amazon(Source):
'ca': _('Canada'),
}
SERVERS = {
'auto': _('Choose server automatically'),
'amazon': _('Amazon servers'),
'bing': _('Bing search cache'),
'google': _('Google search cache'),
'wayback': _('Wayback machine cache (slow)'),
}
options = (
Option('domain', 'choices', 'com', _('Amazon website to use:'),
Option('domain', 'choices', 'com', _('Amazon country website to use:'),
_('Metadata from Amazon will be fetched using this '
'country\'s Amazon website.'), choices=AMAZON_DOMAINS),
Option('server', 'choices', 'auto', _('Server to get data from:'),
_(
'Amazon has started blocking attempts to download'
' metadata from its servers. To get around this problem,'
' calibre can fetch the Amazon data from many different'
' places where it is cached. Choose the source you prefer.'
), choices=SERVERS),
)
def __init__(self, *args, **kwargs):
@ -873,7 +887,7 @@ class Amazon(Source):
@property
def browser(self):
global ua_index
if USE_SEARCH_ENGINE:
if self.use_search_engine:
if self._browser is None:
ua = random_user_agent(allow_ie=False)
self._browser = br = browser(user_agent=ua)
@ -963,6 +977,20 @@ class Amazon(Source):
return domain
@property
def server(self):
x = getattr(self, 'testing_server', None)
if x is not None:
return x
server = self.prefs['server']
if server not in self.SERVERS:
server = 'auto'
return server
@property
def use_search_engine(self):
return self.server != 'amazon'
def clean_downloaded_metadata(self, mi):
docase = (
mi.language == 'eng' or
@ -1223,8 +1251,14 @@ class Amazon(Source):
domain)[len('https://'):].partition('/')[0]
matches = []
se = search_engines_module()
urlproc = se.bing_url_processor
results, qurl = se.bing_search(terms, site, log=log, br=br, timeout=timeout)
server = self.server
if server in ('auto', 'bing'):
urlproc, sfunc = se.bing_url_processor, se.bing_search
elif server == 'google':
urlproc, sfunc = se.google_url_processor, se.google_search
elif server == 'wayback':
urlproc, sfunc = se.wayback_url_processor, se.ddg_search
results, qurl = sfunc(terms, site, log=log, br=br, timeout=timeout)
br.set_current_header('Referer', qurl)
for result in results:
if abort.is_set():
@ -1264,7 +1298,7 @@ class Amazon(Source):
log('User-agent:', br.current_user_agent())
if testing:
print('User-agent:', br.current_user_agent())
if udata is not None and not USE_SEARCH_ENGINE:
if udata is not None and not self.use_search_engine:
# Try to directly get details page instead of running a search
# Cannot use search engine as the directly constructed URL is
# usually redirected to a full URL by amazon, and is therefore
@ -1284,7 +1318,7 @@ class Amazon(Source):
except Exception:
log.exception(
'get_details failed for url: %r' % durl)
func = self.search_search_engine if USE_SEARCH_ENGINE else self.search_amazon
func = self.search_search_engine if self.use_search_engine else self.search_amazon
try:
matches, query, domain, cover_url_processor = func(
br, testing, log, abort, title, authors, identifiers, timeout)
@ -1360,7 +1394,7 @@ class Amazon(Source):
return
log('Downloading cover from:', cached_url)
br = self.browser
if USE_SEARCH_ENGINE:
if self.use_search_engine:
br = br.clone_browser()
br.set_current_header('Referer', self.referrer_for_domain(self.domain))
try:
@ -1437,13 +1471,6 @@ if __name__ == '__main__': # tests {{{
),
]
if not USE_SEARCH_ENGINE:
com_tests.append(
( # A kindle edition that does not appear in the search results when searching by ASIN
{'identifiers': {'amazon': 'B004JHY6OG'}},
[title_test(
'The Heroes: A First Law Novel (First Law World 2)', exact=True)]
))
# }}}
@ -1568,13 +1595,16 @@ if __name__ == '__main__': # tests {{{
),
] # }}}
def do_test(domain, start=0, stop=None):
def do_test(domain, start=0, stop=None, server='auto'):
tests = globals().get(domain + '_tests')
if stop is None:
stop = len(tests)
tests = tests[start:stop]
test_identify_plugin(Amazon.name, tests, modify_plugin=lambda
p: (setattr(p, 'testing_domain', domain), setattr(p, 'touched_fields', p.touched_fields - {'tags'})))
test_identify_plugin(Amazon.name, tests, modify_plugin=lambda p: (
setattr(p, 'testing_domain', domain),
setattr(p, 'touched_fields', p.touched_fields - {'tags'}),
setattr(p, 'testing_server', server),
))
do_test('com')
# do_test('de')