Add a config option to allow the user to control which servers to query for amazon metadata

This commit is contained in:
Kovid Goyal 2017-03-06 10:33:52 +05:30
parent 0030c69626
commit e2bb9b1508

View File

@ -28,7 +28,6 @@ class SearchFailed(ValueError):
ua_index = -1 ua_index = -1
USE_SEARCH_ENGINE = True
def parse_details_page(url, log, timeout, browser, domain): def parse_details_page(url, log, timeout, browser, domain):
@ -844,10 +843,25 @@ class Amazon(Source):
'ca': _('Canada'), 'ca': _('Canada'),
} }
SERVERS = {
'auto': _('Choose server automatically'),
'amazon': _('Amazon servers'),
'bing': _('Bing search cache'),
'google': _('Google search cache'),
'wayback': _('Wayback machine cache (slow)'),
}
options = ( options = (
Option('domain', 'choices', 'com', _('Amazon website to use:'), Option('domain', 'choices', 'com', _('Amazon country website to use:'),
_('Metadata from Amazon will be fetched using this ' _('Metadata from Amazon will be fetched using this '
'country\'s Amazon website.'), choices=AMAZON_DOMAINS), 'country\'s Amazon website.'), choices=AMAZON_DOMAINS),
Option('server', 'choices', 'auto', _('Server to get data from:'),
_(
'Amazon has started blocking attempts to download'
' metadata from its servers. To get around this problem,'
' calibre can fetch the Amazon data from many different'
' places where it is cached. Choose the source you prefer.'
), choices=SERVERS),
) )
def __init__(self, *args, **kwargs): def __init__(self, *args, **kwargs):
@ -873,7 +887,7 @@ class Amazon(Source):
@property @property
def browser(self): def browser(self):
global ua_index global ua_index
if USE_SEARCH_ENGINE: if self.use_search_engine:
if self._browser is None: if self._browser is None:
ua = random_user_agent(allow_ie=False) ua = random_user_agent(allow_ie=False)
self._browser = br = browser(user_agent=ua) self._browser = br = browser(user_agent=ua)
@ -963,6 +977,20 @@ class Amazon(Source):
return domain return domain
@property
def server(self):
x = getattr(self, 'testing_server', None)
if x is not None:
return x
server = self.prefs['server']
if server not in self.SERVERS:
server = 'auto'
return server
@property
def use_search_engine(self):
return self.server != 'amazon'
def clean_downloaded_metadata(self, mi): def clean_downloaded_metadata(self, mi):
docase = ( docase = (
mi.language == 'eng' or mi.language == 'eng' or
@ -1223,8 +1251,14 @@ class Amazon(Source):
domain)[len('https://'):].partition('/')[0] domain)[len('https://'):].partition('/')[0]
matches = [] matches = []
se = search_engines_module() se = search_engines_module()
urlproc = se.bing_url_processor server = self.server
results, qurl = se.bing_search(terms, site, log=log, br=br, timeout=timeout) if server in ('auto', 'bing'):
urlproc, sfunc = se.bing_url_processor, se.bing_search
elif server == 'google':
urlproc, sfunc = se.google_url_processor, se.google_search
elif server == 'wayback':
urlproc, sfunc = se.wayback_url_processor, se.ddg_search
results, qurl = sfunc(terms, site, log=log, br=br, timeout=timeout)
br.set_current_header('Referer', qurl) br.set_current_header('Referer', qurl)
for result in results: for result in results:
if abort.is_set(): if abort.is_set():
@ -1264,7 +1298,7 @@ class Amazon(Source):
log('User-agent:', br.current_user_agent()) log('User-agent:', br.current_user_agent())
if testing: if testing:
print('User-agent:', br.current_user_agent()) print('User-agent:', br.current_user_agent())
if udata is not None and not USE_SEARCH_ENGINE: if udata is not None and not self.use_search_engine:
# Try to directly get details page instead of running a search # Try to directly get details page instead of running a search
# Cannot use search engine as the directly constructed URL is # Cannot use search engine as the directly constructed URL is
# usually redirected to a full URL by amazon, and is therefore # usually redirected to a full URL by amazon, and is therefore
@ -1284,7 +1318,7 @@ class Amazon(Source):
except Exception: except Exception:
log.exception( log.exception(
'get_details failed for url: %r' % durl) 'get_details failed for url: %r' % durl)
func = self.search_search_engine if USE_SEARCH_ENGINE else self.search_amazon func = self.search_search_engine if self.use_search_engine else self.search_amazon
try: try:
matches, query, domain, cover_url_processor = func( matches, query, domain, cover_url_processor = func(
br, testing, log, abort, title, authors, identifiers, timeout) br, testing, log, abort, title, authors, identifiers, timeout)
@ -1360,7 +1394,7 @@ class Amazon(Source):
return return
log('Downloading cover from:', cached_url) log('Downloading cover from:', cached_url)
br = self.browser br = self.browser
if USE_SEARCH_ENGINE: if self.use_search_engine:
br = br.clone_browser() br = br.clone_browser()
br.set_current_header('Referer', self.referrer_for_domain(self.domain)) br.set_current_header('Referer', self.referrer_for_domain(self.domain))
try: try:
@ -1437,13 +1471,6 @@ if __name__ == '__main__': # tests {{{
), ),
] ]
if not USE_SEARCH_ENGINE:
com_tests.append(
( # A kindle edition that does not appear in the search results when searching by ASIN
{'identifiers': {'amazon': 'B004JHY6OG'}},
[title_test(
'The Heroes: A First Law Novel (First Law World 2)', exact=True)]
))
# }}} # }}}
@ -1568,13 +1595,16 @@ if __name__ == '__main__': # tests {{{
), ),
] # }}} ] # }}}
def do_test(domain, start=0, stop=None): def do_test(domain, start=0, stop=None, server='auto'):
tests = globals().get(domain + '_tests') tests = globals().get(domain + '_tests')
if stop is None: if stop is None:
stop = len(tests) stop = len(tests)
tests = tests[start:stop] tests = tests[start:stop]
test_identify_plugin(Amazon.name, tests, modify_plugin=lambda test_identify_plugin(Amazon.name, tests, modify_plugin=lambda p: (
p: (setattr(p, 'testing_domain', domain), setattr(p, 'touched_fields', p.touched_fields - {'tags'}))) setattr(p, 'testing_domain', domain),
setattr(p, 'touched_fields', p.touched_fields - {'tags'}),
setattr(p, 'testing_server', server),
))
do_test('com') do_test('com')
# do_test('de') # do_test('de')