mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Amazon metadata download: Fix for change in Amazon website that broke downloading metadata. Also make debugging the plugin a little easier. Fixes #878395 (Amazon ASIN no longer searchable in the ids field)
This commit is contained in:
parent
eda4c65740
commit
594bf0b678
@ -30,9 +30,11 @@ class Worker(Thread): # Get details {{{
|
||||
Get book details from amazons book page in a separate thread
|
||||
'''
|
||||
|
||||
def __init__(self, url, result_queue, browser, log, relevance, domain, plugin, timeout=20):
|
||||
def __init__(self, url, result_queue, browser, log, relevance, domain,
|
||||
plugin, timeout=20, testing=False):
|
||||
Thread.__init__(self)
|
||||
self.daemon = True
|
||||
self.testing = testing
|
||||
self.url, self.result_queue = url, result_queue
|
||||
self.log, self.timeout = log, timeout
|
||||
self.relevance, self.plugin = relevance, plugin
|
||||
@ -189,10 +191,9 @@ class Worker(Thread): # Get details {{{
|
||||
self.log.exception(msg)
|
||||
return
|
||||
|
||||
oraw = raw
|
||||
raw = xml_to_unicode(raw, strip_encoding_pats=True,
|
||||
resolve_entities=True)[0]
|
||||
#open('/t/t.html', 'wb').write(raw)
|
||||
|
||||
if '<title>404 - ' in raw:
|
||||
self.log.error('URL malformed: %r'%self.url)
|
||||
return
|
||||
@ -211,14 +212,20 @@ class Worker(Thread): # Get details {{{
|
||||
self.log.error(msg)
|
||||
return
|
||||
|
||||
self.parse_details(root)
|
||||
self.parse_details(oraw, root)
|
||||
|
||||
def parse_details(self, root):
|
||||
def parse_details(self, raw, root):
|
||||
try:
|
||||
asin = self.parse_asin(root)
|
||||
except:
|
||||
self.log.exception('Error parsing asin for url: %r'%self.url)
|
||||
asin = None
|
||||
if self.testing:
|
||||
import tempfile
|
||||
with tempfile.NamedTemporaryFile(prefix=asin + '_',
|
||||
suffix='.html', delete=False) as f:
|
||||
f.write(raw)
|
||||
print ('Downloaded html for', asin, 'saved in', f.name)
|
||||
|
||||
try:
|
||||
title = self.parse_title(root)
|
||||
@ -310,7 +317,7 @@ class Worker(Thread): # Get details {{{
|
||||
return l.get('href').rpartition('/')[-1]
|
||||
|
||||
def parse_title(self, root):
|
||||
tdiv = root.xpath('//h1[@class="parseasinTitle"]')[0]
|
||||
tdiv = root.xpath('//h1[contains(@class, "parseasinTitle")]')[0]
|
||||
actual_title = tdiv.xpath('descendant::*[@id="btAsinTitle"]')
|
||||
if actual_title:
|
||||
title = tostring(actual_title[0], encoding=unicode,
|
||||
@ -320,11 +327,11 @@ class Worker(Thread): # Get details {{{
|
||||
return re.sub(r'[(\[].*[)\]]', '', title).strip()
|
||||
|
||||
def parse_authors(self, root):
|
||||
x = '//h1[@class="parseasinTitle"]/following-sibling::span/*[(name()="a" and @href) or (name()="span" and @class="contributorNameTrigger")]'
|
||||
x = '//h1[contains(@class, "parseasinTitle")]/following-sibling::span/*[(name()="a" and @href) or (name()="span" and @class="contributorNameTrigger")]'
|
||||
aname = root.xpath(x)
|
||||
if not aname:
|
||||
aname = root.xpath('''
|
||||
//h1[@class="parseasinTitle"]/following-sibling::*[(name()="a" and @href) or (name()="span" and @class="contributorNameTrigger")]
|
||||
//h1[contains(@class, "parseasinTitle")]/following-sibling::*[(name()="a" and @href) or (name()="span" and @class="contributorNameTrigger")]
|
||||
''')
|
||||
for x in aname:
|
||||
x.tail = ''
|
||||
@ -666,7 +673,8 @@ class Amazon(Source):
|
||||
log.error('No matches found with query: %r'%query)
|
||||
return
|
||||
|
||||
workers = [Worker(url, result_queue, br, log, i, domain, self) for i, url in
|
||||
workers = [Worker(url, result_queue, br, log, i, domain, self,
|
||||
testing=getattr(self, 'running_a_test', False)) for i, url in
|
||||
enumerate(matches)]
|
||||
|
||||
for w in workers:
|
||||
@ -740,16 +748,6 @@ if __name__ == '__main__': # tests {{{
|
||||
|
||||
),
|
||||
|
||||
( # An e-book ISBN not on Amazon, the title/author search matches
|
||||
# the Kindle edition, which has different markup for ratings and
|
||||
# isbn
|
||||
{'identifiers':{'isbn': '9780307459671'},
|
||||
'title':'Invisible Gorilla', 'authors':['Christopher Chabris']},
|
||||
[title_test('The Invisible Gorilla: And Other Ways Our Intuitions Deceive Us',
|
||||
exact=True), authors_test(['Christopher Chabris', 'Daniel Simons'])]
|
||||
|
||||
),
|
||||
|
||||
( # This isbn not on amazon
|
||||
{'identifiers':{'isbn': '8324616489'}, 'title':'Learning Python',
|
||||
'authors':['Lutz']},
|
||||
|
@ -196,6 +196,7 @@ class Source(Plugin):
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
Plugin.__init__(self, *args, **kwargs)
|
||||
self.running_a_test = False # Set to True when using identify_test()
|
||||
self._isbn_to_identifier_cache = {}
|
||||
self._identifier_to_cover_url_cache = {}
|
||||
self.cache_lock = threading.RLock()
|
||||
|
@ -183,7 +183,11 @@ def test_identify_plugin(name, tests): # {{{
|
||||
rq = Queue()
|
||||
args = (log, rq, abort)
|
||||
start_time = time.time()
|
||||
plugin.running_a_test = True
|
||||
try:
|
||||
err = plugin.identify(*args, **kwargs)
|
||||
finally:
|
||||
plugin.running_a_test = False
|
||||
total_time = time.time() - start_time
|
||||
times.append(total_time)
|
||||
if err is not None:
|
||||
|
Loading…
x
Reference in New Issue
Block a user