mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Amazon metadata download: Fix for change in Amazon website that broke downloading metadata. Also make debugging the plugin a little easier. Fixes #878395 (Amazon ASIN no longer searchable in the ids field)
This commit is contained in:
parent
eda4c65740
commit
594bf0b678
@ -30,9 +30,11 @@ class Worker(Thread): # Get details {{{
|
|||||||
Get book details from amazons book page in a separate thread
|
Get book details from amazons book page in a separate thread
|
||||||
'''
|
'''
|
||||||
|
|
||||||
def __init__(self, url, result_queue, browser, log, relevance, domain, plugin, timeout=20):
|
def __init__(self, url, result_queue, browser, log, relevance, domain,
|
||||||
|
plugin, timeout=20, testing=False):
|
||||||
Thread.__init__(self)
|
Thread.__init__(self)
|
||||||
self.daemon = True
|
self.daemon = True
|
||||||
|
self.testing = testing
|
||||||
self.url, self.result_queue = url, result_queue
|
self.url, self.result_queue = url, result_queue
|
||||||
self.log, self.timeout = log, timeout
|
self.log, self.timeout = log, timeout
|
||||||
self.relevance, self.plugin = relevance, plugin
|
self.relevance, self.plugin = relevance, plugin
|
||||||
@ -189,10 +191,9 @@ class Worker(Thread): # Get details {{{
|
|||||||
self.log.exception(msg)
|
self.log.exception(msg)
|
||||||
return
|
return
|
||||||
|
|
||||||
|
oraw = raw
|
||||||
raw = xml_to_unicode(raw, strip_encoding_pats=True,
|
raw = xml_to_unicode(raw, strip_encoding_pats=True,
|
||||||
resolve_entities=True)[0]
|
resolve_entities=True)[0]
|
||||||
#open('/t/t.html', 'wb').write(raw)
|
|
||||||
|
|
||||||
if '<title>404 - ' in raw:
|
if '<title>404 - ' in raw:
|
||||||
self.log.error('URL malformed: %r'%self.url)
|
self.log.error('URL malformed: %r'%self.url)
|
||||||
return
|
return
|
||||||
@ -211,14 +212,20 @@ class Worker(Thread): # Get details {{{
|
|||||||
self.log.error(msg)
|
self.log.error(msg)
|
||||||
return
|
return
|
||||||
|
|
||||||
self.parse_details(root)
|
self.parse_details(oraw, root)
|
||||||
|
|
||||||
def parse_details(self, root):
|
def parse_details(self, raw, root):
|
||||||
try:
|
try:
|
||||||
asin = self.parse_asin(root)
|
asin = self.parse_asin(root)
|
||||||
except:
|
except:
|
||||||
self.log.exception('Error parsing asin for url: %r'%self.url)
|
self.log.exception('Error parsing asin for url: %r'%self.url)
|
||||||
asin = None
|
asin = None
|
||||||
|
if self.testing:
|
||||||
|
import tempfile
|
||||||
|
with tempfile.NamedTemporaryFile(prefix=asin + '_',
|
||||||
|
suffix='.html', delete=False) as f:
|
||||||
|
f.write(raw)
|
||||||
|
print ('Downloaded html for', asin, 'saved in', f.name)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
title = self.parse_title(root)
|
title = self.parse_title(root)
|
||||||
@ -310,7 +317,7 @@ class Worker(Thread): # Get details {{{
|
|||||||
return l.get('href').rpartition('/')[-1]
|
return l.get('href').rpartition('/')[-1]
|
||||||
|
|
||||||
def parse_title(self, root):
|
def parse_title(self, root):
|
||||||
tdiv = root.xpath('//h1[@class="parseasinTitle"]')[0]
|
tdiv = root.xpath('//h1[contains(@class, "parseasinTitle")]')[0]
|
||||||
actual_title = tdiv.xpath('descendant::*[@id="btAsinTitle"]')
|
actual_title = tdiv.xpath('descendant::*[@id="btAsinTitle"]')
|
||||||
if actual_title:
|
if actual_title:
|
||||||
title = tostring(actual_title[0], encoding=unicode,
|
title = tostring(actual_title[0], encoding=unicode,
|
||||||
@ -320,11 +327,11 @@ class Worker(Thread): # Get details {{{
|
|||||||
return re.sub(r'[(\[].*[)\]]', '', title).strip()
|
return re.sub(r'[(\[].*[)\]]', '', title).strip()
|
||||||
|
|
||||||
def parse_authors(self, root):
|
def parse_authors(self, root):
|
||||||
x = '//h1[@class="parseasinTitle"]/following-sibling::span/*[(name()="a" and @href) or (name()="span" and @class="contributorNameTrigger")]'
|
x = '//h1[contains(@class, "parseasinTitle")]/following-sibling::span/*[(name()="a" and @href) or (name()="span" and @class="contributorNameTrigger")]'
|
||||||
aname = root.xpath(x)
|
aname = root.xpath(x)
|
||||||
if not aname:
|
if not aname:
|
||||||
aname = root.xpath('''
|
aname = root.xpath('''
|
||||||
//h1[@class="parseasinTitle"]/following-sibling::*[(name()="a" and @href) or (name()="span" and @class="contributorNameTrigger")]
|
//h1[contains(@class, "parseasinTitle")]/following-sibling::*[(name()="a" and @href) or (name()="span" and @class="contributorNameTrigger")]
|
||||||
''')
|
''')
|
||||||
for x in aname:
|
for x in aname:
|
||||||
x.tail = ''
|
x.tail = ''
|
||||||
@ -666,7 +673,8 @@ class Amazon(Source):
|
|||||||
log.error('No matches found with query: %r'%query)
|
log.error('No matches found with query: %r'%query)
|
||||||
return
|
return
|
||||||
|
|
||||||
workers = [Worker(url, result_queue, br, log, i, domain, self) for i, url in
|
workers = [Worker(url, result_queue, br, log, i, domain, self,
|
||||||
|
testing=getattr(self, 'running_a_test', False)) for i, url in
|
||||||
enumerate(matches)]
|
enumerate(matches)]
|
||||||
|
|
||||||
for w in workers:
|
for w in workers:
|
||||||
@ -740,16 +748,6 @@ if __name__ == '__main__': # tests {{{
|
|||||||
|
|
||||||
),
|
),
|
||||||
|
|
||||||
( # An e-book ISBN not on Amazon, the title/author search matches
|
|
||||||
# the Kindle edition, which has different markup for ratings and
|
|
||||||
# isbn
|
|
||||||
{'identifiers':{'isbn': '9780307459671'},
|
|
||||||
'title':'Invisible Gorilla', 'authors':['Christopher Chabris']},
|
|
||||||
[title_test('The Invisible Gorilla: And Other Ways Our Intuitions Deceive Us',
|
|
||||||
exact=True), authors_test(['Christopher Chabris', 'Daniel Simons'])]
|
|
||||||
|
|
||||||
),
|
|
||||||
|
|
||||||
( # This isbn not on amazon
|
( # This isbn not on amazon
|
||||||
{'identifiers':{'isbn': '8324616489'}, 'title':'Learning Python',
|
{'identifiers':{'isbn': '8324616489'}, 'title':'Learning Python',
|
||||||
'authors':['Lutz']},
|
'authors':['Lutz']},
|
||||||
|
@ -196,6 +196,7 @@ class Source(Plugin):
|
|||||||
|
|
||||||
def __init__(self, *args, **kwargs):
|
def __init__(self, *args, **kwargs):
|
||||||
Plugin.__init__(self, *args, **kwargs)
|
Plugin.__init__(self, *args, **kwargs)
|
||||||
|
self.running_a_test = False # Set to True when using identify_test()
|
||||||
self._isbn_to_identifier_cache = {}
|
self._isbn_to_identifier_cache = {}
|
||||||
self._identifier_to_cover_url_cache = {}
|
self._identifier_to_cover_url_cache = {}
|
||||||
self.cache_lock = threading.RLock()
|
self.cache_lock = threading.RLock()
|
||||||
|
@ -183,7 +183,11 @@ def test_identify_plugin(name, tests): # {{{
|
|||||||
rq = Queue()
|
rq = Queue()
|
||||||
args = (log, rq, abort)
|
args = (log, rq, abort)
|
||||||
start_time = time.time()
|
start_time = time.time()
|
||||||
err = plugin.identify(*args, **kwargs)
|
plugin.running_a_test = True
|
||||||
|
try:
|
||||||
|
err = plugin.identify(*args, **kwargs)
|
||||||
|
finally:
|
||||||
|
plugin.running_a_test = False
|
||||||
total_time = time.time() - start_time
|
total_time = time.time() - start_time
|
||||||
times.append(total_time)
|
times.append(total_time)
|
||||||
if err is not None:
|
if err is not None:
|
||||||
|
Loading…
x
Reference in New Issue
Block a user