diff --git a/src/calibre/ebooks/metadata/sources/amazon.py b/src/calibre/ebooks/metadata/sources/amazon.py
index 122e3ac19b..21bc15d4a4 100644
--- a/src/calibre/ebooks/metadata/sources/amazon.py
+++ b/src/calibre/ebooks/metadata/sources/amazon.py
@@ -30,9 +30,11 @@ class Worker(Thread): # Get details {{{
Get book details from amazons book page in a separate thread
'''
- def __init__(self, url, result_queue, browser, log, relevance, domain, plugin, timeout=20):
+ def __init__(self, url, result_queue, browser, log, relevance, domain,
+ plugin, timeout=20, testing=False):
Thread.__init__(self)
self.daemon = True
+ self.testing = testing
self.url, self.result_queue = url, result_queue
self.log, self.timeout = log, timeout
self.relevance, self.plugin = relevance, plugin
@@ -189,10 +191,9 @@ class Worker(Thread): # Get details {{{
self.log.exception(msg)
return
+ oraw = raw
raw = xml_to_unicode(raw, strip_encoding_pats=True,
resolve_entities=True)[0]
- #open('/t/t.html', 'wb').write(raw)
-
if '
404 - ' in raw:
self.log.error('URL malformed: %r'%self.url)
return
@@ -211,14 +212,20 @@ class Worker(Thread): # Get details {{{
self.log.error(msg)
return
- self.parse_details(root)
+ self.parse_details(oraw, root)
- def parse_details(self, root):
+ def parse_details(self, raw, root):
try:
asin = self.parse_asin(root)
except:
self.log.exception('Error parsing asin for url: %r'%self.url)
asin = None
+ if self.testing:
+ import tempfile
+ with tempfile.NamedTemporaryFile(prefix=asin + '_',
+ suffix='.html', delete=False) as f:
+ f.write(raw)
+ print ('Downloaded html for', asin, 'saved in', f.name)
try:
title = self.parse_title(root)
@@ -310,7 +317,7 @@ class Worker(Thread): # Get details {{{
return l.get('href').rpartition('/')[-1]
def parse_title(self, root):
- tdiv = root.xpath('//h1[@class="parseasinTitle"]')[0]
+ tdiv = root.xpath('//h1[contains(@class, "parseasinTitle")]')[0]
actual_title = tdiv.xpath('descendant::*[@id="btAsinTitle"]')
if actual_title:
title = tostring(actual_title[0], encoding=unicode,
@@ -320,11 +327,11 @@ class Worker(Thread): # Get details {{{
return re.sub(r'[(\[].*[)\]]', '', title).strip()
def parse_authors(self, root):
- x = '//h1[@class="parseasinTitle"]/following-sibling::span/*[(name()="a" and @href) or (name()="span" and @class="contributorNameTrigger")]'
+ x = '//h1[contains(@class, "parseasinTitle")]/following-sibling::span/*[(name()="a" and @href) or (name()="span" and @class="contributorNameTrigger")]'
aname = root.xpath(x)
if not aname:
aname = root.xpath('''
- //h1[@class="parseasinTitle"]/following-sibling::*[(name()="a" and @href) or (name()="span" and @class="contributorNameTrigger")]
+ //h1[contains(@class, "parseasinTitle")]/following-sibling::*[(name()="a" and @href) or (name()="span" and @class="contributorNameTrigger")]
''')
for x in aname:
x.tail = ''
@@ -666,7 +673,8 @@ class Amazon(Source):
log.error('No matches found with query: %r'%query)
return
- workers = [Worker(url, result_queue, br, log, i, domain, self) for i, url in
+ workers = [Worker(url, result_queue, br, log, i, domain, self,
+ testing=getattr(self, 'running_a_test', False)) for i, url in
enumerate(matches)]
for w in workers:
@@ -740,16 +748,6 @@ if __name__ == '__main__': # tests {{{
),
- ( # An e-book ISBN not on Amazon, the title/author search matches
- # the Kindle edition, which has different markup for ratings and
- # isbn
- {'identifiers':{'isbn': '9780307459671'},
- 'title':'Invisible Gorilla', 'authors':['Christopher Chabris']},
- [title_test('The Invisible Gorilla: And Other Ways Our Intuitions Deceive Us',
- exact=True), authors_test(['Christopher Chabris', 'Daniel Simons'])]
-
- ),
-
( # This isbn not on amazon
{'identifiers':{'isbn': '8324616489'}, 'title':'Learning Python',
'authors':['Lutz']},
diff --git a/src/calibre/ebooks/metadata/sources/base.py b/src/calibre/ebooks/metadata/sources/base.py
index 701394e1a5..c79983d928 100644
--- a/src/calibre/ebooks/metadata/sources/base.py
+++ b/src/calibre/ebooks/metadata/sources/base.py
@@ -196,6 +196,7 @@ class Source(Plugin):
def __init__(self, *args, **kwargs):
Plugin.__init__(self, *args, **kwargs)
+ self.running_a_test = False # Set to True when using identify_test()
self._isbn_to_identifier_cache = {}
self._identifier_to_cover_url_cache = {}
self.cache_lock = threading.RLock()
diff --git a/src/calibre/ebooks/metadata/sources/test.py b/src/calibre/ebooks/metadata/sources/test.py
index c55f963003..bccce3dba2 100644
--- a/src/calibre/ebooks/metadata/sources/test.py
+++ b/src/calibre/ebooks/metadata/sources/test.py
@@ -183,7 +183,11 @@ def test_identify_plugin(name, tests): # {{{
rq = Queue()
args = (log, rq, abort)
start_time = time.time()
- err = plugin.identify(*args, **kwargs)
+ plugin.running_a_test = True
+ try:
+ err = plugin.identify(*args, **kwargs)
+ finally:
+ plugin.running_a_test = False
total_time = time.time() - start_time
times.append(total_time)
if err is not None: