diff --git a/src/calibre/ebooks/metadata/sources/amazon.py b/src/calibre/ebooks/metadata/sources/amazon.py index 122e3ac19b..21bc15d4a4 100644 --- a/src/calibre/ebooks/metadata/sources/amazon.py +++ b/src/calibre/ebooks/metadata/sources/amazon.py @@ -30,9 +30,11 @@ class Worker(Thread): # Get details {{{ Get book details from amazons book page in a separate thread ''' - def __init__(self, url, result_queue, browser, log, relevance, domain, plugin, timeout=20): + def __init__(self, url, result_queue, browser, log, relevance, domain, + plugin, timeout=20, testing=False): Thread.__init__(self) self.daemon = True + self.testing = testing self.url, self.result_queue = url, result_queue self.log, self.timeout = log, timeout self.relevance, self.plugin = relevance, plugin @@ -189,10 +191,9 @@ class Worker(Thread): # Get details {{{ self.log.exception(msg) return + oraw = raw raw = xml_to_unicode(raw, strip_encoding_pats=True, resolve_entities=True)[0] - #open('/t/t.html', 'wb').write(raw) - if '404 - ' in raw: self.log.error('URL malformed: %r'%self.url) return @@ -211,14 +212,20 @@ class Worker(Thread): # Get details {{{ self.log.error(msg) return - self.parse_details(root) + self.parse_details(oraw, root) - def parse_details(self, root): + def parse_details(self, raw, root): try: asin = self.parse_asin(root) except: self.log.exception('Error parsing asin for url: %r'%self.url) asin = None + if self.testing: + import tempfile + with tempfile.NamedTemporaryFile(prefix=asin + '_', + suffix='.html', delete=False) as f: + f.write(raw) + print ('Downloaded html for', asin, 'saved in', f.name) try: title = self.parse_title(root) @@ -310,7 +317,7 @@ class Worker(Thread): # Get details {{{ return l.get('href').rpartition('/')[-1] def parse_title(self, root): - tdiv = root.xpath('//h1[@class="parseasinTitle"]')[0] + tdiv = root.xpath('//h1[contains(@class, "parseasinTitle")]')[0] actual_title = tdiv.xpath('descendant::*[@id="btAsinTitle"]') if actual_title: title = tostring(actual_title[0], encoding=unicode, @@ -320,11 +327,11 @@ class Worker(Thread): # Get details {{{ return re.sub(r'[(\[].*[)\]]', '', title).strip() def parse_authors(self, root): - x = '//h1[@class="parseasinTitle"]/following-sibling::span/*[(name()="a" and @href) or (name()="span" and @class="contributorNameTrigger")]' + x = '//h1[contains(@class, "parseasinTitle")]/following-sibling::span/*[(name()="a" and @href) or (name()="span" and @class="contributorNameTrigger")]' aname = root.xpath(x) if not aname: aname = root.xpath(''' - //h1[@class="parseasinTitle"]/following-sibling::*[(name()="a" and @href) or (name()="span" and @class="contributorNameTrigger")] + //h1[contains(@class, "parseasinTitle")]/following-sibling::*[(name()="a" and @href) or (name()="span" and @class="contributorNameTrigger")] ''') for x in aname: x.tail = '' @@ -666,7 +673,8 @@ class Amazon(Source): log.error('No matches found with query: %r'%query) return - workers = [Worker(url, result_queue, br, log, i, domain, self) for i, url in + workers = [Worker(url, result_queue, br, log, i, domain, self, + testing=getattr(self, 'running_a_test', False)) for i, url in enumerate(matches)] for w in workers: @@ -740,16 +748,6 @@ if __name__ == '__main__': # tests {{{ ), - ( # An e-book ISBN not on Amazon, the title/author search matches - # the Kindle edition, which has different markup for ratings and - # isbn - {'identifiers':{'isbn': '9780307459671'}, - 'title':'Invisible Gorilla', 'authors':['Christopher Chabris']}, - [title_test('The Invisible Gorilla: And Other Ways Our Intuitions Deceive Us', - exact=True), authors_test(['Christopher Chabris', 'Daniel Simons'])] - - ), - ( # This isbn not on amazon {'identifiers':{'isbn': '8324616489'}, 'title':'Learning Python', 'authors':['Lutz']}, diff --git a/src/calibre/ebooks/metadata/sources/base.py b/src/calibre/ebooks/metadata/sources/base.py index 701394e1a5..c79983d928 100644 --- a/src/calibre/ebooks/metadata/sources/base.py +++ b/src/calibre/ebooks/metadata/sources/base.py @@ -196,6 +196,7 @@ class Source(Plugin): def __init__(self, *args, **kwargs): Plugin.__init__(self, *args, **kwargs) + self.running_a_test = False # Set to True when using identify_test() self._isbn_to_identifier_cache = {} self._identifier_to_cover_url_cache = {} self.cache_lock = threading.RLock() diff --git a/src/calibre/ebooks/metadata/sources/test.py b/src/calibre/ebooks/metadata/sources/test.py index c55f963003..bccce3dba2 100644 --- a/src/calibre/ebooks/metadata/sources/test.py +++ b/src/calibre/ebooks/metadata/sources/test.py @@ -183,7 +183,11 @@ def test_identify_plugin(name, tests): # {{{ rq = Queue() args = (log, rq, abort) start_time = time.time() - err = plugin.identify(*args, **kwargs) + plugin.running_a_test = True + try: + err = plugin.identify(*args, **kwargs) + finally: + plugin.running_a_test = False total_time = time.time() - start_time times.append(total_time) if err is not None: