From 18fd3ae063794fc1c6f7932ab166dd137a06aeef Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sat, 27 Apr 2019 12:05:24 +0530 Subject: [PATCH] Amazon metadata: Make title processing more robust Also make it easier to run tests selectively from the command line --- src/calibre/ebooks/metadata/sources/amazon.py | 62 ++++++++++++------- 1 file changed, 39 insertions(+), 23 deletions(-) diff --git a/src/calibre/ebooks/metadata/sources/amazon.py b/src/calibre/ebooks/metadata/sources/amazon.py index 44c6f19420..5e2a6ce822 100644 --- a/src/calibre/ebooks/metadata/sources/amazon.py +++ b/src/calibre/ebooks/metadata/sources/amazon.py @@ -475,13 +475,26 @@ class Worker(Thread): # Get details {{{ return self.tostring(elem, encoding='unicode', method='text').strip() def parse_title(self, root): + + def sanitize_title(title): + ans = re.sub(r'[(\[].*[)\]]', '', title).strip() + if not ans: + ans = title.rpartition('[')[0].strip() + return ans + h1 = root.xpath('//h1[@id="title"]') if h1: h1 = h1[0] for child in h1.xpath('./*[contains(@class, "a-color-secondary")]'): h1.remove(child) - return self.totext(h1) - tdiv = root.xpath('//h1[contains(@class, "parseasinTitle")]')[0] + return sanitize_title(self.totext(h1)) + tdiv = root.xpath('//h1[contains(@class, "parseasinTitle")]') + if not tdiv: + span = root.xpath('//*[id="ebooksTitle"]') + if span: + return sanitize_title(self.totext(span[0])) + raise ValueError('No title block found') + tdiv = tdiv[0] actual_title = tdiv.xpath('descendant::*[@id="btAsinTitle"]') if actual_title: title = self.tostring(actual_title[0], encoding='unicode', @@ -489,10 +502,7 @@ class Worker(Thread): # Get details {{{ else: title = self.tostring(tdiv, encoding='unicode', method='text').strip() - ans = re.sub(r'[(\[].*[)\]]', '', title).strip() - if not ans: - ans = title.rpartition('[')[0].strip() - return ans + return sanitize_title(title) def parse_authors(self, root): for sel in ( @@ -851,7 +861,7 @@ class Worker(Thread): # Get details {{{ class Amazon(Source): name = 'Amazon.com' - version = (1, 2, 6) + version = (1, 2, 7) minimum_calibre_version = (2, 82, 0) description = _('Downloads metadata and covers from Amazon') @@ -1480,12 +1490,13 @@ class Amazon(Source): # }}} -if __name__ == '__main__': # tests {{{ - # To run these test use: calibre-debug - # src/calibre/ebooks/metadata/sources/amazon.py +def manual_tests(domain, **kw): # {{{ + # To run these test use: + # calibre-debug -c "from calibre.ebooks.metadata.sources.amazon import *; manual_tests('com')" from calibre.ebooks.metadata.sources.test import (test_identify_plugin, isbn_test, title_test, authors_test, comments_test, series_test) - com_tests = [ # {{{ + all_tests = {} + all_tests['com'] = [ # {{{ ( # Paperback with series {'identifiers': {'amazon': '1423146786'}}, @@ -1533,7 +1544,7 @@ if __name__ == '__main__': # tests {{{ # }}} - de_tests = [ # {{{ + all_tests['de'] = [ # {{{ ( {'identifiers': {'isbn': '9783453314979'}}, [title_test('Die letzten Wächter: Roman', @@ -1551,7 +1562,7 @@ if __name__ == '__main__': # tests {{{ ), ] # }}} - it_tests = [ # {{{ + all_tests['it'] = [ # {{{ ( {'identifiers': {'isbn': '8838922195'}}, [title_test('La briscola in cinque', @@ -1561,7 +1572,13 @@ if __name__ == '__main__': # tests {{{ ), ] # }}} - fr_tests = [ # {{{ + all_tests['fr'] = [ # {{{ + ( + {'identifiers': {'amazon_fr': 'B07L7ST4RS'}}, + [title_test('Le secret de Lola', exact=True), + authors_test(['Amélie BRIZIO']) + ] + ), ( {'identifiers': {'isbn': '2221116798'}}, [title_test('L\'étrange voyage de Monsieur Daldry', @@ -1571,7 +1588,7 @@ if __name__ == '__main__': # tests {{{ ), ] # }}} - es_tests = [ # {{{ + all_tests['es'] = [ # {{{ ( {'identifiers': {'isbn': '8483460831'}}, [title_test('Tiempos Interesantes', @@ -1581,7 +1598,7 @@ if __name__ == '__main__': # tests {{{ ), ] # }}} - jp_tests = [ # {{{ + all_tests['jp'] = [ # {{{ ( # Adult filtering test {'identifiers': {'isbn': '4799500066'}}, [title_test(u'Bitch Trap'), ] @@ -1600,7 +1617,7 @@ if __name__ == '__main__': # tests {{{ ), ] # }}} - br_tests = [ # {{{ + all_tests['br'] = [ # {{{ ( {'title': 'Guerra dos Tronos'}, [title_test('A Guerra dos Tronos - As Crônicas de Gelo e Fogo', @@ -1610,7 +1627,7 @@ if __name__ == '__main__': # tests {{{ ), ] # }}} - nl_tests = [ # {{{ + all_tests['nl'] = [ # {{{ ( {'title': 'Freakonomics'}, [title_test('Freakonomics', @@ -1620,7 +1637,7 @@ if __name__ == '__main__': # tests {{{ ), ] # }}} - cn_tests = [ # {{{ + all_tests['cn'] = [ # {{{ ( {'identifiers': {'isbn': '9787115369512'}}, [title_test('若为自由故 自由软件之父理查德斯托曼传', exact=True), @@ -1635,7 +1652,7 @@ if __name__ == '__main__': # tests {{{ ), ] # }}} - ca_tests = [ # {{{ + all_tests['ca'] = [ # {{{ ( # Paperback with series {'identifiers': {'isbn': '9781623808747'}}, [title_test('Parting Shot', exact=True), @@ -1655,7 +1672,7 @@ if __name__ == '__main__': # tests {{{ ] # }}} def do_test(domain, start=0, stop=None, server='auto'): - tests = globals().get(domain + '_tests') + tests = all_tests[domain] if stop is None: stop = len(tests) tests = tests[start:stop] @@ -1665,6 +1682,5 @@ if __name__ == '__main__': # tests {{{ setattr(p, 'testing_server', server), )) - do_test('com') - # do_test('de') + do_test(domain, **kw) # }}}