Amazon metadata: Make title processing more robust

Also make it easier to run tests selectively from the command line
This commit is contained in:
Kovid Goyal 2019-04-27 12:05:24 +05:30
parent c485690a23
commit 18fd3ae063
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C

View File

@ -475,13 +475,26 @@ class Worker(Thread): # Get details {{{
return self.tostring(elem, encoding='unicode', method='text').strip() return self.tostring(elem, encoding='unicode', method='text').strip()
def parse_title(self, root): def parse_title(self, root):
def sanitize_title(title):
ans = re.sub(r'[(\[].*[)\]]', '', title).strip()
if not ans:
ans = title.rpartition('[')[0].strip()
return ans
h1 = root.xpath('//h1[@id="title"]') h1 = root.xpath('//h1[@id="title"]')
if h1: if h1:
h1 = h1[0] h1 = h1[0]
for child in h1.xpath('./*[contains(@class, "a-color-secondary")]'): for child in h1.xpath('./*[contains(@class, "a-color-secondary")]'):
h1.remove(child) h1.remove(child)
return self.totext(h1) return sanitize_title(self.totext(h1))
tdiv = root.xpath('//h1[contains(@class, "parseasinTitle")]')[0] tdiv = root.xpath('//h1[contains(@class, "parseasinTitle")]')
if not tdiv:
span = root.xpath('//*[id="ebooksTitle"]')
if span:
return sanitize_title(self.totext(span[0]))
raise ValueError('No title block found')
tdiv = tdiv[0]
actual_title = tdiv.xpath('descendant::*[@id="btAsinTitle"]') actual_title = tdiv.xpath('descendant::*[@id="btAsinTitle"]')
if actual_title: if actual_title:
title = self.tostring(actual_title[0], encoding='unicode', title = self.tostring(actual_title[0], encoding='unicode',
@ -489,10 +502,7 @@ class Worker(Thread): # Get details {{{
else: else:
title = self.tostring(tdiv, encoding='unicode', title = self.tostring(tdiv, encoding='unicode',
method='text').strip() method='text').strip()
ans = re.sub(r'[(\[].*[)\]]', '', title).strip() return sanitize_title(title)
if not ans:
ans = title.rpartition('[')[0].strip()
return ans
def parse_authors(self, root): def parse_authors(self, root):
for sel in ( for sel in (
@ -851,7 +861,7 @@ class Worker(Thread): # Get details {{{
class Amazon(Source): class Amazon(Source):
name = 'Amazon.com' name = 'Amazon.com'
version = (1, 2, 6) version = (1, 2, 7)
minimum_calibre_version = (2, 82, 0) minimum_calibre_version = (2, 82, 0)
description = _('Downloads metadata and covers from Amazon') description = _('Downloads metadata and covers from Amazon')
@ -1480,12 +1490,13 @@ class Amazon(Source):
# }}} # }}}
if __name__ == '__main__': # tests {{{ def manual_tests(domain, **kw): # {{{
# To run these test use: calibre-debug # To run these test use:
# src/calibre/ebooks/metadata/sources/amazon.py # calibre-debug -c "from calibre.ebooks.metadata.sources.amazon import *; manual_tests('com')"
from calibre.ebooks.metadata.sources.test import (test_identify_plugin, from calibre.ebooks.metadata.sources.test import (test_identify_plugin,
isbn_test, title_test, authors_test, comments_test, series_test) isbn_test, title_test, authors_test, comments_test, series_test)
com_tests = [ # {{{ all_tests = {}
all_tests['com'] = [ # {{{
( # Paperback with series ( # Paperback with series
{'identifiers': {'amazon': '1423146786'}}, {'identifiers': {'amazon': '1423146786'}},
@ -1533,7 +1544,7 @@ if __name__ == '__main__': # tests {{{
# }}} # }}}
de_tests = [ # {{{ all_tests['de'] = [ # {{{
( (
{'identifiers': {'isbn': '9783453314979'}}, {'identifiers': {'isbn': '9783453314979'}},
[title_test('Die letzten Wächter: Roman', [title_test('Die letzten Wächter: Roman',
@ -1551,7 +1562,7 @@ if __name__ == '__main__': # tests {{{
), ),
] # }}} ] # }}}
it_tests = [ # {{{ all_tests['it'] = [ # {{{
( (
{'identifiers': {'isbn': '8838922195'}}, {'identifiers': {'isbn': '8838922195'}},
[title_test('La briscola in cinque', [title_test('La briscola in cinque',
@ -1561,7 +1572,13 @@ if __name__ == '__main__': # tests {{{
), ),
] # }}} ] # }}}
fr_tests = [ # {{{ all_tests['fr'] = [ # {{{
(
{'identifiers': {'amazon_fr': 'B07L7ST4RS'}},
[title_test('Le secret de Lola', exact=True),
authors_test(['Amélie BRIZIO'])
]
),
( (
{'identifiers': {'isbn': '2221116798'}}, {'identifiers': {'isbn': '2221116798'}},
[title_test('L\'étrange voyage de Monsieur Daldry', [title_test('L\'étrange voyage de Monsieur Daldry',
@ -1571,7 +1588,7 @@ if __name__ == '__main__': # tests {{{
), ),
] # }}} ] # }}}
es_tests = [ # {{{ all_tests['es'] = [ # {{{
( (
{'identifiers': {'isbn': '8483460831'}}, {'identifiers': {'isbn': '8483460831'}},
[title_test('Tiempos Interesantes', [title_test('Tiempos Interesantes',
@ -1581,7 +1598,7 @@ if __name__ == '__main__': # tests {{{
), ),
] # }}} ] # }}}
jp_tests = [ # {{{ all_tests['jp'] = [ # {{{
( # Adult filtering test ( # Adult filtering test
{'identifiers': {'isbn': '4799500066'}}, {'identifiers': {'isbn': '4799500066'}},
[title_test(u' '), ] [title_test(u' '), ]
@ -1600,7 +1617,7 @@ if __name__ == '__main__': # tests {{{
), ),
] # }}} ] # }}}
br_tests = [ # {{{ all_tests['br'] = [ # {{{
( (
{'title': 'Guerra dos Tronos'}, {'title': 'Guerra dos Tronos'},
[title_test('A Guerra dos Tronos - As Crônicas de Gelo e Fogo', [title_test('A Guerra dos Tronos - As Crônicas de Gelo e Fogo',
@ -1610,7 +1627,7 @@ if __name__ == '__main__': # tests {{{
), ),
] # }}} ] # }}}
nl_tests = [ # {{{ all_tests['nl'] = [ # {{{
( (
{'title': 'Freakonomics'}, {'title': 'Freakonomics'},
[title_test('Freakonomics', [title_test('Freakonomics',
@ -1620,7 +1637,7 @@ if __name__ == '__main__': # tests {{{
), ),
] # }}} ] # }}}
cn_tests = [ # {{{ all_tests['cn'] = [ # {{{
( (
{'identifiers': {'isbn': '9787115369512'}}, {'identifiers': {'isbn': '9787115369512'}},
[title_test('若为自由故 自由软件之父理查德斯托曼传', exact=True), [title_test('若为自由故 自由软件之父理查德斯托曼传', exact=True),
@ -1635,7 +1652,7 @@ if __name__ == '__main__': # tests {{{
), ),
] # }}} ] # }}}
ca_tests = [ # {{{ all_tests['ca'] = [ # {{{
( # Paperback with series ( # Paperback with series
{'identifiers': {'isbn': '9781623808747'}}, {'identifiers': {'isbn': '9781623808747'}},
[title_test('Parting Shot', exact=True), [title_test('Parting Shot', exact=True),
@ -1655,7 +1672,7 @@ if __name__ == '__main__': # tests {{{
] # }}} ] # }}}
def do_test(domain, start=0, stop=None, server='auto'): def do_test(domain, start=0, stop=None, server='auto'):
tests = globals().get(domain + '_tests') tests = all_tests[domain]
if stop is None: if stop is None:
stop = len(tests) stop = len(tests)
tests = tests[start:stop] tests = tests[start:stop]
@ -1665,6 +1682,5 @@ if __name__ == '__main__': # tests {{{
setattr(p, 'testing_server', server), setattr(p, 'testing_server', server),
)) ))
do_test('com') do_test(domain, **kw)
# do_test('de')
# }}} # }}}