mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Amazon metadata: Make title processing more robust
Also make it easier to run tests selectively from the command line
This commit is contained in:
parent
c485690a23
commit
18fd3ae063
@ -475,13 +475,26 @@ class Worker(Thread): # Get details {{{
|
|||||||
return self.tostring(elem, encoding='unicode', method='text').strip()
|
return self.tostring(elem, encoding='unicode', method='text').strip()
|
||||||
|
|
||||||
def parse_title(self, root):
|
def parse_title(self, root):
|
||||||
|
|
||||||
|
def sanitize_title(title):
|
||||||
|
ans = re.sub(r'[(\[].*[)\]]', '', title).strip()
|
||||||
|
if not ans:
|
||||||
|
ans = title.rpartition('[')[0].strip()
|
||||||
|
return ans
|
||||||
|
|
||||||
h1 = root.xpath('//h1[@id="title"]')
|
h1 = root.xpath('//h1[@id="title"]')
|
||||||
if h1:
|
if h1:
|
||||||
h1 = h1[0]
|
h1 = h1[0]
|
||||||
for child in h1.xpath('./*[contains(@class, "a-color-secondary")]'):
|
for child in h1.xpath('./*[contains(@class, "a-color-secondary")]'):
|
||||||
h1.remove(child)
|
h1.remove(child)
|
||||||
return self.totext(h1)
|
return sanitize_title(self.totext(h1))
|
||||||
tdiv = root.xpath('//h1[contains(@class, "parseasinTitle")]')[0]
|
tdiv = root.xpath('//h1[contains(@class, "parseasinTitle")]')
|
||||||
|
if not tdiv:
|
||||||
|
span = root.xpath('//*[id="ebooksTitle"]')
|
||||||
|
if span:
|
||||||
|
return sanitize_title(self.totext(span[0]))
|
||||||
|
raise ValueError('No title block found')
|
||||||
|
tdiv = tdiv[0]
|
||||||
actual_title = tdiv.xpath('descendant::*[@id="btAsinTitle"]')
|
actual_title = tdiv.xpath('descendant::*[@id="btAsinTitle"]')
|
||||||
if actual_title:
|
if actual_title:
|
||||||
title = self.tostring(actual_title[0], encoding='unicode',
|
title = self.tostring(actual_title[0], encoding='unicode',
|
||||||
@ -489,10 +502,7 @@ class Worker(Thread): # Get details {{{
|
|||||||
else:
|
else:
|
||||||
title = self.tostring(tdiv, encoding='unicode',
|
title = self.tostring(tdiv, encoding='unicode',
|
||||||
method='text').strip()
|
method='text').strip()
|
||||||
ans = re.sub(r'[(\[].*[)\]]', '', title).strip()
|
return sanitize_title(title)
|
||||||
if not ans:
|
|
||||||
ans = title.rpartition('[')[0].strip()
|
|
||||||
return ans
|
|
||||||
|
|
||||||
def parse_authors(self, root):
|
def parse_authors(self, root):
|
||||||
for sel in (
|
for sel in (
|
||||||
@ -851,7 +861,7 @@ class Worker(Thread): # Get details {{{
|
|||||||
class Amazon(Source):
|
class Amazon(Source):
|
||||||
|
|
||||||
name = 'Amazon.com'
|
name = 'Amazon.com'
|
||||||
version = (1, 2, 6)
|
version = (1, 2, 7)
|
||||||
minimum_calibre_version = (2, 82, 0)
|
minimum_calibre_version = (2, 82, 0)
|
||||||
description = _('Downloads metadata and covers from Amazon')
|
description = _('Downloads metadata and covers from Amazon')
|
||||||
|
|
||||||
@ -1480,12 +1490,13 @@ class Amazon(Source):
|
|||||||
# }}}
|
# }}}
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__': # tests {{{
|
def manual_tests(domain, **kw): # {{{
|
||||||
# To run these test use: calibre-debug
|
# To run these test use:
|
||||||
# src/calibre/ebooks/metadata/sources/amazon.py
|
# calibre-debug -c "from calibre.ebooks.metadata.sources.amazon import *; manual_tests('com')"
|
||||||
from calibre.ebooks.metadata.sources.test import (test_identify_plugin,
|
from calibre.ebooks.metadata.sources.test import (test_identify_plugin,
|
||||||
isbn_test, title_test, authors_test, comments_test, series_test)
|
isbn_test, title_test, authors_test, comments_test, series_test)
|
||||||
com_tests = [ # {{{
|
all_tests = {}
|
||||||
|
all_tests['com'] = [ # {{{
|
||||||
|
|
||||||
( # Paperback with series
|
( # Paperback with series
|
||||||
{'identifiers': {'amazon': '1423146786'}},
|
{'identifiers': {'amazon': '1423146786'}},
|
||||||
@ -1533,7 +1544,7 @@ if __name__ == '__main__': # tests {{{
|
|||||||
|
|
||||||
# }}}
|
# }}}
|
||||||
|
|
||||||
de_tests = [ # {{{
|
all_tests['de'] = [ # {{{
|
||||||
(
|
(
|
||||||
{'identifiers': {'isbn': '9783453314979'}},
|
{'identifiers': {'isbn': '9783453314979'}},
|
||||||
[title_test('Die letzten Wächter: Roman',
|
[title_test('Die letzten Wächter: Roman',
|
||||||
@ -1551,7 +1562,7 @@ if __name__ == '__main__': # tests {{{
|
|||||||
),
|
),
|
||||||
] # }}}
|
] # }}}
|
||||||
|
|
||||||
it_tests = [ # {{{
|
all_tests['it'] = [ # {{{
|
||||||
(
|
(
|
||||||
{'identifiers': {'isbn': '8838922195'}},
|
{'identifiers': {'isbn': '8838922195'}},
|
||||||
[title_test('La briscola in cinque',
|
[title_test('La briscola in cinque',
|
||||||
@ -1561,7 +1572,13 @@ if __name__ == '__main__': # tests {{{
|
|||||||
),
|
),
|
||||||
] # }}}
|
] # }}}
|
||||||
|
|
||||||
fr_tests = [ # {{{
|
all_tests['fr'] = [ # {{{
|
||||||
|
(
|
||||||
|
{'identifiers': {'amazon_fr': 'B07L7ST4RS'}},
|
||||||
|
[title_test('Le secret de Lola', exact=True),
|
||||||
|
authors_test(['Amélie BRIZIO'])
|
||||||
|
]
|
||||||
|
),
|
||||||
(
|
(
|
||||||
{'identifiers': {'isbn': '2221116798'}},
|
{'identifiers': {'isbn': '2221116798'}},
|
||||||
[title_test('L\'étrange voyage de Monsieur Daldry',
|
[title_test('L\'étrange voyage de Monsieur Daldry',
|
||||||
@ -1571,7 +1588,7 @@ if __name__ == '__main__': # tests {{{
|
|||||||
),
|
),
|
||||||
] # }}}
|
] # }}}
|
||||||
|
|
||||||
es_tests = [ # {{{
|
all_tests['es'] = [ # {{{
|
||||||
(
|
(
|
||||||
{'identifiers': {'isbn': '8483460831'}},
|
{'identifiers': {'isbn': '8483460831'}},
|
||||||
[title_test('Tiempos Interesantes',
|
[title_test('Tiempos Interesantes',
|
||||||
@ -1581,7 +1598,7 @@ if __name__ == '__main__': # tests {{{
|
|||||||
),
|
),
|
||||||
] # }}}
|
] # }}}
|
||||||
|
|
||||||
jp_tests = [ # {{{
|
all_tests['jp'] = [ # {{{
|
||||||
( # Adult filtering test
|
( # Adult filtering test
|
||||||
{'identifiers': {'isbn': '4799500066'}},
|
{'identifiers': {'isbn': '4799500066'}},
|
||||||
[title_test(u'Bitch Trap'), ]
|
[title_test(u'Bitch Trap'), ]
|
||||||
@ -1600,7 +1617,7 @@ if __name__ == '__main__': # tests {{{
|
|||||||
),
|
),
|
||||||
] # }}}
|
] # }}}
|
||||||
|
|
||||||
br_tests = [ # {{{
|
all_tests['br'] = [ # {{{
|
||||||
(
|
(
|
||||||
{'title': 'Guerra dos Tronos'},
|
{'title': 'Guerra dos Tronos'},
|
||||||
[title_test('A Guerra dos Tronos - As Crônicas de Gelo e Fogo',
|
[title_test('A Guerra dos Tronos - As Crônicas de Gelo e Fogo',
|
||||||
@ -1610,7 +1627,7 @@ if __name__ == '__main__': # tests {{{
|
|||||||
),
|
),
|
||||||
] # }}}
|
] # }}}
|
||||||
|
|
||||||
nl_tests = [ # {{{
|
all_tests['nl'] = [ # {{{
|
||||||
(
|
(
|
||||||
{'title': 'Freakonomics'},
|
{'title': 'Freakonomics'},
|
||||||
[title_test('Freakonomics',
|
[title_test('Freakonomics',
|
||||||
@ -1620,7 +1637,7 @@ if __name__ == '__main__': # tests {{{
|
|||||||
),
|
),
|
||||||
] # }}}
|
] # }}}
|
||||||
|
|
||||||
cn_tests = [ # {{{
|
all_tests['cn'] = [ # {{{
|
||||||
(
|
(
|
||||||
{'identifiers': {'isbn': '9787115369512'}},
|
{'identifiers': {'isbn': '9787115369512'}},
|
||||||
[title_test('若为自由故 自由软件之父理查德斯托曼传', exact=True),
|
[title_test('若为自由故 自由软件之父理查德斯托曼传', exact=True),
|
||||||
@ -1635,7 +1652,7 @@ if __name__ == '__main__': # tests {{{
|
|||||||
),
|
),
|
||||||
] # }}}
|
] # }}}
|
||||||
|
|
||||||
ca_tests = [ # {{{
|
all_tests['ca'] = [ # {{{
|
||||||
( # Paperback with series
|
( # Paperback with series
|
||||||
{'identifiers': {'isbn': '9781623808747'}},
|
{'identifiers': {'isbn': '9781623808747'}},
|
||||||
[title_test('Parting Shot', exact=True),
|
[title_test('Parting Shot', exact=True),
|
||||||
@ -1655,7 +1672,7 @@ if __name__ == '__main__': # tests {{{
|
|||||||
] # }}}
|
] # }}}
|
||||||
|
|
||||||
def do_test(domain, start=0, stop=None, server='auto'):
|
def do_test(domain, start=0, stop=None, server='auto'):
|
||||||
tests = globals().get(domain + '_tests')
|
tests = all_tests[domain]
|
||||||
if stop is None:
|
if stop is None:
|
||||||
stop = len(tests)
|
stop = len(tests)
|
||||||
tests = tests[start:stop]
|
tests = tests[start:stop]
|
||||||
@ -1665,6 +1682,5 @@ if __name__ == '__main__': # tests {{{
|
|||||||
setattr(p, 'testing_server', server),
|
setattr(p, 'testing_server', server),
|
||||||
))
|
))
|
||||||
|
|
||||||
do_test('com')
|
do_test(domain, **kw)
|
||||||
# do_test('de')
|
|
||||||
# }}}
|
# }}}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user