Test and debug the new identify() function

This commit is contained in:
Kovid Goyal 2011-04-05 11:36:55 -06:00
parent 6e98d78dd7
commit 8dd435ecdb
4 changed files with 43 additions and 31 deletions

View File

@ -20,7 +20,7 @@ from calibre.ebooks.metadata import check_isbn
msprefs = JSONConfig('metadata_sources.json') msprefs = JSONConfig('metadata_sources.json')
msprefs.defaults['txt_comments'] = False msprefs.defaults['txt_comments'] = False
msprefs.defaults['ignore_fields'] = [] msprefs.defaults['ignore_fields'] = []
msprefs.defaults['max_tags'] = 10 msprefs.defaults['max_tags'] = 20
msprefs.defaults['wait_after_first_identify_result'] = 30 # seconds msprefs.defaults['wait_after_first_identify_result'] = 30 # seconds
def create_log(ostream=None): def create_log(ostream=None):
@ -95,7 +95,7 @@ class InternalMetadataCompareKeyGen(object):
def get_cached_cover_urls(mi): def get_cached_cover_urls(mi):
from calibre.customize.ui import metadata_plugins from calibre.customize.ui import metadata_plugins
plugins = list(metadata_plugins['identify']) plugins = list(metadata_plugins(['identify']))
for p in plugins: for p in plugins:
url = p.get_cached_cover_url(mi.identifiers) url = p.get_cached_cover_url(mi.identifiers)
if url: if url:

View File

@ -57,13 +57,13 @@ class ISBNMerge(object):
def isbn_in_pool(self, isbn): def isbn_in_pool(self, isbn):
if isbn: if isbn:
for p in self.pools: for isbns, pool in self.pools.iteritems():
if isbn in p: if isbn in isbns:
return p return pool
return None return None
def pool_has_result_from_same_source(self, pool, result): def pool_has_result_from_same_source(self, pool, result):
results = self.pools[pool][1] results = pool[1]
for r in results: for r in results:
if r.identify_plugin is result.identify_plugin: if r.identify_plugin is result.identify_plugin:
return True return True
@ -77,7 +77,7 @@ class ISBNMerge(object):
isbns, min_year = xisbn.get_isbn_pool(isbn) isbns, min_year = xisbn.get_isbn_pool(isbn)
if not isbns: if not isbns:
isbns = frozenset([isbn]) isbns = frozenset([isbn])
self.pool[isbns] = pool = (min_year, []) self.pools[isbns] = pool = (min_year, [])
if not self.pool_has_result_from_same_source(pool, result): if not self.pool_has_result_from_same_source(pool, result):
pool[1].append(result) pool[1].append(result)
@ -102,7 +102,7 @@ class ISBNMerge(object):
def merge_isbn_results(self): def merge_isbn_results(self):
self.results = [] self.results = []
for min_year, results in self.pool.itervalues(): for min_year, results in self.pools.itervalues():
if results: if results:
self.results.append(self.merge(results, min_year)) self.results.append(self.merge(results, min_year))
@ -169,11 +169,11 @@ class ISBNMerge(object):
min_date = datetime(min_year, 1, 2, tzinfo=utc_tz) min_date = datetime(min_year, 1, 2, tzinfo=utc_tz)
ans.pubdate = min_date ans.pubdate = min_date
else: else:
min_date = datetime(10000, 1, 1, tzinfo=utc_tz) min_date = datetime(3001, 1, 1, tzinfo=utc_tz)
for r in results: for r in results:
if r.pubdate is not None and r.pubdate < min_date: if r.pubdate is not None and r.pubdate < min_date:
min_date = r.pubdate min_date = r.pubdate
if min_date.year < 10000: if min_date.year < 3000:
ans.pubdate = min_date ans.pubdate = min_date
# Identifiers # Identifiers
@ -183,7 +183,7 @@ class ISBNMerge(object):
# Merge any other fields with no special handling (random merge) # Merge any other fields with no special handling (random merge)
touched_fields = set() touched_fields = set()
for r in results: for r in results:
touched_fields |= r.plugin.touched_fields touched_fields |= r.identify_plugin.touched_fields
for f in touched_fields: for f in touched_fields:
if f.startswith('identifier:') or not ans.is_null(f): if f.startswith('identifier:') or not ans.is_null(f):
@ -210,7 +210,7 @@ def merge_identify_results(result_map, log):
def identify(log, abort, title=None, authors=None, identifiers=[], timeout=30): def identify(log, abort, title=None, authors=None, identifiers=[], timeout=30):
start_time = time.time() start_time = time.time()
plugins = list(metadata_plugins['identify']) plugins = list(metadata_plugins(['identify']))
kwargs = { kwargs = {
'title': title, 'title': title,
@ -229,7 +229,10 @@ def identify(log, abort, title=None, authors=None, identifiers=[], timeout=30):
w.start() w.start()
first_result_at = None first_result_at = None
results = dict.fromkeys(plugins, []) results = {}
for p in plugins:
results[p] = []
logs = dict([(w.plugin, w.buf) for w in workers])
def get_results(): def get_results():
found = False found = False
@ -253,28 +256,31 @@ def identify(log, abort, title=None, authors=None, identifiers=[], timeout=30):
if not is_worker_alive(workers): if not is_worker_alive(workers):
break break
if (first_result_at is not None and time.time() - first_result_at < if (first_result_at is not None and time.time() - first_result_at >
wait_time): wait_time):
log('Not waiting any longer for more results') log('Not waiting any longer for more results')
abort.set() abort.set()
break break
get_results() while not abort.is_set() and get_results():
pass
sort_kwargs = dict(kwargs) sort_kwargs = dict(kwargs)
for k in list(sort_kwargs.iterkeys()): for k in list(sort_kwargs.iterkeys()):
if k not in ('title', 'authors', 'identifiers'): if k not in ('title', 'authors', 'identifiers'):
sort_kwargs.pop(k) sort_kwargs.pop(k)
for plugin, results in results.iteritems(): for plugin, presults in results.iteritems():
results.sort(key=plugin.identify_results_keygen(**sort_kwargs)) presults.sort(key=plugin.identify_results_keygen(**sort_kwargs))
plog = plugin.buf.getvalue().strip() plog = logs[plugin].getvalue().strip()
log('\n'+'*'*35, plugin.name, '*'*35)
log('Request extra headers:', plugin.browser.addheaders)
log('Found %d results'%len(presults))
if plog: if plog:
log('\n'+'*'*35, plugin.name, '*'*35)
log('Found %d results'%len(results))
log(plog) log(plog)
log('\n'+'*'*80) log('\n'+'*'*80)
for i, result in enumerate(results): for i, result in enumerate(presults):
result.relevance_in_source = i result.relevance_in_source = i
result.has_cached_cover_url = \ result.has_cached_cover_url = \
plugin.get_cached_cover_url(result.identifiers) is not None plugin.get_cached_cover_url(result.identifiers) is not None
@ -295,10 +301,10 @@ def identify(log, abort, title=None, authors=None, identifiers=[], timeout=30):
dummy = Metadata(_('Unknown')) dummy = Metadata(_('Unknown'))
max_tags = msprefs['max_tags'] max_tags = msprefs['max_tags']
for f in msprefs['ignore_fields']: for r in results:
for r in results: for f in msprefs['ignore_fields']:
setattr(r, f, getattr(dummy, f)) setattr(r, f, getattr(dummy, f))
r.tags = r.tags[:max_tags] r.tags = r.tags[:max_tags]
return results return results
@ -307,8 +313,7 @@ if __name__ == '__main__': # tests {{{
# src/calibre/ebooks/metadata/sources/identify.py # src/calibre/ebooks/metadata/sources/identify.py
from calibre.ebooks.metadata.sources.test import (test_identify, from calibre.ebooks.metadata.sources.test import (test_identify,
title_test, authors_test) title_test, authors_test)
test_identify( tests = [
[
( # An e-book ISBN not on Amazon, one of the authors is ( # An e-book ISBN not on Amazon, one of the authors is
# unknown to Amazon # unknown to Amazon
@ -330,14 +335,14 @@ if __name__ == '__main__': # tests {{{
( # Sophisticated comment formatting ( # Sophisticated comment formatting
{'identifiers':{'isbn': '9781416580829'}}, {'identifiers':{'isbn': '9781416580829'}},
[title_test('Angels & Demons - Movie Tie-In: A Novel', [title_test('Angels & Demons',
exact=True), authors_test(['Dan Brown'])] exact=True), authors_test(['Dan Brown'])]
), ),
( # No specific problems ( # No specific problems
{'identifiers':{'isbn': '0743273567'}}, {'identifiers':{'isbn': '0743273567'}},
[title_test('The great gatsby', exact=True), [title_test('The great gatsby', exact=True),
authors_test(['F. Scott Fitzgerald'])] authors_test(['Francis Scott Fitzgerald'])]
), ),
( # A newer book ( # A newer book
@ -347,6 +352,7 @@ if __name__ == '__main__': # tests {{{
), ),
]) ]
test_identify(tests[4:5])
# }}} # }}}

View File

@ -64,6 +64,7 @@ def test_identify(tests): # {{{
from calibre.ebooks.metadata.sources.identify import identify from calibre.ebooks.metadata.sources.identify import identify
tdir, lf, log, abort = init_test('Full Identify') tdir, lf, log, abort = init_test('Full Identify')
prints('Log saved to', lf)
times = [] times = []
@ -129,6 +130,7 @@ def test_identify_plugin(name, tests): # {{{
plugin = x plugin = x
break break
prints('Testing the identify function of', plugin.name) prints('Testing the identify function of', plugin.name)
prints('Using extra headers:', plugin.browser.addheaders)
tdir, lf, log, abort = init_test(plugin.name) tdir, lf, log, abort = init_test(plugin.name)
prints('Log saved to', lf) prints('Log saved to', lf)

View File

@ -73,7 +73,11 @@ class xISBN(object):
def get_isbn_pool(self, isbn): def get_isbn_pool(self, isbn):
data = self.get_data(isbn) data = self.get_data(isbn)
isbns = frozenset([x.get('isbn') for x in data if 'isbn' in x]) raw = tuple(x.get('isbn') for x in data if 'isbn' in x)
isbns = []
for x in raw:
isbns += x
isbns = frozenset(isbns)
min_year = 100000 min_year = 100000
for x in data: for x in data:
try: try: