mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Test and debug the new identify() function
This commit is contained in:
parent
6e98d78dd7
commit
8dd435ecdb
@ -20,7 +20,7 @@ from calibre.ebooks.metadata import check_isbn
|
||||
msprefs = JSONConfig('metadata_sources.json')
|
||||
msprefs.defaults['txt_comments'] = False
|
||||
msprefs.defaults['ignore_fields'] = []
|
||||
msprefs.defaults['max_tags'] = 10
|
||||
msprefs.defaults['max_tags'] = 20
|
||||
msprefs.defaults['wait_after_first_identify_result'] = 30 # seconds
|
||||
|
||||
def create_log(ostream=None):
|
||||
@ -95,7 +95,7 @@ class InternalMetadataCompareKeyGen(object):
|
||||
|
||||
def get_cached_cover_urls(mi):
|
||||
from calibre.customize.ui import metadata_plugins
|
||||
plugins = list(metadata_plugins['identify'])
|
||||
plugins = list(metadata_plugins(['identify']))
|
||||
for p in plugins:
|
||||
url = p.get_cached_cover_url(mi.identifiers)
|
||||
if url:
|
||||
|
@ -57,13 +57,13 @@ class ISBNMerge(object):
|
||||
|
||||
def isbn_in_pool(self, isbn):
|
||||
if isbn:
|
||||
for p in self.pools:
|
||||
if isbn in p:
|
||||
return p
|
||||
for isbns, pool in self.pools.iteritems():
|
||||
if isbn in isbns:
|
||||
return pool
|
||||
return None
|
||||
|
||||
def pool_has_result_from_same_source(self, pool, result):
|
||||
results = self.pools[pool][1]
|
||||
results = pool[1]
|
||||
for r in results:
|
||||
if r.identify_plugin is result.identify_plugin:
|
||||
return True
|
||||
@ -77,7 +77,7 @@ class ISBNMerge(object):
|
||||
isbns, min_year = xisbn.get_isbn_pool(isbn)
|
||||
if not isbns:
|
||||
isbns = frozenset([isbn])
|
||||
self.pool[isbns] = pool = (min_year, [])
|
||||
self.pools[isbns] = pool = (min_year, [])
|
||||
|
||||
if not self.pool_has_result_from_same_source(pool, result):
|
||||
pool[1].append(result)
|
||||
@ -102,7 +102,7 @@ class ISBNMerge(object):
|
||||
|
||||
def merge_isbn_results(self):
|
||||
self.results = []
|
||||
for min_year, results in self.pool.itervalues():
|
||||
for min_year, results in self.pools.itervalues():
|
||||
if results:
|
||||
self.results.append(self.merge(results, min_year))
|
||||
|
||||
@ -169,11 +169,11 @@ class ISBNMerge(object):
|
||||
min_date = datetime(min_year, 1, 2, tzinfo=utc_tz)
|
||||
ans.pubdate = min_date
|
||||
else:
|
||||
min_date = datetime(10000, 1, 1, tzinfo=utc_tz)
|
||||
min_date = datetime(3001, 1, 1, tzinfo=utc_tz)
|
||||
for r in results:
|
||||
if r.pubdate is not None and r.pubdate < min_date:
|
||||
min_date = r.pubdate
|
||||
if min_date.year < 10000:
|
||||
if min_date.year < 3000:
|
||||
ans.pubdate = min_date
|
||||
|
||||
# Identifiers
|
||||
@ -183,7 +183,7 @@ class ISBNMerge(object):
|
||||
# Merge any other fields with no special handling (random merge)
|
||||
touched_fields = set()
|
||||
for r in results:
|
||||
touched_fields |= r.plugin.touched_fields
|
||||
touched_fields |= r.identify_plugin.touched_fields
|
||||
|
||||
for f in touched_fields:
|
||||
if f.startswith('identifier:') or not ans.is_null(f):
|
||||
@ -210,7 +210,7 @@ def merge_identify_results(result_map, log):
|
||||
|
||||
def identify(log, abort, title=None, authors=None, identifiers=[], timeout=30):
|
||||
start_time = time.time()
|
||||
plugins = list(metadata_plugins['identify'])
|
||||
plugins = list(metadata_plugins(['identify']))
|
||||
|
||||
kwargs = {
|
||||
'title': title,
|
||||
@ -229,7 +229,10 @@ def identify(log, abort, title=None, authors=None, identifiers=[], timeout=30):
|
||||
w.start()
|
||||
|
||||
first_result_at = None
|
||||
results = dict.fromkeys(plugins, [])
|
||||
results = {}
|
||||
for p in plugins:
|
||||
results[p] = []
|
||||
logs = dict([(w.plugin, w.buf) for w in workers])
|
||||
|
||||
def get_results():
|
||||
found = False
|
||||
@ -253,28 +256,31 @@ def identify(log, abort, title=None, authors=None, identifiers=[], timeout=30):
|
||||
if not is_worker_alive(workers):
|
||||
break
|
||||
|
||||
if (first_result_at is not None and time.time() - first_result_at <
|
||||
if (first_result_at is not None and time.time() - first_result_at >
|
||||
wait_time):
|
||||
log('Not waiting any longer for more results')
|
||||
abort.set()
|
||||
break
|
||||
|
||||
get_results()
|
||||
while not abort.is_set() and get_results():
|
||||
pass
|
||||
|
||||
sort_kwargs = dict(kwargs)
|
||||
for k in list(sort_kwargs.iterkeys()):
|
||||
if k not in ('title', 'authors', 'identifiers'):
|
||||
sort_kwargs.pop(k)
|
||||
|
||||
for plugin, results in results.iteritems():
|
||||
results.sort(key=plugin.identify_results_keygen(**sort_kwargs))
|
||||
plog = plugin.buf.getvalue().strip()
|
||||
for plugin, presults in results.iteritems():
|
||||
presults.sort(key=plugin.identify_results_keygen(**sort_kwargs))
|
||||
plog = logs[plugin].getvalue().strip()
|
||||
log('\n'+'*'*35, plugin.name, '*'*35)
|
||||
log('Request extra headers:', plugin.browser.addheaders)
|
||||
log('Found %d results'%len(presults))
|
||||
if plog:
|
||||
log('\n'+'*'*35, plugin.name, '*'*35)
|
||||
log('Found %d results'%len(results))
|
||||
log(plog)
|
||||
log('\n'+'*'*80)
|
||||
log('\n'+'*'*80)
|
||||
|
||||
for i, result in enumerate(results):
|
||||
for i, result in enumerate(presults):
|
||||
result.relevance_in_source = i
|
||||
result.has_cached_cover_url = \
|
||||
plugin.get_cached_cover_url(result.identifiers) is not None
|
||||
@ -295,10 +301,10 @@ def identify(log, abort, title=None, authors=None, identifiers=[], timeout=30):
|
||||
|
||||
dummy = Metadata(_('Unknown'))
|
||||
max_tags = msprefs['max_tags']
|
||||
for f in msprefs['ignore_fields']:
|
||||
for r in results:
|
||||
for r in results:
|
||||
for f in msprefs['ignore_fields']:
|
||||
setattr(r, f, getattr(dummy, f))
|
||||
r.tags = r.tags[:max_tags]
|
||||
r.tags = r.tags[:max_tags]
|
||||
|
||||
return results
|
||||
|
||||
@ -307,8 +313,7 @@ if __name__ == '__main__': # tests {{{
|
||||
# src/calibre/ebooks/metadata/sources/identify.py
|
||||
from calibre.ebooks.metadata.sources.test import (test_identify,
|
||||
title_test, authors_test)
|
||||
test_identify(
|
||||
[
|
||||
tests = [
|
||||
|
||||
( # An e-book ISBN not on Amazon, one of the authors is
|
||||
# unknown to Amazon
|
||||
@ -330,14 +335,14 @@ if __name__ == '__main__': # tests {{{
|
||||
|
||||
( # Sophisticated comment formatting
|
||||
{'identifiers':{'isbn': '9781416580829'}},
|
||||
[title_test('Angels & Demons - Movie Tie-In: A Novel',
|
||||
[title_test('Angels & Demons',
|
||||
exact=True), authors_test(['Dan Brown'])]
|
||||
),
|
||||
|
||||
( # No specific problems
|
||||
{'identifiers':{'isbn': '0743273567'}},
|
||||
[title_test('The great gatsby', exact=True),
|
||||
authors_test(['F. Scott Fitzgerald'])]
|
||||
authors_test(['Francis Scott Fitzgerald'])]
|
||||
),
|
||||
|
||||
( # A newer book
|
||||
@ -347,6 +352,7 @@ if __name__ == '__main__': # tests {{{
|
||||
|
||||
),
|
||||
|
||||
])
|
||||
]
|
||||
test_identify(tests[4:5])
|
||||
# }}}
|
||||
|
||||
|
@ -64,6 +64,7 @@ def test_identify(tests): # {{{
|
||||
from calibre.ebooks.metadata.sources.identify import identify
|
||||
|
||||
tdir, lf, log, abort = init_test('Full Identify')
|
||||
prints('Log saved to', lf)
|
||||
|
||||
times = []
|
||||
|
||||
@ -129,6 +130,7 @@ def test_identify_plugin(name, tests): # {{{
|
||||
plugin = x
|
||||
break
|
||||
prints('Testing the identify function of', plugin.name)
|
||||
prints('Using extra headers:', plugin.browser.addheaders)
|
||||
|
||||
tdir, lf, log, abort = init_test(plugin.name)
|
||||
prints('Log saved to', lf)
|
||||
|
@ -73,7 +73,11 @@ class xISBN(object):
|
||||
|
||||
def get_isbn_pool(self, isbn):
|
||||
data = self.get_data(isbn)
|
||||
isbns = frozenset([x.get('isbn') for x in data if 'isbn' in x])
|
||||
raw = tuple(x.get('isbn') for x in data if 'isbn' in x)
|
||||
isbns = []
|
||||
for x in raw:
|
||||
isbns += x
|
||||
isbns = frozenset(isbns)
|
||||
min_year = 100000
|
||||
for x in data:
|
||||
try:
|
||||
|
Loading…
x
Reference in New Issue
Block a user