This commit is contained in:
Kovid Goyal 2011-04-04 11:14:12 -06:00
parent 3e1a43e86a
commit d1859b0f78
2 changed files with 100 additions and 95 deletions

View File

@ -21,6 +21,7 @@ msprefs = JSONConfig('metadata_sources.json')
msprefs.defaults['txt_comments'] = False msprefs.defaults['txt_comments'] = False
msprefs.defaults['ignore_fields'] = [] msprefs.defaults['ignore_fields'] = []
msprefs.defaults['max_tags'] = 10 msprefs.defaults['max_tags'] = 10
msprefs.defaults['wait_after_first_identify_result'] = 30 # seconds
def create_log(ostream=None): def create_log(ostream=None):
log = ThreadSafeLog(level=ThreadSafeLog.DEBUG) log = ThreadSafeLog(level=ThreadSafeLog.DEBUG)

View File

@ -21,9 +21,7 @@ from calibre.ebooks.metadata.book.base import Metadata
from calibre.utils.date import utc_tz from calibre.utils.date import utc_tz
from calibre.utils.html2text import html2text from calibre.utils.html2text import html2text
# How long to wait for more results after first result is found # Download worker {{{
WAIT_AFTER_FIRST_RESULT = 30 # seconds
class Worker(Thread): class Worker(Thread):
def __init__(self, plugin, kwargs, abort): def __init__(self, plugin, kwargs, abort):
@ -47,99 +45,9 @@ def is_worker_alive(workers):
return True return True
return False return False
def identify(log, abort, title=None, authors=None, identifiers=[], timeout=30): # }}}
start_time = time.time()
plugins = list(metadata_plugins['identify'])
kwargs = {
'title': title,
'authors': authors,
'identifiers': identifiers,
'timeout': timeout,
}
log('Running identify query with parameters:')
log(kwargs)
log('Using plugins:', ', '.join([p.name for p in plugins]))
log('The log (if any) from individual plugins is below')
workers = [Worker(p, kwargs, abort) for p in plugins]
for w in workers:
w.start()
first_result_at = None
results = dict.fromkeys(plugins, [])
def get_results():
found = False
for w in workers:
try:
result = w.rq.get_nowait()
except Empty:
pass
else:
results[w.plugin].append(result)
found = True
return found
while True:
time.sleep(0.2)
if get_results() and first_result_at is None:
first_result_at = time.time()
if not is_worker_alive(workers):
break
if (first_result_at is not None and time.time() - first_result_at <
WAIT_AFTER_FIRST_RESULT):
log('Not waiting any longer for more results')
abort.set()
break
get_results()
sort_kwargs = dict(kwargs)
for k in list(sort_kwargs.iterkeys()):
if k not in ('title', 'authors', 'identifiers'):
sort_kwargs.pop(k)
for plugin, results in results.iteritems():
results.sort(key=plugin.identify_results_keygen(**sort_kwargs))
plog = plugin.buf.getvalue().strip()
if plog:
log('\n'+'*'*35, plugin.name, '*'*35)
log('Found %d results'%len(results))
log(plog)
log('\n'+'*'*80)
for i, result in enumerate(results):
result.relevance_in_source = i
result.has_cached_cover_url = \
plugin.get_cached_cover_url(result.identifiers) is not None
result.identify_plugin = plugin
log('The identify phase took %.2f seconds'%(time.time() - start_time))
log('Merging results from different sources and finding earliest',
'publication dates')
start_time = time.time()
results = merge_identify_results(results, log)
log('We have %d merged results, merging took: %.2f seconds' %
(len(results), time.time() - start_time))
if msprefs['txt_comments']:
for r in results:
if r.plugin.has_html_comments and r.comments:
r.comments = html2text(r.comments)
dummy = Metadata(_('Unknown'))
max_tags = msprefs['max_tags']
for f in msprefs['ignore_fields']:
for r in results:
setattr(r, f, getattr(dummy, f))
r.tags = r.tags[:max_tags]
return results
# Merge results from different sources {{{
class ISBNMerge(object): class ISBNMerge(object):
@ -298,6 +206,102 @@ def merge_identify_results(result_map, log):
return isbn_merge.finalize() return isbn_merge.finalize()
# }}}
def identify(log, abort, title=None, authors=None, identifiers=[], timeout=30):
start_time = time.time()
plugins = list(metadata_plugins['identify'])
kwargs = {
'title': title,
'authors': authors,
'identifiers': identifiers,
'timeout': timeout,
}
log('Running identify query with parameters:')
log(kwargs)
log('Using plugins:', ', '.join([p.name for p in plugins]))
log('The log (if any) from individual plugins is below')
workers = [Worker(p, kwargs, abort) for p in plugins]
for w in workers:
w.start()
first_result_at = None
results = dict.fromkeys(plugins, [])
def get_results():
found = False
for w in workers:
try:
result = w.rq.get_nowait()
except Empty:
pass
else:
results[w.plugin].append(result)
found = True
return found
wait_time = msprefs['wait_after_first_identify_result']
while True:
time.sleep(0.2)
if get_results() and first_result_at is None:
first_result_at = time.time()
if not is_worker_alive(workers):
break
if (first_result_at is not None and time.time() - first_result_at <
wait_time):
log('Not waiting any longer for more results')
abort.set()
break
get_results()
sort_kwargs = dict(kwargs)
for k in list(sort_kwargs.iterkeys()):
if k not in ('title', 'authors', 'identifiers'):
sort_kwargs.pop(k)
for plugin, results in results.iteritems():
results.sort(key=plugin.identify_results_keygen(**sort_kwargs))
plog = plugin.buf.getvalue().strip()
if plog:
log('\n'+'*'*35, plugin.name, '*'*35)
log('Found %d results'%len(results))
log(plog)
log('\n'+'*'*80)
for i, result in enumerate(results):
result.relevance_in_source = i
result.has_cached_cover_url = \
plugin.get_cached_cover_url(result.identifiers) is not None
result.identify_plugin = plugin
log('The identify phase took %.2f seconds'%(time.time() - start_time))
log('Merging results from different sources and finding earliest',
'publication dates')
start_time = time.time()
results = merge_identify_results(results, log)
log('We have %d merged results, merging took: %.2f seconds' %
(len(results), time.time() - start_time))
if msprefs['txt_comments']:
for r in results:
if r.plugin.has_html_comments and r.comments:
r.comments = html2text(r.comments)
dummy = Metadata(_('Unknown'))
max_tags = msprefs['max_tags']
for f in msprefs['ignore_fields']:
for r in results:
setattr(r, f, getattr(dummy, f))
r.tags = r.tags[:max_tags]
return results