diff --git a/src/calibre/ebooks/metadata/sources/base.py b/src/calibre/ebooks/metadata/sources/base.py index 08012c3ee8..d306a02bcb 100644 --- a/src/calibre/ebooks/metadata/sources/base.py +++ b/src/calibre/ebooks/metadata/sources/base.py @@ -21,6 +21,7 @@ msprefs = JSONConfig('metadata_sources.json') msprefs.defaults['txt_comments'] = False msprefs.defaults['ignore_fields'] = [] msprefs.defaults['max_tags'] = 10 +msprefs.defaults['wait_after_first_identify_result'] = 30 # seconds def create_log(ostream=None): log = ThreadSafeLog(level=ThreadSafeLog.DEBUG) diff --git a/src/calibre/ebooks/metadata/sources/identify.py b/src/calibre/ebooks/metadata/sources/identify.py index ab86e8ffa2..87d34c0bff 100644 --- a/src/calibre/ebooks/metadata/sources/identify.py +++ b/src/calibre/ebooks/metadata/sources/identify.py @@ -21,9 +21,7 @@ from calibre.ebooks.metadata.book.base import Metadata from calibre.utils.date import utc_tz from calibre.utils.html2text import html2text -# How long to wait for more results after first result is found -WAIT_AFTER_FIRST_RESULT = 30 # seconds - +# Download worker {{{ class Worker(Thread): def __init__(self, plugin, kwargs, abort): @@ -47,99 +45,9 @@ def is_worker_alive(workers): return True return False -def identify(log, abort, title=None, authors=None, identifiers=[], timeout=30): - start_time = time.time() - plugins = list(metadata_plugins['identify']) - - kwargs = { - 'title': title, - 'authors': authors, - 'identifiers': identifiers, - 'timeout': timeout, - } - - log('Running identify query with parameters:') - log(kwargs) - log('Using plugins:', ', '.join([p.name for p in plugins])) - log('The log (if any) from individual plugins is below') - - workers = [Worker(p, kwargs, abort) for p in plugins] - for w in workers: - w.start() - - first_result_at = None - results = dict.fromkeys(plugins, []) - - def get_results(): - found = False - for w in workers: - try: - result = w.rq.get_nowait() - except Empty: - pass - else: - results[w.plugin].append(result) - found = True - return found - - while True: - time.sleep(0.2) - - if get_results() and first_result_at is None: - first_result_at = time.time() - - if not is_worker_alive(workers): - break - - if (first_result_at is not None and time.time() - first_result_at < - WAIT_AFTER_FIRST_RESULT): - log('Not waiting any longer for more results') - abort.set() - break - - get_results() - sort_kwargs = dict(kwargs) - for k in list(sort_kwargs.iterkeys()): - if k not in ('title', 'authors', 'identifiers'): - sort_kwargs.pop(k) - - for plugin, results in results.iteritems(): - results.sort(key=plugin.identify_results_keygen(**sort_kwargs)) - plog = plugin.buf.getvalue().strip() - if plog: - log('\n'+'*'*35, plugin.name, '*'*35) - log('Found %d results'%len(results)) - log(plog) - log('\n'+'*'*80) - - for i, result in enumerate(results): - result.relevance_in_source = i - result.has_cached_cover_url = \ - plugin.get_cached_cover_url(result.identifiers) is not None - result.identify_plugin = plugin - - log('The identify phase took %.2f seconds'%(time.time() - start_time)) - log('Merging results from different sources and finding earliest', - 'publication dates') - start_time = time.time() - results = merge_identify_results(results, log) - log('We have %d merged results, merging took: %.2f seconds' % - (len(results), time.time() - start_time)) - - if msprefs['txt_comments']: - for r in results: - if r.plugin.has_html_comments and r.comments: - r.comments = html2text(r.comments) - - dummy = Metadata(_('Unknown')) - max_tags = msprefs['max_tags'] - for f in msprefs['ignore_fields']: - for r in results: - setattr(r, f, getattr(dummy, f)) - r.tags = r.tags[:max_tags] - - return results +# }}} +# Merge results from different sources {{{ class ISBNMerge(object): @@ -298,6 +206,102 @@ def merge_identify_results(result_map, log): return isbn_merge.finalize() +# }}} + +def identify(log, abort, title=None, authors=None, identifiers=[], timeout=30): + start_time = time.time() + plugins = list(metadata_plugins['identify']) + + kwargs = { + 'title': title, + 'authors': authors, + 'identifiers': identifiers, + 'timeout': timeout, + } + + log('Running identify query with parameters:') + log(kwargs) + log('Using plugins:', ', '.join([p.name for p in plugins])) + log('The log (if any) from individual plugins is below') + + workers = [Worker(p, kwargs, abort) for p in plugins] + for w in workers: + w.start() + + first_result_at = None + results = dict.fromkeys(plugins, []) + + def get_results(): + found = False + for w in workers: + try: + result = w.rq.get_nowait() + except Empty: + pass + else: + results[w.plugin].append(result) + found = True + return found + + wait_time = msprefs['wait_after_first_identify_result'] + while True: + time.sleep(0.2) + + if get_results() and first_result_at is None: + first_result_at = time.time() + + if not is_worker_alive(workers): + break + + if (first_result_at is not None and time.time() - first_result_at < + wait_time): + log('Not waiting any longer for more results') + abort.set() + break + + get_results() + sort_kwargs = dict(kwargs) + for k in list(sort_kwargs.iterkeys()): + if k not in ('title', 'authors', 'identifiers'): + sort_kwargs.pop(k) + + for plugin, results in results.iteritems(): + results.sort(key=plugin.identify_results_keygen(**sort_kwargs)) + plog = plugin.buf.getvalue().strip() + if plog: + log('\n'+'*'*35, plugin.name, '*'*35) + log('Found %d results'%len(results)) + log(plog) + log('\n'+'*'*80) + + for i, result in enumerate(results): + result.relevance_in_source = i + result.has_cached_cover_url = \ + plugin.get_cached_cover_url(result.identifiers) is not None + result.identify_plugin = plugin + + log('The identify phase took %.2f seconds'%(time.time() - start_time)) + log('Merging results from different sources and finding earliest', + 'publication dates') + start_time = time.time() + results = merge_identify_results(results, log) + log('We have %d merged results, merging took: %.2f seconds' % + (len(results), time.time() - start_time)) + + if msprefs['txt_comments']: + for r in results: + if r.plugin.has_html_comments and r.comments: + r.comments = html2text(r.comments) + + dummy = Metadata(_('Unknown')) + max_tags = msprefs['max_tags'] + for f in msprefs['ignore_fields']: + for r in results: + setattr(r, f, getattr(dummy, f)) + r.tags = r.tags[:max_tags] + + return results +