...

2025-07-08 18:54:09 -04:00 · 2011-04-04 11:14:12 -06:00 · 2011-04-04 11:14:12 -06:00 · d1859b0f78
commit d1859b0f78
parent 3e1a43e86a
2 changed files with 100 additions and 95 deletions
--- a/src/calibre/ebooks/metadata/sources/base.py
+++ b/src/calibre/ebooks/metadata/sources/base.py
@ -21,6 +21,7 @@ msprefs = JSONConfig('metadata_sources.json')
 msprefs.defaults['txt_comments'] = False
 msprefs.defaults['ignore_fields'] = []
 msprefs.defaults['max_tags'] = 10
 msprefs.defaults['wait_after_first_identify_result'] = 30 # seconds
 def create_log(ostream=None):
    log = ThreadSafeLog(level=ThreadSafeLog.DEBUG)
--- a/src/calibre/ebooks/metadata/sources/identify.py
+++ b/src/calibre/ebooks/metadata/sources/identify.py
@ -21,9 +21,7 @@ from calibre.ebooks.metadata.book.base import Metadata
 from calibre.utils.date import utc_tz
 from calibre.utils.html2text import html2text
-# How long to wait for more results after first result is found
+# Download worker {{{
 WAIT_AFTER_FIRST_RESULT = 30 # seconds
 class Worker(Thread):
    def __init__(self, plugin, kwargs, abort):
@ -47,99 +45,9 @@ def is_worker_alive(workers):
            return True
    return False
-def identify(log, abort, title=None, authors=None, identifiers=[], timeout=30):
+# }}}
    start_time = time.time()
    plugins = list(metadata_plugins['identify'])
    kwargs = {
            'title': title,
            'authors': authors,
            'identifiers': identifiers,
            'timeout': timeout,
    }
    log('Running identify query with parameters:')
    log(kwargs)
    log('Using plugins:', ', '.join([p.name for p in plugins]))
    log('The log (if any) from individual plugins is below')
    workers = [Worker(p, kwargs, abort) for p in plugins]
    for w in workers:
        w.start()
    first_result_at = None
    results = dict.fromkeys(plugins, [])
    def get_results():
        found = False
        for w in workers:
            try:
                result = w.rq.get_nowait()
            except Empty:
                pass
            else:
                results[w.plugin].append(result)
                found = True
        return found
    while True:
        time.sleep(0.2)
        if get_results() and first_result_at is None:
            first_result_at = time.time()
        if not is_worker_alive(workers):
            break
        if (first_result_at is not None and time.time() - first_result_at <
                WAIT_AFTER_FIRST_RESULT):
            log('Not waiting any longer for more results')
            abort.set()
            break
    get_results()
    sort_kwargs = dict(kwargs)
    for k in list(sort_kwargs.iterkeys()):
        if k not in ('title', 'authors', 'identifiers'):
            sort_kwargs.pop(k)
    for plugin, results in results.iteritems():
        results.sort(key=plugin.identify_results_keygen(**sort_kwargs))
        plog = plugin.buf.getvalue().strip()
        if plog:
            log('\n'+'*'*35, plugin.name, '*'*35)
            log('Found %d results'%len(results))
            log(plog)
            log('\n'+'*'*80)
        for i, result in enumerate(results):
            result.relevance_in_source = i
            result.has_cached_cover_url = \
                plugin.get_cached_cover_url(result.identifiers) is not None
            result.identify_plugin = plugin
    log('The identify phase took %.2f seconds'%(time.time() - start_time))
    log('Merging results from different sources and finding earliest',
            'publication dates')
    start_time = time.time()
    results = merge_identify_results(results, log)
    log('We have %d merged results, merging took: %.2f seconds' %
            (len(results), time.time() - start_time))
    if msprefs['txt_comments']:
        for r in results:
            if r.plugin.has_html_comments and r.comments:
                r.comments = html2text(r.comments)
    dummy = Metadata(_('Unknown'))
    max_tags = msprefs['max_tags']
    for f in msprefs['ignore_fields']:
        for r in results:
            setattr(r, f, getattr(dummy, f))
            r.tags = r.tags[:max_tags]
    return results
 # Merge results from different sources {{{
 class ISBNMerge(object):
@ -298,6 +206,102 @@ def merge_identify_results(result_map, log):
    return isbn_merge.finalize()
 # }}}
 def identify(log, abort, title=None, authors=None, identifiers=[], timeout=30):
    start_time = time.time()
    plugins = list(metadata_plugins['identify'])
    kwargs = {
            'title': title,
            'authors': authors,
            'identifiers': identifiers,
            'timeout': timeout,
    }
    log('Running identify query with parameters:')
    log(kwargs)
    log('Using plugins:', ', '.join([p.name for p in plugins]))
    log('The log (if any) from individual plugins is below')
    workers = [Worker(p, kwargs, abort) for p in plugins]
    for w in workers:
        w.start()
    first_result_at = None
    results = dict.fromkeys(plugins, [])
    def get_results():
        found = False
        for w in workers:
            try:
                result = w.rq.get_nowait()
            except Empty:
                pass
            else:
                results[w.plugin].append(result)
                found = True
        return found
    wait_time = msprefs['wait_after_first_identify_result']
    while True:
        time.sleep(0.2)
        if get_results() and first_result_at is None:
            first_result_at = time.time()
        if not is_worker_alive(workers):
            break
        if (first_result_at is not None and time.time() - first_result_at <
                wait_time):
            log('Not waiting any longer for more results')
            abort.set()
            break
    get_results()
    sort_kwargs = dict(kwargs)
    for k in list(sort_kwargs.iterkeys()):
        if k not in ('title', 'authors', 'identifiers'):
            sort_kwargs.pop(k)
    for plugin, results in results.iteritems():
        results.sort(key=plugin.identify_results_keygen(**sort_kwargs))
        plog = plugin.buf.getvalue().strip()
        if plog:
            log('\n'+'*'*35, plugin.name, '*'*35)
            log('Found %d results'%len(results))
            log(plog)
            log('\n'+'*'*80)
        for i, result in enumerate(results):
            result.relevance_in_source = i
            result.has_cached_cover_url = \
                plugin.get_cached_cover_url(result.identifiers) is not None
            result.identify_plugin = plugin
    log('The identify phase took %.2f seconds'%(time.time() - start_time))
    log('Merging results from different sources and finding earliest',
            'publication dates')
    start_time = time.time()
    results = merge_identify_results(results, log)
    log('We have %d merged results, merging took: %.2f seconds' %
            (len(results), time.time() - start_time))
    if msprefs['txt_comments']:
        for r in results:
            if r.plugin.has_html_comments and r.comments:
                r.comments = html2text(r.comments)
    dummy = Metadata(_('Unknown'))
    max_tags = msprefs['max_tags']
    for f in msprefs['ignore_fields']:
        for r in results:
            setattr(r, f, getattr(dummy, f))
            r.tags = r.tags[:max_tags]
    return results