Add cover downloading to the new fetch-ebook-metadata

2025-11-27 00:35:00 -05:00 · 2011-04-05 23:16:59 -06:00 · 2011-04-05 23:16:59 -06:00 · 6773cf71af
commit 6773cf71af
parent 2828ba5276
6 changed files with 224 additions and 15 deletions
--- a/src/calibre/ebooks/metadata/sources/amazon.py
+++ b/src/calibre/ebooks/metadata/sources/amazon.py
@ -279,7 +279,7 @@ class Worker(Thread): # Get details {{{
 class Amazon(Source):
-    name = 'Amazon'
+    name = 'Amazon Metadata'
    description = _('Downloads metadata from Amazon')
    capabilities = frozenset(['identify', 'cover'])
@ -493,9 +493,10 @@ class Amazon(Source):
        if abort.is_set():
            return
        br = self.browser
        log('Downloading cover from:', cached_url)
        try:
            cdata = br.open_novisit(cached_url, timeout=timeout).read()
-            result_queue.put(cdata)
+            result_queue.put((self, cdata))
        except:
            log.exception('Failed to download cover from:', cached_url)
    # }}}
--- a/src/calibre/ebooks/metadata/sources/base.py
+++ b/src/calibre/ebooks/metadata/sources/base.py
@ -22,6 +22,12 @@ msprefs.defaults['txt_comments'] = False
 msprefs.defaults['ignore_fields'] = []
 msprefs.defaults['max_tags'] = 20
 msprefs.defaults['wait_after_first_identify_result'] = 30 # seconds
 msprefs.defaults['wait_after_first_cover_result'] = 60 # seconds
 # Google covers are often poor quality (scans/errors) but they have high
 # resolution, so they trump covers from better sources. So make sure they
 # are only used if no other covers are found.
 msprefs.defaults['cover_priorities'] = {'Google':2}
 def create_log(ostream=None):
    log = ThreadSafeLog(level=ThreadSafeLog.DEBUG)
@ -340,7 +346,8 @@ class Source(Plugin):
            title=None, authors=None, identifiers={}, timeout=30):
        '''
        Download a cover and put it into result_queue. The parameters all have
-        the same meaning as for :meth:`identify`.
+        the same meaning as for :meth:`identify`. Put (self, cover_data) into
        result_queue.
        This method should use cached cover URLs for efficiency whenever
        possible. When cached data is not present, most plugins simply call
--- a/src/calibre/ebooks/metadata/sources/cli.py
+++ b/src/calibre/ebooks/metadata/sources/cli.py
@ -13,10 +13,13 @@ from threading import Event
 from calibre import prints
 from calibre.utils.config import OptionParser
 from calibre.utils.magick.draw import save_cover_data_to
 from calibre.ebooks.metadata import string_to_authors
 from calibre.ebooks.metadata.opf2 import metadata_to_opf
 from calibre.ebooks.metadata.sources.base import create_log
 from calibre.ebooks.metadata.sources.identify import identify
 from calibre.ebooks.metadata.sources.covers import download_cover
 def option_parser():
    parser = OptionParser(textwrap.dedent(
@ -33,6 +36,8 @@ def option_parser():
    parser.add_option('-v', '--verbose', default=False, action='store_true',
                      help='Print the log to the console (stderr)')
    parser.add_option('-o', '--opf', help='Output the metadata in OPF format')
    parser.add_option('-c', '--cover',
            help='Specify a filename. The cover, if available, will be saved to it')
    parser.add_option('-d', '--timeout', default='30',
            help='Timeout in seconds. Default is 30')
@ -57,14 +62,26 @@ def main(args=sys.argv):
    results = identify(log, abort, title=opts.title, authors=authors,
            identifiers=identifiers, timeout=int(opts.timeout))
    log = buf.getvalue()
    if not results:
        print (log, file=sys.stderr)
        prints('No results found', file=sys.stderr)
        raise SystemExit(1)
    result = results[0]
    cf = None
    if opts.cover and results:
        cover = download_cover(log, title=opts.title, authors=authors,
                identifiers=result.identifiers, timeout=int(opts.timeout))
        if cover is None:
            prints('No cover found', file=sys.stderr)
        else:
            save_cover_data_to(cover[-1], opts.cover)
            result.cover = cf = opts.cover
    log = buf.getvalue()
    result = (metadata_to_opf(result) if opts.opf else
                    unicode(result).encode('utf-8'))
@ -72,6 +89,8 @@ def main(args=sys.argv):
        print (log, file=sys.stderr)
    print (result)
    if not opts.opf:
        prints('Cover               :', cf)
    return 0
--- a/src/calibre/ebooks/metadata/sources/covers.py
+++ b/src/calibre/ebooks/metadata/sources/covers.py
@ -0,0 +1,178 @@
 #!/usr/bin/env python
 # vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
 from __future__ import (unicode_literals, division, absolute_import,
                        print_function)
 __license__   = 'GPL v3'
 __copyright__ = '2011, Kovid Goyal <kovid@kovidgoyal.net>'
 __docformat__ = 'restructuredtext en'
 import time
 from Queue import Queue, Empty
 from threading import Thread, Event
 from io import BytesIO
 from calibre.customize.ui import metadata_plugins
 from calibre.ebooks.metadata.sources.base import msprefs, create_log
 from calibre.utils.magick.draw import Image, save_cover_data_to
 class Worker(Thread):
    def __init__(self, plugin, abort, title, authors, identifiers, timeout, rq):
        Thread.__init__(self)
        self.daemon = True
        self.plugin = plugin
        self.abort = abort
        self.buf = BytesIO()
        self.log = create_log(self.buf)
        self.title, self.authors, self.identifiers = (title, authors,
                identifiers)
        self.timeout, self.rq = timeout, rq
        self.time_spent = None
    def run(self):
        start_time = time.time()
        if not self.abort.is_set():
            try:
                self.plugin.download_cover(self.log, self.rq, self.abort,
                    title=self.title, authors=self.authors,
                    identifiers=self.identifiers, timeout=self.timeout)
            except:
                self.log.exception('Failed to download cover from',
                        self.plugin.name)
        self.time_spent = time.time() - start_time
 def is_worker_alive(workers):
    for w in workers:
        if w.is_alive():
            return True
    return False
 def process_result(log, result):
    plugin, data = result
    try:
        im = Image()
        im.load(data)
        im.trim(10)
        width, height = im.size
        fmt = im.format
        if width < 50 or height < 50:
            raise ValueError('Image too small')
        data = save_cover_data_to(im, '/cover.jpg', return_data=True)
    except:
        log.exception('Invalid cover from', plugin.name)
        return None
    return (plugin, width, height, fmt, data)
 def run_download(log, results, abort,
        title=None, authors=None, identifiers={}, timeout=30):
    '''
    Run the cover download, putting results into the queue :param:`results`.
    Each result is a tuple of the form:
        (plugin, width, height, fmt, bytes)
    '''
    plugins = list(metadata_plugins(['cover']))
    rq = Queue()
    workers = [Worker(p, abort, title, authors, identifiers, timeout, rq) for p
            in plugins]
    for w in workers:
        w.start()
    first_result_at = None
    wait_time = msprefs['wait_after_first_cover_result']
    found_results = {}
    while True:
        time.sleep(0.1)
        try:
            x = rq.get_nowait()
            result = process_result(log, x)
            if result is not None:
                results.put(result)
                found_results[result[0]] = result
                if first_result_at is not None:
                    first_result_at = time.time()
        except Empty:
            pass
        if not is_worker_alive(workers):
            break
        if first_result_at is not None and time.time() - first_result_at > wait_time:
            log('Not waiting for any more results')
            abort.set()
        if abort.is_set():
            break
    while True:
        try:
            x = rq.get_nowait()
            result = process_result(log, x)
            if result is not None:
                results.put(result)
                found_results[result[0]] = result
        except Empty:
            break
    for w in workers:
        wlog = w.buf.getvalue().strip()
        log('\n'+'*'*30, w.plugin.name, 'Covers', '*'*30)
        log('Request extra headers:', w.plugin.browser.addheaders)
        if w.plugin in found_results:
            result = found_results[w.plugin]
            log('Downloaded cover:', '%dx%d'%(result[1], result[2]))
        else:
            log('Failed to download valid cover')
        if w.time_spent is None:
            log('Download aborted')
        else:
            log('Took', w.time_spent, 'seconds')
        if wlog:
            log(wlog)
        log('\n'+'*'*80)
 def download_cover(log,
        title=None, authors=None, identifiers={}, timeout=30):
    '''
    Synchronous cover download. Returns the "best" cover as per user
    prefs/cover resolution.
    Return cover is a tuple: (plugin, width, height, fmt, data)
    Returns None if no cover is found.
    '''
    rq = Queue()
    abort = Event()
    run_download(log, rq, abort, title=title, authors=authors,
            identifiers=identifiers, timeout=timeout)
    results = []
    while True:
        try:
            results.append(rq.get_nowait())
        except Empty:
            break
    cp = msprefs['cover_priorities']
    def keygen(result):
        plugin, width, height, fmt, data = result
        return (cp.get(plugin.name, 1), 1/(width*height))
    results.sort(key=keygen)
    return results[0] if results else None
--- a/src/calibre/ebooks/metadata/sources/google.py
+++ b/src/calibre/ebooks/metadata/sources/google.py
@ -145,15 +145,18 @@ def to_metadata(browser, log, entry_, timeout): # {{{
            log.exception('Failed to parse rating')
    # Cover
-    mi.has_google_cover = len(extra.xpath(
+    mi.has_google_cover = None
-        '//*[@rel="http://schemas.google.com/books/2008/thumbnail"]')) > 0
+    for x in extra.xpath(
            '//*[@href and @rel="http://schemas.google.com/books/2008/thumbnail"]'):
        mi.has_google_cover = x.get('href')
        break
    return mi
 # }}}
 class GoogleBooks(Source):
-    name = 'Google Books'
+    name = 'Google'
    description = _('Downloads metadata from Google Books')
    capabilities = frozenset(['identify', 'cover'])
@ -213,7 +216,7 @@ class GoogleBooks(Source):
            results.sort(key=self.identify_results_keygen(
                title=title, authors=authors, identifiers=identifiers))
            for mi in results:
-                cached_url = self.cover_url_from_identifiers(mi.identifiers)
+                cached_url = self.get_cached_cover_url(mi.identifiers)
                if cached_url is not None:
                    break
        if cached_url is None:
@ -223,9 +226,10 @@ class GoogleBooks(Source):
        if abort.is_set():
            return
        br = self.browser
        log('Downloading cover from:', cached_url)
        try:
            cdata = br.open_novisit(cached_url, timeout=timeout).read()
-            result_queue.put(cdata)
+            result_queue.put((self, cdata))
        except:
            log.exception('Failed to download cover from:', cached_url)
@ -254,9 +258,9 @@ class GoogleBooks(Source):
                    goog = ans.identifiers['google']
                    for isbn in getattr(ans, 'all_isbns', []):
                        self.cache_isbn_to_identifier(isbn, goog)
-                        if ans.has_google_cover:
+                    if ans.has_google_cover:
-                            self.cache_identifier_to_cover_url(goog,
+                        self.cache_identifier_to_cover_url(goog,
-                                    self.GOOGLE_COVER%goog)
+                                self.GOOGLE_COVER%goog)
                    self.clean_downloaded_metadata(ans)
                    result_queue.put(ans)
            except:
--- a/src/calibre/ebooks/metadata/sources/openlibrary.py
+++ b/src/calibre/ebooks/metadata/sources/openlibrary.py
@ -26,7 +26,7 @@ class OpenLibrary(Source):
        br = self.browser
        try:
            ans = br.open_novisit(self.OPENLIBRARY%isbn, timeout=timeout).read()
-            result_queue.put(ans)
+            result_queue.put((self, ans))
        except Exception as e:
            if callable(getattr(e, 'getcode', None)) and e.getcode() == 404:
                log.error('No cover for ISBN: %r found'%isbn)