Add cover downloading to the new fetch-ebook-metadata

2026-02-07 11:33:30 -05:00 · 2011-04-05 23:16:59 -06:00 · 2011-04-05 23:16:59 -06:00 · 6773cf71af
commit 6773cf71af
parent 2828ba5276
6 changed files with 224 additions and 15 deletions
--- a/src/calibre/ebooks/metadata/sources/amazon.py
+++ b/src/calibre/ebooks/metadata/sources/amazon.py
@ -279,7 +279,7 @@ class Worker(Thread): # Get details {{{

 class Amazon(Source):

-    name = 'Amazon'
+    name = 'Amazon Metadata'
    description = _('Downloads metadata from Amazon')

    capabilities = frozenset(['identify', 'cover'])
@ -493,9 +493,10 @@ class Amazon(Source):
        if abort.is_set():
            return
        br = self.browser
+        log('Downloading cover from:', cached_url)
        try:
            cdata = br.open_novisit(cached_url, timeout=timeout).read()
-            result_queue.put(cdata)
+            result_queue.put((self, cdata))
        except:
            log.exception('Failed to download cover from:', cached_url)
    # }}}
--- a/src/calibre/ebooks/metadata/sources/base.py
+++ b/src/calibre/ebooks/metadata/sources/base.py
@ -22,6 +22,12 @@ msprefs.defaults['txt_comments'] = False
 msprefs.defaults['ignore_fields'] = []
 msprefs.defaults['max_tags'] = 20
 msprefs.defaults['wait_after_first_identify_result'] = 30 # seconds
+msprefs.defaults['wait_after_first_cover_result'] = 60 # seconds
+
+# Google covers are often poor quality (scans/errors) but they have high
+# resolution, so they trump covers from better sources. So make sure they
+# are only used if no other covers are found.
+msprefs.defaults['cover_priorities'] = {'Google':2}

 def create_log(ostream=None):
    log = ThreadSafeLog(level=ThreadSafeLog.DEBUG)
@ -340,7 +346,8 @@ class Source(Plugin):
            title=None, authors=None, identifiers={}, timeout=30):
        '''
        Download a cover and put it into result_queue. The parameters all have
-        the same meaning as for :meth:`identify`.
+        the same meaning as for :meth:`identify`. Put (self, cover_data) into
+        result_queue.

        This method should use cached cover URLs for efficiency whenever
        possible. When cached data is not present, most plugins simply call
--- a/src/calibre/ebooks/metadata/sources/cli.py
+++ b/src/calibre/ebooks/metadata/sources/cli.py
@ -13,10 +13,13 @@ from threading import Event

 from calibre import prints
 from calibre.utils.config import OptionParser
+from calibre.utils.magick.draw import save_cover_data_to
 from calibre.ebooks.metadata import string_to_authors
 from calibre.ebooks.metadata.opf2 import metadata_to_opf
 from calibre.ebooks.metadata.sources.base import create_log
 from calibre.ebooks.metadata.sources.identify import identify
+from calibre.ebooks.metadata.sources.covers import download_cover
+

 def option_parser():
    parser = OptionParser(textwrap.dedent(
@ -33,6 +36,8 @@ def option_parser():
    parser.add_option('-v', '--verbose', default=False, action='store_true',
                      help='Print the log to the console (stderr)')
    parser.add_option('-o', '--opf', help='Output the metadata in OPF format')
+    parser.add_option('-c', '--cover',
+            help='Specify a filename. The cover, if available, will be saved to it')
    parser.add_option('-d', '--timeout', default='30',
            help='Timeout in seconds. Default is 30')

@ -57,14 +62,26 @@ def main(args=sys.argv):
    results = identify(log, abort, title=opts.title, authors=authors,
            identifiers=identifiers, timeout=int(opts.timeout))

-    log = buf.getvalue()
-
    if not results:
        print (log, file=sys.stderr)
        prints('No results found', file=sys.stderr)
        raise SystemExit(1)
-
    result = results[0]
+
+    cf = None
+    if opts.cover and results:
+        cover = download_cover(log, title=opts.title, authors=authors,
+                identifiers=result.identifiers, timeout=int(opts.timeout))
+        if cover is None:
+            prints('No cover found', file=sys.stderr)
+        else:
+            save_cover_data_to(cover[-1], opts.cover)
+            result.cover = cf = opts.cover
+
+
+    log = buf.getvalue()
+
+
    result = (metadata_to_opf(result) if opts.opf else
                    unicode(result).encode('utf-8'))

@ -72,6 +89,8 @@ def main(args=sys.argv):
        print (log, file=sys.stderr)

    print (result)
+    if not opts.opf:
+        prints('Cover               :', cf)

    return 0

--- a/src/calibre/ebooks/metadata/sources/covers.py
+++ b/src/calibre/ebooks/metadata/sources/covers.py
@ -0,0 +1,178 @@
+#!/usr/bin/env python
+# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
+from __future__ import (unicode_literals, division, absolute_import,
+                        print_function)
+
+__license__   = 'GPL v3'
+__copyright__ = '2011, Kovid Goyal <kovid@kovidgoyal.net>'
+__docformat__ = 'restructuredtext en'
+
+import time
+from Queue import Queue, Empty
+from threading import Thread, Event
+from io import BytesIO
+
+from calibre.customize.ui import metadata_plugins
+from calibre.ebooks.metadata.sources.base import msprefs, create_log
+from calibre.utils.magick.draw import Image, save_cover_data_to
+
+class Worker(Thread):
+
+    def __init__(self, plugin, abort, title, authors, identifiers, timeout, rq):
+        Thread.__init__(self)
+        self.daemon = True
+
+        self.plugin = plugin
+        self.abort = abort
+        self.buf = BytesIO()
+        self.log = create_log(self.buf)
+        self.title, self.authors, self.identifiers = (title, authors,
+                identifiers)
+        self.timeout, self.rq = timeout, rq
+        self.time_spent = None
+
+    def run(self):
+        start_time = time.time()
+        if not self.abort.is_set():
+            try:
+                self.plugin.download_cover(self.log, self.rq, self.abort,
+                    title=self.title, authors=self.authors,
+                    identifiers=self.identifiers, timeout=self.timeout)
+            except:
+                self.log.exception('Failed to download cover from',
+                        self.plugin.name)
+        self.time_spent = time.time() - start_time
+
+def is_worker_alive(workers):
+    for w in workers:
+        if w.is_alive():
+            return True
+    return False
+
+def process_result(log, result):
+    plugin, data = result
+    try:
+        im = Image()
+        im.load(data)
+        im.trim(10)
+        width, height = im.size
+        fmt = im.format
+
+        if width < 50 or height < 50:
+            raise ValueError('Image too small')
+        data = save_cover_data_to(im, '/cover.jpg', return_data=True)
+    except:
+        log.exception('Invalid cover from', plugin.name)
+        return None
+    return (plugin, width, height, fmt, data)
+
+def run_download(log, results, abort,
+        title=None, authors=None, identifiers={}, timeout=30):
+    '''
+    Run the cover download, putting results into the queue :param:`results`.
+
+    Each result is a tuple of the form:
+
+        (plugin, width, height, fmt, bytes)
+
+    '''
+    plugins = list(metadata_plugins(['cover']))
+
+    rq = Queue()
+    workers = [Worker(p, abort, title, authors, identifiers, timeout, rq) for p
+            in plugins]
+    for w in workers:
+        w.start()
+
+    first_result_at = None
+    wait_time = msprefs['wait_after_first_cover_result']
+    found_results = {}
+
+    while True:
+        time.sleep(0.1)
+        try:
+            x = rq.get_nowait()
+            result = process_result(log, x)
+            if result is not None:
+                results.put(result)
+                found_results[result[0]] = result
+                if first_result_at is not None:
+                    first_result_at = time.time()
+        except Empty:
+            pass
+
+        if not is_worker_alive(workers):
+            break
+
+        if first_result_at is not None and time.time() - first_result_at > wait_time:
+            log('Not waiting for any more results')
+            abort.set()
+
+        if abort.is_set():
+            break
+
+    while True:
+        try:
+            x = rq.get_nowait()
+            result = process_result(log, x)
+            if result is not None:
+                results.put(result)
+                found_results[result[0]] = result
+        except Empty:
+            break
+
+    for w in workers:
+        wlog = w.buf.getvalue().strip()
+        log('\n'+'*'*30, w.plugin.name, 'Covers', '*'*30)
+        log('Request extra headers:', w.plugin.browser.addheaders)
+        if w.plugin in found_results:
+            result = found_results[w.plugin]
+            log('Downloaded cover:', '%dx%d'%(result[1], result[2]))
+        else:
+            log('Failed to download valid cover')
+        if w.time_spent is None:
+            log('Download aborted')
+        else:
+            log('Took', w.time_spent, 'seconds')
+        if wlog:
+            log(wlog)
+        log('\n'+'*'*80)
+
+
+def download_cover(log,
+        title=None, authors=None, identifiers={}, timeout=30):
+    '''
+    Synchronous cover download. Returns the "best" cover as per user
+    prefs/cover resolution.
+
+    Return cover is a tuple: (plugin, width, height, fmt, data)
+
+    Returns None if no cover is found.
+    '''
+    rq = Queue()
+    abort = Event()
+
+    run_download(log, rq, abort, title=title, authors=authors,
+            identifiers=identifiers, timeout=timeout)
+
+    results = []
+
+    while True:
+        try:
+            results.append(rq.get_nowait())
+        except Empty:
+            break
+
+    cp = msprefs['cover_priorities']
+
+    def keygen(result):
+        plugin, width, height, fmt, data = result
+        return (cp.get(plugin.name, 1), 1/(width*height))
+
+    results.sort(key=keygen)
+
+    return results[0] if results else None
+
+
+
+
--- a/src/calibre/ebooks/metadata/sources/google.py
+++ b/src/calibre/ebooks/metadata/sources/google.py
@ -145,15 +145,18 @@ def to_metadata(browser, log, entry_, timeout): # {{{
            log.exception('Failed to parse rating')

    # Cover
-    mi.has_google_cover = len(extra.xpath(
-        '//*[@rel="http://schemas.google.com/books/2008/thumbnail"]')) > 0
+    mi.has_google_cover = None
+    for x in extra.xpath(
+            '//*[@href and @rel="http://schemas.google.com/books/2008/thumbnail"]'):
+        mi.has_google_cover = x.get('href')
+        break

    return mi
 # }}}

 class GoogleBooks(Source):

-    name = 'Google Books'
+    name = 'Google'
    description = _('Downloads metadata from Google Books')

    capabilities = frozenset(['identify', 'cover'])
@ -213,7 +216,7 @@ class GoogleBooks(Source):
            results.sort(key=self.identify_results_keygen(
                title=title, authors=authors, identifiers=identifiers))
            for mi in results:
-                cached_url = self.cover_url_from_identifiers(mi.identifiers)
+                cached_url = self.get_cached_cover_url(mi.identifiers)
                if cached_url is not None:
                    break
        if cached_url is None:
@ -223,9 +226,10 @@ class GoogleBooks(Source):
        if abort.is_set():
            return
        br = self.browser
+        log('Downloading cover from:', cached_url)
        try:
            cdata = br.open_novisit(cached_url, timeout=timeout).read()
-            result_queue.put(cdata)
+            result_queue.put((self, cdata))
        except:
            log.exception('Failed to download cover from:', cached_url)

@ -254,9 +258,9 @@ class GoogleBooks(Source):
                    goog = ans.identifiers['google']
                    for isbn in getattr(ans, 'all_isbns', []):
                        self.cache_isbn_to_identifier(isbn, goog)
-                        if ans.has_google_cover:
-                            self.cache_identifier_to_cover_url(goog,
-                                    self.GOOGLE_COVER%goog)
+                    if ans.has_google_cover:
+                        self.cache_identifier_to_cover_url(goog,
+                                self.GOOGLE_COVER%goog)
                    self.clean_downloaded_metadata(ans)
                    result_queue.put(ans)
            except:
--- a/src/calibre/ebooks/metadata/sources/openlibrary.py
+++ b/src/calibre/ebooks/metadata/sources/openlibrary.py
@ -26,7 +26,7 @@ class OpenLibrary(Source):
        br = self.browser
        try:
            ans = br.open_novisit(self.OPENLIBRARY%isbn, timeout=timeout).read()
-            result_queue.put(ans)
+            result_queue.put((self, ans))
        except Exception as e:
            if callable(getattr(e, 'getcode', None)) and e.getcode() == 404:
                log.error('No cover for ISBN: %r found'%isbn)