diff --git a/src/calibre/ebooks/metadata/sources/amazon.py b/src/calibre/ebooks/metadata/sources/amazon.py index d1c8f24da6..d48f502c29 100644 --- a/src/calibre/ebooks/metadata/sources/amazon.py +++ b/src/calibre/ebooks/metadata/sources/amazon.py @@ -279,7 +279,7 @@ class Worker(Thread): # Get details {{{ class Amazon(Source): - name = 'Amazon' + name = 'Amazon Metadata' description = _('Downloads metadata from Amazon') capabilities = frozenset(['identify', 'cover']) @@ -493,9 +493,10 @@ class Amazon(Source): if abort.is_set(): return br = self.browser + log('Downloading cover from:', cached_url) try: cdata = br.open_novisit(cached_url, timeout=timeout).read() - result_queue.put(cdata) + result_queue.put((self, cdata)) except: log.exception('Failed to download cover from:', cached_url) # }}} diff --git a/src/calibre/ebooks/metadata/sources/base.py b/src/calibre/ebooks/metadata/sources/base.py index 30b804a76e..33232f25ab 100644 --- a/src/calibre/ebooks/metadata/sources/base.py +++ b/src/calibre/ebooks/metadata/sources/base.py @@ -22,6 +22,12 @@ msprefs.defaults['txt_comments'] = False msprefs.defaults['ignore_fields'] = [] msprefs.defaults['max_tags'] = 20 msprefs.defaults['wait_after_first_identify_result'] = 30 # seconds +msprefs.defaults['wait_after_first_cover_result'] = 60 # seconds + +# Google covers are often poor quality (scans/errors) but they have high +# resolution, so they trump covers from better sources. So make sure they +# are only used if no other covers are found. +msprefs.defaults['cover_priorities'] = {'Google':2} def create_log(ostream=None): log = ThreadSafeLog(level=ThreadSafeLog.DEBUG) @@ -340,7 +346,8 @@ class Source(Plugin): title=None, authors=None, identifiers={}, timeout=30): ''' Download a cover and put it into result_queue. The parameters all have - the same meaning as for :meth:`identify`. + the same meaning as for :meth:`identify`. Put (self, cover_data) into + result_queue. This method should use cached cover URLs for efficiency whenever possible. When cached data is not present, most plugins simply call diff --git a/src/calibre/ebooks/metadata/sources/cli.py b/src/calibre/ebooks/metadata/sources/cli.py index d2cc1648f9..b39da07d53 100644 --- a/src/calibre/ebooks/metadata/sources/cli.py +++ b/src/calibre/ebooks/metadata/sources/cli.py @@ -13,10 +13,13 @@ from threading import Event from calibre import prints from calibre.utils.config import OptionParser +from calibre.utils.magick.draw import save_cover_data_to from calibre.ebooks.metadata import string_to_authors from calibre.ebooks.metadata.opf2 import metadata_to_opf from calibre.ebooks.metadata.sources.base import create_log from calibre.ebooks.metadata.sources.identify import identify +from calibre.ebooks.metadata.sources.covers import download_cover + def option_parser(): parser = OptionParser(textwrap.dedent( @@ -33,6 +36,8 @@ def option_parser(): parser.add_option('-v', '--verbose', default=False, action='store_true', help='Print the log to the console (stderr)') parser.add_option('-o', '--opf', help='Output the metadata in OPF format') + parser.add_option('-c', '--cover', + help='Specify a filename. The cover, if available, will be saved to it') parser.add_option('-d', '--timeout', default='30', help='Timeout in seconds. Default is 30') @@ -57,14 +62,26 @@ def main(args=sys.argv): results = identify(log, abort, title=opts.title, authors=authors, identifiers=identifiers, timeout=int(opts.timeout)) - log = buf.getvalue() - if not results: print (log, file=sys.stderr) prints('No results found', file=sys.stderr) raise SystemExit(1) - result = results[0] + + cf = None + if opts.cover and results: + cover = download_cover(log, title=opts.title, authors=authors, + identifiers=result.identifiers, timeout=int(opts.timeout)) + if cover is None: + prints('No cover found', file=sys.stderr) + else: + save_cover_data_to(cover[-1], opts.cover) + result.cover = cf = opts.cover + + + log = buf.getvalue() + + result = (metadata_to_opf(result) if opts.opf else unicode(result).encode('utf-8')) @@ -72,6 +89,8 @@ def main(args=sys.argv): print (log, file=sys.stderr) print (result) + if not opts.opf: + prints('Cover :', cf) return 0 diff --git a/src/calibre/ebooks/metadata/sources/covers.py b/src/calibre/ebooks/metadata/sources/covers.py new file mode 100644 index 0000000000..46b278397c --- /dev/null +++ b/src/calibre/ebooks/metadata/sources/covers.py @@ -0,0 +1,178 @@ +#!/usr/bin/env python +# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai +from __future__ import (unicode_literals, division, absolute_import, + print_function) + +__license__ = 'GPL v3' +__copyright__ = '2011, Kovid Goyal ' +__docformat__ = 'restructuredtext en' + +import time +from Queue import Queue, Empty +from threading import Thread, Event +from io import BytesIO + +from calibre.customize.ui import metadata_plugins +from calibre.ebooks.metadata.sources.base import msprefs, create_log +from calibre.utils.magick.draw import Image, save_cover_data_to + +class Worker(Thread): + + def __init__(self, plugin, abort, title, authors, identifiers, timeout, rq): + Thread.__init__(self) + self.daemon = True + + self.plugin = plugin + self.abort = abort + self.buf = BytesIO() + self.log = create_log(self.buf) + self.title, self.authors, self.identifiers = (title, authors, + identifiers) + self.timeout, self.rq = timeout, rq + self.time_spent = None + + def run(self): + start_time = time.time() + if not self.abort.is_set(): + try: + self.plugin.download_cover(self.log, self.rq, self.abort, + title=self.title, authors=self.authors, + identifiers=self.identifiers, timeout=self.timeout) + except: + self.log.exception('Failed to download cover from', + self.plugin.name) + self.time_spent = time.time() - start_time + +def is_worker_alive(workers): + for w in workers: + if w.is_alive(): + return True + return False + +def process_result(log, result): + plugin, data = result + try: + im = Image() + im.load(data) + im.trim(10) + width, height = im.size + fmt = im.format + + if width < 50 or height < 50: + raise ValueError('Image too small') + data = save_cover_data_to(im, '/cover.jpg', return_data=True) + except: + log.exception('Invalid cover from', plugin.name) + return None + return (plugin, width, height, fmt, data) + +def run_download(log, results, abort, + title=None, authors=None, identifiers={}, timeout=30): + ''' + Run the cover download, putting results into the queue :param:`results`. + + Each result is a tuple of the form: + + (plugin, width, height, fmt, bytes) + + ''' + plugins = list(metadata_plugins(['cover'])) + + rq = Queue() + workers = [Worker(p, abort, title, authors, identifiers, timeout, rq) for p + in plugins] + for w in workers: + w.start() + + first_result_at = None + wait_time = msprefs['wait_after_first_cover_result'] + found_results = {} + + while True: + time.sleep(0.1) + try: + x = rq.get_nowait() + result = process_result(log, x) + if result is not None: + results.put(result) + found_results[result[0]] = result + if first_result_at is not None: + first_result_at = time.time() + except Empty: + pass + + if not is_worker_alive(workers): + break + + if first_result_at is not None and time.time() - first_result_at > wait_time: + log('Not waiting for any more results') + abort.set() + + if abort.is_set(): + break + + while True: + try: + x = rq.get_nowait() + result = process_result(log, x) + if result is not None: + results.put(result) + found_results[result[0]] = result + except Empty: + break + + for w in workers: + wlog = w.buf.getvalue().strip() + log('\n'+'*'*30, w.plugin.name, 'Covers', '*'*30) + log('Request extra headers:', w.plugin.browser.addheaders) + if w.plugin in found_results: + result = found_results[w.plugin] + log('Downloaded cover:', '%dx%d'%(result[1], result[2])) + else: + log('Failed to download valid cover') + if w.time_spent is None: + log('Download aborted') + else: + log('Took', w.time_spent, 'seconds') + if wlog: + log(wlog) + log('\n'+'*'*80) + + +def download_cover(log, + title=None, authors=None, identifiers={}, timeout=30): + ''' + Synchronous cover download. Returns the "best" cover as per user + prefs/cover resolution. + + Return cover is a tuple: (plugin, width, height, fmt, data) + + Returns None if no cover is found. + ''' + rq = Queue() + abort = Event() + + run_download(log, rq, abort, title=title, authors=authors, + identifiers=identifiers, timeout=timeout) + + results = [] + + while True: + try: + results.append(rq.get_nowait()) + except Empty: + break + + cp = msprefs['cover_priorities'] + + def keygen(result): + plugin, width, height, fmt, data = result + return (cp.get(plugin.name, 1), 1/(width*height)) + + results.sort(key=keygen) + + return results[0] if results else None + + + + diff --git a/src/calibre/ebooks/metadata/sources/google.py b/src/calibre/ebooks/metadata/sources/google.py index c4e2f9fe24..47cfb823bb 100644 --- a/src/calibre/ebooks/metadata/sources/google.py +++ b/src/calibre/ebooks/metadata/sources/google.py @@ -145,15 +145,18 @@ def to_metadata(browser, log, entry_, timeout): # {{{ log.exception('Failed to parse rating') # Cover - mi.has_google_cover = len(extra.xpath( - '//*[@rel="http://schemas.google.com/books/2008/thumbnail"]')) > 0 + mi.has_google_cover = None + for x in extra.xpath( + '//*[@href and @rel="http://schemas.google.com/books/2008/thumbnail"]'): + mi.has_google_cover = x.get('href') + break return mi # }}} class GoogleBooks(Source): - name = 'Google Books' + name = 'Google' description = _('Downloads metadata from Google Books') capabilities = frozenset(['identify', 'cover']) @@ -213,7 +216,7 @@ class GoogleBooks(Source): results.sort(key=self.identify_results_keygen( title=title, authors=authors, identifiers=identifiers)) for mi in results: - cached_url = self.cover_url_from_identifiers(mi.identifiers) + cached_url = self.get_cached_cover_url(mi.identifiers) if cached_url is not None: break if cached_url is None: @@ -223,9 +226,10 @@ class GoogleBooks(Source): if abort.is_set(): return br = self.browser + log('Downloading cover from:', cached_url) try: cdata = br.open_novisit(cached_url, timeout=timeout).read() - result_queue.put(cdata) + result_queue.put((self, cdata)) except: log.exception('Failed to download cover from:', cached_url) @@ -254,9 +258,9 @@ class GoogleBooks(Source): goog = ans.identifiers['google'] for isbn in getattr(ans, 'all_isbns', []): self.cache_isbn_to_identifier(isbn, goog) - if ans.has_google_cover: - self.cache_identifier_to_cover_url(goog, - self.GOOGLE_COVER%goog) + if ans.has_google_cover: + self.cache_identifier_to_cover_url(goog, + self.GOOGLE_COVER%goog) self.clean_downloaded_metadata(ans) result_queue.put(ans) except: diff --git a/src/calibre/ebooks/metadata/sources/openlibrary.py b/src/calibre/ebooks/metadata/sources/openlibrary.py index 1fcb33e35f..19b8747265 100644 --- a/src/calibre/ebooks/metadata/sources/openlibrary.py +++ b/src/calibre/ebooks/metadata/sources/openlibrary.py @@ -26,7 +26,7 @@ class OpenLibrary(Source): br = self.browser try: ans = br.open_novisit(self.OPENLIBRARY%isbn, timeout=timeout).read() - result_queue.put(ans) + result_queue.put((self, ans)) except Exception as e: if callable(getattr(e, 'getcode', None)) and e.getcode() == 404: log.error('No cover for ISBN: %r found'%isbn)