Add cover downloading to the new fetch-ebook-metadata

This commit is contained in:
Kovid Goyal 2011-04-05 23:16:59 -06:00
parent 2828ba5276
commit 6773cf71af
6 changed files with 224 additions and 15 deletions

View File

@ -279,7 +279,7 @@ class Worker(Thread): # Get details {{{
class Amazon(Source): class Amazon(Source):
name = 'Amazon' name = 'Amazon Metadata'
description = _('Downloads metadata from Amazon') description = _('Downloads metadata from Amazon')
capabilities = frozenset(['identify', 'cover']) capabilities = frozenset(['identify', 'cover'])
@ -493,9 +493,10 @@ class Amazon(Source):
if abort.is_set(): if abort.is_set():
return return
br = self.browser br = self.browser
log('Downloading cover from:', cached_url)
try: try:
cdata = br.open_novisit(cached_url, timeout=timeout).read() cdata = br.open_novisit(cached_url, timeout=timeout).read()
result_queue.put(cdata) result_queue.put((self, cdata))
except: except:
log.exception('Failed to download cover from:', cached_url) log.exception('Failed to download cover from:', cached_url)
# }}} # }}}

View File

@ -22,6 +22,12 @@ msprefs.defaults['txt_comments'] = False
msprefs.defaults['ignore_fields'] = [] msprefs.defaults['ignore_fields'] = []
msprefs.defaults['max_tags'] = 20 msprefs.defaults['max_tags'] = 20
msprefs.defaults['wait_after_first_identify_result'] = 30 # seconds msprefs.defaults['wait_after_first_identify_result'] = 30 # seconds
msprefs.defaults['wait_after_first_cover_result'] = 60 # seconds
# Google covers are often poor quality (scans/errors) but they have high
# resolution, so they trump covers from better sources. So make sure they
# are only used if no other covers are found.
msprefs.defaults['cover_priorities'] = {'Google':2}
def create_log(ostream=None): def create_log(ostream=None):
log = ThreadSafeLog(level=ThreadSafeLog.DEBUG) log = ThreadSafeLog(level=ThreadSafeLog.DEBUG)
@ -340,7 +346,8 @@ class Source(Plugin):
title=None, authors=None, identifiers={}, timeout=30): title=None, authors=None, identifiers={}, timeout=30):
''' '''
Download a cover and put it into result_queue. The parameters all have Download a cover and put it into result_queue. The parameters all have
the same meaning as for :meth:`identify`. the same meaning as for :meth:`identify`. Put (self, cover_data) into
result_queue.
This method should use cached cover URLs for efficiency whenever This method should use cached cover URLs for efficiency whenever
possible. When cached data is not present, most plugins simply call possible. When cached data is not present, most plugins simply call

View File

@ -13,10 +13,13 @@ from threading import Event
from calibre import prints from calibre import prints
from calibre.utils.config import OptionParser from calibre.utils.config import OptionParser
from calibre.utils.magick.draw import save_cover_data_to
from calibre.ebooks.metadata import string_to_authors from calibre.ebooks.metadata import string_to_authors
from calibre.ebooks.metadata.opf2 import metadata_to_opf from calibre.ebooks.metadata.opf2 import metadata_to_opf
from calibre.ebooks.metadata.sources.base import create_log from calibre.ebooks.metadata.sources.base import create_log
from calibre.ebooks.metadata.sources.identify import identify from calibre.ebooks.metadata.sources.identify import identify
from calibre.ebooks.metadata.sources.covers import download_cover
def option_parser(): def option_parser():
parser = OptionParser(textwrap.dedent( parser = OptionParser(textwrap.dedent(
@ -33,6 +36,8 @@ def option_parser():
parser.add_option('-v', '--verbose', default=False, action='store_true', parser.add_option('-v', '--verbose', default=False, action='store_true',
help='Print the log to the console (stderr)') help='Print the log to the console (stderr)')
parser.add_option('-o', '--opf', help='Output the metadata in OPF format') parser.add_option('-o', '--opf', help='Output the metadata in OPF format')
parser.add_option('-c', '--cover',
help='Specify a filename. The cover, if available, will be saved to it')
parser.add_option('-d', '--timeout', default='30', parser.add_option('-d', '--timeout', default='30',
help='Timeout in seconds. Default is 30') help='Timeout in seconds. Default is 30')
@ -57,14 +62,26 @@ def main(args=sys.argv):
results = identify(log, abort, title=opts.title, authors=authors, results = identify(log, abort, title=opts.title, authors=authors,
identifiers=identifiers, timeout=int(opts.timeout)) identifiers=identifiers, timeout=int(opts.timeout))
log = buf.getvalue()
if not results: if not results:
print (log, file=sys.stderr) print (log, file=sys.stderr)
prints('No results found', file=sys.stderr) prints('No results found', file=sys.stderr)
raise SystemExit(1) raise SystemExit(1)
result = results[0] result = results[0]
cf = None
if opts.cover and results:
cover = download_cover(log, title=opts.title, authors=authors,
identifiers=result.identifiers, timeout=int(opts.timeout))
if cover is None:
prints('No cover found', file=sys.stderr)
else:
save_cover_data_to(cover[-1], opts.cover)
result.cover = cf = opts.cover
log = buf.getvalue()
result = (metadata_to_opf(result) if opts.opf else result = (metadata_to_opf(result) if opts.opf else
unicode(result).encode('utf-8')) unicode(result).encode('utf-8'))
@ -72,6 +89,8 @@ def main(args=sys.argv):
print (log, file=sys.stderr) print (log, file=sys.stderr)
print (result) print (result)
if not opts.opf:
prints('Cover :', cf)
return 0 return 0

View File

@ -0,0 +1,178 @@
#!/usr/bin/env python
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from __future__ import (unicode_literals, division, absolute_import,
print_function)
__license__ = 'GPL v3'
__copyright__ = '2011, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
import time
from Queue import Queue, Empty
from threading import Thread, Event
from io import BytesIO
from calibre.customize.ui import metadata_plugins
from calibre.ebooks.metadata.sources.base import msprefs, create_log
from calibre.utils.magick.draw import Image, save_cover_data_to
class Worker(Thread):
def __init__(self, plugin, abort, title, authors, identifiers, timeout, rq):
Thread.__init__(self)
self.daemon = True
self.plugin = plugin
self.abort = abort
self.buf = BytesIO()
self.log = create_log(self.buf)
self.title, self.authors, self.identifiers = (title, authors,
identifiers)
self.timeout, self.rq = timeout, rq
self.time_spent = None
def run(self):
start_time = time.time()
if not self.abort.is_set():
try:
self.plugin.download_cover(self.log, self.rq, self.abort,
title=self.title, authors=self.authors,
identifiers=self.identifiers, timeout=self.timeout)
except:
self.log.exception('Failed to download cover from',
self.plugin.name)
self.time_spent = time.time() - start_time
def is_worker_alive(workers):
for w in workers:
if w.is_alive():
return True
return False
def process_result(log, result):
plugin, data = result
try:
im = Image()
im.load(data)
im.trim(10)
width, height = im.size
fmt = im.format
if width < 50 or height < 50:
raise ValueError('Image too small')
data = save_cover_data_to(im, '/cover.jpg', return_data=True)
except:
log.exception('Invalid cover from', plugin.name)
return None
return (plugin, width, height, fmt, data)
def run_download(log, results, abort,
title=None, authors=None, identifiers={}, timeout=30):
'''
Run the cover download, putting results into the queue :param:`results`.
Each result is a tuple of the form:
(plugin, width, height, fmt, bytes)
'''
plugins = list(metadata_plugins(['cover']))
rq = Queue()
workers = [Worker(p, abort, title, authors, identifiers, timeout, rq) for p
in plugins]
for w in workers:
w.start()
first_result_at = None
wait_time = msprefs['wait_after_first_cover_result']
found_results = {}
while True:
time.sleep(0.1)
try:
x = rq.get_nowait()
result = process_result(log, x)
if result is not None:
results.put(result)
found_results[result[0]] = result
if first_result_at is not None:
first_result_at = time.time()
except Empty:
pass
if not is_worker_alive(workers):
break
if first_result_at is not None and time.time() - first_result_at > wait_time:
log('Not waiting for any more results')
abort.set()
if abort.is_set():
break
while True:
try:
x = rq.get_nowait()
result = process_result(log, x)
if result is not None:
results.put(result)
found_results[result[0]] = result
except Empty:
break
for w in workers:
wlog = w.buf.getvalue().strip()
log('\n'+'*'*30, w.plugin.name, 'Covers', '*'*30)
log('Request extra headers:', w.plugin.browser.addheaders)
if w.plugin in found_results:
result = found_results[w.plugin]
log('Downloaded cover:', '%dx%d'%(result[1], result[2]))
else:
log('Failed to download valid cover')
if w.time_spent is None:
log('Download aborted')
else:
log('Took', w.time_spent, 'seconds')
if wlog:
log(wlog)
log('\n'+'*'*80)
def download_cover(log,
title=None, authors=None, identifiers={}, timeout=30):
'''
Synchronous cover download. Returns the "best" cover as per user
prefs/cover resolution.
Return cover is a tuple: (plugin, width, height, fmt, data)
Returns None if no cover is found.
'''
rq = Queue()
abort = Event()
run_download(log, rq, abort, title=title, authors=authors,
identifiers=identifiers, timeout=timeout)
results = []
while True:
try:
results.append(rq.get_nowait())
except Empty:
break
cp = msprefs['cover_priorities']
def keygen(result):
plugin, width, height, fmt, data = result
return (cp.get(plugin.name, 1), 1/(width*height))
results.sort(key=keygen)
return results[0] if results else None

View File

@ -145,15 +145,18 @@ def to_metadata(browser, log, entry_, timeout): # {{{
log.exception('Failed to parse rating') log.exception('Failed to parse rating')
# Cover # Cover
mi.has_google_cover = len(extra.xpath( mi.has_google_cover = None
'//*[@rel="http://schemas.google.com/books/2008/thumbnail"]')) > 0 for x in extra.xpath(
'//*[@href and @rel="http://schemas.google.com/books/2008/thumbnail"]'):
mi.has_google_cover = x.get('href')
break
return mi return mi
# }}} # }}}
class GoogleBooks(Source): class GoogleBooks(Source):
name = 'Google Books' name = 'Google'
description = _('Downloads metadata from Google Books') description = _('Downloads metadata from Google Books')
capabilities = frozenset(['identify', 'cover']) capabilities = frozenset(['identify', 'cover'])
@ -213,7 +216,7 @@ class GoogleBooks(Source):
results.sort(key=self.identify_results_keygen( results.sort(key=self.identify_results_keygen(
title=title, authors=authors, identifiers=identifiers)) title=title, authors=authors, identifiers=identifiers))
for mi in results: for mi in results:
cached_url = self.cover_url_from_identifiers(mi.identifiers) cached_url = self.get_cached_cover_url(mi.identifiers)
if cached_url is not None: if cached_url is not None:
break break
if cached_url is None: if cached_url is None:
@ -223,9 +226,10 @@ class GoogleBooks(Source):
if abort.is_set(): if abort.is_set():
return return
br = self.browser br = self.browser
log('Downloading cover from:', cached_url)
try: try:
cdata = br.open_novisit(cached_url, timeout=timeout).read() cdata = br.open_novisit(cached_url, timeout=timeout).read()
result_queue.put(cdata) result_queue.put((self, cdata))
except: except:
log.exception('Failed to download cover from:', cached_url) log.exception('Failed to download cover from:', cached_url)
@ -254,9 +258,9 @@ class GoogleBooks(Source):
goog = ans.identifiers['google'] goog = ans.identifiers['google']
for isbn in getattr(ans, 'all_isbns', []): for isbn in getattr(ans, 'all_isbns', []):
self.cache_isbn_to_identifier(isbn, goog) self.cache_isbn_to_identifier(isbn, goog)
if ans.has_google_cover: if ans.has_google_cover:
self.cache_identifier_to_cover_url(goog, self.cache_identifier_to_cover_url(goog,
self.GOOGLE_COVER%goog) self.GOOGLE_COVER%goog)
self.clean_downloaded_metadata(ans) self.clean_downloaded_metadata(ans)
result_queue.put(ans) result_queue.put(ans)
except: except:

View File

@ -26,7 +26,7 @@ class OpenLibrary(Source):
br = self.browser br = self.browser
try: try:
ans = br.open_novisit(self.OPENLIBRARY%isbn, timeout=timeout).read() ans = br.open_novisit(self.OPENLIBRARY%isbn, timeout=timeout).read()
result_queue.put(ans) result_queue.put((self, ans))
except Exception as e: except Exception as e:
if callable(getattr(e, 'getcode', None)) and e.getcode() == 404: if callable(getattr(e, 'getcode', None)) and e.getcode() == 404:
log.error('No cover for ISBN: %r found'%isbn) log.error('No cover for ISBN: %r found'%isbn)