Run bulk metadata downloads in a separate process to workaround the problem of third party metadata download plugins with memory leaks. Also removes the need to batch metadata downloads into groups of 100 books at a time.

This commit is contained in:
Kovid Goyal 2012-04-02 10:31:40 +05:30
parent 310c5c17d2
commit 927b7471b7
2 changed files with 178 additions and 88 deletions

View File

@ -0,0 +1,95 @@
#!/usr/bin/env python
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from __future__ import (unicode_literals, division, absolute_import,
print_function)
__license__ = 'GPL v3'
__copyright__ = '2012, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
import os
from threading import Event
from io import BytesIO
from calibre.utils.date import as_utc
from calibre.ebooks.metadata.sources.identify import identify, msprefs
from calibre.ebooks.metadata.book.base import Metadata
from calibre.customize.ui import metadata_plugins
from calibre.ebooks.metadata.sources.covers import download_cover
from calibre.utils.logging import GUILog
from calibre.ebooks.metadata.opf2 import metadata_to_opf, OPF
def merge_result(oldmi, newmi, ensure_fields=None):
dummy = Metadata(_('Unknown'))
for f in msprefs['ignore_fields']:
if ':' in f or (ensure_fields and f in ensure_fields):
continue
setattr(newmi, f, getattr(dummy, f))
fields = set()
for plugin in metadata_plugins(['identify']):
fields |= plugin.touched_fields
def is_equal(x, y):
if hasattr(x, 'tzinfo'):
x = as_utc(x)
if hasattr(y, 'tzinfo'):
y = as_utc(y)
return x == y
for f in fields:
# Optimize so that set_metadata does not have to do extra work later
if not f.startswith('identifier:'):
if (not newmi.is_null(f) and is_equal(getattr(newmi, f),
getattr(oldmi, f))):
setattr(newmi, f, getattr(dummy, f))
return newmi
def main(do_identify, covers, metadata, ensure_fields):
failed_ids = set()
failed_covers = set()
all_failed = True
log = GUILog()
for book_id, mi in metadata.iteritems():
mi = OPF(BytesIO(mi), basedir=os.getcwdu(),
populate_spine=False).to_book_metadata()
title, authors, identifiers = mi.title, mi.authors, mi.identifiers
cdata = None
log.clear()
if do_identify:
results = []
try:
results = identify(log, Event(), title=title, authors=authors,
identifiers=identifiers)
except:
pass
if results:
all_failed = False
mi = merge_result(mi, results[0], ensure_fields=ensure_fields)
identifiers = mi.identifiers
if not mi.is_null('rating'):
# set_metadata expects a rating out of 10
mi.rating *= 2
with open('%d.mi'%book_id, 'wb') as f:
f.write(metadata_to_opf(mi, default_lang='und'))
else:
log.error('Failed to download metadata for', title)
failed_ids.add(book_id)
if covers:
cdata = download_cover(log, title=title, authors=authors,
identifiers=identifiers)
if cdata is None:
failed_covers.add(book_id)
else:
with open('%d.cover'%book_id, 'wb') as f:
f.write(cdata[-1])
all_failed = False
with open('%d.log'%book_id, 'wb') as f:
f.write(log.html.encode('utf-8'))
return failed_ids, failed_covers, all_failed

View File

@ -7,20 +7,17 @@ __license__ = 'GPL v3'
__copyright__ = '2011, Kovid Goyal <kovid@kovidgoyal.net>' __copyright__ = '2011, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en' __docformat__ = 'restructuredtext en'
import os, time, shutil
from functools import partial from functools import partial
from itertools import izip
from threading import Event
from PyQt4.Qt import (QIcon, QDialog, from PyQt4.Qt import (QIcon, QDialog,
QDialogButtonBox, QLabel, QGridLayout, QPixmap, Qt) QDialogButtonBox, QLabel, QGridLayout, QPixmap, Qt)
from calibre.gui2.threaded_jobs import ThreadedJob from calibre.gui2.threaded_jobs import ThreadedJob
from calibre.ebooks.metadata.sources.identify import identify, msprefs from calibre.ebooks.metadata.opf2 import metadata_to_opf
from calibre.ebooks.metadata.sources.covers import download_cover from calibre.utils.ipc.simple_worker import fork_job, WorkerError
from calibre.ebooks.metadata.book.base import Metadata from calibre.ptempfile import (PersistentTemporaryDirectory,
from calibre.customize.ui import metadata_plugins PersistentTemporaryFile)
from calibre.ptempfile import PersistentTemporaryFile
from calibre.utils.date import as_utc
# Start download {{{ # Start download {{{
def show_config(gui, parent): def show_config(gui, parent):
@ -105,18 +102,18 @@ def start_download(gui, ids, callback, ensure_fields=None):
if ret != d.Accepted: if ret != d.Accepted:
return return
for batch in split_jobs(ids): job = ThreadedJob('metadata bulk download',
job = ThreadedJob('metadata bulk download', _('Download metadata for %d books')%len(ids),
_('Download metadata for %d books')%len(batch), download, (ids, gui.current_db, d.identify, d.covers,
download, (batch, gui.current_db, d.identify, d.covers, ensure_fields), {}, callback)
ensure_fields), {}, callback) gui.job_manager.run_threaded_job(job)
gui.job_manager.run_threaded_job(job)
gui.status_bar.show_message(_('Metadata download started'), 3000) gui.status_bar.show_message(_('Metadata download started'), 3000)
# }}} # }}}
def get_job_details(job): def get_job_details(job):
id_map, failed_ids, failed_covers, title_map, all_failed = job.result (aborted, good_ids, tdir, log_file, failed_ids, failed_covers, title_map,
lm_map, all_failed) = job.result
det_msg = [] det_msg = []
for i in failed_ids | failed_covers: for i in failed_ids | failed_covers:
title = title_map[i] title = title_map[i]
@ -126,92 +123,90 @@ def get_job_details(job):
title += (' ' + _('(Failed cover)')) title += (' ' + _('(Failed cover)'))
det_msg.append(title) det_msg.append(title)
det_msg = '\n'.join(det_msg) det_msg = '\n'.join(det_msg)
return id_map, failed_ids, failed_covers, all_failed, det_msg return (aborted, good_ids, tdir, log_file, failed_ids, failed_covers,
all_failed, det_msg, lm_map)
def merge_result(oldmi, newmi, ensure_fields=None): class HeartBeat(object):
dummy = Metadata(_('Unknown')) CHECK_INTERVAL = 300 # seconds
for f in msprefs['ignore_fields']: ''' Check that the file count in tdir changes every five minutes '''
if ':' in f or (ensure_fields and f in ensure_fields):
continue
setattr(newmi, f, getattr(dummy, f))
fields = set()
for plugin in metadata_plugins(['identify']):
fields |= plugin.touched_fields
def is_equal(x, y): def __init__(self, tdir):
if hasattr(x, 'tzinfo'): self.tdir = tdir
x = as_utc(x) self.last_count = len(os.listdir(self.tdir))
if hasattr(y, 'tzinfo'): self.last_time = time.time()
y = as_utc(y)
return x == y
for f in fields: def __call__(self):
# Optimize so that set_metadata does not have to do extra work later if time.time() - self.last_time > self.CHECK_INTERVAL:
if not f.startswith('identifier:'): c = len(os.listdir(self.tdir))
if (not newmi.is_null(f) and is_equal(getattr(newmi, f), if c == self.last_count:
getattr(oldmi, f))): return False
setattr(newmi, f, getattr(dummy, f)) self.last_count = c
self.last_time = time.time()
return True
newmi.last_modified = oldmi.last_modified # Fix log viewer, get_job_details, database update code
# Test: abort, covers only, metadata only, both, 200 entry download, memory
# consumption, all errors and on and on
return newmi def download(all_ids, db, do_identify, covers, ensure_fields,
def download(ids, db, do_identify, covers, ensure_fields,
log=None, abort=None, notifications=None): log=None, abort=None, notifications=None):
ids = list(ids) batch_size = 10
metadata = [db.get_metadata(i, index_is_id=True, get_user_categories=False) batches = split_jobs(all_ids, batch_size=batch_size)
for i in ids] tdir = PersistentTemporaryDirectory('_metadata_bulk_')
tf = PersistentTemporaryFile('_metadata_bulk_log_')
tf.close()
tf = tf.name
heartbeat = HeartBeat(tdir)
failed_ids = set() failed_ids = set()
failed_covers = set() failed_covers = set()
title_map = {} title_map = {}
ans = {} lm_map = {}
count = 0 ans = set()
all_failed = True all_failed = True
''' aborted = False
# Test apply dialog count = 0
all_failed = do_identify = covers = False
''' for ids in batches:
for i, mi in izip(ids, metadata):
if abort.is_set(): if abort.is_set():
log.error('Aborting...') log.error('Aborting...')
break break
title, authors, identifiers = mi.title, mi.authors, mi.identifiers metadata = {i:db.get_metadata(i, index_is_id=True,
title_map[i] = title get_user_categories=False) for i in ids}
if do_identify: for i in ids:
results = [] title_map[i] = metadata[i].title
try: lm_map[i] = metadata[i].last_modified
results = identify(log, Event(), title=title, authors=authors, metadata = {i:metadata_to_opf(mi, default_lang='und') for i, mi in
identifiers=identifiers) metadata.iteritems()}
except: try:
pass ret = fork_job('calibre.ebooks.metadata.sources.worker', 'main',
if results: (do_identify, covers, metadata, ensure_fields),
all_failed = False cwd=tdir, abort=abort, heartbeat=heartbeat, no_output=True)
mi = merge_result(mi, results[0], ensure_fields=ensure_fields) except WorkerError as e:
identifiers = mi.identifiers if e.orig_tb:
if not mi.is_null('rating'): raise Exception('Failed to download metadata. Original '
# set_metadata expects a rating out of 10 'traceback: \n\n'+e.orig_tb)
mi.rating *= 2 raise
else: count += batch_size
log.error('Failed to download metadata for', title)
failed_ids.add(i)
# We don't want set_metadata operating on anything but covers
mi = merge_result(mi, mi, ensure_fields=ensure_fields)
if covers:
cdata = download_cover(log, title=title, authors=authors,
identifiers=identifiers)
if cdata is not None:
with PersistentTemporaryFile('.jpg', 'downloaded-cover-') as f:
f.write(cdata[-1])
mi.cover = f.name
all_failed = False
else:
failed_covers.add(i)
ans[i] = mi
count += 1
notifications.put((count/len(ids), notifications.put((count/len(ids),
_('Downloaded %(num)d of %(tot)d')%dict(num=count, tot=len(ids)))) _('Downloaded %(num)d of %(tot)d')%dict(
num=count, tot=len(all_ids))))
fids, fcovs, allf = ret['result']
if not allf:
all_failed = False
failed_ids = failed_ids.union(fids)
failed_covers = failed_covers.union(fcovs)
ans = ans.union(set(ids) - fids)
for book_id in ids:
lp = os.path.join(tdir, '%d.log'%book_id)
if os.path.exists(lp):
with open(lp, 'rb') as f, open(tf, 'ab') as d:
shutil.copyfileobj(f, d)
if abort.is_set():
aborted = True
log('Download complete, with %d failures'%len(failed_ids)) log('Download complete, with %d failures'%len(failed_ids))
return (ans, failed_ids, failed_covers, title_map, all_failed) return (aborted, ans, tdir, tf, failed_ids, failed_covers, title_map,
lm_map, all_failed)