mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Run bulk metadata downloads in a separate process to workaround the problem of third party metadata download plugins with memory leaks. Also removes the need to batch metadata downloads into groups of 100 books at a time.
This commit is contained in:
parent
310c5c17d2
commit
927b7471b7
95
src/calibre/ebooks/metadata/sources/worker.py
Normal file
95
src/calibre/ebooks/metadata/sources/worker.py
Normal file
@ -0,0 +1,95 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
|
||||||
|
from __future__ import (unicode_literals, division, absolute_import,
|
||||||
|
print_function)
|
||||||
|
|
||||||
|
__license__ = 'GPL v3'
|
||||||
|
__copyright__ = '2012, Kovid Goyal <kovid@kovidgoyal.net>'
|
||||||
|
__docformat__ = 'restructuredtext en'
|
||||||
|
|
||||||
|
import os
|
||||||
|
from threading import Event
|
||||||
|
from io import BytesIO
|
||||||
|
|
||||||
|
from calibre.utils.date import as_utc
|
||||||
|
from calibre.ebooks.metadata.sources.identify import identify, msprefs
|
||||||
|
from calibre.ebooks.metadata.book.base import Metadata
|
||||||
|
from calibre.customize.ui import metadata_plugins
|
||||||
|
from calibre.ebooks.metadata.sources.covers import download_cover
|
||||||
|
from calibre.utils.logging import GUILog
|
||||||
|
from calibre.ebooks.metadata.opf2 import metadata_to_opf, OPF
|
||||||
|
|
||||||
|
def merge_result(oldmi, newmi, ensure_fields=None):
|
||||||
|
dummy = Metadata(_('Unknown'))
|
||||||
|
for f in msprefs['ignore_fields']:
|
||||||
|
if ':' in f or (ensure_fields and f in ensure_fields):
|
||||||
|
continue
|
||||||
|
setattr(newmi, f, getattr(dummy, f))
|
||||||
|
fields = set()
|
||||||
|
for plugin in metadata_plugins(['identify']):
|
||||||
|
fields |= plugin.touched_fields
|
||||||
|
|
||||||
|
def is_equal(x, y):
|
||||||
|
if hasattr(x, 'tzinfo'):
|
||||||
|
x = as_utc(x)
|
||||||
|
if hasattr(y, 'tzinfo'):
|
||||||
|
y = as_utc(y)
|
||||||
|
return x == y
|
||||||
|
|
||||||
|
for f in fields:
|
||||||
|
# Optimize so that set_metadata does not have to do extra work later
|
||||||
|
if not f.startswith('identifier:'):
|
||||||
|
if (not newmi.is_null(f) and is_equal(getattr(newmi, f),
|
||||||
|
getattr(oldmi, f))):
|
||||||
|
setattr(newmi, f, getattr(dummy, f))
|
||||||
|
|
||||||
|
return newmi
|
||||||
|
|
||||||
|
def main(do_identify, covers, metadata, ensure_fields):
|
||||||
|
failed_ids = set()
|
||||||
|
failed_covers = set()
|
||||||
|
all_failed = True
|
||||||
|
log = GUILog()
|
||||||
|
|
||||||
|
for book_id, mi in metadata.iteritems():
|
||||||
|
mi = OPF(BytesIO(mi), basedir=os.getcwdu(),
|
||||||
|
populate_spine=False).to_book_metadata()
|
||||||
|
title, authors, identifiers = mi.title, mi.authors, mi.identifiers
|
||||||
|
cdata = None
|
||||||
|
log.clear()
|
||||||
|
|
||||||
|
if do_identify:
|
||||||
|
results = []
|
||||||
|
try:
|
||||||
|
results = identify(log, Event(), title=title, authors=authors,
|
||||||
|
identifiers=identifiers)
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
if results:
|
||||||
|
all_failed = False
|
||||||
|
mi = merge_result(mi, results[0], ensure_fields=ensure_fields)
|
||||||
|
identifiers = mi.identifiers
|
||||||
|
if not mi.is_null('rating'):
|
||||||
|
# set_metadata expects a rating out of 10
|
||||||
|
mi.rating *= 2
|
||||||
|
with open('%d.mi'%book_id, 'wb') as f:
|
||||||
|
f.write(metadata_to_opf(mi, default_lang='und'))
|
||||||
|
else:
|
||||||
|
log.error('Failed to download metadata for', title)
|
||||||
|
failed_ids.add(book_id)
|
||||||
|
|
||||||
|
if covers:
|
||||||
|
cdata = download_cover(log, title=title, authors=authors,
|
||||||
|
identifiers=identifiers)
|
||||||
|
if cdata is None:
|
||||||
|
failed_covers.add(book_id)
|
||||||
|
else:
|
||||||
|
with open('%d.cover'%book_id, 'wb') as f:
|
||||||
|
f.write(cdata[-1])
|
||||||
|
all_failed = False
|
||||||
|
|
||||||
|
with open('%d.log'%book_id, 'wb') as f:
|
||||||
|
f.write(log.html.encode('utf-8'))
|
||||||
|
|
||||||
|
return failed_ids, failed_covers, all_failed
|
||||||
|
|
@ -7,20 +7,17 @@ __license__ = 'GPL v3'
|
|||||||
__copyright__ = '2011, Kovid Goyal <kovid@kovidgoyal.net>'
|
__copyright__ = '2011, Kovid Goyal <kovid@kovidgoyal.net>'
|
||||||
__docformat__ = 'restructuredtext en'
|
__docformat__ = 'restructuredtext en'
|
||||||
|
|
||||||
|
import os, time, shutil
|
||||||
from functools import partial
|
from functools import partial
|
||||||
from itertools import izip
|
|
||||||
from threading import Event
|
|
||||||
|
|
||||||
from PyQt4.Qt import (QIcon, QDialog,
|
from PyQt4.Qt import (QIcon, QDialog,
|
||||||
QDialogButtonBox, QLabel, QGridLayout, QPixmap, Qt)
|
QDialogButtonBox, QLabel, QGridLayout, QPixmap, Qt)
|
||||||
|
|
||||||
from calibre.gui2.threaded_jobs import ThreadedJob
|
from calibre.gui2.threaded_jobs import ThreadedJob
|
||||||
from calibre.ebooks.metadata.sources.identify import identify, msprefs
|
from calibre.ebooks.metadata.opf2 import metadata_to_opf
|
||||||
from calibre.ebooks.metadata.sources.covers import download_cover
|
from calibre.utils.ipc.simple_worker import fork_job, WorkerError
|
||||||
from calibre.ebooks.metadata.book.base import Metadata
|
from calibre.ptempfile import (PersistentTemporaryDirectory,
|
||||||
from calibre.customize.ui import metadata_plugins
|
PersistentTemporaryFile)
|
||||||
from calibre.ptempfile import PersistentTemporaryFile
|
|
||||||
from calibre.utils.date import as_utc
|
|
||||||
|
|
||||||
# Start download {{{
|
# Start download {{{
|
||||||
def show_config(gui, parent):
|
def show_config(gui, parent):
|
||||||
@ -105,18 +102,18 @@ def start_download(gui, ids, callback, ensure_fields=None):
|
|||||||
if ret != d.Accepted:
|
if ret != d.Accepted:
|
||||||
return
|
return
|
||||||
|
|
||||||
for batch in split_jobs(ids):
|
job = ThreadedJob('metadata bulk download',
|
||||||
job = ThreadedJob('metadata bulk download',
|
_('Download metadata for %d books')%len(ids),
|
||||||
_('Download metadata for %d books')%len(batch),
|
download, (ids, gui.current_db, d.identify, d.covers,
|
||||||
download, (batch, gui.current_db, d.identify, d.covers,
|
ensure_fields), {}, callback)
|
||||||
ensure_fields), {}, callback)
|
gui.job_manager.run_threaded_job(job)
|
||||||
gui.job_manager.run_threaded_job(job)
|
|
||||||
gui.status_bar.show_message(_('Metadata download started'), 3000)
|
gui.status_bar.show_message(_('Metadata download started'), 3000)
|
||||||
|
|
||||||
# }}}
|
# }}}
|
||||||
|
|
||||||
def get_job_details(job):
|
def get_job_details(job):
|
||||||
id_map, failed_ids, failed_covers, title_map, all_failed = job.result
|
(aborted, good_ids, tdir, log_file, failed_ids, failed_covers, title_map,
|
||||||
|
lm_map, all_failed) = job.result
|
||||||
det_msg = []
|
det_msg = []
|
||||||
for i in failed_ids | failed_covers:
|
for i in failed_ids | failed_covers:
|
||||||
title = title_map[i]
|
title = title_map[i]
|
||||||
@ -126,92 +123,90 @@ def get_job_details(job):
|
|||||||
title += (' ' + _('(Failed cover)'))
|
title += (' ' + _('(Failed cover)'))
|
||||||
det_msg.append(title)
|
det_msg.append(title)
|
||||||
det_msg = '\n'.join(det_msg)
|
det_msg = '\n'.join(det_msg)
|
||||||
return id_map, failed_ids, failed_covers, all_failed, det_msg
|
return (aborted, good_ids, tdir, log_file, failed_ids, failed_covers,
|
||||||
|
all_failed, det_msg, lm_map)
|
||||||
|
|
||||||
def merge_result(oldmi, newmi, ensure_fields=None):
|
class HeartBeat(object):
|
||||||
dummy = Metadata(_('Unknown'))
|
CHECK_INTERVAL = 300 # seconds
|
||||||
for f in msprefs['ignore_fields']:
|
''' Check that the file count in tdir changes every five minutes '''
|
||||||
if ':' in f or (ensure_fields and f in ensure_fields):
|
|
||||||
continue
|
|
||||||
setattr(newmi, f, getattr(dummy, f))
|
|
||||||
fields = set()
|
|
||||||
for plugin in metadata_plugins(['identify']):
|
|
||||||
fields |= plugin.touched_fields
|
|
||||||
|
|
||||||
def is_equal(x, y):
|
def __init__(self, tdir):
|
||||||
if hasattr(x, 'tzinfo'):
|
self.tdir = tdir
|
||||||
x = as_utc(x)
|
self.last_count = len(os.listdir(self.tdir))
|
||||||
if hasattr(y, 'tzinfo'):
|
self.last_time = time.time()
|
||||||
y = as_utc(y)
|
|
||||||
return x == y
|
|
||||||
|
|
||||||
for f in fields:
|
def __call__(self):
|
||||||
# Optimize so that set_metadata does not have to do extra work later
|
if time.time() - self.last_time > self.CHECK_INTERVAL:
|
||||||
if not f.startswith('identifier:'):
|
c = len(os.listdir(self.tdir))
|
||||||
if (not newmi.is_null(f) and is_equal(getattr(newmi, f),
|
if c == self.last_count:
|
||||||
getattr(oldmi, f))):
|
return False
|
||||||
setattr(newmi, f, getattr(dummy, f))
|
self.last_count = c
|
||||||
|
self.last_time = time.time()
|
||||||
|
return True
|
||||||
|
|
||||||
newmi.last_modified = oldmi.last_modified
|
# Fix log viewer, get_job_details, database update code
|
||||||
|
# Test: abort, covers only, metadata only, both, 200 entry download, memory
|
||||||
|
# consumption, all errors and on and on
|
||||||
|
|
||||||
return newmi
|
def download(all_ids, db, do_identify, covers, ensure_fields,
|
||||||
|
|
||||||
def download(ids, db, do_identify, covers, ensure_fields,
|
|
||||||
log=None, abort=None, notifications=None):
|
log=None, abort=None, notifications=None):
|
||||||
ids = list(ids)
|
batch_size = 10
|
||||||
metadata = [db.get_metadata(i, index_is_id=True, get_user_categories=False)
|
batches = split_jobs(all_ids, batch_size=batch_size)
|
||||||
for i in ids]
|
tdir = PersistentTemporaryDirectory('_metadata_bulk_')
|
||||||
|
tf = PersistentTemporaryFile('_metadata_bulk_log_')
|
||||||
|
tf.close()
|
||||||
|
tf = tf.name
|
||||||
|
heartbeat = HeartBeat(tdir)
|
||||||
|
|
||||||
failed_ids = set()
|
failed_ids = set()
|
||||||
failed_covers = set()
|
failed_covers = set()
|
||||||
title_map = {}
|
title_map = {}
|
||||||
ans = {}
|
lm_map = {}
|
||||||
count = 0
|
ans = set()
|
||||||
all_failed = True
|
all_failed = True
|
||||||
'''
|
aborted = False
|
||||||
# Test apply dialog
|
count = 0
|
||||||
all_failed = do_identify = covers = False
|
|
||||||
'''
|
for ids in batches:
|
||||||
for i, mi in izip(ids, metadata):
|
|
||||||
if abort.is_set():
|
if abort.is_set():
|
||||||
log.error('Aborting...')
|
log.error('Aborting...')
|
||||||
break
|
break
|
||||||
title, authors, identifiers = mi.title, mi.authors, mi.identifiers
|
metadata = {i:db.get_metadata(i, index_is_id=True,
|
||||||
title_map[i] = title
|
get_user_categories=False) for i in ids}
|
||||||
if do_identify:
|
for i in ids:
|
||||||
results = []
|
title_map[i] = metadata[i].title
|
||||||
try:
|
lm_map[i] = metadata[i].last_modified
|
||||||
results = identify(log, Event(), title=title, authors=authors,
|
metadata = {i:metadata_to_opf(mi, default_lang='und') for i, mi in
|
||||||
identifiers=identifiers)
|
metadata.iteritems()}
|
||||||
except:
|
try:
|
||||||
pass
|
ret = fork_job('calibre.ebooks.metadata.sources.worker', 'main',
|
||||||
if results:
|
(do_identify, covers, metadata, ensure_fields),
|
||||||
all_failed = False
|
cwd=tdir, abort=abort, heartbeat=heartbeat, no_output=True)
|
||||||
mi = merge_result(mi, results[0], ensure_fields=ensure_fields)
|
except WorkerError as e:
|
||||||
identifiers = mi.identifiers
|
if e.orig_tb:
|
||||||
if not mi.is_null('rating'):
|
raise Exception('Failed to download metadata. Original '
|
||||||
# set_metadata expects a rating out of 10
|
'traceback: \n\n'+e.orig_tb)
|
||||||
mi.rating *= 2
|
raise
|
||||||
else:
|
count += batch_size
|
||||||
log.error('Failed to download metadata for', title)
|
|
||||||
failed_ids.add(i)
|
|
||||||
# We don't want set_metadata operating on anything but covers
|
|
||||||
mi = merge_result(mi, mi, ensure_fields=ensure_fields)
|
|
||||||
if covers:
|
|
||||||
cdata = download_cover(log, title=title, authors=authors,
|
|
||||||
identifiers=identifiers)
|
|
||||||
if cdata is not None:
|
|
||||||
with PersistentTemporaryFile('.jpg', 'downloaded-cover-') as f:
|
|
||||||
f.write(cdata[-1])
|
|
||||||
mi.cover = f.name
|
|
||||||
all_failed = False
|
|
||||||
else:
|
|
||||||
failed_covers.add(i)
|
|
||||||
ans[i] = mi
|
|
||||||
count += 1
|
|
||||||
notifications.put((count/len(ids),
|
notifications.put((count/len(ids),
|
||||||
_('Downloaded %(num)d of %(tot)d')%dict(num=count, tot=len(ids))))
|
_('Downloaded %(num)d of %(tot)d')%dict(
|
||||||
|
num=count, tot=len(all_ids))))
|
||||||
|
|
||||||
|
fids, fcovs, allf = ret['result']
|
||||||
|
if not allf:
|
||||||
|
all_failed = False
|
||||||
|
failed_ids = failed_ids.union(fids)
|
||||||
|
failed_covers = failed_covers.union(fcovs)
|
||||||
|
ans = ans.union(set(ids) - fids)
|
||||||
|
for book_id in ids:
|
||||||
|
lp = os.path.join(tdir, '%d.log'%book_id)
|
||||||
|
if os.path.exists(lp):
|
||||||
|
with open(lp, 'rb') as f, open(tf, 'ab') as d:
|
||||||
|
shutil.copyfileobj(f, d)
|
||||||
|
|
||||||
|
if abort.is_set():
|
||||||
|
aborted = True
|
||||||
log('Download complete, with %d failures'%len(failed_ids))
|
log('Download complete, with %d failures'%len(failed_ids))
|
||||||
return (ans, failed_ids, failed_covers, title_map, all_failed)
|
return (aborted, ans, tdir, tf, failed_ids, failed_covers, title_map,
|
||||||
|
lm_map, all_failed)
|
||||||
|
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user