From fbddf37b806ce4ca9fe92842be5f8e4ee774e9b4 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Thu, 5 Apr 2012 18:40:06 +0530 Subject: [PATCH] Run single metadata downloads in a worker process as well to workaround memory leaks in third party plugins --- src/calibre/ebooks/metadata/sources/base.py | 22 ++++ src/calibre/ebooks/metadata/sources/worker.py | 35 ++++- src/calibre/gui2/metadata/single_download.py | 120 ++++++++++++++---- src/calibre/utils/logging.py | 22 ++++ 4 files changed, 171 insertions(+), 28 deletions(-) diff --git a/src/calibre/ebooks/metadata/sources/base.py b/src/calibre/ebooks/metadata/sources/base.py index 4408bff6c6..2206a9ff04 100644 --- a/src/calibre/ebooks/metadata/sources/base.py +++ b/src/calibre/ebooks/metadata/sources/base.py @@ -112,6 +112,18 @@ def get_cached_cover_urls(mi): if url: yield (p, url) +def dump_caches(): + from calibre.customize.ui import metadata_plugins + return {p.name:p.dump_caches() for p in metadata_plugins(['identify'])} + +def load_caches(dump): + from calibre.customize.ui import metadata_plugins + plugins = list(metadata_plugins(['identify'])) + for p in plugins: + cache = dump.get(p.name, None) + if cache: + p.load_caches(cache) + def cap_author_token(token): lt = lower(token) if lt in ('von', 'de', 'el', 'van', 'le'): @@ -293,6 +305,16 @@ class Source(Plugin): with self.cache_lock: return self._identifier_to_cover_url_cache.get(id_, None) + def dump_caches(self): + with self.cache_lock: + return {'isbn_to_identifier':self._isbn_to_identifier_cache.copy(), + 'identifier_to_cover':self._identifier_to_cover_url_cache.copy()} + + def load_caches(self, dump): + with self.cache_lock: + self._isbn_to_identifier_cache.update(dump['isbn_to_identifier']) + self._identifier_to_cover_url_cache.update(dump['identifier_to_cover']) + # }}} # Utility functions {{{ diff --git a/src/calibre/ebooks/metadata/sources/worker.py b/src/calibre/ebooks/metadata/sources/worker.py index f2db60e01f..779613c91b 100644 --- a/src/calibre/ebooks/metadata/sources/worker.py +++ b/src/calibre/ebooks/metadata/sources/worker.py @@ -8,14 +8,17 @@ __copyright__ = '2012, Kovid Goyal ' __docformat__ = 'restructuredtext en' import os -from threading import Event +from threading import Event, Thread +from Queue import Queue, Empty from io import BytesIO from calibre.utils.date import as_utc from calibre.ebooks.metadata.sources.identify import identify, msprefs from calibre.ebooks.metadata.book.base import Metadata from calibre.customize.ui import metadata_plugins -from calibre.ebooks.metadata.sources.covers import download_cover +from calibre.ebooks.metadata.sources.covers import (download_cover, + run_download) +from calibre.ebooks.metadata.sources.base import dump_caches, load_caches from calibre.utils.logging import GUILog from calibre.ebooks.metadata.opf2 import metadata_to_opf, OPF @@ -93,3 +96,31 @@ def main(do_identify, covers, metadata, ensure_fields): return failed_ids, failed_covers, all_failed +def single_identify(title, authors, identifiers): + log = GUILog() + results = identify(log, Event(), title=title, authors=authors, + identifiers=identifiers) + return [metadata_to_opf(r) for r in results], dump_caches(), log.dump() + +def single_covers(title, authors, identifiers, caches): + load_caches(caches) + log = GUILog() + results = Queue() + worker = Thread(target=run_download, args=(log, results, Event()), + kwargs=dict(title=title, authors=authors, identifiers=identifiers)) + worker.daemon = True + worker.start() + while worker.is_alive(): + try: + plugin, width, height, fmt, data = results.get(True, 1) + except Empty: + continue + else: + name = '%s,,%s,,%s,,%s.cover'%(plugin.name, width, height, fmt) + with open(name, 'wb') as f: + f.write(data) + os.mkdir(name+'.done') + + return log.dump() + + diff --git a/src/calibre/gui2/metadata/single_download.py b/src/calibre/gui2/metadata/single_download.py index eabde31015..783e61f28c 100644 --- a/src/calibre/gui2/metadata/single_download.py +++ b/src/calibre/gui2/metadata/single_download.py @@ -8,11 +8,16 @@ __copyright__ = '2011, Kovid Goyal ' __docformat__ = 'restructuredtext en' DEBUG_DIALOG = False +# Test: turn fields off, error +# handling +# Do some testing in windows as filesystem model is different # Imports {{{ +import os, time from threading import Thread, Event from operator import attrgetter from Queue import Queue, Empty +from io import BytesIO from PyQt4.Qt import (QStyledItemDelegate, QTextDocument, QRectF, QIcon, Qt, QApplication, QDialog, QVBoxLayout, QLabel, QDialogButtonBox, @@ -24,16 +29,17 @@ from PyQt4.QtWebKit import QWebView from calibre.customize.ui import metadata_plugins from calibre.ebooks.metadata import authors_to_string from calibre.utils.logging import GUILog as Log -from calibre.ebooks.metadata.sources.identify import (identify, - urls_from_identifiers) +from calibre.ebooks.metadata.sources.identify import urls_from_identifiers from calibre.ebooks.metadata.book.base import Metadata +from calibre.ebooks.metadata.opf2 import OPF from calibre.gui2 import error_dialog, NONE, rating_font from calibre.utils.date import (utcnow, fromordinal, format_date, UNDEFINED_DATE, as_utc) from calibre.library.comments import comments_to_html from calibre import force_unicode from calibre.utils.config import tweaks - +from calibre.utils.ipc.simple_worker import fork_job, WorkerError +from calibre.ptempfile import TemporaryDirectory # }}} class RichTextDelegate(QStyledItemDelegate): # {{{ @@ -357,7 +363,7 @@ class Comments(QWebView): # {{{ class IdentifyWorker(Thread): # {{{ - def __init__(self, log, abort, title, authors, identifiers): + def __init__(self, log, abort, title, authors, identifiers, caches): Thread.__init__(self) self.daemon = True @@ -367,6 +373,7 @@ class IdentifyWorker(Thread): # {{{ self.results = [] self.error = None + self.caches = caches def sample_results(self): m1 = Metadata('The Great Gatsby', ['Francis Scott Fitzgerald']) @@ -390,25 +397,36 @@ class IdentifyWorker(Thread): # {{{ if DEBUG_DIALOG: self.results = self.sample_results() else: - self.results = identify(self.log, self.abort, title=self.title, - authors=self.authors, identifiers=self.identifiers) + res = fork_job( + 'calibre.ebooks.metadata.sources.worker', + 'single_identify', (self.title, self.authors, + self.identifiers), no_output=True, abort=self.abort) + self.results, caches, log_dump = res['result'] + self.results = [OPF(BytesIO(r), basedir=os.getcwdu(), + populate_spine=False).to_book_metadata() for r in self.results] + self.caches.update(caches) + self.log.load(log_dump) for i, result in enumerate(self.results): result.gui_rank = i + except WorkerError as e: + self.error = force_unicode(e.orig_tb) except: import traceback self.error = force_unicode(traceback.format_exc()) + # }}} class IdentifyWidget(QWidget): # {{{ rejected = pyqtSignal() results_found = pyqtSignal() - book_selected = pyqtSignal(object) + book_selected = pyqtSignal(object, object) def __init__(self, log, parent=None): QWidget.__init__(self, parent) self.log = log self.abort = Event() + self.caches = {} self.l = l = QGridLayout() self.setLayout(l) @@ -421,7 +439,7 @@ class IdentifyWidget(QWidget): # {{{ l.addWidget(self.top, 0, 0) self.results_view = ResultsView(self) - self.results_view.book_selected.connect(self.book_selected.emit) + self.results_view.book_selected.connect(self.emit_book_selected) self.get_result = self.results_view.get_result l.addWidget(self.results_view, 1, 0) @@ -455,6 +473,9 @@ class IdentifyWidget(QWidget): # {{{ ''') + def emit_book_selected(self, book): + self.book_selected.emit(book, self.caches) + def start(self, title=None, authors=None, identifiers={}): self.log.clear() self.log('Starting download') @@ -470,7 +491,7 @@ class IdentifyWidget(QWidget): # {{{ self.log(unicode(self.query.text())) self.worker = IdentifyWorker(self.log, self.abort, title, - authors, identifiers) + authors, identifiers, self.caches) self.worker.start() @@ -513,20 +534,20 @@ class IdentifyWidget(QWidget): # {{{ class CoverWorker(Thread): # {{{ - def __init__(self, log, abort, title, authors, identifiers): + def __init__(self, log, abort, title, authors, identifiers, caches): Thread.__init__(self) self.daemon = True self.log, self.abort = log, abort self.title, self.authors, self.identifiers = (title, authors, identifiers) + self.caches = caches self.rq = Queue() self.error = None def fake_run(self): images = ['donate.png', 'config.png', 'column.png', 'eject.png', ] - import time time.sleep(2) for pl, im in zip(metadata_plugins(['cover']), images): self.rq.put((pl, 1, 1, 'png', I(im, data=True))) @@ -536,12 +557,56 @@ class CoverWorker(Thread): # {{{ if DEBUG_DIALOG: self.fake_run() else: - from calibre.ebooks.metadata.sources.covers import run_download - run_download(self.log, self.rq, self.abort, title=self.title, - authors=self.authors, identifiers=self.identifiers) + self.run_fork() + except WorkerError as e: + self.error = force_unicode(e.orig_tb) except: import traceback self.error = force_unicode(traceback.format_exc()) + + def run_fork(self): + with TemporaryDirectory('_single_metadata_download') as tdir: + self.keep_going = True + t = Thread(target=self.monitor_tdir, args=(tdir,)) + t.daemon = True + t.start() + + try: + res = fork_job('calibre.ebooks.metadata.sources.worker', + 'single_covers', + (self.title, self.authors, self.identifiers, self.caches), + cwd=tdir, no_output=True, abort=self.abort) + self.log.append_dump(res['result']) + finally: + self.keep_going = False + t.join() + + def scan_once(self, tdir, seen): + for x in list(os.listdir(tdir)): + if x in seen: continue + if x.endswith('.cover') and os.path.exists(os.path.join(tdir, + x+'.done')): + name = x.rpartition('.')[0] + try: + plugin_name, width, height, fmt = name.split(',,') + width, height = int(width), int(height) + with open(os.path.join(tdir, x), 'rb') as f: + data = f.read() + except: + import traceback + traceback.print_exc() + else: + seen.add(x) + self.rq.put((plugin_name, width, height, fmt, data)) + + def monitor_tdir(self, tdir): + seen = set() + while self.keep_going: + time.sleep(1) + self.scan_once(tdir, seen) + # One last scan after the download process has ended + self.scan_once(tdir, seen) + # }}} class CoversModel(QAbstractListModel): # {{{ @@ -620,16 +685,19 @@ class CoversModel(QAbstractListModel): # {{{ idx = self.plugin_map.get(plugin, 0) return self.index(idx) - def update_result(self, plugin, width, height, data): - try: - idx = self.plugin_map[plugin] - except: + def update_result(self, plugin_name, width, height, data): + idx = None + for plugin, i in self.plugin_map.iteritems(): + if plugin.name == plugin_name: + idx = i + break + if idx is None: return pmap = QPixmap() pmap.loadFromData(data) if pmap.isNull(): return - self.covers[idx] = self.get_item(plugin.name, pmap, waiting=False) + self.covers[idx] = self.get_item(plugin_name, pmap, waiting=False) self.dataChanged.emit(self.index(idx), self.index(idx)) def cover_pixmap(self, index): @@ -709,7 +777,7 @@ class CoversWidget(QWidget): # {{{ def reset_covers(self): self.covers_view.reset_covers() - def start(self, book, current_cover, title, authors): + def start(self, book, current_cover, title, authors, caches): self.continue_processing = True self.abort.clear() self.book, self.current_cover = book, current_cover @@ -721,7 +789,7 @@ class CoversWidget(QWidget): # {{{ self.covers_view.start() self.worker = CoverWorker(self.log, self.abort, self.title, - self.authors, book.identifiers) + self.authors, book.identifiers, caches) self.worker.start() QTimer.singleShot(50, self.check) self.covers_view.setFocus(Qt.OtherFocusReason) @@ -766,8 +834,8 @@ class CoversWidget(QWidget): # {{{ def process_result(self, result): if not self.continue_processing: return - plugin, width, height, fmt, data = result - self.covers_view.model().update_result(plugin, width, height, data) + plugin_name, width, height, fmt, data = result + self.covers_view.model().update_result(plugin_name, width, height, data) def cleanup(self): self.covers_view.delegate.stop_animation() @@ -894,7 +962,7 @@ class FullFetch(QDialog): # {{{ def view_log(self): self._lv = LogViewer(self.log, self) - def book_selected(self, book): + def book_selected(self, book, caches): self.next_button.setVisible(False) self.ok_button.setVisible(True) self.prev_button.setVisible(True) @@ -902,7 +970,7 @@ class FullFetch(QDialog): # {{{ self.stack.setCurrentIndex(1) self.log('\n\n') self.covers_widget.start(book, self.current_cover, - self.title, self.authors) + self.title, self.authors, caches) def back_clicked(self): self.next_button.setVisible(True) @@ -993,7 +1061,7 @@ class CoverFetch(QDialog): # {{{ book = Metadata(title, authors) book.identifiers = identifiers self.covers_widget.start(book, self.current_cover, - title, authors) + title, authors, {}) return self.exec_() def view_log(self): diff --git a/src/calibre/utils/logging.py b/src/calibre/utils/logging.py index 46b843565e..ad4a40c57e 100644 --- a/src/calibre/utils/logging.py +++ b/src/calibre/utils/logging.py @@ -122,6 +122,18 @@ class UnicodeHTMLStream(HTMLStream): end = self.normal if self.data else u'' return u''.join(self.data) + end + def dump(self): + return [self.data, self.plain_text, self.last_col] + + def load(self, dump): + self.data, self.plain_text, self.last_col = dump + + def append_dump(self, dump): + d, p, lc = dump + self.data.extend(d) + self.plain_text.extend(p) + self.last_col = lc + class Log(object): @@ -186,4 +198,14 @@ class GUILog(ThreadSafeLog): def plain_text(self): return u''.join(self.outputs[0].plain_text) + def dump(self): + return self.outputs[0].dump() + + def load(self, dump): + return self.outputs[0].load(dump) + + def append_dump(self, dump): + return self.outputs[0].append_dump(dump) + + default_log = Log()