mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-08 10:44:09 -04:00
Switch to threaded worker for book render
Now that almost all code runs in C and releases the GIL these perform much better. Cuts the time of first render for a 400K word book on my system by half from 1.9 to 0.9 seconds.
This commit is contained in:
parent
cb07b649f2
commit
3d19fb01be
@ -5,17 +5,15 @@
|
|||||||
import json
|
import json
|
||||||
import os
|
import os
|
||||||
import sys
|
import sys
|
||||||
import time
|
|
||||||
from collections import defaultdict
|
from collections import defaultdict
|
||||||
|
from concurrent.futures import ThreadPoolExecutor
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from functools import partial
|
from functools import partial
|
||||||
from itertools import count
|
from itertools import count
|
||||||
from math import ceil
|
|
||||||
|
|
||||||
from lxml.etree import Comment
|
from lxml.etree import Comment
|
||||||
|
|
||||||
from calibre import detect_ncpus, force_unicode, prepare_string_for_xml
|
from calibre import detect_ncpus, force_unicode, prepare_string_for_xml
|
||||||
from calibre.constants import iswindows
|
|
||||||
from calibre.customize.ui import plugin_for_input_format
|
from calibre.customize.ui import plugin_for_input_format
|
||||||
from calibre.ebooks.oeb.base import EPUB, OEB_DOCS, OEB_STYLES, OPF, SMIL, XHTML, XHTML_NS, XLINK, rewrite_links, urlunquote
|
from calibre.ebooks.oeb.base import EPUB, OEB_DOCS, OEB_STYLES, OPF, SMIL, XHTML, XHTML_NS, XLINK, rewrite_links, urlunquote
|
||||||
from calibre.ebooks.oeb.base import XPath as _XPath
|
from calibre.ebooks.oeb.base import XPath as _XPath
|
||||||
@ -24,14 +22,10 @@ from calibre.ebooks.oeb.polish.container import Container as ContainerBase
|
|||||||
from calibre.ebooks.oeb.polish.cover import find_cover_image, find_cover_image_in_page, find_cover_page
|
from calibre.ebooks.oeb.polish.cover import find_cover_image, find_cover_image_in_page, find_cover_page
|
||||||
from calibre.ebooks.oeb.polish.toc import from_xpaths, get_landmarks, get_toc
|
from calibre.ebooks.oeb.polish.toc import from_xpaths, get_landmarks, get_toc
|
||||||
from calibre.ebooks.oeb.polish.utils import guess_type
|
from calibre.ebooks.oeb.polish.utils import guess_type
|
||||||
from calibre.ptempfile import PersistentTemporaryDirectory
|
|
||||||
from calibre.srv.metadata import encode_datetime
|
from calibre.srv.metadata import encode_datetime
|
||||||
from calibre.srv.opts import grouper
|
|
||||||
from calibre.utils.date import EPOCH
|
from calibre.utils.date import EPOCH
|
||||||
from calibre.utils.filenames import rmtree
|
|
||||||
from calibre.utils.ipc.simple_worker import start_pipe_worker
|
|
||||||
from calibre.utils.logging import default_log
|
from calibre.utils.logging import default_log
|
||||||
from calibre.utils.serialize import json_dumps, json_loads, msgpack_dumps, msgpack_loads
|
from calibre.utils.serialize import json_dumps, json_loads, msgpack_loads
|
||||||
from calibre.utils.short_uuid import uuid4
|
from calibre.utils.short_uuid import uuid4
|
||||||
from calibre_extensions.fast_css_transform import transform_properties
|
from calibre_extensions.fast_css_transform import transform_properties
|
||||||
from polyglot.binary import as_base64_unicode as encode_component
|
from polyglot.binary import as_base64_unicode as encode_component
|
||||||
@ -501,102 +495,6 @@ def transform_html(container, name, virtualize_resources, link_uid, link_to_map,
|
|||||||
f.write(shtml)
|
f.write(shtml)
|
||||||
|
|
||||||
|
|
||||||
class RenderManager:
|
|
||||||
|
|
||||||
def __init__(self, max_workers):
|
|
||||||
self.max_workers = max_workers
|
|
||||||
|
|
||||||
def launch_worker(self):
|
|
||||||
with open(os.path.join(self.tdir, f'{len(self.workers)}.json'), 'wb') as output:
|
|
||||||
error = open(os.path.join(self.tdir, f'{len(self.workers)}.error'), 'wb')
|
|
||||||
p = start_pipe_worker('from calibre.srv.render_book import worker_main; worker_main()', stdout=error, stderr=error)
|
|
||||||
p.output_path = output.name
|
|
||||||
p.error_path = error.name
|
|
||||||
self.workers.append(p)
|
|
||||||
|
|
||||||
def __enter__(self):
|
|
||||||
self.workers = []
|
|
||||||
self.tdir = PersistentTemporaryDirectory()
|
|
||||||
return self
|
|
||||||
|
|
||||||
def __exit__(self, *a):
|
|
||||||
while self.workers:
|
|
||||||
p = self.workers.pop()
|
|
||||||
if p.poll() is not None:
|
|
||||||
continue
|
|
||||||
p.terminate()
|
|
||||||
if not iswindows and p.poll() is None:
|
|
||||||
time.sleep(0.02)
|
|
||||||
if p.poll() is None:
|
|
||||||
p.kill()
|
|
||||||
del self.workers
|
|
||||||
try:
|
|
||||||
rmtree(self.tdir)
|
|
||||||
except OSError:
|
|
||||||
time.sleep(0.1)
|
|
||||||
try:
|
|
||||||
rmtree(self.tdir)
|
|
||||||
except OSError:
|
|
||||||
pass
|
|
||||||
del self.tdir
|
|
||||||
|
|
||||||
def launch_workers(self, names, in_process_container):
|
|
||||||
num_workers = min(detect_ncpus(), len(names))
|
|
||||||
if self.max_workers:
|
|
||||||
num_workers = min(num_workers, self.max_workers)
|
|
||||||
if num_workers > 1:
|
|
||||||
if len(names) < 3 or sum(os.path.getsize(in_process_container.name_path_map[n]) for n in names) < 128 * 1024:
|
|
||||||
num_workers = 1
|
|
||||||
if num_workers > 1:
|
|
||||||
num_other_workers = num_workers - 1
|
|
||||||
while len(self.workers) < num_other_workers:
|
|
||||||
self.launch_worker()
|
|
||||||
return num_workers
|
|
||||||
|
|
||||||
def __call__(self, names, args, in_process_container):
|
|
||||||
num_workers = len(self.workers) + 1
|
|
||||||
if num_workers == 1:
|
|
||||||
return [process_book_files(names, *args, container=in_process_container)]
|
|
||||||
|
|
||||||
group_sz = int(ceil(len(names) / num_workers))
|
|
||||||
groups = tuple(grouper(group_sz, names))
|
|
||||||
for group, worker in zip(groups[:-1], self.workers):
|
|
||||||
worker.stdin.write(as_bytes(msgpack_dumps((worker.output_path, group,) + args)))
|
|
||||||
worker.stdin.flush(), worker.stdin.close()
|
|
||||||
worker.job_sent = True
|
|
||||||
|
|
||||||
for worker in self.workers:
|
|
||||||
if not hasattr(worker, 'job_sent'):
|
|
||||||
worker.stdin.write(b'_'), worker.stdin.flush(), worker.stdin.close()
|
|
||||||
|
|
||||||
error = None
|
|
||||||
results = [process_book_files(groups[-1], *args, container=in_process_container)]
|
|
||||||
for worker in self.workers:
|
|
||||||
if not hasattr(worker, 'job_sent'):
|
|
||||||
worker.wait()
|
|
||||||
continue
|
|
||||||
if worker.wait() != 0:
|
|
||||||
with open(worker.error_path, 'rb') as f:
|
|
||||||
error = f.read().decode('utf-8', 'replace')
|
|
||||||
else:
|
|
||||||
with open(worker.output_path, 'rb') as f:
|
|
||||||
results.append(msgpack_loads(f.read()))
|
|
||||||
if error is not None:
|
|
||||||
raise Exception('Render worker failed with error:\n' + error)
|
|
||||||
return results
|
|
||||||
|
|
||||||
|
|
||||||
def worker_main():
|
|
||||||
stdin = getattr(sys.stdin, 'buffer', sys.stdin)
|
|
||||||
raw = stdin.read()
|
|
||||||
if raw == b'_':
|
|
||||||
return
|
|
||||||
args = msgpack_loads(raw)
|
|
||||||
result = process_book_files(*args[1:])
|
|
||||||
with open(args[0], 'wb') as f:
|
|
||||||
f.write(as_bytes(msgpack_dumps(result)))
|
|
||||||
|
|
||||||
|
|
||||||
def virtualize_html(container, name, link_uid, link_to_map, virtualized_names):
|
def virtualize_html(container, name, link_uid, link_to_map, virtualized_names):
|
||||||
|
|
||||||
changed = set()
|
changed = set()
|
||||||
@ -634,7 +532,7 @@ def virtualize_html(container, name, link_uid, link_to_map, virtualized_names):
|
|||||||
__smil_file_names__ = ''
|
__smil_file_names__ = ''
|
||||||
|
|
||||||
|
|
||||||
def process_book_files(names, container_dir, opfpath, virtualize_resources, link_uid, data_for_clone, container=None):
|
def process_book_files(names, container_dir, opfpath, virtualize_resources, link_uid, data_for_clone=None, container=None):
|
||||||
if container is None:
|
if container is None:
|
||||||
container = SimpleContainer(container_dir, opfpath, default_log, clone_data=data_for_clone)
|
container = SimpleContainer(container_dir, opfpath, default_log, clone_data=data_for_clone)
|
||||||
container.cloned = False
|
container.cloned = False
|
||||||
@ -664,9 +562,19 @@ def process_book_files(names, container_dir, opfpath, virtualize_resources, link
|
|||||||
return link_to_map, html_data, virtualized_names, smil_map
|
return link_to_map, html_data, virtualized_names, smil_map
|
||||||
|
|
||||||
|
|
||||||
|
def calculate_number_of_workers(names, in_process_container, max_workers):
|
||||||
|
num_workers = min(detect_ncpus(), len(names))
|
||||||
|
if max_workers:
|
||||||
|
num_workers = min(num_workers, max_workers)
|
||||||
|
if num_workers > 1:
|
||||||
|
if len(names) < 3 or sum(os.path.getsize(in_process_container.name_path_map[n]) for n in names) < 128 * 1024:
|
||||||
|
num_workers = 1
|
||||||
|
return num_workers
|
||||||
|
|
||||||
|
|
||||||
def process_exploded_book(
|
def process_exploded_book(
|
||||||
book_fmt, opfpath, input_fmt, tdir, render_manager, log=None, book_hash=None, save_bookmark_data=False,
|
book_fmt, opfpath, input_fmt, tdir, log=None, book_hash=None, save_bookmark_data=False,
|
||||||
book_metadata=None, virtualize_resources=True
|
book_metadata=None, virtualize_resources=True, max_workers=1
|
||||||
):
|
):
|
||||||
log = log or default_log
|
log = log or default_log
|
||||||
container = SimpleContainer(tdir, opfpath, log)
|
container = SimpleContainer(tdir, opfpath, log)
|
||||||
@ -676,15 +584,8 @@ def process_exploded_book(
|
|||||||
def needs_work(mt):
|
def needs_work(mt):
|
||||||
return mt in OEB_STYLES or mt in OEB_DOCS or mt in ('image/svg+xml', 'application/smil', 'application/smil+xml')
|
return mt in OEB_STYLES or mt in OEB_DOCS or mt in ('image/svg+xml', 'application/smil', 'application/smil+xml')
|
||||||
|
|
||||||
def work_priority(name):
|
names_that_need_work = tuple(n for n, mt in iteritems(container.mime_map) if needs_work(mt))
|
||||||
# ensure workers with large files or stylesheets
|
num_workers = calculate_number_of_workers(names_that_need_work, container, max_workers)
|
||||||
# have the less names
|
|
||||||
size = os.path.getsize(container.name_path_map[name]),
|
|
||||||
is_html = container.mime_map.get(name) in OEB_DOCS
|
|
||||||
return (0 if is_html else 1), size
|
|
||||||
|
|
||||||
if not is_comic:
|
|
||||||
render_manager.launch_workers(tuple(n for n, mt in iteritems(container.mime_map) if needs_work(mt)), container)
|
|
||||||
|
|
||||||
bookmark_data = None
|
bookmark_data = None
|
||||||
if save_bookmark_data:
|
if save_bookmark_data:
|
||||||
@ -741,15 +642,17 @@ def process_exploded_book(
|
|||||||
'page_list_anchor_map': pagelist_anchor_map(page_list),
|
'page_list_anchor_map': pagelist_anchor_map(page_list),
|
||||||
}
|
}
|
||||||
|
|
||||||
names = sorted(
|
results = []
|
||||||
(n for n, mt in iteritems(container.mime_map) if needs_work(mt)),
|
if num_workers < 2:
|
||||||
key=work_priority)
|
results.append(process_book_files(names_that_need_work, tdir, opfpath, virtualize_resources, book_render_data['link_uid'], container=container))
|
||||||
|
else:
|
||||||
|
with ThreadPoolExecutor(max_workers=num_workers) as executor:
|
||||||
|
futures = tuple(
|
||||||
|
executor.submit(process_book_files, (name,), tdir, opfpath, virtualize_resources, book_render_data['link_uid'], container=container)
|
||||||
|
for name in names_that_need_work)
|
||||||
|
for future in futures:
|
||||||
|
results.append(future.result())
|
||||||
|
|
||||||
results = render_manager(
|
|
||||||
names, (
|
|
||||||
tdir, opfpath, virtualize_resources, book_render_data['link_uid'], container.data_for_clone()
|
|
||||||
), container
|
|
||||||
)
|
|
||||||
ltm = book_render_data['link_to_map']
|
ltm = book_render_data['link_to_map']
|
||||||
html_data = {}
|
html_data = {}
|
||||||
virtualized_names = set()
|
virtualized_names = set()
|
||||||
@ -899,33 +802,32 @@ def get_stored_annotations(container, bookmark_data):
|
|||||||
|
|
||||||
def render(pathtoebook, output_dir, book_hash=None, serialize_metadata=False, extract_annotations=False, virtualize_resources=True, max_workers=1):
|
def render(pathtoebook, output_dir, book_hash=None, serialize_metadata=False, extract_annotations=False, virtualize_resources=True, max_workers=1):
|
||||||
pathtoebook = os.path.abspath(pathtoebook)
|
pathtoebook = os.path.abspath(pathtoebook)
|
||||||
with RenderManager(max_workers) as render_manager:
|
mi = None
|
||||||
mi = None
|
if serialize_metadata:
|
||||||
if serialize_metadata:
|
from calibre.customize.ui import quick_metadata
|
||||||
from calibre.customize.ui import quick_metadata
|
from calibre.ebooks.metadata.meta import get_metadata
|
||||||
from calibre.ebooks.metadata.meta import get_metadata
|
with open(pathtoebook, 'rb') as f, quick_metadata:
|
||||||
with open(pathtoebook, 'rb') as f, quick_metadata:
|
mi = get_metadata(f, os.path.splitext(pathtoebook)[1][1:].lower())
|
||||||
mi = get_metadata(f, os.path.splitext(pathtoebook)[1][1:].lower())
|
book_fmt, opfpath, input_fmt = extract_book(pathtoebook, output_dir, log=default_log)
|
||||||
book_fmt, opfpath, input_fmt = extract_book(pathtoebook, output_dir, log=default_log)
|
container, bookmark_data = process_exploded_book(
|
||||||
container, bookmark_data = process_exploded_book(
|
book_fmt, opfpath, input_fmt, output_dir, max_workers=max_workers,
|
||||||
book_fmt, opfpath, input_fmt, output_dir, render_manager,
|
book_hash=book_hash, save_bookmark_data=extract_annotations,
|
||||||
book_hash=book_hash, save_bookmark_data=extract_annotations,
|
book_metadata=mi, virtualize_resources=virtualize_resources
|
||||||
book_metadata=mi, virtualize_resources=virtualize_resources
|
)
|
||||||
)
|
if serialize_metadata:
|
||||||
if serialize_metadata:
|
from calibre.ebooks.metadata.book.serialize import metadata_as_dict
|
||||||
from calibre.ebooks.metadata.book.serialize import metadata_as_dict
|
d = metadata_as_dict(mi)
|
||||||
d = metadata_as_dict(mi)
|
d.pop('cover_data', None)
|
||||||
d.pop('cover_data', None)
|
serialize_datetimes(d), serialize_datetimes(d.get('user_metadata', {}))
|
||||||
serialize_datetimes(d), serialize_datetimes(d.get('user_metadata', {}))
|
with open(os.path.join(output_dir, 'calibre-book-metadata.json'), 'wb') as f:
|
||||||
with open(os.path.join(output_dir, 'calibre-book-metadata.json'), 'wb') as f:
|
f.write(json_dumps(d))
|
||||||
f.write(json_dumps(d))
|
if extract_annotations:
|
||||||
if extract_annotations:
|
annotations = None
|
||||||
annotations = None
|
if bookmark_data:
|
||||||
if bookmark_data:
|
annotations = json_dumps(tuple(get_stored_annotations(container, bookmark_data)))
|
||||||
annotations = json_dumps(tuple(get_stored_annotations(container, bookmark_data)))
|
if annotations:
|
||||||
if annotations:
|
with open(os.path.join(output_dir, 'calibre-book-annotations.json'), 'wb') as f:
|
||||||
with open(os.path.join(output_dir, 'calibre-book-annotations.json'), 'wb') as f:
|
f.write(annotations)
|
||||||
f.write(annotations)
|
|
||||||
|
|
||||||
|
|
||||||
def render_for_viewer(path, out_dir, book_hash):
|
def render_for_viewer(path, out_dir, book_hash):
|
||||||
|
Loading…
x
Reference in New Issue
Block a user