Switch to threaded worker for book render

Now that almost all code runs in C and releases the GIL these perform much better. Cuts the time of first render for a 400K word book on my system by half from 1.9 to 0.9 seconds.
2025-08-11 09:13:57 -04:00 · 2024-09-18 21:35:47 +05:30 · 2024-09-18 21:35:47 +05:30 · 3d19fb01be
commit 3d19fb01be
parent cb07b649f2
1 changed files with 53 additions and 151 deletions
--- a/src/calibre/srv/render_book.py
+++ b/src/calibre/srv/render_book.py
@ -5,17 +5,15 @@
 import json
 import os
 import sys
 import time
 from collections import defaultdict
 from concurrent.futures import ThreadPoolExecutor
 from datetime import datetime
 from functools import partial
 from itertools import count
 from math import ceil
 from lxml.etree import Comment
 from calibre import detect_ncpus, force_unicode, prepare_string_for_xml
 from calibre.constants import iswindows
 from calibre.customize.ui import plugin_for_input_format
 from calibre.ebooks.oeb.base import EPUB, OEB_DOCS, OEB_STYLES, OPF, SMIL, XHTML, XHTML_NS, XLINK, rewrite_links, urlunquote
 from calibre.ebooks.oeb.base import XPath as _XPath
@ -24,14 +22,10 @@ from calibre.ebooks.oeb.polish.container import Container as ContainerBase
 from calibre.ebooks.oeb.polish.cover import find_cover_image, find_cover_image_in_page, find_cover_page
 from calibre.ebooks.oeb.polish.toc import from_xpaths, get_landmarks, get_toc
 from calibre.ebooks.oeb.polish.utils import guess_type
 from calibre.ptempfile import PersistentTemporaryDirectory
 from calibre.srv.metadata import encode_datetime
 from calibre.srv.opts import grouper
 from calibre.utils.date import EPOCH
 from calibre.utils.filenames import rmtree
 from calibre.utils.ipc.simple_worker import start_pipe_worker
 from calibre.utils.logging import default_log
-from calibre.utils.serialize import json_dumps, json_loads, msgpack_dumps, msgpack_loads
+from calibre.utils.serialize import json_dumps, json_loads, msgpack_loads
 from calibre.utils.short_uuid import uuid4
 from calibre_extensions.fast_css_transform import transform_properties
 from polyglot.binary import as_base64_unicode as encode_component
@ -501,102 +495,6 @@ def transform_html(container, name, virtualize_resources, link_uid, link_to_map,
        f.write(shtml)
 class RenderManager:
    def __init__(self, max_workers):
        self.max_workers = max_workers
    def launch_worker(self):
        with open(os.path.join(self.tdir, f'{len(self.workers)}.json'), 'wb') as output:
            error = open(os.path.join(self.tdir, f'{len(self.workers)}.error'), 'wb')
            p = start_pipe_worker('from calibre.srv.render_book import worker_main; worker_main()', stdout=error, stderr=error)
            p.output_path = output.name
            p.error_path = error.name
        self.workers.append(p)
    def __enter__(self):
        self.workers = []
        self.tdir = PersistentTemporaryDirectory()
        return self
    def __exit__(self, *a):
        while self.workers:
            p = self.workers.pop()
            if p.poll() is not None:
                continue
            p.terminate()
            if not iswindows and p.poll() is None:
                time.sleep(0.02)
                if p.poll() is None:
                    p.kill()
        del self.workers
        try:
            rmtree(self.tdir)
        except OSError:
            time.sleep(0.1)
            try:
                rmtree(self.tdir)
            except OSError:
                pass
        del self.tdir
    def launch_workers(self, names, in_process_container):
        num_workers = min(detect_ncpus(), len(names))
        if self.max_workers:
            num_workers = min(num_workers, self.max_workers)
        if num_workers > 1:
            if len(names) < 3 or sum(os.path.getsize(in_process_container.name_path_map[n]) for n in names) < 128 * 1024:
                num_workers = 1
        if num_workers > 1:
            num_other_workers = num_workers - 1
            while len(self.workers) < num_other_workers:
                self.launch_worker()
        return num_workers
    def __call__(self, names, args, in_process_container):
        num_workers = len(self.workers) + 1
        if num_workers == 1:
            return [process_book_files(names, *args, container=in_process_container)]
        group_sz = int(ceil(len(names) / num_workers))
        groups = tuple(grouper(group_sz, names))
        for group, worker in zip(groups[:-1], self.workers):
            worker.stdin.write(as_bytes(msgpack_dumps((worker.output_path, group,) + args)))
            worker.stdin.flush(), worker.stdin.close()
            worker.job_sent = True
        for worker in self.workers:
            if not hasattr(worker, 'job_sent'):
                worker.stdin.write(b'_'), worker.stdin.flush(), worker.stdin.close()
        error = None
        results = [process_book_files(groups[-1], *args, container=in_process_container)]
        for worker in self.workers:
            if not hasattr(worker, 'job_sent'):
                worker.wait()
                continue
            if worker.wait() != 0:
                with open(worker.error_path, 'rb') as f:
                    error = f.read().decode('utf-8', 'replace')
            else:
                with open(worker.output_path, 'rb') as f:
                    results.append(msgpack_loads(f.read()))
        if error is not None:
            raise Exception('Render worker failed with error:\n' + error)
        return results
 def worker_main():
    stdin = getattr(sys.stdin, 'buffer', sys.stdin)
    raw = stdin.read()
    if raw == b'_':
        return
    args = msgpack_loads(raw)
    result = process_book_files(*args[1:])
    with open(args[0], 'wb') as f:
        f.write(as_bytes(msgpack_dumps(result)))
 def virtualize_html(container, name, link_uid, link_to_map, virtualized_names):
    changed = set()
@ -634,7 +532,7 @@ def virtualize_html(container, name, link_uid, link_to_map, virtualized_names):
 __smil_file_names__ = ''
-def process_book_files(names, container_dir, opfpath, virtualize_resources, link_uid, data_for_clone, container=None):
+def process_book_files(names, container_dir, opfpath, virtualize_resources, link_uid, data_for_clone=None, container=None):
    if container is None:
        container = SimpleContainer(container_dir, opfpath, default_log, clone_data=data_for_clone)
        container.cloned = False
@ -664,9 +562,19 @@ def process_book_files(names, container_dir, opfpath, virtualize_resources, link
    return link_to_map, html_data, virtualized_names, smil_map
 def calculate_number_of_workers(names, in_process_container, max_workers):
    num_workers = min(detect_ncpus(), len(names))
    if max_workers:
        num_workers = min(num_workers, max_workers)
    if num_workers > 1:
        if len(names) < 3 or sum(os.path.getsize(in_process_container.name_path_map[n]) for n in names) < 128 * 1024:
            num_workers = 1
    return num_workers
 def process_exploded_book(
-    book_fmt, opfpath, input_fmt, tdir, render_manager, log=None, book_hash=None, save_bookmark_data=False,
+    book_fmt, opfpath, input_fmt, tdir, log=None, book_hash=None, save_bookmark_data=False,
-    book_metadata=None, virtualize_resources=True
+    book_metadata=None, virtualize_resources=True, max_workers=1
 ):
    log = log or default_log
    container = SimpleContainer(tdir, opfpath, log)
@ -676,15 +584,8 @@ def process_exploded_book(
    def needs_work(mt):
        return mt in OEB_STYLES or mt in OEB_DOCS or mt in ('image/svg+xml', 'application/smil', 'application/smil+xml')
-    def work_priority(name):
+    names_that_need_work = tuple(n for n, mt in iteritems(container.mime_map) if needs_work(mt))
-        # ensure workers with large files or stylesheets
+    num_workers = calculate_number_of_workers(names_that_need_work, container, max_workers)
        # have the less names
        size = os.path.getsize(container.name_path_map[name]),
        is_html = container.mime_map.get(name) in OEB_DOCS
        return (0 if is_html else 1), size
    if not is_comic:
        render_manager.launch_workers(tuple(n for n, mt in iteritems(container.mime_map) if needs_work(mt)), container)
    bookmark_data = None
    if save_bookmark_data:
@ -741,15 +642,17 @@ def process_exploded_book(
        'page_list_anchor_map': pagelist_anchor_map(page_list),
    }
-    names = sorted(
+    results = []
-        (n for n, mt in iteritems(container.mime_map) if needs_work(mt)),
+    if num_workers < 2:
-        key=work_priority)
+        results.append(process_book_files(names_that_need_work, tdir, opfpath, virtualize_resources, book_render_data['link_uid'], container=container))
    else:
        with ThreadPoolExecutor(max_workers=num_workers) as executor:
            futures = tuple(
                executor.submit(process_book_files, (name,), tdir, opfpath, virtualize_resources, book_render_data['link_uid'], container=container)
                for name in names_that_need_work)
            for future in futures:
                results.append(future.result())
    results = render_manager(
        names, (
            tdir, opfpath, virtualize_resources, book_render_data['link_uid'], container.data_for_clone()
        ), container
    )
    ltm = book_render_data['link_to_map']
    html_data = {}
    virtualized_names = set()
@ -899,33 +802,32 @@ def get_stored_annotations(container, bookmark_data):
 def render(pathtoebook, output_dir, book_hash=None, serialize_metadata=False, extract_annotations=False, virtualize_resources=True, max_workers=1):
    pathtoebook = os.path.abspath(pathtoebook)
-    with RenderManager(max_workers) as render_manager:
+    mi = None
-        mi = None
+    if serialize_metadata:
-        if serialize_metadata:
+        from calibre.customize.ui import quick_metadata
-            from calibre.customize.ui import quick_metadata
+        from calibre.ebooks.metadata.meta import get_metadata
-            from calibre.ebooks.metadata.meta import get_metadata
+        with open(pathtoebook, 'rb') as f, quick_metadata:
-            with open(pathtoebook, 'rb') as f, quick_metadata:
+            mi = get_metadata(f, os.path.splitext(pathtoebook)[1][1:].lower())
-                mi = get_metadata(f, os.path.splitext(pathtoebook)[1][1:].lower())
+    book_fmt, opfpath, input_fmt = extract_book(pathtoebook, output_dir, log=default_log)
-        book_fmt, opfpath, input_fmt = extract_book(pathtoebook, output_dir, log=default_log)
+    container, bookmark_data = process_exploded_book(
-        container, bookmark_data = process_exploded_book(
+        book_fmt, opfpath, input_fmt, output_dir, max_workers=max_workers,
-            book_fmt, opfpath, input_fmt, output_dir, render_manager,
+        book_hash=book_hash, save_bookmark_data=extract_annotations,
-            book_hash=book_hash, save_bookmark_data=extract_annotations,
+        book_metadata=mi, virtualize_resources=virtualize_resources
-            book_metadata=mi, virtualize_resources=virtualize_resources
+    )
-        )
+    if serialize_metadata:
-        if serialize_metadata:
+        from calibre.ebooks.metadata.book.serialize import metadata_as_dict
-            from calibre.ebooks.metadata.book.serialize import metadata_as_dict
+        d = metadata_as_dict(mi)
-            d = metadata_as_dict(mi)
+        d.pop('cover_data', None)
-            d.pop('cover_data', None)
+        serialize_datetimes(d), serialize_datetimes(d.get('user_metadata', {}))
-            serialize_datetimes(d), serialize_datetimes(d.get('user_metadata', {}))
+        with open(os.path.join(output_dir, 'calibre-book-metadata.json'), 'wb') as f:
-            with open(os.path.join(output_dir, 'calibre-book-metadata.json'), 'wb') as f:
+            f.write(json_dumps(d))
-                f.write(json_dumps(d))
+    if extract_annotations:
-        if extract_annotations:
+        annotations = None
-            annotations = None
+        if bookmark_data:
-            if bookmark_data:
+            annotations = json_dumps(tuple(get_stored_annotations(container, bookmark_data)))
-                annotations = json_dumps(tuple(get_stored_annotations(container, bookmark_data)))
+        if annotations:
-            if annotations:
+            with open(os.path.join(output_dir, 'calibre-book-annotations.json'), 'wb') as f:
-                with open(os.path.join(output_dir, 'calibre-book-annotations.json'), 'wb') as f:
+                f.write(annotations)
                    f.write(annotations)
 def render_for_viewer(path, out_dir, book_hash):