Avoid re-parsing the OPF file in worker processes

2025-07-09 03:04:10 -04:00 · 2019-10-26 13:35:07 +05:30 · 2019-10-26 13:35:07 +05:30 · 9a44ef24ff
commit 9a44ef24ff
parent 36d5dee047
2 changed files with 26 additions and 23 deletions
--- a/src/calibre/ebooks/oeb/polish/container.py
+++ b/src/calibre/ebooks/oeb/polish/container.py
@ -298,10 +298,8 @@ class Container(ContainerBase):  # {{{
                # some epubs include the opf in the manifest with an incorrect mime type
                self.mime_map[name] = item.get('media-type')
-    def clone_data(self, dest_dir):
+    def data_for_clone(self, dest_dir=None):
-        Container.commit(self, keep_parsed=True)
+        dest_dir = dest_dir or self.root
        self.cloned = True
        clone_dir(self.root, dest_dir)
        return {
            'root': dest_dir,
            'opf_name': self.opf_name,
@ -314,6 +312,12 @@ class Container(ContainerBase):  # {{{
                for name, path in iteritems(self.name_path_map)}
        }
    def clone_data(self, dest_dir):
        Container.commit(self, keep_parsed=True)
        self.cloned = True
        clone_dir(self.root, dest_dir)
        return self.data_for_clone()
    def add_name_to_manifest(self, name, process_manifest_item=None):
        ' Add an entry to the manifest for a file with the specified name. Returns the manifest id. '
        all_ids = {x.get('id') for x in self.opf_xpath('//*[@id]')}
--- a/src/calibre/srv/render_book.py
+++ b/src/calibre/srv/render_book.py
@ -43,15 +43,15 @@ from calibre.utils.date import EPOCH
 from calibre.utils.ipc.simple_worker import start_pipe_worker
 from calibre.utils.iso8601 import parse_iso8601
 from calibre.utils.logging import default_log
-from calibre.utils.serialize import json_loads
+from calibre.utils.serialize import (
    json_dumps, json_loads, msgpack_dumps, msgpack_loads
 )
 from calibre.utils.short_uuid import uuid4
 from polyglot.binary import (
    as_base64_unicode as encode_component, from_base64_bytes,
    from_base64_unicode as decode_component
 )
-from polyglot.builtins import (
+from polyglot.builtins import as_bytes, is_py3, iteritems, map, unicode_type
    as_bytes, is_py3, iteritems, itervalues, map, unicode_type
 )
 from polyglot.urllib import quote, urlparse
 RENDER_VERSION = 1
@ -460,7 +460,7 @@ class RenderManager(object):
        group_sz = int(ceil(len(names) / num_workers))
        for group, worker in zip(grouper(group_sz, names), self.workers):
-            worker.stdin.write(as_bytes(json.dumps((worker.output_path, group,) + args)))
+            worker.stdin.write(as_bytes(msgpack_dumps((worker.output_path, group,) + args)))
            worker.stdin.flush(), worker.stdin.close()
            worker.job_sent = True
@ -479,7 +479,7 @@ class RenderManager(object):
                    error = f.read().decode('utf-8', 'replace')
            else:
                with lopen(worker.output_path, 'rb') as f:
-                    results.append(json.loads(f.read()))
+                    results.append(msgpack_loads(f.read()))
        if error is not None:
            raise Exception('Render worker failed with error:\n' + error)
        return results
@ -490,10 +490,10 @@ def worker_main():
    raw = stdin.read()
    if raw == b'_':
        return
-    args = json.loads(raw)
+    args = msgpack_loads(raw)
    result = process_book_files(*args[1:])
    with open(args[0], 'wb') as f:
-        f.write(as_bytes(json.dumps(result)))
+        f.write(as_bytes(msgpack_dumps(result)))
 def virtualize_html(container, name, link_uid, link_to_map, virtualized_names):
@ -520,8 +520,9 @@ def virtualize_html(container, name, link_uid, link_to_map, virtualized_names):
    return name in changed
-def process_book_files(names, container_dir, opfpath, virtualize_resources, link_uid, container=None):
+def process_book_files(names, container_dir, opfpath, virtualize_resources, link_uid, data_for_clone, container=None):
-    container = container or SimpleContainer(container_dir, opfpath, default_log)
+    if container is None:
        container = SimpleContainer(container_dir, opfpath, default_log, clone_data=data_for_clone)
    link_to_map = {}
    html_data = {}
    virtualized_names = set()
@ -541,10 +542,7 @@ def process_book_files(names, container_dir, opfpath, virtualize_resources, link
            transform_style_sheet(container, name, link_uid, virtualize_resources, virtualized_names)
        elif mt == 'image/svg+xml':
            transform_svg_image(container, name, link_uid, virtualize_resources, virtualized_names)
-    for v in itervalues(link_to_map):
+    return link_to_map, html_data, virtualized_names
        for k in v:
            v[k] = tuple(v[k])
    return link_to_map, html_data, tuple(virtualized_names)
 def process_exploded_book(
@ -607,7 +605,11 @@ def process_exploded_book(
        (n for n, mt in iteritems(container.mime_map) if mt in OEB_STYLES or mt in OEB_DOCS or mt == 'image/svg+xml'),
        key=work_priority)
-    results = render_manager(names, (tdir, opfpath, virtualize_resources, book_render_data['link_uid']), container)
+    results = render_manager(
        names, (
            tdir, opfpath, virtualize_resources, book_render_data['link_uid'], container.data_for_clone()
        ), container
    )
    ltm = book_render_data['link_to_map']
    html_data = {}
    virtualized_names = set()
@ -621,10 +623,8 @@ def process_exploded_book(
    for link_to_map, hdata, vnames in results:
        html_data.update(hdata)
-        virtualized_names |= set(vnames)
+        virtualized_names |= vnames
        for k, v in iteritems(link_to_map):
            for x in v:
                v[x] = set(v[x])
            if k in ltm:
                merge_ltm(ltm[k], v)
            else:
@ -838,7 +838,6 @@ def render(pathtoebook, output_dir, book_hash=None, serialize_metadata=False, ex
            book_metadata=mi, virtualize_resources=virtualize_resources
        )
        if serialize_metadata:
            from calibre.utils.serialize import json_dumps
            from calibre.ebooks.metadata.book.serialize import metadata_as_dict
            d = metadata_as_dict(mi)
            d.pop('cover_data', None)