Merging fonts now works for truetype fonts

2025-07-09 03:04:10 -04:00 · 2019-07-22 16:24:50 +05:30 · 2019-07-22 16:24:50 +05:30 · b573c33d1c
commit b573c33d1c
parent c383a2ce25
7 changed files with 177 additions and 10 deletions
--- a/src/calibre/ebooks/pdf/html_writer.py
+++ b/src/calibre/ebooks/pdf/html_writer.py
@ -32,6 +32,7 @@ from calibre.ebooks.pdf.render.serialize import PDFStream
 from calibre.gui2 import setup_unix_signals
 from calibre.gui2.webengine import secure_webengine
 from calibre.utils.fonts.sfnt.container import Sfnt, UnsupportedFont
+from calibre.utils.fonts.sfnt.merge import merge_truetype_fonts_for_pdf
 from calibre.utils.logging import default_log
 from calibre.utils.podofo import (
    get_podofo, remove_unused_fonts, set_metadata_implementation
@ -600,17 +601,19 @@ def merge_w_arrays(arrays):


 def merge_font(fonts):
-    # TODO: Check if the ToUnicode entry in the Type) dict needs to be merged
+    # TODO: Check if the ToUnicode entry in the Type0 dict needs to be merged

    # choose the largest font as the base font
    fonts.sort(key=lambda f: len(f['Data'] or b''), reverse=True)
    base_font = fonts[0]
    t0_font = next(f for f in fonts if f['DescendantFont'] == base_font['Reference'])
-    descendant_fonts = [f for f in fonts if f['Subtype'] != 'Type0' and f is not base_font]
+    descendant_fonts = [f for f in fonts if f['Subtype'] != 'Type0']
    for key in ('W', 'W2'):
-        arrays = tuple(filter(True, (f[key] for f in descendant_fonts)))
+        arrays = tuple(filter(None, (f[key] for f in descendant_fonts)))
        base_font[key] = merge_w_arrays(arrays)
-    t0_font
+    base_font['sfnt'] = merge_truetype_fonts_for_pdf(*(f['sfnt'] for f in descendant_fonts))
+    references_to_drop = tuple(f['Reference'] for f in fonts if f is not base_font and f is not t0_font)
+    return t0_font, base_font, references_to_drop


 def merge_fonts(pdf_doc):
@ -639,9 +642,16 @@ def merge_fonts(pdf_doc):

    for f in all_fonts:
        base_font_map.setdefault(f['BaseFont'], []).append(f)
+    replacements = {}
+    items = []
    for name, fonts in iteritems(base_font_map):
        if mergeable(fonts):
-            merge_font(fonts)
+            t0_font, base_font, references_to_drop = merge_font(fonts)
+            for ref in references_to_drop:
+                replacements[ref] = t0_font['Reference']
+            data = base_font['sfnt']()[0]
+            items.append((base_font['Reference'], base_font['W'] or [], base_font['W2'] or [], data))
+    pdf_doc.merge_fonts(tuple(items), replacements)


 def test_merge_fonts():
--- a/src/calibre/utils/fonts/sfnt/glyf.py
+++ b/src/calibre/utils/fonts/sfnt/glyf.py
@ -70,8 +70,10 @@ class CompositeGlyph(SimpleGlyph):

 class GlyfTable(UnknownTable):

-    def glyph_data(self, offset, length):
+    def glyph_data(self, offset, length, as_raw=False):
        raw = self.raw[offset:offset+length]
+        if as_raw:
+            return raw
        num_of_countours = unpack_from(b'>h', raw)[0] if raw else 0
        if num_of_countours >= 0:
            return SimpleGlyph(num_of_countours, raw)
--- a/src/calibre/utils/fonts/sfnt/loca.py
+++ b/src/calibre/utils/fonts/sfnt/loca.py
@ -8,6 +8,7 @@ __docformat__ = 'restructuredtext en'

 from struct import calcsize, unpack_from, pack
 from operator import itemgetter
+from itertools import repeat

 from calibre.utils.fonts.sfnt import UnknownTable
 from polyglot.builtins import iteritems, range
@ -32,12 +33,13 @@ class LocaTable(UnknownTable):
        next_offset = self.offset_map[glyph_id+1]
        return offset, next_offset - offset

-    def subset(self, resolved_glyph_map):
+    def update(self, resolved_glyph_map):
        '''
        Update this table to contain pointers only to the glyphs in
        resolved_glyph_map which must be a map of glyph_ids to (offset, sz)
        '''
-        self.offset_map = [0 for i in self.offset_map]
+        max_glyph_id = max(resolved_glyph_map or (0,))
+        self.offset_map = list(repeat(0, max_glyph_id + 2))
        glyphs = [(glyph_id, x[0], x[1]) for glyph_id, x in
                    iteritems(resolved_glyph_map)]
        glyphs.sort(key=itemgetter(1))
@ -55,6 +57,7 @@ class LocaTable(UnknownTable):
            vals = [i//2 for i in self.offset_map]

        self.raw = pack(('>%d%s'%(len(vals), self.fmt)).encode('ascii'), *vals)
+    subset = update

    def dump_glyphs(self, sfnt):
        if not hasattr(self, 'offset_map'):
--- a/src/calibre/utils/fonts/sfnt/merge.py
+++ b/src/calibre/utils/fonts/sfnt/merge.py
@ -0,0 +1,32 @@
+#!/usr/bin/env python2
+# vim:fileencoding=utf-8
+# License: GPL v3 Copyright: 2019, Kovid Goyal <kovid at kovidgoyal.net>
+
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+from collections import OrderedDict
+from functools import partial
+
+
+def merge_truetype_fonts_for_pdf(*fonts):
+    # only merges the glyf and loca tables, ignoring all other tables
+    all_glyphs = {}
+    for font in fonts:
+        loca = font[b'loca']
+        glyf = font[b'glyf']
+        loca.load_offsets(font[b'head'], font[b'maxp'])
+        for glyph_id in range(len(loca.offset_map) - 1):
+            if glyph_id not in all_glyphs:
+                offset, sz = loca.glyph_location(glyph_id)
+                if sz > 0:
+                    all_glyphs[glyph_id] = glyf.glyph_data(offset, sz, as_raw=True)
+
+    ans = fonts[0]
+    loca = ans[b'loca']
+    glyf = ans[b'glyf']
+    gmap = OrderedDict()
+    for glyph_id in sorted(all_glyphs):
+        gmap[glyph_id] = partial(all_glyphs.__getitem__, glyph_id)
+    offset_map = glyf.update(gmap)
+    loca.update(offset_map)
+    return ans
--- a/src/calibre/utils/podofo/doc.cpp
+++ b/src/calibre/utils/podofo/doc.cpp
@ -738,6 +738,9 @@ static PyMethodDef PDFDoc_methods[] = {
    {"remove_fonts", (PyCFunction)remove_fonts, METH_VARARGS,
     "remove_fonts() -> Remove the specified font objects."
    },
+    {"merge_fonts", (PyCFunction)merge_fonts, METH_VARARGS,
+     "merge_fonts() -> Merge the specified fonts."
+    },
    {"delete_pages", (PyCFunction)PDFDoc_delete_pages, METH_VARARGS,
     "delete_page(page_num, count=1) -> Delete the specified pages from the pdf."
    },
--- a/src/calibre/utils/podofo/fonts.cpp
+++ b/src/calibre/utils/podofo/fonts.cpp
@ -8,6 +8,7 @@
 #include "global.h"
 #include <iostream>
 #include <stack>
+#include <unordered_map>

 using namespace pdf;

@ -17,7 +18,7 @@ ref_as_tuple(const PdfReference &ref) {
    return Py_BuildValue("kk", num, generation);
 }

-static inline const PdfObject*
+static inline PdfObject*
 get_font_file(const PdfObject *descriptor) {
    PdfObject *ff = descriptor->GetIndirectKey("FontFile");
    if (!ff) ff = descriptor->GetIndirectKey("FontFile2");
@ -25,7 +26,7 @@ get_font_file(const PdfObject *descriptor) {
    return ff;
 }

-static void
+static inline void
 remove_font(PdfVecObjects &objects, PdfObject *font) {
    PdfObject *descriptor = font->GetIndirectKey("FontDescriptor");
    if (descriptor) {
@ -36,6 +37,40 @@ remove_font(PdfVecObjects &objects, PdfObject *font) {
    delete objects.RemoveObject(font->Reference());
 }

+static inline uint64_t
+ref_as_integer(pdf_objnum num, pdf_gennum gen) {
+    return static_cast<uint64_t>(num) | (static_cast<uint64_t>(gen) << 32);
+}
+
+static inline uint64_t
+ref_as_integer(const PdfReference &ref) { return ref_as_integer(ref.ObjectNumber(), ref.GenerationNumber()); }
+
+
+static inline void
+replace_font_references(PDFDoc *self, std::unordered_map<uint64_t, uint64_t> &ref_map) {
+    int num_pages = self->doc->GetPageCount();
+    for (int i = 0; i < num_pages; i++) {
+        PdfPage *page = self->doc->GetPage(i);
+        PdfDictionary &resources = page->GetResources()->GetDictionary();
+        PdfObject* f = resources.GetKey("Font");
+        if (f && f->IsDictionary()) {
+            const PdfDictionary &font = f->GetDictionary();
+            PdfDictionary new_font = PdfDictionary(font);
+            for (auto &k : font.GetKeys()) {
+                if (k.second->IsReference()) {
+                    uint64_t key = ref_as_integer(k.second->GetReference()), r;
+                    try {
+                        r = ref_map.at(key);
+                    } catch (const std::out_of_range &err) { continue; }
+                    PdfReference new_ref(static_cast<uint32_t>(r & 0xffffffff), r >> 32);
+                    new_font.AddKey(k.first.GetName(), new_ref);
+                }
+            }
+            resources.AddKey("Font", new_font);
+        }
+    }
+}
+
 static bool
 used_fonts_in_page(PdfPage *page, int page_num, PyObject *ans) {
    PdfContentsTokenizer tokenizer(page);
@ -91,6 +126,34 @@ convert_w_array(const PdfArray &w) {
    return ans.release();
 }

+#if PY_MAJOR_VERSION > 2
+#define py_as_long_long PyLong_AsLongLong
+#else
+static inline long long
+py_as_long_long(const PyObject *x) {
+    if (PyInt_Check(x)) return PyInt_AS_LONG(x);
+    return PyLong_AsLongLong(x);
+}
+#endif
+
+static void
+convert_w_array(PyObject *src, PdfArray &dest) {
+    for (Py_ssize_t i = 0; i < PyList_GET_SIZE(src); i++) {
+        PyObject *item = PyList_GET_ITEM(src, i);
+        if (PyFloat_Check(item)) {
+            dest.push_back(PdfObject(PyFloat_AS_DOUBLE(item)));
+        } else if (PyList_Check(item)) {
+            PdfArray sub;
+            convert_w_array(item, sub);
+            dest.push_back(sub);
+        } else {
+            pdf_int64 val = py_as_long_long(item);
+            if (val == -1 && PyErr_Occurred()) { PyErr_Print(); continue; }
+            dest.push_back(PdfObject(val));
+        }
+    }
+}
+
 extern "C" {
 PyObject*
 list_fonts(PDFDoc *self, PyObject *args) {
@ -195,4 +258,57 @@ remove_fonts(PDFDoc *self, PyObject *args) {
    Py_RETURN_NONE;
 }

+PyObject*
+merge_fonts(PDFDoc *self, PyObject *args) {
+    PyObject *items, *replacements;
+    if (!PyArg_ParseTuple(args, "O!O!", &PyTuple_Type, &items, &PyDict_Type, &replacements)) return NULL;
+    std::unordered_map<uint64_t, uint64_t> ref_map;
+    PdfVecObjects &objects = self->doc->GetObjects();
+    PyObject *key, *value;
+    Py_ssize_t pos = 0;
+    size_t c = 0;
+    while (PyDict_Next(replacements, &pos, &key, &value)) {
+        c++;
+        unsigned long num, gen;
+        if (!PyArg_ParseTuple(key, "kk", &num, &gen)) return NULL;
+        uint64_t k = ref_as_integer(num, gen);
+        PdfReference ref(num, gen);
+        PdfObject *font = objects.GetObject(ref);
+        if (font) remove_font(objects, font);
+        if (!PyArg_ParseTuple(value, "kk", &num, &gen)) return NULL;
+        uint64_t v = ref_as_integer(num, gen);
+        ref_map[k] = v;
+    }
+    if (c > 0) replace_font_references(self, ref_map);
+
+    for (Py_ssize_t i = 0; i < PyTuple_GET_SIZE(items); i++) {
+        long num, gen;
+        PyObject *W, *W2;
+        const char *data;
+        Py_ssize_t sz;
+        if (!PyArg_ParseTuple(PyTuple_GET_ITEM(items, i), "(ll)O!O!s#", &num, &gen, &PyList_Type, &W, &PyList_Type, &W2, &data, &sz)) return NULL;
+        PdfReference ref(num, gen);
+        PdfObject *font = objects.GetObject(ref);
+        if (font) {
+            if (PyObject_IsTrue(W)) {
+                PdfArray w;
+                convert_w_array(W, w);
+                font->GetDictionary().AddKey("W", w);
+            }
+            if (PyObject_IsTrue(W2)) {
+                PdfArray w;
+                convert_w_array(W2, w);
+                font->GetDictionary().AddKey("W2", w);
+            }
+            const PdfObject *descriptor = font->GetIndirectKey("FontDescriptor");
+            if (descriptor) {
+                PdfObject *ff = get_font_file(descriptor);
+                PdfStream *stream = ff->GetStream();
+                stream->Set(data, sz);
+            }
+        }
+    }
+    Py_RETURN_NONE;
+}
+
 }
--- a/src/calibre/utils/podofo/global.h
+++ b/src/calibre/utils/podofo/global.h
@ -88,5 +88,6 @@ extern "C" {
 PyObject* list_fonts(PDFDoc*, PyObject*);
 PyObject* used_fonts_in_page_range(PDFDoc *self, PyObject *args);
 PyObject* remove_fonts(PDFDoc *self, PyObject *args);
+PyObject* merge_fonts(PDFDoc *self, PyObject *args);
 }
 }