Merging fonts now works for truetype fonts

This commit is contained in:
Kovid Goyal 2019-07-22 16:24:50 +05:30
parent c383a2ce25
commit b573c33d1c
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C
7 changed files with 177 additions and 10 deletions

View File

@ -32,6 +32,7 @@ from calibre.ebooks.pdf.render.serialize import PDFStream
from calibre.gui2 import setup_unix_signals from calibre.gui2 import setup_unix_signals
from calibre.gui2.webengine import secure_webengine from calibre.gui2.webengine import secure_webengine
from calibre.utils.fonts.sfnt.container import Sfnt, UnsupportedFont from calibre.utils.fonts.sfnt.container import Sfnt, UnsupportedFont
from calibre.utils.fonts.sfnt.merge import merge_truetype_fonts_for_pdf
from calibre.utils.logging import default_log from calibre.utils.logging import default_log
from calibre.utils.podofo import ( from calibre.utils.podofo import (
get_podofo, remove_unused_fonts, set_metadata_implementation get_podofo, remove_unused_fonts, set_metadata_implementation
@ -600,17 +601,19 @@ def merge_w_arrays(arrays):
def merge_font(fonts): def merge_font(fonts):
# TODO: Check if the ToUnicode entry in the Type) dict needs to be merged # TODO: Check if the ToUnicode entry in the Type0 dict needs to be merged
# choose the largest font as the base font # choose the largest font as the base font
fonts.sort(key=lambda f: len(f['Data'] or b''), reverse=True) fonts.sort(key=lambda f: len(f['Data'] or b''), reverse=True)
base_font = fonts[0] base_font = fonts[0]
t0_font = next(f for f in fonts if f['DescendantFont'] == base_font['Reference']) t0_font = next(f for f in fonts if f['DescendantFont'] == base_font['Reference'])
descendant_fonts = [f for f in fonts if f['Subtype'] != 'Type0' and f is not base_font] descendant_fonts = [f for f in fonts if f['Subtype'] != 'Type0']
for key in ('W', 'W2'): for key in ('W', 'W2'):
arrays = tuple(filter(True, (f[key] for f in descendant_fonts))) arrays = tuple(filter(None, (f[key] for f in descendant_fonts)))
base_font[key] = merge_w_arrays(arrays) base_font[key] = merge_w_arrays(arrays)
t0_font base_font['sfnt'] = merge_truetype_fonts_for_pdf(*(f['sfnt'] for f in descendant_fonts))
references_to_drop = tuple(f['Reference'] for f in fonts if f is not base_font and f is not t0_font)
return t0_font, base_font, references_to_drop
def merge_fonts(pdf_doc): def merge_fonts(pdf_doc):
@ -639,9 +642,16 @@ def merge_fonts(pdf_doc):
for f in all_fonts: for f in all_fonts:
base_font_map.setdefault(f['BaseFont'], []).append(f) base_font_map.setdefault(f['BaseFont'], []).append(f)
replacements = {}
items = []
for name, fonts in iteritems(base_font_map): for name, fonts in iteritems(base_font_map):
if mergeable(fonts): if mergeable(fonts):
merge_font(fonts) t0_font, base_font, references_to_drop = merge_font(fonts)
for ref in references_to_drop:
replacements[ref] = t0_font['Reference']
data = base_font['sfnt']()[0]
items.append((base_font['Reference'], base_font['W'] or [], base_font['W2'] or [], data))
pdf_doc.merge_fonts(tuple(items), replacements)
def test_merge_fonts(): def test_merge_fonts():

View File

@ -70,8 +70,10 @@ class CompositeGlyph(SimpleGlyph):
class GlyfTable(UnknownTable): class GlyfTable(UnknownTable):
def glyph_data(self, offset, length): def glyph_data(self, offset, length, as_raw=False):
raw = self.raw[offset:offset+length] raw = self.raw[offset:offset+length]
if as_raw:
return raw
num_of_countours = unpack_from(b'>h', raw)[0] if raw else 0 num_of_countours = unpack_from(b'>h', raw)[0] if raw else 0
if num_of_countours >= 0: if num_of_countours >= 0:
return SimpleGlyph(num_of_countours, raw) return SimpleGlyph(num_of_countours, raw)

View File

@ -8,6 +8,7 @@ __docformat__ = 'restructuredtext en'
from struct import calcsize, unpack_from, pack from struct import calcsize, unpack_from, pack
from operator import itemgetter from operator import itemgetter
from itertools import repeat
from calibre.utils.fonts.sfnt import UnknownTable from calibre.utils.fonts.sfnt import UnknownTable
from polyglot.builtins import iteritems, range from polyglot.builtins import iteritems, range
@ -32,12 +33,13 @@ class LocaTable(UnknownTable):
next_offset = self.offset_map[glyph_id+1] next_offset = self.offset_map[glyph_id+1]
return offset, next_offset - offset return offset, next_offset - offset
def subset(self, resolved_glyph_map): def update(self, resolved_glyph_map):
''' '''
Update this table to contain pointers only to the glyphs in Update this table to contain pointers only to the glyphs in
resolved_glyph_map which must be a map of glyph_ids to (offset, sz) resolved_glyph_map which must be a map of glyph_ids to (offset, sz)
''' '''
self.offset_map = [0 for i in self.offset_map] max_glyph_id = max(resolved_glyph_map or (0,))
self.offset_map = list(repeat(0, max_glyph_id + 2))
glyphs = [(glyph_id, x[0], x[1]) for glyph_id, x in glyphs = [(glyph_id, x[0], x[1]) for glyph_id, x in
iteritems(resolved_glyph_map)] iteritems(resolved_glyph_map)]
glyphs.sort(key=itemgetter(1)) glyphs.sort(key=itemgetter(1))
@ -55,6 +57,7 @@ class LocaTable(UnknownTable):
vals = [i//2 for i in self.offset_map] vals = [i//2 for i in self.offset_map]
self.raw = pack(('>%d%s'%(len(vals), self.fmt)).encode('ascii'), *vals) self.raw = pack(('>%d%s'%(len(vals), self.fmt)).encode('ascii'), *vals)
subset = update
def dump_glyphs(self, sfnt): def dump_glyphs(self, sfnt):
if not hasattr(self, 'offset_map'): if not hasattr(self, 'offset_map'):

View File

@ -0,0 +1,32 @@
#!/usr/bin/env python2
# vim:fileencoding=utf-8
# License: GPL v3 Copyright: 2019, Kovid Goyal <kovid at kovidgoyal.net>
from __future__ import absolute_import, division, print_function, unicode_literals
from collections import OrderedDict
from functools import partial
def merge_truetype_fonts_for_pdf(*fonts):
# only merges the glyf and loca tables, ignoring all other tables
all_glyphs = {}
for font in fonts:
loca = font[b'loca']
glyf = font[b'glyf']
loca.load_offsets(font[b'head'], font[b'maxp'])
for glyph_id in range(len(loca.offset_map) - 1):
if glyph_id not in all_glyphs:
offset, sz = loca.glyph_location(glyph_id)
if sz > 0:
all_glyphs[glyph_id] = glyf.glyph_data(offset, sz, as_raw=True)
ans = fonts[0]
loca = ans[b'loca']
glyf = ans[b'glyf']
gmap = OrderedDict()
for glyph_id in sorted(all_glyphs):
gmap[glyph_id] = partial(all_glyphs.__getitem__, glyph_id)
offset_map = glyf.update(gmap)
loca.update(offset_map)
return ans

View File

@ -738,6 +738,9 @@ static PyMethodDef PDFDoc_methods[] = {
{"remove_fonts", (PyCFunction)remove_fonts, METH_VARARGS, {"remove_fonts", (PyCFunction)remove_fonts, METH_VARARGS,
"remove_fonts() -> Remove the specified font objects." "remove_fonts() -> Remove the specified font objects."
}, },
{"merge_fonts", (PyCFunction)merge_fonts, METH_VARARGS,
"merge_fonts() -> Merge the specified fonts."
},
{"delete_pages", (PyCFunction)PDFDoc_delete_pages, METH_VARARGS, {"delete_pages", (PyCFunction)PDFDoc_delete_pages, METH_VARARGS,
"delete_page(page_num, count=1) -> Delete the specified pages from the pdf." "delete_page(page_num, count=1) -> Delete the specified pages from the pdf."
}, },

View File

@ -8,6 +8,7 @@
#include "global.h" #include "global.h"
#include <iostream> #include <iostream>
#include <stack> #include <stack>
#include <unordered_map>
using namespace pdf; using namespace pdf;
@ -17,7 +18,7 @@ ref_as_tuple(const PdfReference &ref) {
return Py_BuildValue("kk", num, generation); return Py_BuildValue("kk", num, generation);
} }
static inline const PdfObject* static inline PdfObject*
get_font_file(const PdfObject *descriptor) { get_font_file(const PdfObject *descriptor) {
PdfObject *ff = descriptor->GetIndirectKey("FontFile"); PdfObject *ff = descriptor->GetIndirectKey("FontFile");
if (!ff) ff = descriptor->GetIndirectKey("FontFile2"); if (!ff) ff = descriptor->GetIndirectKey("FontFile2");
@ -25,7 +26,7 @@ get_font_file(const PdfObject *descriptor) {
return ff; return ff;
} }
static void static inline void
remove_font(PdfVecObjects &objects, PdfObject *font) { remove_font(PdfVecObjects &objects, PdfObject *font) {
PdfObject *descriptor = font->GetIndirectKey("FontDescriptor"); PdfObject *descriptor = font->GetIndirectKey("FontDescriptor");
if (descriptor) { if (descriptor) {
@ -36,6 +37,40 @@ remove_font(PdfVecObjects &objects, PdfObject *font) {
delete objects.RemoveObject(font->Reference()); delete objects.RemoveObject(font->Reference());
} }
static inline uint64_t
ref_as_integer(pdf_objnum num, pdf_gennum gen) {
return static_cast<uint64_t>(num) | (static_cast<uint64_t>(gen) << 32);
}
static inline uint64_t
ref_as_integer(const PdfReference &ref) { return ref_as_integer(ref.ObjectNumber(), ref.GenerationNumber()); }
static inline void
replace_font_references(PDFDoc *self, std::unordered_map<uint64_t, uint64_t> &ref_map) {
int num_pages = self->doc->GetPageCount();
for (int i = 0; i < num_pages; i++) {
PdfPage *page = self->doc->GetPage(i);
PdfDictionary &resources = page->GetResources()->GetDictionary();
PdfObject* f = resources.GetKey("Font");
if (f && f->IsDictionary()) {
const PdfDictionary &font = f->GetDictionary();
PdfDictionary new_font = PdfDictionary(font);
for (auto &k : font.GetKeys()) {
if (k.second->IsReference()) {
uint64_t key = ref_as_integer(k.second->GetReference()), r;
try {
r = ref_map.at(key);
} catch (const std::out_of_range &err) { continue; }
PdfReference new_ref(static_cast<uint32_t>(r & 0xffffffff), r >> 32);
new_font.AddKey(k.first.GetName(), new_ref);
}
}
resources.AddKey("Font", new_font);
}
}
}
static bool static bool
used_fonts_in_page(PdfPage *page, int page_num, PyObject *ans) { used_fonts_in_page(PdfPage *page, int page_num, PyObject *ans) {
PdfContentsTokenizer tokenizer(page); PdfContentsTokenizer tokenizer(page);
@ -91,6 +126,34 @@ convert_w_array(const PdfArray &w) {
return ans.release(); return ans.release();
} }
#if PY_MAJOR_VERSION > 2
#define py_as_long_long PyLong_AsLongLong
#else
static inline long long
py_as_long_long(const PyObject *x) {
if (PyInt_Check(x)) return PyInt_AS_LONG(x);
return PyLong_AsLongLong(x);
}
#endif
static void
convert_w_array(PyObject *src, PdfArray &dest) {
for (Py_ssize_t i = 0; i < PyList_GET_SIZE(src); i++) {
PyObject *item = PyList_GET_ITEM(src, i);
if (PyFloat_Check(item)) {
dest.push_back(PdfObject(PyFloat_AS_DOUBLE(item)));
} else if (PyList_Check(item)) {
PdfArray sub;
convert_w_array(item, sub);
dest.push_back(sub);
} else {
pdf_int64 val = py_as_long_long(item);
if (val == -1 && PyErr_Occurred()) { PyErr_Print(); continue; }
dest.push_back(PdfObject(val));
}
}
}
extern "C" { extern "C" {
PyObject* PyObject*
list_fonts(PDFDoc *self, PyObject *args) { list_fonts(PDFDoc *self, PyObject *args) {
@ -195,4 +258,57 @@ remove_fonts(PDFDoc *self, PyObject *args) {
Py_RETURN_NONE; Py_RETURN_NONE;
} }
PyObject*
merge_fonts(PDFDoc *self, PyObject *args) {
PyObject *items, *replacements;
if (!PyArg_ParseTuple(args, "O!O!", &PyTuple_Type, &items, &PyDict_Type, &replacements)) return NULL;
std::unordered_map<uint64_t, uint64_t> ref_map;
PdfVecObjects &objects = self->doc->GetObjects();
PyObject *key, *value;
Py_ssize_t pos = 0;
size_t c = 0;
while (PyDict_Next(replacements, &pos, &key, &value)) {
c++;
unsigned long num, gen;
if (!PyArg_ParseTuple(key, "kk", &num, &gen)) return NULL;
uint64_t k = ref_as_integer(num, gen);
PdfReference ref(num, gen);
PdfObject *font = objects.GetObject(ref);
if (font) remove_font(objects, font);
if (!PyArg_ParseTuple(value, "kk", &num, &gen)) return NULL;
uint64_t v = ref_as_integer(num, gen);
ref_map[k] = v;
}
if (c > 0) replace_font_references(self, ref_map);
for (Py_ssize_t i = 0; i < PyTuple_GET_SIZE(items); i++) {
long num, gen;
PyObject *W, *W2;
const char *data;
Py_ssize_t sz;
if (!PyArg_ParseTuple(PyTuple_GET_ITEM(items, i), "(ll)O!O!s#", &num, &gen, &PyList_Type, &W, &PyList_Type, &W2, &data, &sz)) return NULL;
PdfReference ref(num, gen);
PdfObject *font = objects.GetObject(ref);
if (font) {
if (PyObject_IsTrue(W)) {
PdfArray w;
convert_w_array(W, w);
font->GetDictionary().AddKey("W", w);
}
if (PyObject_IsTrue(W2)) {
PdfArray w;
convert_w_array(W2, w);
font->GetDictionary().AddKey("W2", w);
}
const PdfObject *descriptor = font->GetIndirectKey("FontDescriptor");
if (descriptor) {
PdfObject *ff = get_font_file(descriptor);
PdfStream *stream = ff->GetStream();
stream->Set(data, sz);
}
}
}
Py_RETURN_NONE;
}
} }

View File

@ -88,5 +88,6 @@ extern "C" {
PyObject* list_fonts(PDFDoc*, PyObject*); PyObject* list_fonts(PDFDoc*, PyObject*);
PyObject* used_fonts_in_page_range(PDFDoc *self, PyObject *args); PyObject* used_fonts_in_page_range(PDFDoc *self, PyObject *args);
PyObject* remove_fonts(PDFDoc *self, PyObject *args); PyObject* remove_fonts(PDFDoc *self, PyObject *args);
PyObject* merge_fonts(PDFDoc *self, PyObject *args);
} }
} }