diff --git a/src/calibre/ebooks/pdf/html_writer.py b/src/calibre/ebooks/pdf/html_writer.py index 43608fff8c..20fc490735 100644 --- a/src/calibre/ebooks/pdf/html_writer.py +++ b/src/calibre/ebooks/pdf/html_writer.py @@ -32,6 +32,7 @@ from calibre.ebooks.pdf.render.serialize import PDFStream from calibre.gui2 import setup_unix_signals from calibre.gui2.webengine import secure_webengine from calibre.utils.fonts.sfnt.container import Sfnt, UnsupportedFont +from calibre.utils.fonts.sfnt.merge import merge_truetype_fonts_for_pdf from calibre.utils.logging import default_log from calibre.utils.podofo import ( get_podofo, remove_unused_fonts, set_metadata_implementation @@ -600,17 +601,19 @@ def merge_w_arrays(arrays): def merge_font(fonts): - # TODO: Check if the ToUnicode entry in the Type) dict needs to be merged + # TODO: Check if the ToUnicode entry in the Type0 dict needs to be merged # choose the largest font as the base font fonts.sort(key=lambda f: len(f['Data'] or b''), reverse=True) base_font = fonts[0] t0_font = next(f for f in fonts if f['DescendantFont'] == base_font['Reference']) - descendant_fonts = [f for f in fonts if f['Subtype'] != 'Type0' and f is not base_font] + descendant_fonts = [f for f in fonts if f['Subtype'] != 'Type0'] for key in ('W', 'W2'): - arrays = tuple(filter(True, (f[key] for f in descendant_fonts))) + arrays = tuple(filter(None, (f[key] for f in descendant_fonts))) base_font[key] = merge_w_arrays(arrays) - t0_font + base_font['sfnt'] = merge_truetype_fonts_for_pdf(*(f['sfnt'] for f in descendant_fonts)) + references_to_drop = tuple(f['Reference'] for f in fonts if f is not base_font and f is not t0_font) + return t0_font, base_font, references_to_drop def merge_fonts(pdf_doc): @@ -639,9 +642,16 @@ def merge_fonts(pdf_doc): for f in all_fonts: base_font_map.setdefault(f['BaseFont'], []).append(f) + replacements = {} + items = [] for name, fonts in iteritems(base_font_map): if mergeable(fonts): - merge_font(fonts) + t0_font, base_font, references_to_drop = merge_font(fonts) + for ref in references_to_drop: + replacements[ref] = t0_font['Reference'] + data = base_font['sfnt']()[0] + items.append((base_font['Reference'], base_font['W'] or [], base_font['W2'] or [], data)) + pdf_doc.merge_fonts(tuple(items), replacements) def test_merge_fonts(): diff --git a/src/calibre/utils/fonts/sfnt/glyf.py b/src/calibre/utils/fonts/sfnt/glyf.py index 6a403afb4f..ce28b6e16c 100644 --- a/src/calibre/utils/fonts/sfnt/glyf.py +++ b/src/calibre/utils/fonts/sfnt/glyf.py @@ -70,8 +70,10 @@ class CompositeGlyph(SimpleGlyph): class GlyfTable(UnknownTable): - def glyph_data(self, offset, length): + def glyph_data(self, offset, length, as_raw=False): raw = self.raw[offset:offset+length] + if as_raw: + return raw num_of_countours = unpack_from(b'>h', raw)[0] if raw else 0 if num_of_countours >= 0: return SimpleGlyph(num_of_countours, raw) diff --git a/src/calibre/utils/fonts/sfnt/loca.py b/src/calibre/utils/fonts/sfnt/loca.py index 3d4c8ee94e..0c9bf617d0 100644 --- a/src/calibre/utils/fonts/sfnt/loca.py +++ b/src/calibre/utils/fonts/sfnt/loca.py @@ -8,6 +8,7 @@ __docformat__ = 'restructuredtext en' from struct import calcsize, unpack_from, pack from operator import itemgetter +from itertools import repeat from calibre.utils.fonts.sfnt import UnknownTable from polyglot.builtins import iteritems, range @@ -32,12 +33,13 @@ class LocaTable(UnknownTable): next_offset = self.offset_map[glyph_id+1] return offset, next_offset - offset - def subset(self, resolved_glyph_map): + def update(self, resolved_glyph_map): ''' Update this table to contain pointers only to the glyphs in resolved_glyph_map which must be a map of glyph_ids to (offset, sz) ''' - self.offset_map = [0 for i in self.offset_map] + max_glyph_id = max(resolved_glyph_map or (0,)) + self.offset_map = list(repeat(0, max_glyph_id + 2)) glyphs = [(glyph_id, x[0], x[1]) for glyph_id, x in iteritems(resolved_glyph_map)] glyphs.sort(key=itemgetter(1)) @@ -55,6 +57,7 @@ class LocaTable(UnknownTable): vals = [i//2 for i in self.offset_map] self.raw = pack(('>%d%s'%(len(vals), self.fmt)).encode('ascii'), *vals) + subset = update def dump_glyphs(self, sfnt): if not hasattr(self, 'offset_map'): diff --git a/src/calibre/utils/fonts/sfnt/merge.py b/src/calibre/utils/fonts/sfnt/merge.py new file mode 100644 index 0000000000..6ef28eeda8 --- /dev/null +++ b/src/calibre/utils/fonts/sfnt/merge.py @@ -0,0 +1,32 @@ +#!/usr/bin/env python2 +# vim:fileencoding=utf-8 +# License: GPL v3 Copyright: 2019, Kovid Goyal + +from __future__ import absolute_import, division, print_function, unicode_literals + +from collections import OrderedDict +from functools import partial + + +def merge_truetype_fonts_for_pdf(*fonts): + # only merges the glyf and loca tables, ignoring all other tables + all_glyphs = {} + for font in fonts: + loca = font[b'loca'] + glyf = font[b'glyf'] + loca.load_offsets(font[b'head'], font[b'maxp']) + for glyph_id in range(len(loca.offset_map) - 1): + if glyph_id not in all_glyphs: + offset, sz = loca.glyph_location(glyph_id) + if sz > 0: + all_glyphs[glyph_id] = glyf.glyph_data(offset, sz, as_raw=True) + + ans = fonts[0] + loca = ans[b'loca'] + glyf = ans[b'glyf'] + gmap = OrderedDict() + for glyph_id in sorted(all_glyphs): + gmap[glyph_id] = partial(all_glyphs.__getitem__, glyph_id) + offset_map = glyf.update(gmap) + loca.update(offset_map) + return ans diff --git a/src/calibre/utils/podofo/doc.cpp b/src/calibre/utils/podofo/doc.cpp index 84f57edb65..058fba489a 100644 --- a/src/calibre/utils/podofo/doc.cpp +++ b/src/calibre/utils/podofo/doc.cpp @@ -738,6 +738,9 @@ static PyMethodDef PDFDoc_methods[] = { {"remove_fonts", (PyCFunction)remove_fonts, METH_VARARGS, "remove_fonts() -> Remove the specified font objects." }, + {"merge_fonts", (PyCFunction)merge_fonts, METH_VARARGS, + "merge_fonts() -> Merge the specified fonts." + }, {"delete_pages", (PyCFunction)PDFDoc_delete_pages, METH_VARARGS, "delete_page(page_num, count=1) -> Delete the specified pages from the pdf." }, diff --git a/src/calibre/utils/podofo/fonts.cpp b/src/calibre/utils/podofo/fonts.cpp index 967f46092a..fb629b10f9 100644 --- a/src/calibre/utils/podofo/fonts.cpp +++ b/src/calibre/utils/podofo/fonts.cpp @@ -8,6 +8,7 @@ #include "global.h" #include #include +#include using namespace pdf; @@ -17,7 +18,7 @@ ref_as_tuple(const PdfReference &ref) { return Py_BuildValue("kk", num, generation); } -static inline const PdfObject* +static inline PdfObject* get_font_file(const PdfObject *descriptor) { PdfObject *ff = descriptor->GetIndirectKey("FontFile"); if (!ff) ff = descriptor->GetIndirectKey("FontFile2"); @@ -25,7 +26,7 @@ get_font_file(const PdfObject *descriptor) { return ff; } -static void +static inline void remove_font(PdfVecObjects &objects, PdfObject *font) { PdfObject *descriptor = font->GetIndirectKey("FontDescriptor"); if (descriptor) { @@ -36,6 +37,40 @@ remove_font(PdfVecObjects &objects, PdfObject *font) { delete objects.RemoveObject(font->Reference()); } +static inline uint64_t +ref_as_integer(pdf_objnum num, pdf_gennum gen) { + return static_cast(num) | (static_cast(gen) << 32); +} + +static inline uint64_t +ref_as_integer(const PdfReference &ref) { return ref_as_integer(ref.ObjectNumber(), ref.GenerationNumber()); } + + +static inline void +replace_font_references(PDFDoc *self, std::unordered_map &ref_map) { + int num_pages = self->doc->GetPageCount(); + for (int i = 0; i < num_pages; i++) { + PdfPage *page = self->doc->GetPage(i); + PdfDictionary &resources = page->GetResources()->GetDictionary(); + PdfObject* f = resources.GetKey("Font"); + if (f && f->IsDictionary()) { + const PdfDictionary &font = f->GetDictionary(); + PdfDictionary new_font = PdfDictionary(font); + for (auto &k : font.GetKeys()) { + if (k.second->IsReference()) { + uint64_t key = ref_as_integer(k.second->GetReference()), r; + try { + r = ref_map.at(key); + } catch (const std::out_of_range &err) { continue; } + PdfReference new_ref(static_cast(r & 0xffffffff), r >> 32); + new_font.AddKey(k.first.GetName(), new_ref); + } + } + resources.AddKey("Font", new_font); + } + } +} + static bool used_fonts_in_page(PdfPage *page, int page_num, PyObject *ans) { PdfContentsTokenizer tokenizer(page); @@ -91,6 +126,34 @@ convert_w_array(const PdfArray &w) { return ans.release(); } +#if PY_MAJOR_VERSION > 2 +#define py_as_long_long PyLong_AsLongLong +#else +static inline long long +py_as_long_long(const PyObject *x) { + if (PyInt_Check(x)) return PyInt_AS_LONG(x); + return PyLong_AsLongLong(x); +} +#endif + +static void +convert_w_array(PyObject *src, PdfArray &dest) { + for (Py_ssize_t i = 0; i < PyList_GET_SIZE(src); i++) { + PyObject *item = PyList_GET_ITEM(src, i); + if (PyFloat_Check(item)) { + dest.push_back(PdfObject(PyFloat_AS_DOUBLE(item))); + } else if (PyList_Check(item)) { + PdfArray sub; + convert_w_array(item, sub); + dest.push_back(sub); + } else { + pdf_int64 val = py_as_long_long(item); + if (val == -1 && PyErr_Occurred()) { PyErr_Print(); continue; } + dest.push_back(PdfObject(val)); + } + } +} + extern "C" { PyObject* list_fonts(PDFDoc *self, PyObject *args) { @@ -195,4 +258,57 @@ remove_fonts(PDFDoc *self, PyObject *args) { Py_RETURN_NONE; } +PyObject* +merge_fonts(PDFDoc *self, PyObject *args) { + PyObject *items, *replacements; + if (!PyArg_ParseTuple(args, "O!O!", &PyTuple_Type, &items, &PyDict_Type, &replacements)) return NULL; + std::unordered_map ref_map; + PdfVecObjects &objects = self->doc->GetObjects(); + PyObject *key, *value; + Py_ssize_t pos = 0; + size_t c = 0; + while (PyDict_Next(replacements, &pos, &key, &value)) { + c++; + unsigned long num, gen; + if (!PyArg_ParseTuple(key, "kk", &num, &gen)) return NULL; + uint64_t k = ref_as_integer(num, gen); + PdfReference ref(num, gen); + PdfObject *font = objects.GetObject(ref); + if (font) remove_font(objects, font); + if (!PyArg_ParseTuple(value, "kk", &num, &gen)) return NULL; + uint64_t v = ref_as_integer(num, gen); + ref_map[k] = v; + } + if (c > 0) replace_font_references(self, ref_map); + + for (Py_ssize_t i = 0; i < PyTuple_GET_SIZE(items); i++) { + long num, gen; + PyObject *W, *W2; + const char *data; + Py_ssize_t sz; + if (!PyArg_ParseTuple(PyTuple_GET_ITEM(items, i), "(ll)O!O!s#", &num, &gen, &PyList_Type, &W, &PyList_Type, &W2, &data, &sz)) return NULL; + PdfReference ref(num, gen); + PdfObject *font = objects.GetObject(ref); + if (font) { + if (PyObject_IsTrue(W)) { + PdfArray w; + convert_w_array(W, w); + font->GetDictionary().AddKey("W", w); + } + if (PyObject_IsTrue(W2)) { + PdfArray w; + convert_w_array(W2, w); + font->GetDictionary().AddKey("W2", w); + } + const PdfObject *descriptor = font->GetIndirectKey("FontDescriptor"); + if (descriptor) { + PdfObject *ff = get_font_file(descriptor); + PdfStream *stream = ff->GetStream(); + stream->Set(data, sz); + } + } + } + Py_RETURN_NONE; +} + } diff --git a/src/calibre/utils/podofo/global.h b/src/calibre/utils/podofo/global.h index 2063d67ed6..7925ae73be 100644 --- a/src/calibre/utils/podofo/global.h +++ b/src/calibre/utils/podofo/global.h @@ -88,5 +88,6 @@ extern "C" { PyObject* list_fonts(PDFDoc*, PyObject*); PyObject* used_fonts_in_page_range(PDFDoc *self, PyObject *args); PyObject* remove_fonts(PDFDoc *self, PyObject *args); +PyObject* merge_fonts(PDFDoc *self, PyObject *args); } }