From 6caf7cf77a8851ee98af129bde14ce552813b3d3 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sun, 14 Jul 2019 20:01:16 +0530 Subject: [PATCH] Remove unused fonts from generated PDF --- src/calibre/ebooks/pdf/html_writer.py | 9 ++++-- src/calibre/utils/podofo/__init__.py | 19 +++++++++++- src/calibre/utils/podofo/doc.cpp | 5 +++- src/calibre/utils/podofo/fonts.cpp | 43 +++++++++++++++++++++++---- src/calibre/utils/podofo/global.h | 1 + 5 files changed, 68 insertions(+), 9 deletions(-) diff --git a/src/calibre/ebooks/pdf/html_writer.py b/src/calibre/ebooks/pdf/html_writer.py index b68352041b..6f2ed6726f 100644 --- a/src/calibre/ebooks/pdf/html_writer.py +++ b/src/calibre/ebooks/pdf/html_writer.py @@ -31,7 +31,9 @@ from calibre.ebooks.pdf.render.serialize import PDFStream from calibre.gui2 import setup_unix_signals from calibre.gui2.webengine import secure_webengine from calibre.utils.logging import default_log -from calibre.utils.podofo import get_podofo, set_metadata_implementation +from calibre.utils.podofo import ( + get_podofo, remove_unused_fonts, set_metadata_implementation +) from calibre.utils.short_uuid import uuid4 from polyglot.builtins import iteritems, map, range, unicode_type from polyglot.urllib import urlparse @@ -563,11 +565,14 @@ def convert(opf_path, opts, metadata=None, output_path=None, log=default_log, co add_toc(PDFOutlineRoot(pdf_doc), toc) report_progress(0.75, _('Added links to PDF content')) - # TODO: Remove unused fonts # TODO: Remove duplicate fonts # TODO: Subset and embed fonts before rendering PDF # TODO: Support for mathematics + num_removed = remove_unused_fonts(pdf_doc) + if num_removed: + log('Removed', num_removed, 'unused fonts') + if cover_data: add_cover(pdf_doc, cover_data, page_layout, opts) diff --git a/src/calibre/utils/podofo/__init__.py b/src/calibre/utils/podofo/__init__.py index a53d7123f8..a57d5c60ff 100644 --- a/src/calibre/utils/podofo/__init__.py +++ b/src/calibre/utils/podofo/__init__.py @@ -10,7 +10,7 @@ from calibre.constants import plugins, preferred_encoding from calibre.ebooks.metadata import authors_to_string from calibre.ptempfile import TemporaryDirectory from calibre.utils.ipc.simple_worker import WorkerError, fork_job -from polyglot.builtins import range, unicode_type +from polyglot.builtins import range, unicode_type, iteritems def get_podofo(): @@ -154,6 +154,23 @@ def list_fonts(pdf_doc): return ref_map +def remove_unused_fonts(pdf_doc): + font_ref_map = list_fonts(pdf_doc) + unused = tuple(ref for ref, font in iteritems(font_ref_map) if not font['used']) + pdf_doc.remove_fonts(unused) + return len(tuple(f for f in unused if font_ref_map[f]['StreamRef'])) + + +def test_remove_unused_fonts(src): + podofo = get_podofo() + p = podofo.PDFDoc() + p.open(src) + remove_unused_fonts(p) + dest = src.rpartition('.')[0] + '-removed.pdf' + p.save(dest) + print('Modified pdf saved to:', dest) + + def test_list_fonts(src): podofo = get_podofo() p = podofo.PDFDoc() diff --git a/src/calibre/utils/podofo/doc.cpp b/src/calibre/utils/podofo/doc.cpp index 3cd4dc9fc9..1e99abfb62 100644 --- a/src/calibre/utils/podofo/doc.cpp +++ b/src/calibre/utils/podofo/doc.cpp @@ -715,12 +715,15 @@ static PyMethodDef PDFDoc_methods[] = { {"alter_links", (PyCFunction)PDFDoc_alter_links, METH_VARARGS, "alter_links() -> Change links in the document." }, - {"list_fonts", (PyCFunction)list_fonts, METH_VARARGS, + {"list_fonts", (PyCFunction)list_fonts, METH_NOARGS, "list_fonts() -> Get list of fonts in document" }, {"used_fonts_in_page_range", (PyCFunction)used_fonts_in_page_range, METH_VARARGS, "used_fonts_in_page_range() -> Get list of references to fonts used in the specified pages" }, + {"remove_fonts", (PyCFunction)remove_fonts, METH_VARARGS, + "remove_fonts() -> Remove the specified font objects." + }, {"delete_page", (PyCFunction)PDFDoc_delete_page, METH_VARARGS, "delete_page(page_num) -> Delete the specified page from the pdf (0 is the first page)." }, diff --git a/src/calibre/utils/podofo/fonts.cpp b/src/calibre/utils/podofo/fonts.cpp index 0217d993d0..62b125b204 100644 --- a/src/calibre/utils/podofo/fonts.cpp +++ b/src/calibre/utils/podofo/fonts.cpp @@ -17,9 +17,27 @@ ref_as_tuple(const PdfReference &ref) { return Py_BuildValue("kk", num, generation); } +static inline const PdfObject* +get_font_file(const PdfObject *descriptor) { + PdfObject *ff = descriptor->GetIndirectKey("FontFile"); + if (!ff) ff = descriptor->GetIndirectKey("FontFile2"); + if (!ff) ff = descriptor->GetIndirectKey("FontFile3"); + return ff; +} + +static void +remove_font(PdfVecObjects &objects, PdfObject *font) { + PdfObject *descriptor = font->GetIndirectKey("FontDescriptor"); + if (descriptor) { + const PdfObject *ff = get_font_file(descriptor); + if (ff) delete objects.RemoveObject(ff->Reference()); + delete objects.RemoveObject(descriptor->Reference()); + } + delete objects.RemoveObject(font->Reference()); +} static bool -used_fonts_in_page(PdfPage *page, PyObject *ans) { +used_fonts_in_page(PdfPage *page, int page_num, PyObject *ans) { PdfContentsTokenizer tokenizer(page); bool in_text_block = false; const char* token = NULL; @@ -73,9 +91,7 @@ list_fonts(PDFDoc *self, PyObject *args) { long long stream_len = 0; pyunique_ptr descendant_font, stream_ref; if (descriptor) { - const PdfObject *ff = descriptor->GetIndirectKey("FontFile"); - if (!ff) ff = descriptor->GetIndirectKey("FontFile2"); - if (!ff) ff = descriptor->GetIndirectKey("FontFile3"); + const PdfObject *ff = get_font_file(descriptor); if (ff) { stream_ref.reset(ref_as_tuple(ff->Reference())); if (!stream_ref) return NULL; @@ -119,10 +135,27 @@ used_fonts_in_page_range(PDFDoc *self, PyObject *args) { for (int i = first - 1; i < last; i++) { try { PdfPage *page = self->doc->GetPage(i); - if (!used_fonts_in_page(page, ans.get())) return NULL; + if (!used_fonts_in_page(page, i, ans.get())) return NULL; } catch (const PdfError &err) { continue; } } return ans.release(); } +PyObject* +remove_fonts(PDFDoc *self, PyObject *args) { + PyObject *fonts; + if (!PyArg_ParseTuple(args, "O!", &PyTuple_Type, &fonts)) return NULL; + PdfVecObjects &objects = self->doc->GetObjects(); + for (Py_ssize_t i = 0; i < PyTuple_GET_SIZE(fonts); i++) { + unsigned long num, gen; + if (!PyArg_ParseTuple(PyTuple_GET_ITEM(fonts, i), "kk", &num, &gen)) return NULL; + PdfReference ref(num, gen); + PdfObject *font = objects.GetObject(ref); + if (font) { + remove_font(objects, font); + } + } + Py_RETURN_NONE; +} + } diff --git a/src/calibre/utils/podofo/global.h b/src/calibre/utils/podofo/global.h index 3a36fea3ca..ee69492a05 100644 --- a/src/calibre/utils/podofo/global.h +++ b/src/calibre/utils/podofo/global.h @@ -61,5 +61,6 @@ dictionary_has_key_name(const PdfDictionary &d, T key, const char *name) { extern "C" { PyObject* list_fonts(PDFDoc*, PyObject*); PyObject* used_fonts_in_page_range(PDFDoc *self, PyObject *args); +PyObject* remove_fonts(PDFDoc *self, PyObject *args); } }