Remove unused fonts from generated PDF

This commit is contained in:
Kovid Goyal 2019-07-14 20:01:16 +05:30
parent 24a344546e
commit 6caf7cf77a
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C
5 changed files with 68 additions and 9 deletions

View File

@ -31,7 +31,9 @@ from calibre.ebooks.pdf.render.serialize import PDFStream
from calibre.gui2 import setup_unix_signals
from calibre.gui2.webengine import secure_webengine
from calibre.utils.logging import default_log
from calibre.utils.podofo import get_podofo, set_metadata_implementation
from calibre.utils.podofo import (
get_podofo, remove_unused_fonts, set_metadata_implementation
)
from calibre.utils.short_uuid import uuid4
from polyglot.builtins import iteritems, map, range, unicode_type
from polyglot.urllib import urlparse
@ -563,11 +565,14 @@ def convert(opf_path, opts, metadata=None, output_path=None, log=default_log, co
add_toc(PDFOutlineRoot(pdf_doc), toc)
report_progress(0.75, _('Added links to PDF content'))
# TODO: Remove unused fonts
# TODO: Remove duplicate fonts
# TODO: Subset and embed fonts before rendering PDF
# TODO: Support for mathematics
num_removed = remove_unused_fonts(pdf_doc)
if num_removed:
log('Removed', num_removed, 'unused fonts')
if cover_data:
add_cover(pdf_doc, cover_data, page_layout, opts)

View File

@ -10,7 +10,7 @@ from calibre.constants import plugins, preferred_encoding
from calibre.ebooks.metadata import authors_to_string
from calibre.ptempfile import TemporaryDirectory
from calibre.utils.ipc.simple_worker import WorkerError, fork_job
from polyglot.builtins import range, unicode_type
from polyglot.builtins import range, unicode_type, iteritems
def get_podofo():
@ -154,6 +154,23 @@ def list_fonts(pdf_doc):
return ref_map
def remove_unused_fonts(pdf_doc):
font_ref_map = list_fonts(pdf_doc)
unused = tuple(ref for ref, font in iteritems(font_ref_map) if not font['used'])
pdf_doc.remove_fonts(unused)
return len(tuple(f for f in unused if font_ref_map[f]['StreamRef']))
def test_remove_unused_fonts(src):
podofo = get_podofo()
p = podofo.PDFDoc()
p.open(src)
remove_unused_fonts(p)
dest = src.rpartition('.')[0] + '-removed.pdf'
p.save(dest)
print('Modified pdf saved to:', dest)
def test_list_fonts(src):
podofo = get_podofo()
p = podofo.PDFDoc()

View File

@ -715,12 +715,15 @@ static PyMethodDef PDFDoc_methods[] = {
{"alter_links", (PyCFunction)PDFDoc_alter_links, METH_VARARGS,
"alter_links() -> Change links in the document."
},
{"list_fonts", (PyCFunction)list_fonts, METH_VARARGS,
{"list_fonts", (PyCFunction)list_fonts, METH_NOARGS,
"list_fonts() -> Get list of fonts in document"
},
{"used_fonts_in_page_range", (PyCFunction)used_fonts_in_page_range, METH_VARARGS,
"used_fonts_in_page_range() -> Get list of references to fonts used in the specified pages"
},
{"remove_fonts", (PyCFunction)remove_fonts, METH_VARARGS,
"remove_fonts() -> Remove the specified font objects."
},
{"delete_page", (PyCFunction)PDFDoc_delete_page, METH_VARARGS,
"delete_page(page_num) -> Delete the specified page from the pdf (0 is the first page)."
},

View File

@ -17,9 +17,27 @@ ref_as_tuple(const PdfReference &ref) {
return Py_BuildValue("kk", num, generation);
}
static inline const PdfObject*
get_font_file(const PdfObject *descriptor) {
PdfObject *ff = descriptor->GetIndirectKey("FontFile");
if (!ff) ff = descriptor->GetIndirectKey("FontFile2");
if (!ff) ff = descriptor->GetIndirectKey("FontFile3");
return ff;
}
static void
remove_font(PdfVecObjects &objects, PdfObject *font) {
PdfObject *descriptor = font->GetIndirectKey("FontDescriptor");
if (descriptor) {
const PdfObject *ff = get_font_file(descriptor);
if (ff) delete objects.RemoveObject(ff->Reference());
delete objects.RemoveObject(descriptor->Reference());
}
delete objects.RemoveObject(font->Reference());
}
static bool
used_fonts_in_page(PdfPage *page, PyObject *ans) {
used_fonts_in_page(PdfPage *page, int page_num, PyObject *ans) {
PdfContentsTokenizer tokenizer(page);
bool in_text_block = false;
const char* token = NULL;
@ -73,9 +91,7 @@ list_fonts(PDFDoc *self, PyObject *args) {
long long stream_len = 0;
pyunique_ptr descendant_font, stream_ref;
if (descriptor) {
const PdfObject *ff = descriptor->GetIndirectKey("FontFile");
if (!ff) ff = descriptor->GetIndirectKey("FontFile2");
if (!ff) ff = descriptor->GetIndirectKey("FontFile3");
const PdfObject *ff = get_font_file(descriptor);
if (ff) {
stream_ref.reset(ref_as_tuple(ff->Reference()));
if (!stream_ref) return NULL;
@ -119,10 +135,27 @@ used_fonts_in_page_range(PDFDoc *self, PyObject *args) {
for (int i = first - 1; i < last; i++) {
try {
PdfPage *page = self->doc->GetPage(i);
if (!used_fonts_in_page(page, ans.get())) return NULL;
if (!used_fonts_in_page(page, i, ans.get())) return NULL;
} catch (const PdfError &err) { continue; }
}
return ans.release();
}
PyObject*
remove_fonts(PDFDoc *self, PyObject *args) {
PyObject *fonts;
if (!PyArg_ParseTuple(args, "O!", &PyTuple_Type, &fonts)) return NULL;
PdfVecObjects &objects = self->doc->GetObjects();
for (Py_ssize_t i = 0; i < PyTuple_GET_SIZE(fonts); i++) {
unsigned long num, gen;
if (!PyArg_ParseTuple(PyTuple_GET_ITEM(fonts, i), "kk", &num, &gen)) return NULL;
PdfReference ref(num, gen);
PdfObject *font = objects.GetObject(ref);
if (font) {
remove_font(objects, font);
}
}
Py_RETURN_NONE;
}
}

View File

@ -61,5 +61,6 @@ dictionary_has_key_name(const PdfDictionary &d, T key, const char *name) {
extern "C" {
PyObject* list_fonts(PDFDoc*, PyObject*);
PyObject* used_fonts_in_page_range(PDFDoc *self, PyObject *args);
PyObject* remove_fonts(PDFDoc *self, PyObject *args);
}
}