diff --git a/src/calibre/ebooks/pdf/html_writer.py b/src/calibre/ebooks/pdf/html_writer.py
index 3c10500167..d42188de83 100644
--- a/src/calibre/ebooks/pdf/html_writer.py
+++ b/src/calibre/ebooks/pdf/html_writer.py
@@ -36,7 +36,7 @@ from calibre.utils.fonts.sfnt.container import Sfnt, UnsupportedFont
from calibre.utils.fonts.sfnt.merge import merge_truetype_fonts_for_pdf
from calibre.utils.logging import default_log
from calibre.utils.podofo import (
- get_podofo, remove_unused_fonts, set_metadata_implementation
+ dedup_type3_fonts, get_podofo, remove_unused_fonts, set_metadata_implementation
)
from calibre.utils.short_uuid import uuid4
from polyglot.builtins import as_bytes, filter, iteritems, map, range, unicode_type
@@ -838,6 +838,9 @@ def convert(opf_path, opts, metadata=None, output_path=None, log=default_log, co
report_progress(0.75, _('Added links to PDF content'))
merge_fonts(pdf_doc)
+ num_removed = dedup_type3_fonts(pdf_doc)
+ if num_removed:
+ log('Removed', num_removed, 'unused Type3 glyphs')
# TODO: Support for mathematics
diff --git a/src/calibre/utils/podofo/__init__.py b/src/calibre/utils/podofo/__init__.py
index 2431c47d06..c036eb8a23 100644
--- a/src/calibre/utils/podofo/__init__.py
+++ b/src/calibre/utils/podofo/__init__.py
@@ -144,6 +144,20 @@ def test_remove_unused_fonts(src):
print('Modified pdf saved to:', dest)
+def dedup_type3_fonts(pdf_doc):
+ return pdf_doc.dedup_type3_fonts()
+
+
+def test_dedup_type3_fonts(src):
+ podofo = get_podofo()
+ p = podofo.PDFDoc()
+ p.open(src)
+ num = dedup_type3_fonts(p)
+ dest = src.rpartition('.')[0] + '-removed.pdf'
+ p.save(dest)
+ print('Modified pdf with {} glyphs removed saved to:'.format(num), dest)
+
+
def test_list_fonts(src):
podofo = get_podofo()
p = podofo.PDFDoc()
diff --git a/src/calibre/utils/podofo/doc.cpp b/src/calibre/utils/podofo/doc.cpp
index 1b54113a7d..7de327ebfe 100644
--- a/src/calibre/utils/podofo/doc.cpp
+++ b/src/calibre/utils/podofo/doc.cpp
@@ -753,6 +753,9 @@ static PyMethodDef PDFDoc_methods[] = {
{"merge_fonts", (PyCFunction)py_merge_fonts, METH_VARARGS,
"merge_fonts() -> Merge the specified fonts."
},
+ {"dedup_type3_fonts", (PyCFunction)py_dedup_type3_fonts, METH_VARARGS,
+ "dedup_type3_fonts() -> De-duplicate repeated glyphs in Type3 fonts"
+ },
{"delete_pages", (PyCFunction)PDFDoc_delete_pages, METH_VARARGS,
"delete_page(page_num, count=1) -> Delete the specified pages from the pdf."
},
diff --git a/src/calibre/utils/podofo/fonts.cpp b/src/calibre/utils/podofo/fonts.cpp
index a6fe586efb..f12a0b46a9 100644
--- a/src/calibre/utils/podofo/fonts.cpp
+++ b/src/calibre/utils/podofo/fonts.cpp
@@ -356,6 +356,103 @@ merge_fonts(PDFDoc *self, PyObject *args) {
Py_RETURN_NONE;
}
+class CharProc {
+ char *buf; pdf_long sz;
+ PdfReference ref;
+ std::size_t precomputed_hash;
+ CharProc( const CharProc & ) ;
+ CharProc & operator=( const CharProc & ) ;
+
+ public:
+ CharProc(const PdfReference &reference, const PdfObject *o) : buf(NULL), sz(0), ref(reference), precomputed_hash(0) {
+ const PdfStream *stream = o->GetStream();
+ stream->GetFilteredCopy(&buf, &sz);
+ precomputed_hash = std::hash()(sz);
+ }
+ CharProc(CharProc &&other) noexcept :
+ buf(other.buf), sz(other.sz), ref(other.ref), precomputed_hash(other.precomputed_hash) {
+ other.buf = NULL;
+ }
+ CharProc& operator=(CharProc &&other) noexcept {
+ if (buf) podofo_free(buf);
+ buf = other.buf; other.buf = NULL; sz = other.sz; ref = other.ref; precomputed_hash = other.precomputed_hash;
+ return *this;
+ }
+ ~CharProc() noexcept { if (buf) podofo_free(buf); buf = NULL; }
+ bool operator==(const CharProc &other) const noexcept {
+ return other.sz == sz && memcmp(buf, other.buf, sz) == 0;
+ }
+ std::size_t hash() const noexcept { return precomputed_hash; }
+ const PdfReference& reference() const noexcept { return ref; }
+};
+
+struct CharProcHasher {
+ std::size_t operator()(const CharProc& k) const { return k.hash(); }
+};
+
+typedef std::unordered_map, CharProcHasher> char_proc_reference_map;
+
+static PyObject*
+dedup_type3_fonts(PDFDoc *self, PyObject *args) {
+ unsigned long count = 0;
+ unordered_reference_set all_char_procs;
+ unordered_reference_set all_type3_fonts;
+ char_proc_reference_map cp_map;
+
+ PdfVecObjects &objects = self->doc->GetObjects();
+ for (auto &k : objects) {
+ const PdfDictionary &dict = k->GetDictionary();
+ if (dictionary_has_key_name(dict, PdfName::KeyType, "Font")) {
+ const std::string &font_type = dict.GetKey(PdfName::KeySubtype)->GetName().GetName();
+ if (font_type == "Type3") {
+ all_type3_fonts.insert(k->Reference());
+ for (auto &x : dict.GetKey("CharProcs")->GetDictionary().GetKeys()) {
+ const PdfReference &ref = x.second->GetReference();
+ const PdfObject *cpobj = objects.GetObject(ref);
+ if (!cpobj || !cpobj->HasStream()) continue;
+ CharProc cp(ref, cpobj);
+ auto it = cp_map.find(cp);
+ if (it == cp_map.end()) {
+ std::vector vals;
+ cp_map.insert(std::make_pair(std::move(cp), std::move(vals)));
+ } else (*it).second.push_back(ref);
+ }
+ }
+ }
+ }
+ std::unordered_map ref_map;
+ for (auto &x : cp_map) {
+ if (x.second.size() > 0) {
+ const PdfReference &canonical_ref = x.first.reference();
+ for (auto &ref : x.second) {
+ if (ref != canonical_ref) {
+ ref_map[ref] = x.first.reference();
+ delete objects.RemoveObject(ref);
+ count++;
+ }
+ }
+ }
+ }
+ if (count > 0) {
+ for (auto &ref : all_type3_fonts) {
+ PdfObject *font = objects.GetObject(ref);
+ PdfDictionary dict = font->GetIndirectKey("CharProcs")->GetDictionary();
+ PdfDictionary new_dict = PdfDictionary(dict);
+ bool changed = false;
+ for (auto &k : dict.GetKeys()) {
+ auto it = ref_map.find(k.second->GetReference());
+ if (it != ref_map.end()) {
+ new_dict.AddKey(k.first, (*it).second);
+ changed = true;
+ }
+ }
+ if (changed) font->GetDictionary().AddKey("CharProcs", new_dict);
+ }
+ }
+ return Py_BuildValue("k", count);
+}
+
PYWRAP(list_fonts)
PYWRAP(merge_fonts)
PYWRAP(remove_unused_fonts)
+PYWRAP(dedup_type3_fonts)
diff --git a/src/calibre/utils/podofo/global.h b/src/calibre/utils/podofo/global.h
index 3f0638d1d2..6e24796cbb 100644
--- a/src/calibre/utils/podofo/global.h
+++ b/src/calibre/utils/podofo/global.h
@@ -99,5 +99,6 @@ extern "C" {
PyObject* py_list_fonts(PDFDoc*, PyObject*);
PyObject* py_remove_unused_fonts(PDFDoc *self, PyObject *args);
PyObject* py_merge_fonts(PDFDoc *self, PyObject *args);
+PyObject* py_dedup_type3_fonts(PDFDoc *self, PyObject *args);
}
}