diff --git a/src/calibre/ebooks/pdf/html_writer.py b/src/calibre/ebooks/pdf/html_writer.py index 3c10500167..d42188de83 100644 --- a/src/calibre/ebooks/pdf/html_writer.py +++ b/src/calibre/ebooks/pdf/html_writer.py @@ -36,7 +36,7 @@ from calibre.utils.fonts.sfnt.container import Sfnt, UnsupportedFont from calibre.utils.fonts.sfnt.merge import merge_truetype_fonts_for_pdf from calibre.utils.logging import default_log from calibre.utils.podofo import ( - get_podofo, remove_unused_fonts, set_metadata_implementation + dedup_type3_fonts, get_podofo, remove_unused_fonts, set_metadata_implementation ) from calibre.utils.short_uuid import uuid4 from polyglot.builtins import as_bytes, filter, iteritems, map, range, unicode_type @@ -838,6 +838,9 @@ def convert(opf_path, opts, metadata=None, output_path=None, log=default_log, co report_progress(0.75, _('Added links to PDF content')) merge_fonts(pdf_doc) + num_removed = dedup_type3_fonts(pdf_doc) + if num_removed: + log('Removed', num_removed, 'unused Type3 glyphs') # TODO: Support for mathematics diff --git a/src/calibre/utils/podofo/__init__.py b/src/calibre/utils/podofo/__init__.py index 2431c47d06..c036eb8a23 100644 --- a/src/calibre/utils/podofo/__init__.py +++ b/src/calibre/utils/podofo/__init__.py @@ -144,6 +144,20 @@ def test_remove_unused_fonts(src): print('Modified pdf saved to:', dest) +def dedup_type3_fonts(pdf_doc): + return pdf_doc.dedup_type3_fonts() + + +def test_dedup_type3_fonts(src): + podofo = get_podofo() + p = podofo.PDFDoc() + p.open(src) + num = dedup_type3_fonts(p) + dest = src.rpartition('.')[0] + '-removed.pdf' + p.save(dest) + print('Modified pdf with {} glyphs removed saved to:'.format(num), dest) + + def test_list_fonts(src): podofo = get_podofo() p = podofo.PDFDoc() diff --git a/src/calibre/utils/podofo/doc.cpp b/src/calibre/utils/podofo/doc.cpp index 1b54113a7d..7de327ebfe 100644 --- a/src/calibre/utils/podofo/doc.cpp +++ b/src/calibre/utils/podofo/doc.cpp @@ -753,6 +753,9 @@ static PyMethodDef PDFDoc_methods[] = { {"merge_fonts", (PyCFunction)py_merge_fonts, METH_VARARGS, "merge_fonts() -> Merge the specified fonts." }, + {"dedup_type3_fonts", (PyCFunction)py_dedup_type3_fonts, METH_VARARGS, + "dedup_type3_fonts() -> De-duplicate repeated glyphs in Type3 fonts" + }, {"delete_pages", (PyCFunction)PDFDoc_delete_pages, METH_VARARGS, "delete_page(page_num, count=1) -> Delete the specified pages from the pdf." }, diff --git a/src/calibre/utils/podofo/fonts.cpp b/src/calibre/utils/podofo/fonts.cpp index a6fe586efb..f12a0b46a9 100644 --- a/src/calibre/utils/podofo/fonts.cpp +++ b/src/calibre/utils/podofo/fonts.cpp @@ -356,6 +356,103 @@ merge_fonts(PDFDoc *self, PyObject *args) { Py_RETURN_NONE; } +class CharProc { + char *buf; pdf_long sz; + PdfReference ref; + std::size_t precomputed_hash; + CharProc( const CharProc & ) ; + CharProc & operator=( const CharProc & ) ; + + public: + CharProc(const PdfReference &reference, const PdfObject *o) : buf(NULL), sz(0), ref(reference), precomputed_hash(0) { + const PdfStream *stream = o->GetStream(); + stream->GetFilteredCopy(&buf, &sz); + precomputed_hash = std::hash()(sz); + } + CharProc(CharProc &&other) noexcept : + buf(other.buf), sz(other.sz), ref(other.ref), precomputed_hash(other.precomputed_hash) { + other.buf = NULL; + } + CharProc& operator=(CharProc &&other) noexcept { + if (buf) podofo_free(buf); + buf = other.buf; other.buf = NULL; sz = other.sz; ref = other.ref; precomputed_hash = other.precomputed_hash; + return *this; + } + ~CharProc() noexcept { if (buf) podofo_free(buf); buf = NULL; } + bool operator==(const CharProc &other) const noexcept { + return other.sz == sz && memcmp(buf, other.buf, sz) == 0; + } + std::size_t hash() const noexcept { return precomputed_hash; } + const PdfReference& reference() const noexcept { return ref; } +}; + +struct CharProcHasher { + std::size_t operator()(const CharProc& k) const { return k.hash(); } +}; + +typedef std::unordered_map, CharProcHasher> char_proc_reference_map; + +static PyObject* +dedup_type3_fonts(PDFDoc *self, PyObject *args) { + unsigned long count = 0; + unordered_reference_set all_char_procs; + unordered_reference_set all_type3_fonts; + char_proc_reference_map cp_map; + + PdfVecObjects &objects = self->doc->GetObjects(); + for (auto &k : objects) { + const PdfDictionary &dict = k->GetDictionary(); + if (dictionary_has_key_name(dict, PdfName::KeyType, "Font")) { + const std::string &font_type = dict.GetKey(PdfName::KeySubtype)->GetName().GetName(); + if (font_type == "Type3") { + all_type3_fonts.insert(k->Reference()); + for (auto &x : dict.GetKey("CharProcs")->GetDictionary().GetKeys()) { + const PdfReference &ref = x.second->GetReference(); + const PdfObject *cpobj = objects.GetObject(ref); + if (!cpobj || !cpobj->HasStream()) continue; + CharProc cp(ref, cpobj); + auto it = cp_map.find(cp); + if (it == cp_map.end()) { + std::vector vals; + cp_map.insert(std::make_pair(std::move(cp), std::move(vals))); + } else (*it).second.push_back(ref); + } + } + } + } + std::unordered_map ref_map; + for (auto &x : cp_map) { + if (x.second.size() > 0) { + const PdfReference &canonical_ref = x.first.reference(); + for (auto &ref : x.second) { + if (ref != canonical_ref) { + ref_map[ref] = x.first.reference(); + delete objects.RemoveObject(ref); + count++; + } + } + } + } + if (count > 0) { + for (auto &ref : all_type3_fonts) { + PdfObject *font = objects.GetObject(ref); + PdfDictionary dict = font->GetIndirectKey("CharProcs")->GetDictionary(); + PdfDictionary new_dict = PdfDictionary(dict); + bool changed = false; + for (auto &k : dict.GetKeys()) { + auto it = ref_map.find(k.second->GetReference()); + if (it != ref_map.end()) { + new_dict.AddKey(k.first, (*it).second); + changed = true; + } + } + if (changed) font->GetDictionary().AddKey("CharProcs", new_dict); + } + } + return Py_BuildValue("k", count); +} + PYWRAP(list_fonts) PYWRAP(merge_fonts) PYWRAP(remove_unused_fonts) +PYWRAP(dedup_type3_fonts) diff --git a/src/calibre/utils/podofo/global.h b/src/calibre/utils/podofo/global.h index 3f0638d1d2..6e24796cbb 100644 --- a/src/calibre/utils/podofo/global.h +++ b/src/calibre/utils/podofo/global.h @@ -99,5 +99,6 @@ extern "C" { PyObject* py_list_fonts(PDFDoc*, PyObject*); PyObject* py_remove_unused_fonts(PDFDoc *self, PyObject *args); PyObject* py_merge_fonts(PDFDoc *self, PyObject *args); +PyObject* py_dedup_type3_fonts(PDFDoc *self, PyObject *args); } }