Deduplicate Type3 glyph definitions

This commit is contained in:
Kovid Goyal 2019-07-25 20:54:35 +05:30
parent 32d483b3ce
commit 66b7037cd2
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C
5 changed files with 119 additions and 1 deletions

View File

@ -36,7 +36,7 @@ from calibre.utils.fonts.sfnt.container import Sfnt, UnsupportedFont
from calibre.utils.fonts.sfnt.merge import merge_truetype_fonts_for_pdf
from calibre.utils.logging import default_log
from calibre.utils.podofo import (
get_podofo, remove_unused_fonts, set_metadata_implementation
dedup_type3_fonts, get_podofo, remove_unused_fonts, set_metadata_implementation
)
from calibre.utils.short_uuid import uuid4
from polyglot.builtins import as_bytes, filter, iteritems, map, range, unicode_type
@ -838,6 +838,9 @@ def convert(opf_path, opts, metadata=None, output_path=None, log=default_log, co
report_progress(0.75, _('Added links to PDF content'))
merge_fonts(pdf_doc)
num_removed = dedup_type3_fonts(pdf_doc)
if num_removed:
log('Removed', num_removed, 'unused Type3 glyphs')
# TODO: Support for mathematics

View File

@ -144,6 +144,20 @@ def test_remove_unused_fonts(src):
print('Modified pdf saved to:', dest)
def dedup_type3_fonts(pdf_doc):
return pdf_doc.dedup_type3_fonts()
def test_dedup_type3_fonts(src):
podofo = get_podofo()
p = podofo.PDFDoc()
p.open(src)
num = dedup_type3_fonts(p)
dest = src.rpartition('.')[0] + '-removed.pdf'
p.save(dest)
print('Modified pdf with {} glyphs removed saved to:'.format(num), dest)
def test_list_fonts(src):
podofo = get_podofo()
p = podofo.PDFDoc()

View File

@ -753,6 +753,9 @@ static PyMethodDef PDFDoc_methods[] = {
{"merge_fonts", (PyCFunction)py_merge_fonts, METH_VARARGS,
"merge_fonts() -> Merge the specified fonts."
},
{"dedup_type3_fonts", (PyCFunction)py_dedup_type3_fonts, METH_VARARGS,
"dedup_type3_fonts() -> De-duplicate repeated glyphs in Type3 fonts"
},
{"delete_pages", (PyCFunction)PDFDoc_delete_pages, METH_VARARGS,
"delete_page(page_num, count=1) -> Delete the specified pages from the pdf."
},

View File

@ -356,6 +356,103 @@ merge_fonts(PDFDoc *self, PyObject *args) {
Py_RETURN_NONE;
}
class CharProc {
char *buf; pdf_long sz;
PdfReference ref;
std::size_t precomputed_hash;
CharProc( const CharProc & ) ;
CharProc & operator=( const CharProc & ) ;
public:
CharProc(const PdfReference &reference, const PdfObject *o) : buf(NULL), sz(0), ref(reference), precomputed_hash(0) {
const PdfStream *stream = o->GetStream();
stream->GetFilteredCopy(&buf, &sz);
precomputed_hash = std::hash<pdf_long>()(sz);
}
CharProc(CharProc &&other) noexcept :
buf(other.buf), sz(other.sz), ref(other.ref), precomputed_hash(other.precomputed_hash) {
other.buf = NULL;
}
CharProc& operator=(CharProc &&other) noexcept {
if (buf) podofo_free(buf);
buf = other.buf; other.buf = NULL; sz = other.sz; ref = other.ref; precomputed_hash = other.precomputed_hash;
return *this;
}
~CharProc() noexcept { if (buf) podofo_free(buf); buf = NULL; }
bool operator==(const CharProc &other) const noexcept {
return other.sz == sz && memcmp(buf, other.buf, sz) == 0;
}
std::size_t hash() const noexcept { return precomputed_hash; }
const PdfReference& reference() const noexcept { return ref; }
};
struct CharProcHasher {
std::size_t operator()(const CharProc& k) const { return k.hash(); }
};
typedef std::unordered_map<CharProc, std::vector<PdfReference>, CharProcHasher> char_proc_reference_map;
static PyObject*
dedup_type3_fonts(PDFDoc *self, PyObject *args) {
unsigned long count = 0;
unordered_reference_set all_char_procs;
unordered_reference_set all_type3_fonts;
char_proc_reference_map cp_map;
PdfVecObjects &objects = self->doc->GetObjects();
for (auto &k : objects) {
const PdfDictionary &dict = k->GetDictionary();
if (dictionary_has_key_name(dict, PdfName::KeyType, "Font")) {
const std::string &font_type = dict.GetKey(PdfName::KeySubtype)->GetName().GetName();
if (font_type == "Type3") {
all_type3_fonts.insert(k->Reference());
for (auto &x : dict.GetKey("CharProcs")->GetDictionary().GetKeys()) {
const PdfReference &ref = x.second->GetReference();
const PdfObject *cpobj = objects.GetObject(ref);
if (!cpobj || !cpobj->HasStream()) continue;
CharProc cp(ref, cpobj);
auto it = cp_map.find(cp);
if (it == cp_map.end()) {
std::vector<PdfReference> vals;
cp_map.insert(std::make_pair(std::move(cp), std::move(vals)));
} else (*it).second.push_back(ref);
}
}
}
}
std::unordered_map<PdfReference, PdfReference, PdfReferenceHasher> ref_map;
for (auto &x : cp_map) {
if (x.second.size() > 0) {
const PdfReference &canonical_ref = x.first.reference();
for (auto &ref : x.second) {
if (ref != canonical_ref) {
ref_map[ref] = x.first.reference();
delete objects.RemoveObject(ref);
count++;
}
}
}
}
if (count > 0) {
for (auto &ref : all_type3_fonts) {
PdfObject *font = objects.GetObject(ref);
PdfDictionary dict = font->GetIndirectKey("CharProcs")->GetDictionary();
PdfDictionary new_dict = PdfDictionary(dict);
bool changed = false;
for (auto &k : dict.GetKeys()) {
auto it = ref_map.find(k.second->GetReference());
if (it != ref_map.end()) {
new_dict.AddKey(k.first, (*it).second);
changed = true;
}
}
if (changed) font->GetDictionary().AddKey("CharProcs", new_dict);
}
}
return Py_BuildValue("k", count);
}
PYWRAP(list_fonts)
PYWRAP(merge_fonts)
PYWRAP(remove_unused_fonts)
PYWRAP(dedup_type3_fonts)

View File

@ -99,5 +99,6 @@ extern "C" {
PyObject* py_list_fonts(PDFDoc*, PyObject*);
PyObject* py_remove_unused_fonts(PDFDoc *self, PyObject *args);
PyObject* py_merge_fonts(PDFDoc *self, PyObject *args);
PyObject* py_dedup_type3_fonts(PDFDoc *self, PyObject *args);
}
}