Add support for removing unused Type3 fonts as well

This commit is contained in:
Kovid Goyal 2019-07-23 20:58:57 +05:30
parent 470193f222
commit 7f3bd476d3
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C
4 changed files with 76 additions and 37 deletions

View File

@ -10,7 +10,7 @@ from calibre.constants import plugins, preferred_encoding
from calibre.ebooks.metadata import authors_to_string
from calibre.ptempfile import TemporaryDirectory
from calibre.utils.ipc.simple_worker import WorkerError, fork_job
from polyglot.builtins import unicode_type, iteritems
from polyglot.builtins import unicode_type
def get_podofo():
@ -127,20 +127,11 @@ def get_image_count(path):
def list_fonts(pdf_doc):
fonts = pdf_doc.list_fonts()
ref_map = {f['Reference']: f for f in fonts}
for ref in pdf_doc.used_fonts_in_page_range():
ref_map[ref]['used'] = True
for font in fonts:
font['used'] = font.get('used', False)
if font['DescendantFont'] and font['used']:
ref_map[font['DescendantFont']]['used'] = True
return ref_map
def remove_unused_fonts(pdf_doc):
font_ref_map = list_fonts(pdf_doc)
unused = tuple(ref for ref, font in iteritems(font_ref_map) if not font['used'])
pdf_doc.remove_fonts(unused)
return len(tuple(f for f in unused if font_ref_map[f]['StreamRef']))
return pdf_doc.remove_unused_fonts()
def test_remove_unused_fonts(src):

View File

@ -747,12 +747,12 @@ static PyMethodDef PDFDoc_methods[] = {
{"list_fonts", (PyCFunction)list_fonts, METH_VARARGS,
"list_fonts() -> Get list of fonts in document"
},
{"used_fonts_in_page_range", (PyCFunction)used_fonts_in_page_range, METH_VARARGS,
"used_fonts_in_page_range() -> Get list of references to fonts used in the specified pages"
},
{"remove_fonts", (PyCFunction)remove_fonts, METH_VARARGS,
"remove_fonts() -> Remove the specified font objects."
},
{"remove_unused_fonts", (PyCFunction)remove_unused_fonts, METH_NOARGS,
"remove_unused_fonts() -> Remove unused font objects."
},
{"merge_fonts", (PyCFunction)merge_fonts, METH_VARARGS,
"merge_fonts() -> Merge the specified fonts."
},

View File

@ -8,7 +8,6 @@
#include "global.h"
#include <iostream>
#include <stack>
#include <unordered_map>
using namespace pdf;
@ -72,7 +71,7 @@ replace_font_references(PDFDoc *self, std::unordered_map<uint64_t, uint64_t> &re
}
static bool
used_fonts_in_page(PdfPage *page, int page_num, PyObject *ans) {
used_fonts_in_page(PdfPage *page, unordered_reference_set &ans) {
PdfContentsTokenizer tokenizer(page);
bool in_text_block = false;
const char* token = NULL;
@ -96,11 +95,7 @@ used_fonts_in_page(PdfPage *page, int page_num, PyObject *ans) {
if (stack.size() > 0 && stack.top().IsName()) {
const PdfName &reference_name = stack.top().GetName();
PdfObject* font = page->GetFromResources("Font", reference_name);
if (font) {
pyunique_ptr r(ref_as_tuple(font->Reference()));
if (!r) return false;
if (PySet_Add(ans, r.get()) != 0) return false;
}
if (font) ans.insert(font->Reference());
}
}
}
@ -235,21 +230,6 @@ list_fonts(PDFDoc *self, PyObject *args) {
return ans.release();
}
PyObject*
used_fonts_in_page_range(PDFDoc *self, PyObject *args) {
int first = 1, last = self->doc->GetPageCount();
if (!PyArg_ParseTuple(args, "|ii", &first, &last)) return NULL;
pyunique_ptr ans(PySet_New(NULL));
if (!ans) return NULL;
for (int i = first - 1; i < last; i++) {
try {
PdfPage *page = self->doc->GetPage(i);
if (!used_fonts_in_page(page, i, ans.get())) return NULL;
} catch (const PdfError &err) { continue; }
}
return ans.release();
}
PyObject*
remove_fonts(PDFDoc *self, PyObject *args) {
PyObject *fonts;
@ -267,6 +247,63 @@ remove_fonts(PDFDoc *self, PyObject *args) {
Py_RETURN_NONE;
}
typedef std::unordered_map<PdfReference, unsigned long, PdfReferenceHasher> charprocs_usage_map;
PyObject*
remove_unused_fonts(PDFDoc *self, PyObject *args) {
unordered_reference_set used_fonts;
for (int i = 0; i < self->doc->GetPageCount(); i++) {
PdfPage *page = self->doc->GetPage(i);
if (page) used_fonts_in_page(page, used_fonts);
}
unordered_reference_set all_fonts;
unordered_reference_set type3_fonts;
charprocs_usage_map charprocs_usage;
PdfVecObjects &objects = self->doc->GetObjects();
for (TCIVecObjects it = objects.begin(); it != objects.end(); it++) {
if ((*it)->IsDictionary()) {
const PdfDictionary &dict = (*it)->GetDictionary();
if (dictionary_has_key_name(dict, PdfName::KeyType, "Font")) {
const std::string &font_type = dict.GetKey(PdfName::KeySubtype)->GetName().GetName();
if (font_type == "Type0") {
all_fonts.insert((*it)->Reference());
} else if (font_type == "Type3") {
all_fonts.insert((*it)->Reference());
type3_fonts.insert((*it)->Reference());
for (auto &x : dict.GetKey("CharProcs")->GetDictionary().GetKeys()) {
const PdfReference &ref = x.second->GetReference();
if (charprocs_usage.find(ref) == charprocs_usage.end()) charprocs_usage[ref] = 1;
else charprocs_usage[ref] += 1;
}
}
}
}
}
unsigned long count = 0;
for (auto &ref : all_fonts) {
if (used_fonts.find(ref) == used_fonts.end()) {
PdfObject *font = objects.GetObject(ref);
if (font) {
count++;
if (type3_fonts.find(ref) != type3_fonts.end()) {
for (auto &x : font->GetIndirectKey("CharProcs")->GetDictionary().GetKeys()) {
charprocs_usage[x.second->GetReference()] -= 1;
}
}
remove_font(objects, font);
}
}
}
for (auto &x : charprocs_usage) {
if (x.second == 0u) {
delete objects.RemoveObject(x.first);
}
}
return Py_BuildValue("k", count);
}
PyObject*
merge_fonts(PDFDoc *self, PyObject *args) {
PyObject *items, *replacements;

View File

@ -12,6 +12,8 @@
#define USING_SHARED_PODOFO
#include <podofo.h>
#include <unordered_set>
#include <unordered_map>
using namespace PoDoFo;
namespace pdf {
@ -84,10 +86,19 @@ dictionary_has_key_name(const PdfDictionary &d, T key, const char *name) {
return false;
}
class PdfReferenceHasher {
public:
size_t operator()(const PdfReference & obj) const {
return std::hash<pdf_objnum>()(obj.ObjectNumber());
}
};
typedef std::unordered_set<PdfReference, PdfReferenceHasher> unordered_reference_set;
extern "C" {
PyObject* list_fonts(PDFDoc*, PyObject*);
PyObject* used_fonts_in_page_range(PDFDoc *self, PyObject *args);
PyObject* remove_fonts(PDFDoc *self, PyObject *args);
PyObject* remove_unused_fonts(PDFDoc *self, PyObject *args);
PyObject* merge_fonts(PDFDoc *self, PyObject *args);
}
}