When listing fonts check if they are used

This commit is contained in:
Kovid Goyal 2019-07-14 18:00:04 +05:30
parent dfc09ede98
commit 1e9517f40f
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C
4 changed files with 47 additions and 12 deletions

View File

@ -142,6 +142,18 @@ def get_image_count(path):
return p.image_count() return p.image_count()
def list_fonts(pdf_doc):
fonts = pdf_doc.list_fonts()
ref_map = {f['Reference']: f for f in fonts}
for ref in pdf_doc.used_fonts_in_page_range():
ref_map[ref]['used'] = True
for font in fonts:
font['used'] = font.get('used', False)
if font['DescendantFont'] and font['used']:
ref_map[font['DescendantFont']]['used'] = True
return ref_map
def test_list_fonts(src): def test_list_fonts(src):
podofo = get_podofo() podofo = get_podofo()
p = podofo.PDFDoc() p = podofo.PDFDoc()
@ -149,7 +161,7 @@ def test_list_fonts(src):
raw = f.read() raw = f.read()
p.load(raw) p.load(raw)
import pprint import pprint
pprint.pprint(p.list_fonts()) pprint.pprint(list_fonts(p))
def test_save_to(src, dest): def test_save_to(src, dest):

View File

@ -718,6 +718,9 @@ static PyMethodDef PDFDoc_methods[] = {
{"list_fonts", (PyCFunction)list_fonts, METH_VARARGS, {"list_fonts", (PyCFunction)list_fonts, METH_VARARGS,
"list_fonts() -> Get list of fonts in document" "list_fonts() -> Get list of fonts in document"
}, },
{"used_fonts_in_page_range", (PyCFunction)used_fonts_in_page_range, METH_VARARGS,
"used_fonts_in_page_range() -> Get list of references to fonts used in the specified pages"
},
{"delete_page", (PyCFunction)PDFDoc_delete_page, METH_VARARGS, {"delete_page", (PyCFunction)PDFDoc_delete_page, METH_VARARGS,
"delete_page(page_num) -> Delete the specified page from the pdf (0 is the first page)." "delete_page(page_num) -> Delete the specified page from the pdf (0 is the first page)."
}, },

View File

@ -11,9 +11,16 @@
using namespace pdf; using namespace pdf;
static inline PyObject*
ref_as_tuple(const PdfReference &ref) {
unsigned long num = ref.ObjectNumber(), generation = ref.GenerationNumber();
return Py_BuildValue("kk", num, generation);
}
static bool static bool
used_fonts_in_page(const PdfPage *page, PyObject *ans) { used_fonts_in_page(PdfPage *page, PyObject *ans) {
PdfContentsTokenizer tokenizer((PdfCanvas*)page); PdfContentsTokenizer tokenizer(page);
bool in_text_block = false; bool in_text_block = false;
const char* token = NULL; const char* token = NULL;
EPdfContentsType contents_type; EPdfContentsType contents_type;
@ -35,11 +42,9 @@ used_fonts_in_page(const PdfPage *page, PyObject *ans) {
stack.pop(); stack.pop();
if (stack.size() > 0 && stack.top().IsName()) { if (stack.size() > 0 && stack.top().IsName()) {
const PdfName &reference_name = stack.top().GetName(); const PdfName &reference_name = stack.top().GetName();
PdfObject* font = pPage->GetFromResources("Font", reference_name); PdfObject* font = page->GetFromResources("Font", reference_name);
if (font) { if (font) {
const PdfReference &ref = font->Reference(); pyunique_ptr r(ref_as_tuple(font->Reference()));
unsigned long num = ref.ObjectNumber(), generation = ref.GenerationNumber();
pyunique_ptr r(Py_BuildValue("kk", num, generation));
if (!r) return false; if (!r) return false;
if (PySet_Add(ans, r.get()) != 0) return false; if (PySet_Add(ans, r.get()) != 0) return false;
} }
@ -66,19 +71,33 @@ list_fonts(PDFDoc *self, PyObject *args) {
unsigned long num = ref.ObjectNumber(), generation = ref.GenerationNumber(); unsigned long num = ref.ObjectNumber(), generation = ref.GenerationNumber();
const PdfObject *descriptor = (*it)->GetIndirectKey("FontDescriptor"); const PdfObject *descriptor = (*it)->GetIndirectKey("FontDescriptor");
long long stream_len = 0; long long stream_len = 0;
pyunique_ptr descendant_font, stream_ref;
if (descriptor) { if (descriptor) {
const PdfObject *ff = descriptor->GetIndirectKey("FontFile"); const PdfObject *ff = descriptor->GetIndirectKey("FontFile");
if (!ff) ff = descriptor->GetIndirectKey("FontFile2"); if (!ff) ff = descriptor->GetIndirectKey("FontFile2");
if (!ff) ff = descriptor->GetIndirectKey("FontFile3"); if (!ff) ff = descriptor->GetIndirectKey("FontFile3");
const PdfStream *stream = ff->GetStream(); if (ff) {
if (stream) stream_len = stream->GetLength(); stream_ref.reset(ref_as_tuple(ff->Reference()));
if (!stream_ref) return NULL;
const PdfStream *stream = ff->GetStream();
if (stream) stream_len = stream->GetLength();
}
} else if (dict.HasKey("DescendantFonts")) {
const PdfArray &df = dict.GetKey("DescendantFonts")->GetArray();
descendant_font.reset(ref_as_tuple(df[0].GetReference()));
if (!descendant_font) return NULL;
} }
#define V(x) (x ? x.get() : Py_None)
pyunique_ptr d(Py_BuildValue( pyunique_ptr d(Py_BuildValue(
"{sssss(kk)sL}", "{ss ss s(kk) sL sO sO}",
"BaseFont", name.c_str(), "BaseFont", name.c_str(),
"Subtype", subtype.c_str(), "Subtype", subtype.c_str(),
"Reference", num, generation, "Reference", num, generation,
"Length", stream_len)); "Length", stream_len,
"DescendantFont", V(descendant_font),
"StreamRef", V(stream_ref)
));
#undef V
if (!d) { return NULL; } if (!d) { return NULL; }
if (PyList_Append(ans.get(), d.get()) != 0) return NULL; if (PyList_Append(ans.get(), d.get()) != 0) return NULL;
} }
@ -99,7 +118,7 @@ used_fonts_in_page_range(PDFDoc *self, PyObject *args) {
if (!ans) return NULL; if (!ans) return NULL;
for (int i = first - 1; i < last; i++) { for (int i = first - 1; i < last; i++) {
try { try {
const PdfPage *page = self->doc->GetPage(i); PdfPage *page = self->doc->GetPage(i);
if (!used_fonts_in_page(page, ans.get())) return NULL; if (!used_fonts_in_page(page, ans.get())) return NULL;
} catch (const PdfError &err) { continue; } } catch (const PdfError &err) { continue; }
} }

View File

@ -60,5 +60,6 @@ dictionary_has_key_name(const PdfDictionary &d, T key, const char *name) {
extern "C" { extern "C" {
PyObject* list_fonts(PDFDoc*, PyObject*); PyObject* list_fonts(PDFDoc*, PyObject*);
PyObject* used_fonts_in_page_range(PDFDoc *self, PyObject *args);
} }
} }