mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Add support for removing unused Type3 fonts as well
This commit is contained in:
parent
470193f222
commit
7f3bd476d3
@ -10,7 +10,7 @@ from calibre.constants import plugins, preferred_encoding
|
|||||||
from calibre.ebooks.metadata import authors_to_string
|
from calibre.ebooks.metadata import authors_to_string
|
||||||
from calibre.ptempfile import TemporaryDirectory
|
from calibre.ptempfile import TemporaryDirectory
|
||||||
from calibre.utils.ipc.simple_worker import WorkerError, fork_job
|
from calibre.utils.ipc.simple_worker import WorkerError, fork_job
|
||||||
from polyglot.builtins import unicode_type, iteritems
|
from polyglot.builtins import unicode_type
|
||||||
|
|
||||||
|
|
||||||
def get_podofo():
|
def get_podofo():
|
||||||
@ -127,20 +127,11 @@ def get_image_count(path):
|
|||||||
def list_fonts(pdf_doc):
|
def list_fonts(pdf_doc):
|
||||||
fonts = pdf_doc.list_fonts()
|
fonts = pdf_doc.list_fonts()
|
||||||
ref_map = {f['Reference']: f for f in fonts}
|
ref_map = {f['Reference']: f for f in fonts}
|
||||||
for ref in pdf_doc.used_fonts_in_page_range():
|
|
||||||
ref_map[ref]['used'] = True
|
|
||||||
for font in fonts:
|
|
||||||
font['used'] = font.get('used', False)
|
|
||||||
if font['DescendantFont'] and font['used']:
|
|
||||||
ref_map[font['DescendantFont']]['used'] = True
|
|
||||||
return ref_map
|
return ref_map
|
||||||
|
|
||||||
|
|
||||||
def remove_unused_fonts(pdf_doc):
|
def remove_unused_fonts(pdf_doc):
|
||||||
font_ref_map = list_fonts(pdf_doc)
|
return pdf_doc.remove_unused_fonts()
|
||||||
unused = tuple(ref for ref, font in iteritems(font_ref_map) if not font['used'])
|
|
||||||
pdf_doc.remove_fonts(unused)
|
|
||||||
return len(tuple(f for f in unused if font_ref_map[f]['StreamRef']))
|
|
||||||
|
|
||||||
|
|
||||||
def test_remove_unused_fonts(src):
|
def test_remove_unused_fonts(src):
|
||||||
|
@ -747,12 +747,12 @@ static PyMethodDef PDFDoc_methods[] = {
|
|||||||
{"list_fonts", (PyCFunction)list_fonts, METH_VARARGS,
|
{"list_fonts", (PyCFunction)list_fonts, METH_VARARGS,
|
||||||
"list_fonts() -> Get list of fonts in document"
|
"list_fonts() -> Get list of fonts in document"
|
||||||
},
|
},
|
||||||
{"used_fonts_in_page_range", (PyCFunction)used_fonts_in_page_range, METH_VARARGS,
|
|
||||||
"used_fonts_in_page_range() -> Get list of references to fonts used in the specified pages"
|
|
||||||
},
|
|
||||||
{"remove_fonts", (PyCFunction)remove_fonts, METH_VARARGS,
|
{"remove_fonts", (PyCFunction)remove_fonts, METH_VARARGS,
|
||||||
"remove_fonts() -> Remove the specified font objects."
|
"remove_fonts() -> Remove the specified font objects."
|
||||||
},
|
},
|
||||||
|
{"remove_unused_fonts", (PyCFunction)remove_unused_fonts, METH_NOARGS,
|
||||||
|
"remove_unused_fonts() -> Remove unused font objects."
|
||||||
|
},
|
||||||
{"merge_fonts", (PyCFunction)merge_fonts, METH_VARARGS,
|
{"merge_fonts", (PyCFunction)merge_fonts, METH_VARARGS,
|
||||||
"merge_fonts() -> Merge the specified fonts."
|
"merge_fonts() -> Merge the specified fonts."
|
||||||
},
|
},
|
||||||
|
@ -8,7 +8,6 @@
|
|||||||
#include "global.h"
|
#include "global.h"
|
||||||
#include <iostream>
|
#include <iostream>
|
||||||
#include <stack>
|
#include <stack>
|
||||||
#include <unordered_map>
|
|
||||||
|
|
||||||
using namespace pdf;
|
using namespace pdf;
|
||||||
|
|
||||||
@ -72,7 +71,7 @@ replace_font_references(PDFDoc *self, std::unordered_map<uint64_t, uint64_t> &re
|
|||||||
}
|
}
|
||||||
|
|
||||||
static bool
|
static bool
|
||||||
used_fonts_in_page(PdfPage *page, int page_num, PyObject *ans) {
|
used_fonts_in_page(PdfPage *page, unordered_reference_set &ans) {
|
||||||
PdfContentsTokenizer tokenizer(page);
|
PdfContentsTokenizer tokenizer(page);
|
||||||
bool in_text_block = false;
|
bool in_text_block = false;
|
||||||
const char* token = NULL;
|
const char* token = NULL;
|
||||||
@ -96,11 +95,7 @@ used_fonts_in_page(PdfPage *page, int page_num, PyObject *ans) {
|
|||||||
if (stack.size() > 0 && stack.top().IsName()) {
|
if (stack.size() > 0 && stack.top().IsName()) {
|
||||||
const PdfName &reference_name = stack.top().GetName();
|
const PdfName &reference_name = stack.top().GetName();
|
||||||
PdfObject* font = page->GetFromResources("Font", reference_name);
|
PdfObject* font = page->GetFromResources("Font", reference_name);
|
||||||
if (font) {
|
if (font) ans.insert(font->Reference());
|
||||||
pyunique_ptr r(ref_as_tuple(font->Reference()));
|
|
||||||
if (!r) return false;
|
|
||||||
if (PySet_Add(ans, r.get()) != 0) return false;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -235,21 +230,6 @@ list_fonts(PDFDoc *self, PyObject *args) {
|
|||||||
return ans.release();
|
return ans.release();
|
||||||
}
|
}
|
||||||
|
|
||||||
PyObject*
|
|
||||||
used_fonts_in_page_range(PDFDoc *self, PyObject *args) {
|
|
||||||
int first = 1, last = self->doc->GetPageCount();
|
|
||||||
if (!PyArg_ParseTuple(args, "|ii", &first, &last)) return NULL;
|
|
||||||
pyunique_ptr ans(PySet_New(NULL));
|
|
||||||
if (!ans) return NULL;
|
|
||||||
for (int i = first - 1; i < last; i++) {
|
|
||||||
try {
|
|
||||||
PdfPage *page = self->doc->GetPage(i);
|
|
||||||
if (!used_fonts_in_page(page, i, ans.get())) return NULL;
|
|
||||||
} catch (const PdfError &err) { continue; }
|
|
||||||
}
|
|
||||||
return ans.release();
|
|
||||||
}
|
|
||||||
|
|
||||||
PyObject*
|
PyObject*
|
||||||
remove_fonts(PDFDoc *self, PyObject *args) {
|
remove_fonts(PDFDoc *self, PyObject *args) {
|
||||||
PyObject *fonts;
|
PyObject *fonts;
|
||||||
@ -267,6 +247,63 @@ remove_fonts(PDFDoc *self, PyObject *args) {
|
|||||||
Py_RETURN_NONE;
|
Py_RETURN_NONE;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
typedef std::unordered_map<PdfReference, unsigned long, PdfReferenceHasher> charprocs_usage_map;
|
||||||
|
|
||||||
|
PyObject*
|
||||||
|
remove_unused_fonts(PDFDoc *self, PyObject *args) {
|
||||||
|
unordered_reference_set used_fonts;
|
||||||
|
for (int i = 0; i < self->doc->GetPageCount(); i++) {
|
||||||
|
PdfPage *page = self->doc->GetPage(i);
|
||||||
|
if (page) used_fonts_in_page(page, used_fonts);
|
||||||
|
}
|
||||||
|
unordered_reference_set all_fonts;
|
||||||
|
unordered_reference_set type3_fonts;
|
||||||
|
charprocs_usage_map charprocs_usage;
|
||||||
|
PdfVecObjects &objects = self->doc->GetObjects();
|
||||||
|
for (TCIVecObjects it = objects.begin(); it != objects.end(); it++) {
|
||||||
|
if ((*it)->IsDictionary()) {
|
||||||
|
const PdfDictionary &dict = (*it)->GetDictionary();
|
||||||
|
if (dictionary_has_key_name(dict, PdfName::KeyType, "Font")) {
|
||||||
|
const std::string &font_type = dict.GetKey(PdfName::KeySubtype)->GetName().GetName();
|
||||||
|
if (font_type == "Type0") {
|
||||||
|
all_fonts.insert((*it)->Reference());
|
||||||
|
} else if (font_type == "Type3") {
|
||||||
|
all_fonts.insert((*it)->Reference());
|
||||||
|
type3_fonts.insert((*it)->Reference());
|
||||||
|
for (auto &x : dict.GetKey("CharProcs")->GetDictionary().GetKeys()) {
|
||||||
|
const PdfReference &ref = x.second->GetReference();
|
||||||
|
if (charprocs_usage.find(ref) == charprocs_usage.end()) charprocs_usage[ref] = 1;
|
||||||
|
else charprocs_usage[ref] += 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
unsigned long count = 0;
|
||||||
|
for (auto &ref : all_fonts) {
|
||||||
|
if (used_fonts.find(ref) == used_fonts.end()) {
|
||||||
|
PdfObject *font = objects.GetObject(ref);
|
||||||
|
if (font) {
|
||||||
|
count++;
|
||||||
|
if (type3_fonts.find(ref) != type3_fonts.end()) {
|
||||||
|
for (auto &x : font->GetIndirectKey("CharProcs")->GetDictionary().GetKeys()) {
|
||||||
|
charprocs_usage[x.second->GetReference()] -= 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
remove_font(objects, font);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for (auto &x : charprocs_usage) {
|
||||||
|
if (x.second == 0u) {
|
||||||
|
delete objects.RemoveObject(x.first);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return Py_BuildValue("k", count);
|
||||||
|
}
|
||||||
|
|
||||||
PyObject*
|
PyObject*
|
||||||
merge_fonts(PDFDoc *self, PyObject *args) {
|
merge_fonts(PDFDoc *self, PyObject *args) {
|
||||||
PyObject *items, *replacements;
|
PyObject *items, *replacements;
|
||||||
|
@ -12,6 +12,8 @@
|
|||||||
|
|
||||||
#define USING_SHARED_PODOFO
|
#define USING_SHARED_PODOFO
|
||||||
#include <podofo.h>
|
#include <podofo.h>
|
||||||
|
#include <unordered_set>
|
||||||
|
#include <unordered_map>
|
||||||
using namespace PoDoFo;
|
using namespace PoDoFo;
|
||||||
|
|
||||||
namespace pdf {
|
namespace pdf {
|
||||||
@ -84,10 +86,19 @@ dictionary_has_key_name(const PdfDictionary &d, T key, const char *name) {
|
|||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
class PdfReferenceHasher {
|
||||||
|
public:
|
||||||
|
size_t operator()(const PdfReference & obj) const {
|
||||||
|
return std::hash<pdf_objnum>()(obj.ObjectNumber());
|
||||||
|
}
|
||||||
|
};
|
||||||
|
typedef std::unordered_set<PdfReference, PdfReferenceHasher> unordered_reference_set;
|
||||||
|
|
||||||
|
|
||||||
extern "C" {
|
extern "C" {
|
||||||
PyObject* list_fonts(PDFDoc*, PyObject*);
|
PyObject* list_fonts(PDFDoc*, PyObject*);
|
||||||
PyObject* used_fonts_in_page_range(PDFDoc *self, PyObject *args);
|
|
||||||
PyObject* remove_fonts(PDFDoc *self, PyObject *args);
|
PyObject* remove_fonts(PDFDoc *self, PyObject *args);
|
||||||
|
PyObject* remove_unused_fonts(PDFDoc *self, PyObject *args);
|
||||||
PyObject* merge_fonts(PDFDoc *self, PyObject *args);
|
PyObject* merge_fonts(PDFDoc *self, PyObject *args);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user