diff --git a/src/calibre/utils/podofo/__init__.py b/src/calibre/utils/podofo/__init__.py index 39c5eb2320..2431c47d06 100644 --- a/src/calibre/utils/podofo/__init__.py +++ b/src/calibre/utils/podofo/__init__.py @@ -10,7 +10,7 @@ from calibre.constants import plugins, preferred_encoding from calibre.ebooks.metadata import authors_to_string from calibre.ptempfile import TemporaryDirectory from calibre.utils.ipc.simple_worker import WorkerError, fork_job -from polyglot.builtins import unicode_type, iteritems +from polyglot.builtins import unicode_type def get_podofo(): @@ -127,20 +127,11 @@ def get_image_count(path): def list_fonts(pdf_doc): fonts = pdf_doc.list_fonts() ref_map = {f['Reference']: f for f in fonts} - for ref in pdf_doc.used_fonts_in_page_range(): - ref_map[ref]['used'] = True - for font in fonts: - font['used'] = font.get('used', False) - if font['DescendantFont'] and font['used']: - ref_map[font['DescendantFont']]['used'] = True return ref_map def remove_unused_fonts(pdf_doc): - font_ref_map = list_fonts(pdf_doc) - unused = tuple(ref for ref, font in iteritems(font_ref_map) if not font['used']) - pdf_doc.remove_fonts(unused) - return len(tuple(f for f in unused if font_ref_map[f]['StreamRef'])) + return pdf_doc.remove_unused_fonts() def test_remove_unused_fonts(src): diff --git a/src/calibre/utils/podofo/doc.cpp b/src/calibre/utils/podofo/doc.cpp index 4b27df370e..1416ac86f0 100644 --- a/src/calibre/utils/podofo/doc.cpp +++ b/src/calibre/utils/podofo/doc.cpp @@ -747,12 +747,12 @@ static PyMethodDef PDFDoc_methods[] = { {"list_fonts", (PyCFunction)list_fonts, METH_VARARGS, "list_fonts() -> Get list of fonts in document" }, - {"used_fonts_in_page_range", (PyCFunction)used_fonts_in_page_range, METH_VARARGS, - "used_fonts_in_page_range() -> Get list of references to fonts used in the specified pages" - }, {"remove_fonts", (PyCFunction)remove_fonts, METH_VARARGS, "remove_fonts() -> Remove the specified font objects." }, + {"remove_unused_fonts", (PyCFunction)remove_unused_fonts, METH_NOARGS, + "remove_unused_fonts() -> Remove unused font objects." + }, {"merge_fonts", (PyCFunction)merge_fonts, METH_VARARGS, "merge_fonts() -> Merge the specified fonts." }, diff --git a/src/calibre/utils/podofo/fonts.cpp b/src/calibre/utils/podofo/fonts.cpp index 886f93931c..bec9044ba9 100644 --- a/src/calibre/utils/podofo/fonts.cpp +++ b/src/calibre/utils/podofo/fonts.cpp @@ -8,7 +8,6 @@ #include "global.h" #include #include -#include using namespace pdf; @@ -72,7 +71,7 @@ replace_font_references(PDFDoc *self, std::unordered_map &re } static bool -used_fonts_in_page(PdfPage *page, int page_num, PyObject *ans) { +used_fonts_in_page(PdfPage *page, unordered_reference_set &ans) { PdfContentsTokenizer tokenizer(page); bool in_text_block = false; const char* token = NULL; @@ -96,11 +95,7 @@ used_fonts_in_page(PdfPage *page, int page_num, PyObject *ans) { if (stack.size() > 0 && stack.top().IsName()) { const PdfName &reference_name = stack.top().GetName(); PdfObject* font = page->GetFromResources("Font", reference_name); - if (font) { - pyunique_ptr r(ref_as_tuple(font->Reference())); - if (!r) return false; - if (PySet_Add(ans, r.get()) != 0) return false; - } + if (font) ans.insert(font->Reference()); } } } @@ -235,21 +230,6 @@ list_fonts(PDFDoc *self, PyObject *args) { return ans.release(); } -PyObject* -used_fonts_in_page_range(PDFDoc *self, PyObject *args) { - int first = 1, last = self->doc->GetPageCount(); - if (!PyArg_ParseTuple(args, "|ii", &first, &last)) return NULL; - pyunique_ptr ans(PySet_New(NULL)); - if (!ans) return NULL; - for (int i = first - 1; i < last; i++) { - try { - PdfPage *page = self->doc->GetPage(i); - if (!used_fonts_in_page(page, i, ans.get())) return NULL; - } catch (const PdfError &err) { continue; } - } - return ans.release(); -} - PyObject* remove_fonts(PDFDoc *self, PyObject *args) { PyObject *fonts; @@ -267,6 +247,63 @@ remove_fonts(PDFDoc *self, PyObject *args) { Py_RETURN_NONE; } +typedef std::unordered_map charprocs_usage_map; + +PyObject* +remove_unused_fonts(PDFDoc *self, PyObject *args) { + unordered_reference_set used_fonts; + for (int i = 0; i < self->doc->GetPageCount(); i++) { + PdfPage *page = self->doc->GetPage(i); + if (page) used_fonts_in_page(page, used_fonts); + } + unordered_reference_set all_fonts; + unordered_reference_set type3_fonts; + charprocs_usage_map charprocs_usage; + PdfVecObjects &objects = self->doc->GetObjects(); + for (TCIVecObjects it = objects.begin(); it != objects.end(); it++) { + if ((*it)->IsDictionary()) { + const PdfDictionary &dict = (*it)->GetDictionary(); + if (dictionary_has_key_name(dict, PdfName::KeyType, "Font")) { + const std::string &font_type = dict.GetKey(PdfName::KeySubtype)->GetName().GetName(); + if (font_type == "Type0") { + all_fonts.insert((*it)->Reference()); + } else if (font_type == "Type3") { + all_fonts.insert((*it)->Reference()); + type3_fonts.insert((*it)->Reference()); + for (auto &x : dict.GetKey("CharProcs")->GetDictionary().GetKeys()) { + const PdfReference &ref = x.second->GetReference(); + if (charprocs_usage.find(ref) == charprocs_usage.end()) charprocs_usage[ref] = 1; + else charprocs_usage[ref] += 1; + } + } + } + } + } + + unsigned long count = 0; + for (auto &ref : all_fonts) { + if (used_fonts.find(ref) == used_fonts.end()) { + PdfObject *font = objects.GetObject(ref); + if (font) { + count++; + if (type3_fonts.find(ref) != type3_fonts.end()) { + for (auto &x : font->GetIndirectKey("CharProcs")->GetDictionary().GetKeys()) { + charprocs_usage[x.second->GetReference()] -= 1; + } + } + remove_font(objects, font); + } + } + } + + for (auto &x : charprocs_usage) { + if (x.second == 0u) { + delete objects.RemoveObject(x.first); + } + } + return Py_BuildValue("k", count); +} + PyObject* merge_fonts(PDFDoc *self, PyObject *args) { PyObject *items, *replacements; diff --git a/src/calibre/utils/podofo/global.h b/src/calibre/utils/podofo/global.h index 7925ae73be..981912d128 100644 --- a/src/calibre/utils/podofo/global.h +++ b/src/calibre/utils/podofo/global.h @@ -12,6 +12,8 @@ #define USING_SHARED_PODOFO #include +#include +#include using namespace PoDoFo; namespace pdf { @@ -84,10 +86,19 @@ dictionary_has_key_name(const PdfDictionary &d, T key, const char *name) { return false; } +class PdfReferenceHasher { + public: + size_t operator()(const PdfReference & obj) const { + return std::hash()(obj.ObjectNumber()); + } +}; +typedef std::unordered_set unordered_reference_set; + + extern "C" { PyObject* list_fonts(PDFDoc*, PyObject*); -PyObject* used_fonts_in_page_range(PDFDoc *self, PyObject *args); PyObject* remove_fonts(PDFDoc *self, PyObject *args); +PyObject* remove_unused_fonts(PDFDoc *self, PyObject *args); PyObject* merge_fonts(PDFDoc *self, PyObject *args); } }