diff --git a/setup/extensions.json b/setup/extensions.json index b64e8b12ae..90f4e82cfd 100644 --- a/setup/extensions.json +++ b/setup/extensions.json @@ -116,7 +116,7 @@ }, { "name": "podofo", - "sources": "calibre/utils/podofo/utils.cpp calibre/utils/podofo/output.cpp calibre/utils/podofo/doc.cpp calibre/utils/podofo/outline.cpp calibre/utils/podofo/fonts.cpp calibre/utils/podofo/impose.cpp calibre/utils/podofo/podofo.cpp", + "sources": "calibre/utils/podofo/utils.cpp calibre/utils/podofo/output.cpp calibre/utils/podofo/doc.cpp calibre/utils/podofo/outline.cpp calibre/utils/podofo/fonts.cpp calibre/utils/podofo/impose.cpp calibre/utils/podofo/images.cpp calibre/utils/podofo/podofo.cpp", "headers": "calibre/utils/podofo/global.h", "libraries": "podofo", "lib_dirs": "!podofo_lib", diff --git a/src/calibre/ebooks/pdf/html_writer.py b/src/calibre/ebooks/pdf/html_writer.py index 167ad77bcc..8889e2fdf1 100644 --- a/src/calibre/ebooks/pdf/html_writer.py +++ b/src/calibre/ebooks/pdf/html_writer.py @@ -1066,13 +1066,16 @@ def convert(opf_path, opts, metadata=None, output_path=None, log=default_log, co if num_removed: log('Removed', num_removed, 'duplicated Type3 glyphs') - # TODO: dedup images # TODO: Support for mathematics num_removed = remove_unused_fonts(pdf_doc) if num_removed: log('Removed', num_removed, 'unused fonts') + num_removed = pdf_doc.dedup_images() + if num_removed: + log('Removed', num_removed, 'duplicate images') + if cover_data: add_cover(pdf_doc, cover_data, page_layout, opts) diff --git a/src/calibre/utils/podofo/doc.cpp b/src/calibre/utils/podofo/doc.cpp index 075ba80729..c169f80719 100644 --- a/src/calibre/utils/podofo/doc.cpp +++ b/src/calibre/utils/podofo/doc.cpp @@ -759,6 +759,9 @@ static PyMethodDef PDFDoc_methods[] = { {"impose", (PyCFunction)py_impose, METH_VARARGS, "impose() -> impose pages onto each other" }, + {"dedup_images", (PyCFunction)py_dedup_images, METH_VARARGS, + "dedup_images() -> Remove duplicated images" + }, {"delete_pages", (PyCFunction)PDFDoc_delete_pages, METH_VARARGS, "delete_page(page_num, count=1) -> Delete the specified pages from the pdf." }, diff --git a/src/calibre/utils/podofo/fonts.cpp b/src/calibre/utils/podofo/fonts.cpp index daa9734b5e..dcb1d4564e 100644 --- a/src/calibre/utils/podofo/fonts.cpp +++ b/src/calibre/utils/podofo/fonts.cpp @@ -51,6 +51,7 @@ replace_font_references_in_resources(PdfDictionary &resources, const std::unorde if (f && f->IsDictionary()) { const PdfDictionary &font = f->GetDictionary(); PdfDictionary new_font = PdfDictionary(font); + bool changed = false; for (auto &k : font.GetKeys()) { if (k.second->IsReference()) { uint64_t key = ref_as_integer(k.second->GetReference()), r; @@ -59,9 +60,10 @@ replace_font_references_in_resources(PdfDictionary &resources, const std::unorde } catch (const std::out_of_range &err) { continue; } PdfReference new_ref(static_cast(r & 0xffffffff), r >> 32); new_font.AddKey(k.first.GetName(), new_ref); + changed = true; } } - resources.AddKey("Font", new_font); + if (changed) resources.AddKey("Font", new_font); } } diff --git a/src/calibre/utils/podofo/global.h b/src/calibre/utils/podofo/global.h index b32fbd95e5..bbe04d9ab3 100644 --- a/src/calibre/utils/podofo/global.h +++ b/src/calibre/utils/podofo/global.h @@ -101,6 +101,7 @@ PyObject* py_remove_unused_fonts(PDFDoc *self, PyObject *args); PyObject* py_merge_fonts(PDFDoc *self, PyObject *args); PyObject* py_dedup_type3_fonts(PDFDoc *self, PyObject *args); PyObject* py_impose(PDFDoc *self, PyObject *args); +PyObject* py_dedup_images(PDFDoc *self, PyObject *args); } } diff --git a/src/calibre/utils/podofo/images.cpp b/src/calibre/utils/podofo/images.cpp new file mode 100644 index 0000000000..fddb21b4b7 --- /dev/null +++ b/src/calibre/utils/podofo/images.cpp @@ -0,0 +1,112 @@ +/* + * impose.cpp + * Copyright (C) 2019 Kovid Goyal + * + * Distributed under terms of the GPL3 license. + */ + +#include "global.h" + +using namespace pdf; + +class Image { + char *buf; pdf_long sz; + pdf_int64 width, height; + PdfReference ref; + Image( const Image & ) ; + Image & operator=( const Image & ) ; + + public: + Image(const PdfReference &reference, const PdfObject *o) : buf(NULL), sz(0), width(0), height(0), ref(reference) { + const PdfStream *stream = o->GetStream(); + stream->GetFilteredCopy(&buf, &sz); + const PdfDictionary &dict = o->GetDictionary(); + if (dict.HasKey("Width") && dict.GetKey("Width")->IsNumber()) width = dict.GetKey("Width")->GetNumber(); + if (dict.HasKey("Height") && dict.GetKey("Height")->IsNumber()) height = dict.GetKey("Height")->GetNumber(); + } + Image(Image &&other) noexcept : + buf(other.buf), sz(other.sz), width(other.width), height(other.height), ref(other.ref) { + other.buf = NULL; + } + Image& operator=(Image &&other) noexcept { + if (buf) podofo_free(buf); + buf = other.buf; other.buf = NULL; sz = other.sz; ref = other.ref; + width = other.width; height = other.height; + return *this; + } + ~Image() noexcept { if (buf) podofo_free(buf); buf = NULL; } + bool operator==(const Image &other) const noexcept { + return other.sz == sz && other.width == width && other.height == height && memcmp(buf, other.buf, sz) == 0; + } + std::size_t hash() const noexcept { return sz; } + const PdfReference& reference() const noexcept { return ref; } +}; + +struct ImageHasher { + std::size_t operator()(const Image& k) const { return k.hash(); } +}; + +typedef std::unordered_map, ImageHasher> image_reference_map; + + +static PyObject* +dedup_images(PDFDoc *self, PyObject *args) { + unsigned long count = 0; + PdfVecObjects &objects = self->doc->GetObjects(); + image_reference_map image_map; + + for (auto &k : objects) { + if (!k->IsDictionary()) continue; + const PdfDictionary &dict = k->GetDictionary(); + if (dictionary_has_key_name(dict, PdfName::KeyType, "XObject") && dictionary_has_key_name(dict, PdfName::KeySubtype, "Image")) { + Image img(k->Reference(), k); + auto it = image_map.find(img); + if (it == image_map.end()) { + std::vector vals; + image_map.insert(std::make_pair(std::move(img), std::move(vals))); + } else (*it).second.push_back(img.reference()); + } + } + std::unordered_map ref_map; + for (auto &x : image_map) { + if (x.second.size() > 0) { + const PdfReference &canonical_ref = x.first.reference(); + for (auto &ref : x.second) { + if (ref != canonical_ref) { + ref_map[ref] = x.first.reference(); + delete objects.RemoveObject(ref); + count++; + } + } + } + } + + if (count > 0) { + for (auto &k : objects) { + if (!k->IsDictionary()) continue; + PdfDictionary &dict = k->GetDictionary(); + if (dict.HasKey("Resources") && dict.GetKey("Resources")->IsDictionary()) { + PdfDictionary &resources = dict.GetKey("Resources")->GetDictionary(); + if (!resources.HasKey("XObject") || !resources.GetKey("XObject")->IsDictionary()) continue; + const PdfDictionary &xobject = resources.GetKey("XObject")->GetDictionary(); + PdfDictionary new_xobject = PdfDictionary(xobject); + bool changed = false; + for (auto &x : xobject.GetKeys()) { + if (x.second->IsReference()) { + try { + const PdfReference &r = ref_map.at(x.second->GetReference()); + new_xobject.AddKey(x.first.GetName(), r); + changed = true; + } catch (const std::out_of_range &err) { continue; } + } + } + if (changed) resources.AddKey("XObject", new_xobject); + } + } + } + + return Py_BuildValue("k", count); + +} + +PYWRAP(dedup_images)