From d91dd6c39b24b55ffa20dc08055137ee0515146d Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Fri, 15 Sep 2023 14:17:22 +0530 Subject: [PATCH] PDF Output: Run dedup in multiple passes. This catches images that have identical duplicated soft masks that are different images --- src/calibre/utils/podofo/images.cpp | 39 +++++++++++++++++++++-------- 1 file changed, 29 insertions(+), 10 deletions(-) diff --git a/src/calibre/utils/podofo/images.cpp b/src/calibre/utils/podofo/images.cpp index 64258265bb..6337deecd9 100644 --- a/src/calibre/utils/podofo/images.cpp +++ b/src/calibre/utils/podofo/images.cpp @@ -11,18 +11,21 @@ using namespace pdf; +typedef std::unordered_map hash_cache_map; + class Image { charbuff buf; int64_t width, height; PdfReference ref; PdfReference smask; bool is_valid; + size_t content_hash, overall_hash; Image( const Image & ) ; Image & operator=( const Image & ) ; public: - Image(const PdfReference &reference, const PdfObject *o) : buf(), width(0), height(0), ref(reference) { + Image(const PdfReference &reference, const PdfObject *o, hash_cache_map &hash_cache) : buf(), width(0), height(0), ref(reference) { const PdfObjectStream *stream = o->GetStream(); try { buf = stream->GetCopySafe(); @@ -35,25 +38,33 @@ class Image { if (dict.HasKey("Width") && dict.GetKey("Width")->IsNumber()) width = dict.GetKey("Width")->GetNumber(); if (dict.HasKey("Height") && dict.GetKey("Height")->IsNumber()) height = dict.GetKey("Height")->GetNumber(); if (dict.HasKey("SMask") && dict.GetKey("SMask")->IsReference()) smask = dict.GetKey("SMask")->GetReference(); + std::hash s; + auto it = hash_cache.find(reference); + if (it == hash_cache.end()) { + content_hash = s(buf); + hash_cache.insert(std::make_pair(reference, content_hash)); + } else { + content_hash = it->second; + } + overall_hash = s(std::to_string(width) + " " + std::to_string(height) + " " + smask.ToString() + " " + std::to_string(content_hash)); } Image(Image &&other) noexcept : - buf(std::move(other.buf)), width(other.width), height(other.height), ref(other.ref), smask(other.smask) { + buf(std::move(other.buf)), width(other.width), height(other.height), ref(other.ref), smask(other.smask), content_hash(other.content_hash), overall_hash(other.overall_hash) { other.buf = charbuff(); is_valid = other.is_valid; } Image& operator=(Image &&other) noexcept { buf = std::move(other.buf); other.buf = charbuff(); ref = other.ref; width = other.width; height = other.height; is_valid = other.is_valid; - smask = other.smask; + smask = other.smask; content_hash = other.content_hash; overall_hash = other.overall_hash; return *this; } bool operator==(const Image &other) const noexcept { return other.width == width && is_valid && other.is_valid && other.height == height && other.smask == smask && other.buf == buf; } - std::size_t hash() const noexcept { return buf.size(); } + std::size_t hash() const noexcept { return overall_hash; } const PdfReference& reference() const noexcept { return ref; } std::string ToString() const { - std::hash s; - return "Image(ref=" + ref.ToString() + ", width="s + std::to_string(width) + ", height="s + std::to_string(height) + ", smask="s + smask.ToString() + ", digest=" + std::to_string(s(buf)) + ")"; + return "Image(ref=" + ref.ToString() + ", width="s + std::to_string(width) + ", height="s + std::to_string(height) + ", smask="s + smask.ToString() + ", digest=" + std::to_string(content_hash) + ")"; } }; @@ -63,9 +74,8 @@ struct ImageHasher { typedef std::unordered_map, ImageHasher> image_reference_map; - -static PyObject* -dedup_images(PDFDoc *self, PyObject *args) { +static unsigned long +run_one_dedup_pass(PDFDoc *self, hash_cache_map &hash_cache) { unsigned long count = 0; PdfIndirectObjectList &objects = self->doc->GetObjects(); image_reference_map image_map; @@ -74,7 +84,7 @@ dedup_images(PDFDoc *self, PyObject *args) { if (!k->IsDictionary()) continue; const PdfDictionary &dict = k->GetDictionary(); if (dictionary_has_key_name(dict, PdfName::KeyType, "XObject") && dictionary_has_key_name(dict, PdfName::KeySubtype, "Image")) { - Image img(object_as_reference(k), k); + Image img(object_as_reference(k), k, hash_cache); auto it = image_map.find(img); if (it == image_map.end()) { std::vector vals; @@ -124,6 +134,15 @@ dedup_images(PDFDoc *self, PyObject *args) { } } } + return count; +} + +static PyObject* +dedup_images(PDFDoc *self, PyObject *args) { + unsigned long count = 0; + hash_cache_map hash_cache; + count += run_one_dedup_pass(self, hash_cache); + count += run_one_dedup_pass(self, hash_cache); return Py_BuildValue("k", count); }