PDF Output: Run dedup in multiple passes. This catches images that have identical duplicated soft masks that are different images

This commit is contained in:
Kovid Goyal 2023-09-15 14:17:22 +05:30
parent 93bec9e28b
commit d91dd6c39b
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C

View File

@ -11,18 +11,21 @@
using namespace pdf; using namespace pdf;
typedef std::unordered_map<PdfReference, size_t, PdfReferenceHasher> hash_cache_map;
class Image { class Image {
charbuff buf; charbuff buf;
int64_t width, height; int64_t width, height;
PdfReference ref; PdfReference ref;
PdfReference smask; PdfReference smask;
bool is_valid; bool is_valid;
size_t content_hash, overall_hash;
Image( const Image & ) ; Image( const Image & ) ;
Image & operator=( const Image & ) ; Image & operator=( const Image & ) ;
public: public:
Image(const PdfReference &reference, const PdfObject *o) : buf(), width(0), height(0), ref(reference) { Image(const PdfReference &reference, const PdfObject *o, hash_cache_map &hash_cache) : buf(), width(0), height(0), ref(reference) {
const PdfObjectStream *stream = o->GetStream(); const PdfObjectStream *stream = o->GetStream();
try { try {
buf = stream->GetCopySafe(); buf = stream->GetCopySafe();
@ -35,25 +38,33 @@ class Image {
if (dict.HasKey("Width") && dict.GetKey("Width")->IsNumber()) width = dict.GetKey("Width")->GetNumber(); if (dict.HasKey("Width") && dict.GetKey("Width")->IsNumber()) width = dict.GetKey("Width")->GetNumber();
if (dict.HasKey("Height") && dict.GetKey("Height")->IsNumber()) height = dict.GetKey("Height")->GetNumber(); if (dict.HasKey("Height") && dict.GetKey("Height")->IsNumber()) height = dict.GetKey("Height")->GetNumber();
if (dict.HasKey("SMask") && dict.GetKey("SMask")->IsReference()) smask = dict.GetKey("SMask")->GetReference(); if (dict.HasKey("SMask") && dict.GetKey("SMask")->IsReference()) smask = dict.GetKey("SMask")->GetReference();
std::hash<std::string> s;
auto it = hash_cache.find(reference);
if (it == hash_cache.end()) {
content_hash = s(buf);
hash_cache.insert(std::make_pair(reference, content_hash));
} else {
content_hash = it->second;
}
overall_hash = s(std::to_string(width) + " " + std::to_string(height) + " " + smask.ToString() + " " + std::to_string(content_hash));
} }
Image(Image &&other) noexcept : Image(Image &&other) noexcept :
buf(std::move(other.buf)), width(other.width), height(other.height), ref(other.ref), smask(other.smask) { buf(std::move(other.buf)), width(other.width), height(other.height), ref(other.ref), smask(other.smask), content_hash(other.content_hash), overall_hash(other.overall_hash) {
other.buf = charbuff(); is_valid = other.is_valid; other.buf = charbuff(); is_valid = other.is_valid;
} }
Image& operator=(Image &&other) noexcept { Image& operator=(Image &&other) noexcept {
buf = std::move(other.buf); other.buf = charbuff(); ref = other.ref; buf = std::move(other.buf); other.buf = charbuff(); ref = other.ref;
width = other.width; height = other.height; is_valid = other.is_valid; width = other.width; height = other.height; is_valid = other.is_valid;
smask = other.smask; smask = other.smask; content_hash = other.content_hash; overall_hash = other.overall_hash;
return *this; return *this;
} }
bool operator==(const Image &other) const noexcept { bool operator==(const Image &other) const noexcept {
return other.width == width && is_valid && other.is_valid && other.height == height && other.smask == smask && other.buf == buf; return other.width == width && is_valid && other.is_valid && other.height == height && other.smask == smask && other.buf == buf;
} }
std::size_t hash() const noexcept { return buf.size(); } std::size_t hash() const noexcept { return overall_hash; }
const PdfReference& reference() const noexcept { return ref; } const PdfReference& reference() const noexcept { return ref; }
std::string ToString() const { std::string ToString() const {
std::hash<std::string> s; return "Image(ref=" + ref.ToString() + ", width="s + std::to_string(width) + ", height="s + std::to_string(height) + ", smask="s + smask.ToString() + ", digest=" + std::to_string(content_hash) + ")";
return "Image(ref=" + ref.ToString() + ", width="s + std::to_string(width) + ", height="s + std::to_string(height) + ", smask="s + smask.ToString() + ", digest=" + std::to_string(s(buf)) + ")";
} }
}; };
@ -63,9 +74,8 @@ struct ImageHasher {
typedef std::unordered_map<Image, std::vector<PdfReference>, ImageHasher> image_reference_map; typedef std::unordered_map<Image, std::vector<PdfReference>, ImageHasher> image_reference_map;
static unsigned long
static PyObject* run_one_dedup_pass(PDFDoc *self, hash_cache_map &hash_cache) {
dedup_images(PDFDoc *self, PyObject *args) {
unsigned long count = 0; unsigned long count = 0;
PdfIndirectObjectList &objects = self->doc->GetObjects(); PdfIndirectObjectList &objects = self->doc->GetObjects();
image_reference_map image_map; image_reference_map image_map;
@ -74,7 +84,7 @@ dedup_images(PDFDoc *self, PyObject *args) {
if (!k->IsDictionary()) continue; if (!k->IsDictionary()) continue;
const PdfDictionary &dict = k->GetDictionary(); const PdfDictionary &dict = k->GetDictionary();
if (dictionary_has_key_name(dict, PdfName::KeyType, "XObject") && dictionary_has_key_name(dict, PdfName::KeySubtype, "Image")) { if (dictionary_has_key_name(dict, PdfName::KeyType, "XObject") && dictionary_has_key_name(dict, PdfName::KeySubtype, "Image")) {
Image img(object_as_reference(k), k); Image img(object_as_reference(k), k, hash_cache);
auto it = image_map.find(img); auto it = image_map.find(img);
if (it == image_map.end()) { if (it == image_map.end()) {
std::vector<PdfReference> vals; std::vector<PdfReference> vals;
@ -124,6 +134,15 @@ dedup_images(PDFDoc *self, PyObject *args) {
} }
} }
} }
return count;
}
static PyObject*
dedup_images(PDFDoc *self, PyObject *args) {
unsigned long count = 0;
hash_cache_map hash_cache;
count += run_one_dedup_pass(self, hash_cache);
count += run_one_dedup_pass(self, hash_cache);
return Py_BuildValue("k", count); return Py_BuildValue("k", count);
} }