mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
PDF Output: Run dedup in multiple passes. This catches images that have identical duplicated soft masks that are different images
This commit is contained in:
parent
93bec9e28b
commit
d91dd6c39b
@ -11,18 +11,21 @@
|
|||||||
|
|
||||||
using namespace pdf;
|
using namespace pdf;
|
||||||
|
|
||||||
|
typedef std::unordered_map<PdfReference, size_t, PdfReferenceHasher> hash_cache_map;
|
||||||
|
|
||||||
class Image {
|
class Image {
|
||||||
charbuff buf;
|
charbuff buf;
|
||||||
int64_t width, height;
|
int64_t width, height;
|
||||||
PdfReference ref;
|
PdfReference ref;
|
||||||
PdfReference smask;
|
PdfReference smask;
|
||||||
bool is_valid;
|
bool is_valid;
|
||||||
|
size_t content_hash, overall_hash;
|
||||||
|
|
||||||
Image( const Image & ) ;
|
Image( const Image & ) ;
|
||||||
Image & operator=( const Image & ) ;
|
Image & operator=( const Image & ) ;
|
||||||
|
|
||||||
public:
|
public:
|
||||||
Image(const PdfReference &reference, const PdfObject *o) : buf(), width(0), height(0), ref(reference) {
|
Image(const PdfReference &reference, const PdfObject *o, hash_cache_map &hash_cache) : buf(), width(0), height(0), ref(reference) {
|
||||||
const PdfObjectStream *stream = o->GetStream();
|
const PdfObjectStream *stream = o->GetStream();
|
||||||
try {
|
try {
|
||||||
buf = stream->GetCopySafe();
|
buf = stream->GetCopySafe();
|
||||||
@ -35,25 +38,33 @@ class Image {
|
|||||||
if (dict.HasKey("Width") && dict.GetKey("Width")->IsNumber()) width = dict.GetKey("Width")->GetNumber();
|
if (dict.HasKey("Width") && dict.GetKey("Width")->IsNumber()) width = dict.GetKey("Width")->GetNumber();
|
||||||
if (dict.HasKey("Height") && dict.GetKey("Height")->IsNumber()) height = dict.GetKey("Height")->GetNumber();
|
if (dict.HasKey("Height") && dict.GetKey("Height")->IsNumber()) height = dict.GetKey("Height")->GetNumber();
|
||||||
if (dict.HasKey("SMask") && dict.GetKey("SMask")->IsReference()) smask = dict.GetKey("SMask")->GetReference();
|
if (dict.HasKey("SMask") && dict.GetKey("SMask")->IsReference()) smask = dict.GetKey("SMask")->GetReference();
|
||||||
|
std::hash<std::string> s;
|
||||||
|
auto it = hash_cache.find(reference);
|
||||||
|
if (it == hash_cache.end()) {
|
||||||
|
content_hash = s(buf);
|
||||||
|
hash_cache.insert(std::make_pair(reference, content_hash));
|
||||||
|
} else {
|
||||||
|
content_hash = it->second;
|
||||||
|
}
|
||||||
|
overall_hash = s(std::to_string(width) + " " + std::to_string(height) + " " + smask.ToString() + " " + std::to_string(content_hash));
|
||||||
}
|
}
|
||||||
Image(Image &&other) noexcept :
|
Image(Image &&other) noexcept :
|
||||||
buf(std::move(other.buf)), width(other.width), height(other.height), ref(other.ref), smask(other.smask) {
|
buf(std::move(other.buf)), width(other.width), height(other.height), ref(other.ref), smask(other.smask), content_hash(other.content_hash), overall_hash(other.overall_hash) {
|
||||||
other.buf = charbuff(); is_valid = other.is_valid;
|
other.buf = charbuff(); is_valid = other.is_valid;
|
||||||
}
|
}
|
||||||
Image& operator=(Image &&other) noexcept {
|
Image& operator=(Image &&other) noexcept {
|
||||||
buf = std::move(other.buf); other.buf = charbuff(); ref = other.ref;
|
buf = std::move(other.buf); other.buf = charbuff(); ref = other.ref;
|
||||||
width = other.width; height = other.height; is_valid = other.is_valid;
|
width = other.width; height = other.height; is_valid = other.is_valid;
|
||||||
smask = other.smask;
|
smask = other.smask; content_hash = other.content_hash; overall_hash = other.overall_hash;
|
||||||
return *this;
|
return *this;
|
||||||
}
|
}
|
||||||
bool operator==(const Image &other) const noexcept {
|
bool operator==(const Image &other) const noexcept {
|
||||||
return other.width == width && is_valid && other.is_valid && other.height == height && other.smask == smask && other.buf == buf;
|
return other.width == width && is_valid && other.is_valid && other.height == height && other.smask == smask && other.buf == buf;
|
||||||
}
|
}
|
||||||
std::size_t hash() const noexcept { return buf.size(); }
|
std::size_t hash() const noexcept { return overall_hash; }
|
||||||
const PdfReference& reference() const noexcept { return ref; }
|
const PdfReference& reference() const noexcept { return ref; }
|
||||||
std::string ToString() const {
|
std::string ToString() const {
|
||||||
std::hash<std::string> s;
|
return "Image(ref=" + ref.ToString() + ", width="s + std::to_string(width) + ", height="s + std::to_string(height) + ", smask="s + smask.ToString() + ", digest=" + std::to_string(content_hash) + ")";
|
||||||
return "Image(ref=" + ref.ToString() + ", width="s + std::to_string(width) + ", height="s + std::to_string(height) + ", smask="s + smask.ToString() + ", digest=" + std::to_string(s(buf)) + ")";
|
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -63,9 +74,8 @@ struct ImageHasher {
|
|||||||
|
|
||||||
typedef std::unordered_map<Image, std::vector<PdfReference>, ImageHasher> image_reference_map;
|
typedef std::unordered_map<Image, std::vector<PdfReference>, ImageHasher> image_reference_map;
|
||||||
|
|
||||||
|
static unsigned long
|
||||||
static PyObject*
|
run_one_dedup_pass(PDFDoc *self, hash_cache_map &hash_cache) {
|
||||||
dedup_images(PDFDoc *self, PyObject *args) {
|
|
||||||
unsigned long count = 0;
|
unsigned long count = 0;
|
||||||
PdfIndirectObjectList &objects = self->doc->GetObjects();
|
PdfIndirectObjectList &objects = self->doc->GetObjects();
|
||||||
image_reference_map image_map;
|
image_reference_map image_map;
|
||||||
@ -74,7 +84,7 @@ dedup_images(PDFDoc *self, PyObject *args) {
|
|||||||
if (!k->IsDictionary()) continue;
|
if (!k->IsDictionary()) continue;
|
||||||
const PdfDictionary &dict = k->GetDictionary();
|
const PdfDictionary &dict = k->GetDictionary();
|
||||||
if (dictionary_has_key_name(dict, PdfName::KeyType, "XObject") && dictionary_has_key_name(dict, PdfName::KeySubtype, "Image")) {
|
if (dictionary_has_key_name(dict, PdfName::KeyType, "XObject") && dictionary_has_key_name(dict, PdfName::KeySubtype, "Image")) {
|
||||||
Image img(object_as_reference(k), k);
|
Image img(object_as_reference(k), k, hash_cache);
|
||||||
auto it = image_map.find(img);
|
auto it = image_map.find(img);
|
||||||
if (it == image_map.end()) {
|
if (it == image_map.end()) {
|
||||||
std::vector<PdfReference> vals;
|
std::vector<PdfReference> vals;
|
||||||
@ -124,6 +134,15 @@ dedup_images(PDFDoc *self, PyObject *args) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
return count;
|
||||||
|
}
|
||||||
|
|
||||||
|
static PyObject*
|
||||||
|
dedup_images(PDFDoc *self, PyObject *args) {
|
||||||
|
unsigned long count = 0;
|
||||||
|
hash_cache_map hash_cache;
|
||||||
|
count += run_one_dedup_pass(self, hash_cache);
|
||||||
|
count += run_one_dedup_pass(self, hash_cache);
|
||||||
return Py_BuildValue("k", count);
|
return Py_BuildValue("k", count);
|
||||||
|
|
||||||
}
|
}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user