mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
PDF Output: Run dedup in multiple passes. This catches images that have identical duplicated soft masks that are different images
This commit is contained in:
parent
93bec9e28b
commit
d91dd6c39b
@ -11,18 +11,21 @@
|
||||
|
||||
using namespace pdf;
|
||||
|
||||
typedef std::unordered_map<PdfReference, size_t, PdfReferenceHasher> hash_cache_map;
|
||||
|
||||
class Image {
|
||||
charbuff buf;
|
||||
int64_t width, height;
|
||||
PdfReference ref;
|
||||
PdfReference smask;
|
||||
bool is_valid;
|
||||
size_t content_hash, overall_hash;
|
||||
|
||||
Image( const Image & ) ;
|
||||
Image & operator=( const Image & ) ;
|
||||
|
||||
public:
|
||||
Image(const PdfReference &reference, const PdfObject *o) : buf(), width(0), height(0), ref(reference) {
|
||||
Image(const PdfReference &reference, const PdfObject *o, hash_cache_map &hash_cache) : buf(), width(0), height(0), ref(reference) {
|
||||
const PdfObjectStream *stream = o->GetStream();
|
||||
try {
|
||||
buf = stream->GetCopySafe();
|
||||
@ -35,25 +38,33 @@ class Image {
|
||||
if (dict.HasKey("Width") && dict.GetKey("Width")->IsNumber()) width = dict.GetKey("Width")->GetNumber();
|
||||
if (dict.HasKey("Height") && dict.GetKey("Height")->IsNumber()) height = dict.GetKey("Height")->GetNumber();
|
||||
if (dict.HasKey("SMask") && dict.GetKey("SMask")->IsReference()) smask = dict.GetKey("SMask")->GetReference();
|
||||
std::hash<std::string> s;
|
||||
auto it = hash_cache.find(reference);
|
||||
if (it == hash_cache.end()) {
|
||||
content_hash = s(buf);
|
||||
hash_cache.insert(std::make_pair(reference, content_hash));
|
||||
} else {
|
||||
content_hash = it->second;
|
||||
}
|
||||
overall_hash = s(std::to_string(width) + " " + std::to_string(height) + " " + smask.ToString() + " " + std::to_string(content_hash));
|
||||
}
|
||||
Image(Image &&other) noexcept :
|
||||
buf(std::move(other.buf)), width(other.width), height(other.height), ref(other.ref), smask(other.smask) {
|
||||
buf(std::move(other.buf)), width(other.width), height(other.height), ref(other.ref), smask(other.smask), content_hash(other.content_hash), overall_hash(other.overall_hash) {
|
||||
other.buf = charbuff(); is_valid = other.is_valid;
|
||||
}
|
||||
Image& operator=(Image &&other) noexcept {
|
||||
buf = std::move(other.buf); other.buf = charbuff(); ref = other.ref;
|
||||
width = other.width; height = other.height; is_valid = other.is_valid;
|
||||
smask = other.smask;
|
||||
smask = other.smask; content_hash = other.content_hash; overall_hash = other.overall_hash;
|
||||
return *this;
|
||||
}
|
||||
bool operator==(const Image &other) const noexcept {
|
||||
return other.width == width && is_valid && other.is_valid && other.height == height && other.smask == smask && other.buf == buf;
|
||||
}
|
||||
std::size_t hash() const noexcept { return buf.size(); }
|
||||
std::size_t hash() const noexcept { return overall_hash; }
|
||||
const PdfReference& reference() const noexcept { return ref; }
|
||||
std::string ToString() const {
|
||||
std::hash<std::string> s;
|
||||
return "Image(ref=" + ref.ToString() + ", width="s + std::to_string(width) + ", height="s + std::to_string(height) + ", smask="s + smask.ToString() + ", digest=" + std::to_string(s(buf)) + ")";
|
||||
return "Image(ref=" + ref.ToString() + ", width="s + std::to_string(width) + ", height="s + std::to_string(height) + ", smask="s + smask.ToString() + ", digest=" + std::to_string(content_hash) + ")";
|
||||
}
|
||||
};
|
||||
|
||||
@ -63,9 +74,8 @@ struct ImageHasher {
|
||||
|
||||
typedef std::unordered_map<Image, std::vector<PdfReference>, ImageHasher> image_reference_map;
|
||||
|
||||
|
||||
static PyObject*
|
||||
dedup_images(PDFDoc *self, PyObject *args) {
|
||||
static unsigned long
|
||||
run_one_dedup_pass(PDFDoc *self, hash_cache_map &hash_cache) {
|
||||
unsigned long count = 0;
|
||||
PdfIndirectObjectList &objects = self->doc->GetObjects();
|
||||
image_reference_map image_map;
|
||||
@ -74,7 +84,7 @@ dedup_images(PDFDoc *self, PyObject *args) {
|
||||
if (!k->IsDictionary()) continue;
|
||||
const PdfDictionary &dict = k->GetDictionary();
|
||||
if (dictionary_has_key_name(dict, PdfName::KeyType, "XObject") && dictionary_has_key_name(dict, PdfName::KeySubtype, "Image")) {
|
||||
Image img(object_as_reference(k), k);
|
||||
Image img(object_as_reference(k), k, hash_cache);
|
||||
auto it = image_map.find(img);
|
||||
if (it == image_map.end()) {
|
||||
std::vector<PdfReference> vals;
|
||||
@ -124,6 +134,15 @@ dedup_images(PDFDoc *self, PyObject *args) {
|
||||
}
|
||||
}
|
||||
}
|
||||
return count;
|
||||
}
|
||||
|
||||
static PyObject*
|
||||
dedup_images(PDFDoc *self, PyObject *args) {
|
||||
unsigned long count = 0;
|
||||
hash_cache_map hash_cache;
|
||||
count += run_one_dedup_pass(self, hash_cache);
|
||||
count += run_one_dedup_pass(self, hash_cache);
|
||||
return Py_BuildValue("k", count);
|
||||
|
||||
}
|
||||
|
Loading…
x
Reference in New Issue
Block a user