De-dup images in the PDF

This commit is contained in:
Kovid Goyal 2019-07-29 17:49:09 +05:30
parent bdebe91156
commit 0f02312390
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C
6 changed files with 124 additions and 3 deletions

View File

@ -116,7 +116,7 @@
},
{
"name": "podofo",
"sources": "calibre/utils/podofo/utils.cpp calibre/utils/podofo/output.cpp calibre/utils/podofo/doc.cpp calibre/utils/podofo/outline.cpp calibre/utils/podofo/fonts.cpp calibre/utils/podofo/impose.cpp calibre/utils/podofo/podofo.cpp",
"sources": "calibre/utils/podofo/utils.cpp calibre/utils/podofo/output.cpp calibre/utils/podofo/doc.cpp calibre/utils/podofo/outline.cpp calibre/utils/podofo/fonts.cpp calibre/utils/podofo/impose.cpp calibre/utils/podofo/images.cpp calibre/utils/podofo/podofo.cpp",
"headers": "calibre/utils/podofo/global.h",
"libraries": "podofo",
"lib_dirs": "!podofo_lib",

View File

@ -1066,13 +1066,16 @@ def convert(opf_path, opts, metadata=None, output_path=None, log=default_log, co
if num_removed:
log('Removed', num_removed, 'duplicated Type3 glyphs')
# TODO: dedup images
# TODO: Support for mathematics
num_removed = remove_unused_fonts(pdf_doc)
if num_removed:
log('Removed', num_removed, 'unused fonts')
num_removed = pdf_doc.dedup_images()
if num_removed:
log('Removed', num_removed, 'duplicate images')
if cover_data:
add_cover(pdf_doc, cover_data, page_layout, opts)

View File

@ -759,6 +759,9 @@ static PyMethodDef PDFDoc_methods[] = {
{"impose", (PyCFunction)py_impose, METH_VARARGS,
"impose() -> impose pages onto each other"
},
{"dedup_images", (PyCFunction)py_dedup_images, METH_VARARGS,
"dedup_images() -> Remove duplicated images"
},
{"delete_pages", (PyCFunction)PDFDoc_delete_pages, METH_VARARGS,
"delete_page(page_num, count=1) -> Delete the specified pages from the pdf."
},

View File

@ -51,6 +51,7 @@ replace_font_references_in_resources(PdfDictionary &resources, const std::unorde
if (f && f->IsDictionary()) {
const PdfDictionary &font = f->GetDictionary();
PdfDictionary new_font = PdfDictionary(font);
bool changed = false;
for (auto &k : font.GetKeys()) {
if (k.second->IsReference()) {
uint64_t key = ref_as_integer(k.second->GetReference()), r;
@ -59,9 +60,10 @@ replace_font_references_in_resources(PdfDictionary &resources, const std::unorde
} catch (const std::out_of_range &err) { continue; }
PdfReference new_ref(static_cast<uint32_t>(r & 0xffffffff), r >> 32);
new_font.AddKey(k.first.GetName(), new_ref);
changed = true;
}
}
resources.AddKey("Font", new_font);
if (changed) resources.AddKey("Font", new_font);
}
}

View File

@ -101,6 +101,7 @@ PyObject* py_remove_unused_fonts(PDFDoc *self, PyObject *args);
PyObject* py_merge_fonts(PDFDoc *self, PyObject *args);
PyObject* py_dedup_type3_fonts(PDFDoc *self, PyObject *args);
PyObject* py_impose(PDFDoc *self, PyObject *args);
PyObject* py_dedup_images(PDFDoc *self, PyObject *args);
}
}

View File

@ -0,0 +1,112 @@
/*
* impose.cpp
* Copyright (C) 2019 Kovid Goyal <kovid at kovidgoyal.net>
*
* Distributed under terms of the GPL3 license.
*/
#include "global.h"
using namespace pdf;
class Image {
char *buf; pdf_long sz;
pdf_int64 width, height;
PdfReference ref;
Image( const Image & ) ;
Image & operator=( const Image & ) ;
public:
Image(const PdfReference &reference, const PdfObject *o) : buf(NULL), sz(0), width(0), height(0), ref(reference) {
const PdfStream *stream = o->GetStream();
stream->GetFilteredCopy(&buf, &sz);
const PdfDictionary &dict = o->GetDictionary();
if (dict.HasKey("Width") && dict.GetKey("Width")->IsNumber()) width = dict.GetKey("Width")->GetNumber();
if (dict.HasKey("Height") && dict.GetKey("Height")->IsNumber()) height = dict.GetKey("Height")->GetNumber();
}
Image(Image &&other) noexcept :
buf(other.buf), sz(other.sz), width(other.width), height(other.height), ref(other.ref) {
other.buf = NULL;
}
Image& operator=(Image &&other) noexcept {
if (buf) podofo_free(buf);
buf = other.buf; other.buf = NULL; sz = other.sz; ref = other.ref;
width = other.width; height = other.height;
return *this;
}
~Image() noexcept { if (buf) podofo_free(buf); buf = NULL; }
bool operator==(const Image &other) const noexcept {
return other.sz == sz && other.width == width && other.height == height && memcmp(buf, other.buf, sz) == 0;
}
std::size_t hash() const noexcept { return sz; }
const PdfReference& reference() const noexcept { return ref; }
};
struct ImageHasher {
std::size_t operator()(const Image& k) const { return k.hash(); }
};
typedef std::unordered_map<Image, std::vector<PdfReference>, ImageHasher> image_reference_map;
static PyObject*
dedup_images(PDFDoc *self, PyObject *args) {
unsigned long count = 0;
PdfVecObjects &objects = self->doc->GetObjects();
image_reference_map image_map;
for (auto &k : objects) {
if (!k->IsDictionary()) continue;
const PdfDictionary &dict = k->GetDictionary();
if (dictionary_has_key_name(dict, PdfName::KeyType, "XObject") && dictionary_has_key_name(dict, PdfName::KeySubtype, "Image")) {
Image img(k->Reference(), k);
auto it = image_map.find(img);
if (it == image_map.end()) {
std::vector<PdfReference> vals;
image_map.insert(std::make_pair(std::move(img), std::move(vals)));
} else (*it).second.push_back(img.reference());
}
}
std::unordered_map<PdfReference, PdfReference, PdfReferenceHasher> ref_map;
for (auto &x : image_map) {
if (x.second.size() > 0) {
const PdfReference &canonical_ref = x.first.reference();
for (auto &ref : x.second) {
if (ref != canonical_ref) {
ref_map[ref] = x.first.reference();
delete objects.RemoveObject(ref);
count++;
}
}
}
}
if (count > 0) {
for (auto &k : objects) {
if (!k->IsDictionary()) continue;
PdfDictionary &dict = k->GetDictionary();
if (dict.HasKey("Resources") && dict.GetKey("Resources")->IsDictionary()) {
PdfDictionary &resources = dict.GetKey("Resources")->GetDictionary();
if (!resources.HasKey("XObject") || !resources.GetKey("XObject")->IsDictionary()) continue;
const PdfDictionary &xobject = resources.GetKey("XObject")->GetDictionary();
PdfDictionary new_xobject = PdfDictionary(xobject);
bool changed = false;
for (auto &x : xobject.GetKeys()) {
if (x.second->IsReference()) {
try {
const PdfReference &r = ref_map.at(x.second->GetReference());
new_xobject.AddKey(x.first.GetName(), r);
changed = true;
} catch (const std::out_of_range &err) { continue; }
}
}
if (changed) resources.AddKey("XObject", new_xobject);
}
}
}
return Py_BuildValue("k", count);
}
PYWRAP(dedup_images)