mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-08-07 09:01:38 -04:00
De-dup images in the PDF
This commit is contained in:
parent
bdebe91156
commit
0f02312390
@ -116,7 +116,7 @@
|
||||
},
|
||||
{
|
||||
"name": "podofo",
|
||||
"sources": "calibre/utils/podofo/utils.cpp calibre/utils/podofo/output.cpp calibre/utils/podofo/doc.cpp calibre/utils/podofo/outline.cpp calibre/utils/podofo/fonts.cpp calibre/utils/podofo/impose.cpp calibre/utils/podofo/podofo.cpp",
|
||||
"sources": "calibre/utils/podofo/utils.cpp calibre/utils/podofo/output.cpp calibre/utils/podofo/doc.cpp calibre/utils/podofo/outline.cpp calibre/utils/podofo/fonts.cpp calibre/utils/podofo/impose.cpp calibre/utils/podofo/images.cpp calibre/utils/podofo/podofo.cpp",
|
||||
"headers": "calibre/utils/podofo/global.h",
|
||||
"libraries": "podofo",
|
||||
"lib_dirs": "!podofo_lib",
|
||||
|
@ -1066,13 +1066,16 @@ def convert(opf_path, opts, metadata=None, output_path=None, log=default_log, co
|
||||
if num_removed:
|
||||
log('Removed', num_removed, 'duplicated Type3 glyphs')
|
||||
|
||||
# TODO: dedup images
|
||||
# TODO: Support for mathematics
|
||||
|
||||
num_removed = remove_unused_fonts(pdf_doc)
|
||||
if num_removed:
|
||||
log('Removed', num_removed, 'unused fonts')
|
||||
|
||||
num_removed = pdf_doc.dedup_images()
|
||||
if num_removed:
|
||||
log('Removed', num_removed, 'duplicate images')
|
||||
|
||||
if cover_data:
|
||||
add_cover(pdf_doc, cover_data, page_layout, opts)
|
||||
|
||||
|
@ -759,6 +759,9 @@ static PyMethodDef PDFDoc_methods[] = {
|
||||
{"impose", (PyCFunction)py_impose, METH_VARARGS,
|
||||
"impose() -> impose pages onto each other"
|
||||
},
|
||||
{"dedup_images", (PyCFunction)py_dedup_images, METH_VARARGS,
|
||||
"dedup_images() -> Remove duplicated images"
|
||||
},
|
||||
{"delete_pages", (PyCFunction)PDFDoc_delete_pages, METH_VARARGS,
|
||||
"delete_page(page_num, count=1) -> Delete the specified pages from the pdf."
|
||||
},
|
||||
|
@ -51,6 +51,7 @@ replace_font_references_in_resources(PdfDictionary &resources, const std::unorde
|
||||
if (f && f->IsDictionary()) {
|
||||
const PdfDictionary &font = f->GetDictionary();
|
||||
PdfDictionary new_font = PdfDictionary(font);
|
||||
bool changed = false;
|
||||
for (auto &k : font.GetKeys()) {
|
||||
if (k.second->IsReference()) {
|
||||
uint64_t key = ref_as_integer(k.second->GetReference()), r;
|
||||
@ -59,9 +60,10 @@ replace_font_references_in_resources(PdfDictionary &resources, const std::unorde
|
||||
} catch (const std::out_of_range &err) { continue; }
|
||||
PdfReference new_ref(static_cast<uint32_t>(r & 0xffffffff), r >> 32);
|
||||
new_font.AddKey(k.first.GetName(), new_ref);
|
||||
changed = true;
|
||||
}
|
||||
}
|
||||
resources.AddKey("Font", new_font);
|
||||
if (changed) resources.AddKey("Font", new_font);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -101,6 +101,7 @@ PyObject* py_remove_unused_fonts(PDFDoc *self, PyObject *args);
|
||||
PyObject* py_merge_fonts(PDFDoc *self, PyObject *args);
|
||||
PyObject* py_dedup_type3_fonts(PDFDoc *self, PyObject *args);
|
||||
PyObject* py_impose(PDFDoc *self, PyObject *args);
|
||||
PyObject* py_dedup_images(PDFDoc *self, PyObject *args);
|
||||
}
|
||||
}
|
||||
|
||||
|
112
src/calibre/utils/podofo/images.cpp
Normal file
112
src/calibre/utils/podofo/images.cpp
Normal file
@ -0,0 +1,112 @@
|
||||
/*
|
||||
* impose.cpp
|
||||
* Copyright (C) 2019 Kovid Goyal <kovid at kovidgoyal.net>
|
||||
*
|
||||
* Distributed under terms of the GPL3 license.
|
||||
*/
|
||||
|
||||
#include "global.h"
|
||||
|
||||
using namespace pdf;
|
||||
|
||||
class Image {
|
||||
char *buf; pdf_long sz;
|
||||
pdf_int64 width, height;
|
||||
PdfReference ref;
|
||||
Image( const Image & ) ;
|
||||
Image & operator=( const Image & ) ;
|
||||
|
||||
public:
|
||||
Image(const PdfReference &reference, const PdfObject *o) : buf(NULL), sz(0), width(0), height(0), ref(reference) {
|
||||
const PdfStream *stream = o->GetStream();
|
||||
stream->GetFilteredCopy(&buf, &sz);
|
||||
const PdfDictionary &dict = o->GetDictionary();
|
||||
if (dict.HasKey("Width") && dict.GetKey("Width")->IsNumber()) width = dict.GetKey("Width")->GetNumber();
|
||||
if (dict.HasKey("Height") && dict.GetKey("Height")->IsNumber()) height = dict.GetKey("Height")->GetNumber();
|
||||
}
|
||||
Image(Image &&other) noexcept :
|
||||
buf(other.buf), sz(other.sz), width(other.width), height(other.height), ref(other.ref) {
|
||||
other.buf = NULL;
|
||||
}
|
||||
Image& operator=(Image &&other) noexcept {
|
||||
if (buf) podofo_free(buf);
|
||||
buf = other.buf; other.buf = NULL; sz = other.sz; ref = other.ref;
|
||||
width = other.width; height = other.height;
|
||||
return *this;
|
||||
}
|
||||
~Image() noexcept { if (buf) podofo_free(buf); buf = NULL; }
|
||||
bool operator==(const Image &other) const noexcept {
|
||||
return other.sz == sz && other.width == width && other.height == height && memcmp(buf, other.buf, sz) == 0;
|
||||
}
|
||||
std::size_t hash() const noexcept { return sz; }
|
||||
const PdfReference& reference() const noexcept { return ref; }
|
||||
};
|
||||
|
||||
struct ImageHasher {
|
||||
std::size_t operator()(const Image& k) const { return k.hash(); }
|
||||
};
|
||||
|
||||
typedef std::unordered_map<Image, std::vector<PdfReference>, ImageHasher> image_reference_map;
|
||||
|
||||
|
||||
static PyObject*
|
||||
dedup_images(PDFDoc *self, PyObject *args) {
|
||||
unsigned long count = 0;
|
||||
PdfVecObjects &objects = self->doc->GetObjects();
|
||||
image_reference_map image_map;
|
||||
|
||||
for (auto &k : objects) {
|
||||
if (!k->IsDictionary()) continue;
|
||||
const PdfDictionary &dict = k->GetDictionary();
|
||||
if (dictionary_has_key_name(dict, PdfName::KeyType, "XObject") && dictionary_has_key_name(dict, PdfName::KeySubtype, "Image")) {
|
||||
Image img(k->Reference(), k);
|
||||
auto it = image_map.find(img);
|
||||
if (it == image_map.end()) {
|
||||
std::vector<PdfReference> vals;
|
||||
image_map.insert(std::make_pair(std::move(img), std::move(vals)));
|
||||
} else (*it).second.push_back(img.reference());
|
||||
}
|
||||
}
|
||||
std::unordered_map<PdfReference, PdfReference, PdfReferenceHasher> ref_map;
|
||||
for (auto &x : image_map) {
|
||||
if (x.second.size() > 0) {
|
||||
const PdfReference &canonical_ref = x.first.reference();
|
||||
for (auto &ref : x.second) {
|
||||
if (ref != canonical_ref) {
|
||||
ref_map[ref] = x.first.reference();
|
||||
delete objects.RemoveObject(ref);
|
||||
count++;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (count > 0) {
|
||||
for (auto &k : objects) {
|
||||
if (!k->IsDictionary()) continue;
|
||||
PdfDictionary &dict = k->GetDictionary();
|
||||
if (dict.HasKey("Resources") && dict.GetKey("Resources")->IsDictionary()) {
|
||||
PdfDictionary &resources = dict.GetKey("Resources")->GetDictionary();
|
||||
if (!resources.HasKey("XObject") || !resources.GetKey("XObject")->IsDictionary()) continue;
|
||||
const PdfDictionary &xobject = resources.GetKey("XObject")->GetDictionary();
|
||||
PdfDictionary new_xobject = PdfDictionary(xobject);
|
||||
bool changed = false;
|
||||
for (auto &x : xobject.GetKeys()) {
|
||||
if (x.second->IsReference()) {
|
||||
try {
|
||||
const PdfReference &r = ref_map.at(x.second->GetReference());
|
||||
new_xobject.AddKey(x.first.GetName(), r);
|
||||
changed = true;
|
||||
} catch (const std::out_of_range &err) { continue; }
|
||||
}
|
||||
}
|
||||
if (changed) resources.AddKey("XObject", new_xobject);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return Py_BuildValue("k", count);
|
||||
|
||||
}
|
||||
|
||||
PYWRAP(dedup_images)
|
Loading…
x
Reference in New Issue
Block a user