From 0517092f52180e4aaaaba3723b187c2a10345fd1 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sun, 14 May 2023 07:35:01 +0530 Subject: [PATCH] Get PDF metadata setting working again Bypass the new podofo GetMetadata machinery as it completely clobbers custom XML metadata. --- src/calibre/utils/podofo/__init__.py | 27 ++++---- src/calibre/utils/podofo/doc.cpp | 100 ++++++++++++++------------- src/calibre/utils/podofo/global.h | 2 + src/calibre/utils/podofo/output.cpp | 2 +- 4 files changed, 69 insertions(+), 62 deletions(-) diff --git a/src/calibre/utils/podofo/__init__.py b/src/calibre/utils/podofo/__init__.py index ae6e5992eb..aae0c9ba82 100644 --- a/src/calibre/utils/podofo/__init__.py +++ b/src/calibre/utils/podofo/__init__.py @@ -71,7 +71,7 @@ def set_metadata_implementation(pdf_doc, title, authors, bkp, tags, xmp_packet): touched = True try: - tags = prep(', '.join([x.strip() for x in tags if x.strip()])) + tags = prep(', '.join(x.strip() for x in tags if x.strip())) if tags != pdf_doc.keywords: pdf_doc.keywords = tags touched = True @@ -194,12 +194,17 @@ def test_podofo(): # {{{ raw = b"%PDF-1.1\n%\xe2\xe3\xcf\xd3\n1 0 obj<>\nendobj\n2 0 obj<>\nendobj\n3 0 obj<>>>>>>>\nendobj\n4 0 obj<>\nstream\n BT\n /F1 18 Tf\n 0 0 Td\n (Hello World) Tj\n ET\nendstream\nendobj\n5 0 obj<>\nendobj\n6 0 obj<>\nstream\nx\x9c\xed\x98\xcd\xb2\x930\x14\xc7\xf7}\n&.\x1d\x1ahoGa\x80\x8e\xb6\xe3x\x17ua\xaf\xe3\xd2\t\xc9i\x1b\x0b\x81&a\xc0\xfbj.|$_\xc1\xd0r\xe9\xb7V\x9d\xbb\x83\x15\x9c\x9c\xff\xff\x97\x8fs\xb2 \x18W9\xa1k\xd0V\x0cK.B\xf4\xf3\xfb\x0fdq\x16\xa2\xcf\xa3\x993\xcb'\xb0\xe2\xef\x1f%\xcc\x1f?<\xd0\xc75\xf5\x18\x1aG\xbd\xa0\xf2\xab4OA\x13\xabJ\x13\xa1\xfc*D\x84e1\xf8\xe6\xbd\x0ec\x14\xf5,+\x90l\xe1\x7f\x9c\xbek\x92\xccW\x88VZ\xe7>\xc6eY\xf6\xcba?\x93K\xecz\x9e\x87\x9d\x01\x1e\x0cl\x93a\xaboB\x93\xca\x16\xea\xc5\xd6\xa3q\x99\x82\xa2\x92\xe7\x9ag\xa2qc\xb45\xcb\x0b\x99l\xad\x18\xc5\x90@\nB+\xec\xf6]\x8c\xacZK\xe2\xac\xd0!j\xec\x8c!\xa3>\xdb\xfb=\x85\x1b\xd2\x9bD\xef#M,\xe15\xd4O\x88X\x86\xa8\xb2\x19,H\x91h\x14\x05x7z`\x81O<\x02|\x99VOBs\x9d\xc0\x7f\xe0\x05\x94\xfa\xd6)\x1c\xb1jx^\xc4\tW+\x90'\x13xK\x96\xf8Hy\x96X\xabU\x11\x7f\x05\xaa\xff\xa4=I\xab\x95T\x02\xd1\xd9)u\x0e\x9b\x0b\xcb\x8e>\x89\xb5\xc8Jqm\x91\x07\xaa-\xee\xc8{\x972=\xdd\xfa+\xe5d\xea\xb9\xad'\xa1\xfa\xdbj\xee\xd3,\xc5\x15\xc9M-9\xa6\x96\xdaD\xce6Wr\xd3\x1c\xdf3S~|\xc1A\xe2MA\x92F{\xb1\x0eM\xba?3\xdd\xc2\x88&S\xa2!\x1a8\xee\x9d\xedx\xb6\xeb=\xb8C\xff\xce\xf1\x87\xaf\xfb\xde\xe0\xd5\xc8\xf3^:#\x7f\xe8\x04\xf8L\xf2\x0fK\xcd%W\xe9\xbey\xea/\xa5\x89`D\xb2m\x17\t\x92\x822\xb7\x02(\x1c\x13\xc5)\x1e\x9c-\x01\xff\x1e\xc0\x16\xd5\xe5\r\xaaG\xcc\x8e\x0c\xff\xca\x8e\x92\x84\xc7\x12&\x93\xd6\xb3\x89\xd8\x10g\xd9\xfai\xe7\xedv\xde6-\x94\xceR\x9bfI\x91\n\x85\x8e}nu9\x91\xcd\xefo\xc6+\x90\x1c\x94\xcd\x05\x83\xea\xca\xd17\x16\xbb\xb6\xfc\xa22\xa9\x9bn\xbe0p\xfd\x88wAs\xc3\x9a+\x19\xb7w\xf2a#=\xdf\xd3A:H\x07\xe9 \x1d\xa4\x83t\x90\x0e\xd2A:H\x07yNH/h\x7f\xd6\x80`!*\xd18\xfa\x05\x94\x80P\xb0\nendstream\nendobj\nxref\n0 7\n0000000000 65535 f \n0000000015 00000 n \n0000000074 00000 n \n0000000148 00000 n \n0000000280 00000 n \n0000000382 00000 n \n0000000522 00000 n \ntrailer\n<<4D028D512DEBEFD964756764AD8FF726>]/Info 5 0 R/Root 1 0 R/Size 7>>\nstartxref\n1199\n%%EOF\n" # noqa # }}} - mi = Metadata('title1', ['author1']) + mi = Metadata('title1', ['xmp_author']) podofo = get_podofo() p = podofo.PDFDoc() p.load(raw) - p.title = mi.title - p.author = mi.authors[0] + p.title = 'info title' + p.author = 'info author' + p.keywords = 'a, b' + xmp_packet = metadata_to_xmp_packet(mi) + # print(p.get_xmp_metadata().decode()) + p.set_xmp_metadata(xmp_packet) + # print(p.get_xmp_metadata().decode()) with tempfile.NamedTemporaryFile(suffix='.pdf', delete=False) as f: p.save_to_fileobj(f) f.seek(0) @@ -210,18 +215,12 @@ def test_podofo(): try: p = podofo.PDFDoc() p.open(f.name) - if (p.title, p.author) != (mi.title, mi.authors[0]): + if (p.title, p.author, p.keywords) != ('info title', 'info author', 'a, b'): raise ValueError('podofo failed to set title and author in Info dict {} != {}'.format( - (p.title, p.author), (mi.title, mi.authors[0]))) - xmp_packet = metadata_to_xmp_packet(mi) - xmp_packet = xmp_packet.replace(b'author1', b'changed_author') - p.set_xmp_metadata(xmp_packet) - raw = p.write() - p = podofo.PDFDoc() - p.load(raw) + (p.title, p.author, p.keywords), ('info title', 'info author', 'a, b'))) xmp = p.get_xmp_metadata().decode() - if 'changed_author' not in xmp: - raise ValueError('Failed to set XML block, received:', xmp) + if 'xmp_author' not in xmp: + raise ValueError('Failed to set XML block, received:\n' + xmp) del p finally: os.remove(f.name) diff --git a/src/calibre/utils/podofo/doc.cpp b/src/calibre/utils/podofo/doc.cpp index c38dbaa668..1f6afacbe2 100644 --- a/src/calibre/utils/podofo/doc.cpp +++ b/src/calibre/utils/podofo/doc.cpp @@ -83,7 +83,7 @@ PDFDoc_save(PDFDoc *self, PyObject *args) { if (PyArg_ParseTuple(args, "s", &buffer)) { try { - self->doc->Save(buffer); + self->doc->Save(buffer, save_options); } catch(const PdfError & err) { podofo_set_exception(err); return NULL; @@ -139,7 +139,7 @@ PDFDoc_write(PDFDoc *self, PyObject *args) { BytesOutputDevice d; try { - self->doc->Save(d); + self->doc->Save(d, save_options); return d.Release(); } catch(const PdfError &err) { podofo_set_exception(err); @@ -391,7 +391,13 @@ PDFDoc_set_box(PDFDoc *self, PyObject *args) { static PyObject * PDFDoc_get_xmp_metadata(PDFDoc *self, PyObject *args) { try { - auto s = self->doc->GetCatalog().GetMetadataStreamValue(); + auto obj = self->doc->GetCatalog().GetDictionary().FindKey("Metadata"); + if (obj == nullptr) Py_RETURN_NONE; + auto stream = obj->GetStream(); + if (stream == nullptr) Py_RETURN_NONE; + std::string s; + StringStreamDevice ouput(s); + stream->CopyTo(ouput); return PyBytes_FromStringAndSize(s.data(), s.size()); } catch(const PdfError & err) { podofo_set_exception(err); return NULL; @@ -408,7 +414,10 @@ PDFDoc_set_xmp_metadata(PDFDoc *self, PyObject *args) { Py_ssize_t len = 0; if (!PyArg_ParseTuple(args, "y#", &raw, &len)) return NULL; try { - self->doc->GetCatalog().SetMetadataStreamValue(std::string_view(raw, len)); + auto& metadata = self->doc->GetCatalog().GetOrCreateMetadataObject(); + auto& stream = metadata.GetOrCreateStream(); + stream.SetData(std::string_view(raw, len), true); + metadata.GetDictionary().RemoveKey(PdfName::KeyFilter); } catch(const PdfError & err) { podofo_set_exception(err); return NULL; } catch (...) { @@ -582,97 +591,94 @@ PDFDoc_version_getter(PDFDoc *self, void *closure) { return PyUnicode_FromString(""); } -static inline PyObject* -string_metadata_getter(const nullable& t) { - if (t.has_value()) return podofo_convert_pdfstring(t.value()); - return PyUnicode_FromString(""); +static PdfDictionary& +get_or_create_info(PDFDoc *self) { + PdfObject *info = self->doc->GetTrailer().GetDictionary().FindKey("Info"); + if (info && info->IsDictionary()) return info->GetDictionary(); + auto ninfo = self->doc->GetObjects().CreateDictionaryObject(); + self->doc->GetTrailer().GetDictionary().AddKeyIndirect("Info", ninfo); + return ninfo.GetDictionary(); } +static inline PyObject* +string_metadata_getter(PDFDoc *self, const std::string_view name) { + auto info = get_or_create_info(self); + auto obj = info.FindKey(name); + const PdfString* str; + return (obj == nullptr || !obj->TryGetString(str)) ? PyUnicode_FromString("") : podofo_convert_pdfstring(*str); +} + + static PyObject * PDFDoc_title_getter(PDFDoc *self, void *closure) { - return string_metadata_getter(self->doc->GetMetadata().GetTitle()); + return string_metadata_getter(self, "Title"); } static PyObject * PDFDoc_author_getter(PDFDoc *self, void *closure) { - return string_metadata_getter(self->doc->GetMetadata().GetAuthor()); + return string_metadata_getter(self, "Author"); } static PyObject * PDFDoc_subject_getter(PDFDoc *self, void *closure) { - return string_metadata_getter(self->doc->GetMetadata().GetSubject()); + return string_metadata_getter(self, "Subject"); } static PyObject * PDFDoc_keywords_getter(PDFDoc *self, void *closure) { - auto kw = self->doc->GetMetadata().GetKeywords(); - pyunique_ptr ans(PyTuple_New(kw.size())); - if (!ans) return NULL; - for (size_t i = 0; i < kw.size(); i++) { - pyunique_ptr t(PyUnicode_FromString(kw[i].c_str())); - if (!t) return NULL; - PyTuple_SET_ITEM(ans.get(), i, t.release()); - } - return ans.release(); + return string_metadata_getter(self, "Keywords"); } static PyObject * PDFDoc_creator_getter(PDFDoc *self, void *closure) { - return string_metadata_getter(self->doc->GetMetadata().GetCreator()); + return string_metadata_getter(self, "Creator"); } static PyObject * PDFDoc_producer_getter(PDFDoc *self, void *closure) { - return string_metadata_getter(self->doc->GetMetadata().GetProducer()); + return string_metadata_getter(self, "Producer"); } +static inline int +string_metadata_setter(PDFDoc *self, const std::string_view name, PyObject *val) { + if (!PyUnicode_Check(val)) { PyErr_SetString(PyExc_TypeError, "Must use unicode to set metadata"); return -1; } + auto& info = get_or_create_info(self); + const char *raw; Py_ssize_t sz; + raw = PyUnicode_AsUTF8AndSize(val, &sz); + if (sz == 0) info.RemoveKey(name); + else info.AddKey(name, PdfString(std::string_view(raw, sz))); + return 0; +} + + static int PDFDoc_title_setter(PDFDoc *self, PyObject *val, void *closure) { - if (!PyUnicode_Check(val)) { PyErr_SetString(PyExc_TypeError, "Must use unicode to set metadata"); return -1; } - self->doc->GetMetadata().SetTitle(podofo_convert_pystring(val)); - return 0; + return string_metadata_setter(self, "Title", val); } static int PDFDoc_author_setter(PDFDoc *self, PyObject *val, void *closure) { - if (!PyUnicode_Check(val)) { PyErr_SetString(PyExc_TypeError, "Must use unicode to set metadata"); return -1; } - self->doc->GetMetadata().SetAuthor(podofo_convert_pystring(val)); - return 0; + return string_metadata_setter(self, "Author", val); } static int PDFDoc_subject_setter(PDFDoc *self, PyObject *val, void *closure) { - if (!PyUnicode_Check(val)) { PyErr_SetString(PyExc_TypeError, "Must use unicode to set metadata"); return -1; } - self->doc->GetMetadata().SetSubject(podofo_convert_pystring(val)); - return 0; + return string_metadata_setter(self, "Subject", val); } static int PDFDoc_keywords_setter(PDFDoc *self, PyObject *val, void *closure) { - pyunique_ptr f(PySequence_Fast(val, "Need a sequence to set keywords")); - if (!f) return -1; - std::vector keywords(PySequence_Fast_GET_SIZE(f.get())); - for (Py_ssize_t i = 0; i < PySequence_Fast_GET_SIZE(f.get()); i++) { - PyObject *x = PySequence_Fast_GET_ITEM(f.get(), i); - if (!PyUnicode_Check(x)) { PyErr_SetString(PyExc_TypeError, "keywords sequence must contain only unicode objects"); return -1; } - keywords.emplace_back(podofo_convert_pystring(x)); - } - self->doc->GetMetadata().SetKeywords(keywords); - return 0; + return string_metadata_setter(self, "Keywords", val); } static int PDFDoc_creator_setter(PDFDoc *self, PyObject *val, void *closure) { - if (!PyUnicode_Check(val)) { PyErr_SetString(PyExc_TypeError, "Must use unicode to set metadata"); return -1; } - self->doc->GetMetadata().SetCreator(podofo_convert_pystring(val)); - return 0; + return string_metadata_setter(self, "Creator", val); } static int PDFDoc_producer_setter(PDFDoc *self, PyObject *val, void *closure) { - if (!PyUnicode_Check(val)) { PyErr_SetString(PyExc_TypeError, "Must use unicode to set metadata"); return -1; } - self->doc->GetMetadata().SetProducer(podofo_convert_pystring(val)); - return 0; + return string_metadata_setter(self, "Producer", val); } static PyGetSetDef PDFDoc_getsetters[] = { diff --git a/src/calibre/utils/podofo/global.h b/src/calibre/utils/podofo/global.h index 17cf64a8f9..3743f7640b 100644 --- a/src/calibre/utils/podofo/global.h +++ b/src/calibre/utils/podofo/global.h @@ -129,6 +129,8 @@ object_as_reference(const PdfObject *o) { return o->IsReference() ? o->GetReference() : o->GetIndirectReference(); } +// Needed to avoid PoDoFo clobbering the /Info and XMP metadata with its own nonsense +static const PdfSaveOptions save_options = PdfSaveOptions::NoModifyDateUpdate; class PdfReferenceHasher { public: diff --git a/src/calibre/utils/podofo/output.cpp b/src/calibre/utils/podofo/output.cpp index a70c8bc26d..ede83b9d6a 100644 --- a/src/calibre/utils/podofo/output.cpp +++ b/src/calibre/utils/podofo/output.cpp @@ -182,7 +182,7 @@ PyObject* pdf::write_doc(PdfMemDocument *doc, PyObject *f) { MyOutputDevice d(f); try { - doc->Save(d); + doc->Save(d, save_options); d.Flush(); } catch(const PdfError & err) { podofo_set_exception(err); return NULL;