From c24f507cc233db31ac1ff523283ad789a269b110 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Tue, 22 Sep 2009 11:27:53 -0600 Subject: [PATCH] IGN:Restore PoDoFo based set pdf metadata functionality --- src/calibre/ebooks/metadata/pdf.py | 88 ++---------------------------- src/calibre/ebooks/pdf/main.cpp | 48 ++++++++++++++++ src/calibre/ebooks/pdf/reflow.cpp | 65 +++++++++++++++++++++- src/calibre/ebooks/pdf/reflow.h | 3 + 4 files changed, 120 insertions(+), 84 deletions(-) diff --git a/src/calibre/ebooks/metadata/pdf.py b/src/calibre/ebooks/metadata/pdf.py index e11197e4fe..147e3d2504 100644 --- a/src/calibre/ebooks/metadata/pdf.py +++ b/src/calibre/ebooks/metadata/pdf.py @@ -5,8 +5,9 @@ __copyright__ = '2008, Kovid Goyal ' from functools import partial -from calibre import plugins, prints -from calibre.ebooks.metadata import MetaInformation, string_to_authors#, authors_to_string +from calibre import prints +from calibre.constants import plugins +from calibre.ebooks.metadata import MetaInformation, string_to_authors, authors_to_string pdfreflow, pdfreflow_error = plugins['pdfreflow'] @@ -44,64 +45,13 @@ def get_metadata(stream, cover=True): return mi - - get_quick_metadata = partial(get_metadata, cover=False) -''' -import sys, os, cStringIO +import cStringIO from threading import Thread -from calibre import StreamReadWrapper -from calibre.ptempfile import TemporaryDirectory -try: - from calibre.utils.PythonMagickWand import \ - NewMagickWand, MagickReadImage, MagickSetImageFormat, \ - MagickWriteImage, ImageMagick - _imagemagick_loaded = True -except: - _imagemagick_loaded = False -from calibre.ebooks.metadata import MetaInformation, string_to_authors, authors_to_string from calibre.utils.pdftk import set_metadata as pdftk_set_metadata -from calibre.utils.podofo import get_metadata as podofo_get_metadata, \ - set_metadata as podofo_set_metadata, Unavailable, get_metadata_quick -from calibre.utils.poppler import get_metadata as get_metadata_poppler, NotAvailable - -def get_quick_metadata(stream): - try: - return get_metadata_poppler(stream, False) - except NotAvailable: - pass - - return get_metadata_pypdf(stream) - raw = stream.read() - mi = get_metadata_quick(raw) - if mi.title == '_': - mi.title = getattr(stream, 'name', _('Unknown')) - mi.title = mi.title.rpartition('.')[0] - return mi - - -def get_metadata(stream, extract_cover=True): - try: - return get_metadata_poppler(stream, extract_cover) - except NotAvailable: - pass - try: - with TemporaryDirectory('_pdfmeta') as tdir: - cpath = os.path.join(tdir, 'cover.pdf') - if not extract_cover: - cpath = None - mi = podofo_get_metadata(stream, cpath=cpath) - if mi.cover is not None: - cdata = get_cover(mi.cover) - mi.cover = None - if cdata is not None: - mi.cover_data = ('jpg', cdata) - except Unavailable: - mi = get_metadata_pypdf(stream) - return mi - +from calibre.utils.podofo import set_metadata as podofo_set_metadata, Unavailable def set_metadata(stream, mi): stream.seek(0) @@ -116,25 +66,6 @@ def set_metadata(stream, mi): set_metadata_pypdf(stream, mi) -def get_metadata_pypdf(stream): - """ Return metadata as a L{MetaInfo} object """ - from pyPdf import PdfFileReader - mi = MetaInformation(_('Unknown'), [_('Unknown')]) - try: - with StreamReadWrapper(stream) as stream: - info = PdfFileReader(stream).getDocumentInfo() - if info.title: - mi.title = info.title - if info.author: - mi.author = info.author - mi.authors = string_to_authors(info.author) - if info.subject: - mi.category = info.subject - except Exception, err: - msg = u'Couldn\'t read metadata from pdf: %s with error %s'%(mi.title, unicode(err)) - print >>sys.stderr, msg.encode('utf8') - return mi - class MetadataWriter(Thread): def __init__(self, out_pdf, buf): @@ -178,13 +109,4 @@ def set_metadata_pypdf(stream, mi): stream.write(out_str.read()) stream.seek(0) -def get_cover(cover_path): - with ImageMagick(): - wand = NewMagickWand() - MagickReadImage(wand, cover_path) - MagickSetImageFormat(wand, 'JPEG') - MagickWriteImage(wand, '%s.jpg' % cover_path) - return open('%s.jpg' % cover_path, 'rb').read() -''' - diff --git a/src/calibre/ebooks/pdf/main.cpp b/src/calibre/ebooks/pdf/main.cpp index 358f344c09..96bb5ed853 100644 --- a/src/calibre/ebooks/pdf/main.cpp +++ b/src/calibre/ebooks/pdf/main.cpp @@ -79,6 +79,50 @@ extern "C" { return ans; } + static PyObject * + pdfreflow_set_metadata(PyObject *self, PyObject *args) { + char *pdfdata; + Py_ssize_t size; + PyObject *info; + + if (!PyArg_ParseTuple(args, "s#O", &pdfdata, &size, &info)) + return NULL; + + if (!PyDict_Check(info)) { + PyErr_SetString(PyExc_ValueError, "Info object must be a dictionary."); + return NULL; + } + + char Title[10] = "Title", Author[10] = "Author", Keywords[10] = "Keywords"; + char *keys[3] = { Title, Author, Keywords }; + map pinfo; + PyObject *val = NULL, *utf8 = NULL; + + for (int i = 0; i < 3; i++) { + val = PyDict_GetItemString(info, keys[i]); + if (!val || !PyUnicode_Check(val)) continue; + utf8 = PyUnicode_AsUTF8String(val); + if (!utf8) continue; + pinfo[keys[i]] = PyString_AS_STRING(utf8); + } + + PyObject *ans = NULL; + try { + Reflow reflow(pdfdata, static_cast(size)); + if (reflow.is_locked()) { + PyErr_SetString(PyExc_ValueError, "Setting metadata not possible in encrypeted PDFs"); + return NULL; + } + string result = reflow.set_info(pinfo); + ans = PyString_FromStringAndSize(result.c_str(), result.size()); + } catch (std::exception &e) { + PyErr_SetString(PyExc_RuntimeError, e.what()); return NULL; + } catch (...) { + PyErr_SetString(PyExc_RuntimeError, + "Unknown exception raised while getting metadata from PDF"); return NULL; + } + return ans; + } static PyMethodDef pdfreflow_methods[] = { @@ -90,6 +134,10 @@ extern "C" { "get_metadata(pdf_data, cover)\n\n" "Get metadata and (optionally) cover from the specified PDF." }, + {"set_metadata", pdfreflow_set_metadata, METH_VARARGS, + "get_metadata(info_dict)\n\n" + "Set metadata in the specified PDF. Currently broken." + }, {NULL, NULL, 0, NULL} }; diff --git a/src/calibre/ebooks/pdf/reflow.cpp b/src/calibre/ebooks/pdf/reflow.cpp index 0181194ea2..a494887bca 100644 --- a/src/calibre/ebooks/pdf/reflow.cpp +++ b/src/calibre/ebooks/pdf/reflow.cpp @@ -680,6 +680,16 @@ void XMLOutputDev::drawImage(GfxState *state, Object *ref, Stream *str, colorMap, interpolate, maskColors, inlineImg); } +static char stream_pdf[15] = "stream.pdf"; + +class MemInStream : public MemStream { + public: + MemInStream(char *buf, size_t st, size_t sz, Object *obj) : + MemStream(buf, st, sz, obj) {} + ~MemInStream() {} + GooString *getFileName() { return new GooString(stream_pdf); } +}; + Reflow::Reflow(char *pdfdata, size_t sz) : pdfdata(pdfdata), current_font_size(-1), doc(NULL) { @@ -690,7 +700,7 @@ Reflow::Reflow(char *pdfdata, size_t sz) : if (!globalParams) throw ReflowException("Failed to allocate Globalparams"); } - MemStream *str = new MemStream(pdfdata, 0, sz, &obj); + MemInStream *str = new MemInStream(pdfdata, 0, sz, &obj); this->doc = new PDFDoc(str, NULL, NULL); if (!this->doc->isOk()) { @@ -909,3 +919,56 @@ char* Reflow::render_first_page(size_t *data_size, } return buffer; } + +class MemOutStream : public OutStream { + private: + ostringstream out; + + public: + MemOutStream() :OutStream() {} + ~MemOutStream() {} + void close() {} + int getPos() { return out.tellp(); } + void put(char c) { out.put(c); } + void printf (const char *format, ...) { + vector buf; + size_t written = strlen(format)*5; + va_list ap; + do { + buf.reserve(written + 20); + va_start(ap, format); + written = vsnprintf(&buf[0], buf.capacity(), format, ap); + va_end(ap); + } while (written >= buf.capacity()); + out.write(&buf[0], written); + } +}; + +string Reflow::set_info(map sinfo) { + XRef *xref = this->doc->getXRef(); + if (!xref) throw ReflowException("No XRef table"); + Object *trailer_dict = xref->getTrailerDict(); + if (!trailer_dict || !trailer_dict->isDict()) throw ReflowException("No trailer dictionary"); + Object tmp; + char INFO[5] = "Info"; + Object *info = trailer_dict->dictLookup(INFO, &tmp); + if (!info) { + info = new Object(); + info->initDict(xref); + } + if (!info->isDict()) throw ReflowException("Invalid info object"); + + for (map::iterator it = sinfo.begin(); it != sinfo.end(); it++) { + Object *tmp = new Object(); + tmp->initString(new GooString((*it).second)); + info->dictSet((*it).first, tmp); + } + + trailer_dict->dictSet(INFO, info); + char out[20] = "/t/out.pdf"; + this->doc->saveAs(new GooString(out), writeForceRewrite); + string ans; + return ans; +} + + diff --git a/src/calibre/ebooks/pdf/reflow.h b/src/calibre/ebooks/pdf/reflow.h index 2a672c6661..cf17cd15ae 100644 --- a/src/calibre/ebooks/pdf/reflow.h +++ b/src/calibre/ebooks/pdf/reflow.h @@ -74,6 +74,9 @@ class Reflow { /* Dump the PDF outline as the file outline.xml in the current directory */ void dump_outline(); + + /* Set the info dictionary. Currently broken. */ + string set_info(map info); }; class XMLString {