IGN:Restore PoDoFo based set pdf metadata functionality

This commit is contained in:
Kovid Goyal 2009-09-22 11:27:53 -06:00
parent 4be28fb1fa
commit c24f507cc2
4 changed files with 120 additions and 84 deletions

View File

@ -5,8 +5,9 @@ __copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
from functools import partial from functools import partial
from calibre import plugins, prints from calibre import prints
from calibre.ebooks.metadata import MetaInformation, string_to_authors#, authors_to_string from calibre.constants import plugins
from calibre.ebooks.metadata import MetaInformation, string_to_authors, authors_to_string
pdfreflow, pdfreflow_error = plugins['pdfreflow'] pdfreflow, pdfreflow_error = plugins['pdfreflow']
@ -44,64 +45,13 @@ def get_metadata(stream, cover=True):
return mi return mi
get_quick_metadata = partial(get_metadata, cover=False) get_quick_metadata = partial(get_metadata, cover=False)
''' import cStringIO
import sys, os, cStringIO
from threading import Thread from threading import Thread
from calibre import StreamReadWrapper
from calibre.ptempfile import TemporaryDirectory
try:
from calibre.utils.PythonMagickWand import \
NewMagickWand, MagickReadImage, MagickSetImageFormat, \
MagickWriteImage, ImageMagick
_imagemagick_loaded = True
except:
_imagemagick_loaded = False
from calibre.ebooks.metadata import MetaInformation, string_to_authors, authors_to_string
from calibre.utils.pdftk import set_metadata as pdftk_set_metadata from calibre.utils.pdftk import set_metadata as pdftk_set_metadata
from calibre.utils.podofo import get_metadata as podofo_get_metadata, \ from calibre.utils.podofo import set_metadata as podofo_set_metadata, Unavailable
set_metadata as podofo_set_metadata, Unavailable, get_metadata_quick
from calibre.utils.poppler import get_metadata as get_metadata_poppler, NotAvailable
def get_quick_metadata(stream):
try:
return get_metadata_poppler(stream, False)
except NotAvailable:
pass
return get_metadata_pypdf(stream)
raw = stream.read()
mi = get_metadata_quick(raw)
if mi.title == '_':
mi.title = getattr(stream, 'name', _('Unknown'))
mi.title = mi.title.rpartition('.')[0]
return mi
def get_metadata(stream, extract_cover=True):
try:
return get_metadata_poppler(stream, extract_cover)
except NotAvailable:
pass
try:
with TemporaryDirectory('_pdfmeta') as tdir:
cpath = os.path.join(tdir, 'cover.pdf')
if not extract_cover:
cpath = None
mi = podofo_get_metadata(stream, cpath=cpath)
if mi.cover is not None:
cdata = get_cover(mi.cover)
mi.cover = None
if cdata is not None:
mi.cover_data = ('jpg', cdata)
except Unavailable:
mi = get_metadata_pypdf(stream)
return mi
def set_metadata(stream, mi): def set_metadata(stream, mi):
stream.seek(0) stream.seek(0)
@ -116,25 +66,6 @@ def set_metadata(stream, mi):
set_metadata_pypdf(stream, mi) set_metadata_pypdf(stream, mi)
def get_metadata_pypdf(stream):
""" Return metadata as a L{MetaInfo} object """
from pyPdf import PdfFileReader
mi = MetaInformation(_('Unknown'), [_('Unknown')])
try:
with StreamReadWrapper(stream) as stream:
info = PdfFileReader(stream).getDocumentInfo()
if info.title:
mi.title = info.title
if info.author:
mi.author = info.author
mi.authors = string_to_authors(info.author)
if info.subject:
mi.category = info.subject
except Exception, err:
msg = u'Couldn\'t read metadata from pdf: %s with error %s'%(mi.title, unicode(err))
print >>sys.stderr, msg.encode('utf8')
return mi
class MetadataWriter(Thread): class MetadataWriter(Thread):
def __init__(self, out_pdf, buf): def __init__(self, out_pdf, buf):
@ -178,13 +109,4 @@ def set_metadata_pypdf(stream, mi):
stream.write(out_str.read()) stream.write(out_str.read())
stream.seek(0) stream.seek(0)
def get_cover(cover_path):
with ImageMagick():
wand = NewMagickWand()
MagickReadImage(wand, cover_path)
MagickSetImageFormat(wand, 'JPEG')
MagickWriteImage(wand, '%s.jpg' % cover_path)
return open('%s.jpg' % cover_path, 'rb').read()
'''

View File

@ -79,6 +79,50 @@ extern "C" {
return ans; return ans;
} }
static PyObject *
pdfreflow_set_metadata(PyObject *self, PyObject *args) {
char *pdfdata;
Py_ssize_t size;
PyObject *info;
if (!PyArg_ParseTuple(args, "s#O", &pdfdata, &size, &info))
return NULL;
if (!PyDict_Check(info)) {
PyErr_SetString(PyExc_ValueError, "Info object must be a dictionary.");
return NULL;
}
char Title[10] = "Title", Author[10] = "Author", Keywords[10] = "Keywords";
char *keys[3] = { Title, Author, Keywords };
map<char *, char *> pinfo;
PyObject *val = NULL, *utf8 = NULL;
for (int i = 0; i < 3; i++) {
val = PyDict_GetItemString(info, keys[i]);
if (!val || !PyUnicode_Check(val)) continue;
utf8 = PyUnicode_AsUTF8String(val);
if (!utf8) continue;
pinfo[keys[i]] = PyString_AS_STRING(utf8);
}
PyObject *ans = NULL;
try {
Reflow reflow(pdfdata, static_cast<std::ifstream::pos_type>(size));
if (reflow.is_locked()) {
PyErr_SetString(PyExc_ValueError, "Setting metadata not possible in encrypeted PDFs");
return NULL;
}
string result = reflow.set_info(pinfo);
ans = PyString_FromStringAndSize(result.c_str(), result.size());
} catch (std::exception &e) {
PyErr_SetString(PyExc_RuntimeError, e.what()); return NULL;
} catch (...) {
PyErr_SetString(PyExc_RuntimeError,
"Unknown exception raised while getting metadata from PDF"); return NULL;
}
return ans;
}
static static
PyMethodDef pdfreflow_methods[] = { PyMethodDef pdfreflow_methods[] = {
@ -90,6 +134,10 @@ extern "C" {
"get_metadata(pdf_data, cover)\n\n" "get_metadata(pdf_data, cover)\n\n"
"Get metadata and (optionally) cover from the specified PDF." "Get metadata and (optionally) cover from the specified PDF."
}, },
{"set_metadata", pdfreflow_set_metadata, METH_VARARGS,
"get_metadata(info_dict)\n\n"
"Set metadata in the specified PDF. Currently broken."
},
{NULL, NULL, 0, NULL} {NULL, NULL, 0, NULL}
}; };

View File

@ -680,6 +680,16 @@ void XMLOutputDev::drawImage(GfxState *state, Object *ref, Stream *str,
colorMap, interpolate, maskColors, inlineImg); colorMap, interpolate, maskColors, inlineImg);
} }
static char stream_pdf[15] = "stream.pdf";
class MemInStream : public MemStream {
public:
MemInStream(char *buf, size_t st, size_t sz, Object *obj) :
MemStream(buf, st, sz, obj) {}
~MemInStream() {}
GooString *getFileName() { return new GooString(stream_pdf); }
};
Reflow::Reflow(char *pdfdata, size_t sz) : Reflow::Reflow(char *pdfdata, size_t sz) :
pdfdata(pdfdata), current_font_size(-1), doc(NULL) pdfdata(pdfdata), current_font_size(-1), doc(NULL)
{ {
@ -690,7 +700,7 @@ Reflow::Reflow(char *pdfdata, size_t sz) :
if (!globalParams) if (!globalParams)
throw ReflowException("Failed to allocate Globalparams"); throw ReflowException("Failed to allocate Globalparams");
} }
MemStream *str = new MemStream(pdfdata, 0, sz, &obj); MemInStream *str = new MemInStream(pdfdata, 0, sz, &obj);
this->doc = new PDFDoc(str, NULL, NULL); this->doc = new PDFDoc(str, NULL, NULL);
if (!this->doc->isOk()) { if (!this->doc->isOk()) {
@ -909,3 +919,56 @@ char* Reflow::render_first_page(size_t *data_size,
} }
return buffer; return buffer;
} }
class MemOutStream : public OutStream {
private:
ostringstream out;
public:
MemOutStream() :OutStream() {}
~MemOutStream() {}
void close() {}
int getPos() { return out.tellp(); }
void put(char c) { out.put(c); }
void printf (const char *format, ...) {
vector<char> buf;
size_t written = strlen(format)*5;
va_list ap;
do {
buf.reserve(written + 20);
va_start(ap, format);
written = vsnprintf(&buf[0], buf.capacity(), format, ap);
va_end(ap);
} while (written >= buf.capacity());
out.write(&buf[0], written);
}
};
string Reflow::set_info(map<char *, char *> sinfo) {
XRef *xref = this->doc->getXRef();
if (!xref) throw ReflowException("No XRef table");
Object *trailer_dict = xref->getTrailerDict();
if (!trailer_dict || !trailer_dict->isDict()) throw ReflowException("No trailer dictionary");
Object tmp;
char INFO[5] = "Info";
Object *info = trailer_dict->dictLookup(INFO, &tmp);
if (!info) {
info = new Object();
info->initDict(xref);
}
if (!info->isDict()) throw ReflowException("Invalid info object");
for (map<char *, char *>::iterator it = sinfo.begin(); it != sinfo.end(); it++) {
Object *tmp = new Object();
tmp->initString(new GooString((*it).second));
info->dictSet((*it).first, tmp);
}
trailer_dict->dictSet(INFO, info);
char out[20] = "/t/out.pdf";
this->doc->saveAs(new GooString(out), writeForceRewrite);
string ans;
return ans;
}

View File

@ -74,6 +74,9 @@ class Reflow {
/* Dump the PDF outline as the file outline.xml in the current directory */ /* Dump the PDF outline as the file outline.xml in the current directory */
void dump_outline(); void dump_outline();
/* Set the info dictionary. Currently broken. */
string set_info(map<char *, char *> info);
}; };
class XMLString { class XMLString {