Move pdf metadata reader and writer into separate process, since podofo crashes the python interpreter alarmingly often

This commit is contained in:
Kovid Goyal 2009-05-24 18:30:28 -07:00
parent 5b4d17c783
commit ac1e73174a
3 changed files with 86 additions and 19 deletions

View File

@ -30,6 +30,13 @@ PARALLEL_FUNCS = {
'read_metadata' : 'read_metadata' :
('calibre.ebooks.metadata.worker', 'read_metadata_', 'notification'), ('calibre.ebooks.metadata.worker', 'read_metadata_', 'notification'),
'read_pdf_metadata' :
('calibre.utils.podofo.__init__', 'get_metadata_', None),
'write_pdf_metadata' :
('calibre.utils.podofo.__init__', 'set_metadata_', None),
'save_book' : 'save_book' :
('calibre.ebooks.metadata.worker', 'save_book', 'notification'), ('calibre.ebooks.metadata.worker', 'save_book', 'notification'),
} }

View File

@ -6,11 +6,14 @@ __license__ = 'GPL v3'
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>' __copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en' __docformat__ = 'restructuredtext en'
import os import os, time
from calibre.constants import plugins, preferred_encoding from calibre.constants import plugins, preferred_encoding
from calibre.ebooks.metadata import MetaInformation, string_to_authors, \ from calibre.ebooks.metadata import MetaInformation, string_to_authors, \
authors_to_string authors_to_string
from calibre.utils.ipc.job import ParallelJob
from calibre.utils.ipc.server import Server
from calibre.ptempfile import PersistentTemporaryFile
podofo, podofo_err = plugins['podofo'] podofo, podofo_err = plugins['podofo']
@ -19,21 +22,40 @@ class Unavailable(Exception): pass
def get_metadata(stream): def get_metadata(stream):
if not podofo: if not podofo:
raise Unavailable(podofo_err) raise Unavailable(podofo_err)
raw = stream.read() pt = PersistentTemporaryFile('_podofo.pdf')
stream.seek(0) pt.write(stream.read())
pt.close()
server = Server(pool_size=1)
job = ParallelJob('read_pdf_metadata', 'Read pdf metadata',
lambda x,y:x, args=[pt.name])
server.add_job(job)
while not job.is_finished:
time.sleep(0.1)
job.update()
job.update()
server.close()
if job.result is None:
raise ValueError('Failed to read metadata: PoDoFo crashed')
title, authors, creator = job.result
mi = MetaInformation(title, authors)
if creator:
mi.book_producer = creator
if os.path.exists(pt.name): os.remove(pt.name)
return mi
def get_metadata_(path):
p = podofo.PDFDoc() p = podofo.PDFDoc()
p.load(raw) p.open(path)
title = p.title title = p.title
if not title: if not title:
title = getattr(stream, 'name', _('Unknown')) title = getattr(stream, 'name', _('Unknown'))
title = os.path.splitext(os.path.basename(title))[0] title = os.path.splitext(os.path.basename(title))[0]
author = p.author author = p.author
authors = string_to_authors(author) if author else [_('Unknown')] authors = string_to_authors(author) if author else [_('Unknown')]
mi = MetaInformation(title, authors)
creator = p.creator creator = p.creator
if creator: return (title, authors, creator)
mi.book_producer = creator
return mi
def prep(val): def prep(val):
if not val: if not val:
@ -45,21 +67,43 @@ def prep(val):
def set_metadata(stream, mi): def set_metadata(stream, mi):
if not podofo: if not podofo:
raise Unavailable(podofo_err) raise Unavailable(podofo_err)
raw = stream.read() pt = PersistentTemporaryFile('_podofo.pdf')
pt.write(stream.read())
pt.close()
server = Server(pool_size=1)
job = ParallelJob('write_pdf_metadata', 'Write pdf metadata',
lambda x,y:x, args=[pt.name, mi.title, mi.authors, mi.book_producer])
server.add_job(job)
while not job.is_finished:
time.sleep(0.1)
job.update()
job.update()
server.close()
if job.result is not None:
stream.seek(0)
stream.truncate()
stream.write(job.result)
stream.flush()
stream.seek(0)
def set_metadata_(path, title, authors, bkp):
p = podofo.PDFDoc() p = podofo.PDFDoc()
p.load(raw) p.open(path)
title = prep(mi.title) title = prep(title)
touched = False touched = False
if title: if title:
p.title = title p.title = title
touched = True touched = True
author = prep(authors_to_string(mi.authors)) author = prep(authors_to_string(authors))
if author: if author:
p.author = author p.author = author
touched = True touched = True
bkp = prep(mi.book_producer) bkp = prep(bkp)
if bkp: if bkp:
p.creator = bkp p.creator = bkp
touched = True touched = True
@ -68,12 +112,7 @@ def set_metadata(stream, mi):
from calibre.ptempfile import TemporaryFile from calibre.ptempfile import TemporaryFile
with TemporaryFile('_pdf_set_metadata.pdf') as f: with TemporaryFile('_pdf_set_metadata.pdf') as f:
p.save(f) p.save(f)
raw = open(f, 'rb').read() return open(f, 'rb').read()
stream.seek(0)
stream.truncate()
stream.write(raw)
stream.flush()
stream.seek(0)
if __name__ == '__main__': if __name__ == '__main__':
f = '/tmp/t.pdf' f = '/tmp/t.pdf'

View File

@ -64,6 +64,24 @@ podofo_PDFDoc_load(podofo_PDFDoc *self, PyObject *args, PyObject *kwargs) {
return Py_None; return Py_None;
} }
static PyObject *
podofo_PDFDoc_open(podofo_PDFDoc *self, PyObject *args, PyObject *kwargs) {
char *fname;
if (PyArg_ParseTuple(args, "s", &fname)) {
try {
self->doc->Load(fname);
} catch(const PdfError & err) {
podofo_set_exception(err);
return NULL;
}
} else return NULL;
Py_INCREF(Py_None);
return Py_None;
}
static PyObject * static PyObject *
podofo_PDFDoc_save(podofo_PDFDoc *self, PyObject *args, PyObject *kwargs) { podofo_PDFDoc_save(podofo_PDFDoc *self, PyObject *args, PyObject *kwargs) {
char *buffer; char *buffer;
@ -232,6 +250,9 @@ static PyMethodDef podofo_PDFDoc_methods[] = {
{"load", (PyCFunction)podofo_PDFDoc_load, METH_VARARGS, {"load", (PyCFunction)podofo_PDFDoc_load, METH_VARARGS,
"Load a PDF document from a byte buffer (string)" "Load a PDF document from a byte buffer (string)"
}, },
{"open", (PyCFunction)podofo_PDFDoc_open, METH_VARARGS,
"Load a PDF document from a file path (string)"
},
{"save", (PyCFunction)podofo_PDFDoc_save, METH_VARARGS, {"save", (PyCFunction)podofo_PDFDoc_save, METH_VARARGS,
"Save the PDF document to a path on disk" "Save the PDF document to a path on disk"
}, },