More podofo refactoring

This commit is contained in:
Kovid Goyal 2012-08-26 12:17:04 +05:30
parent 1136f26186
commit 238bc24cf2
4 changed files with 139 additions and 216 deletions

View File

@ -148,7 +148,6 @@ extensions = [
libraries=['podofo'], libraries=['podofo'],
lib_dirs=[podofo_lib], lib_dirs=[podofo_lib],
inc_dirs=[podofo_inc, os.path.dirname(podofo_inc)], inc_dirs=[podofo_inc, os.path.dirname(podofo_inc)],
optional=True,
error=podofo_error), error=podofo_error),
Extension('pictureflow', Extension('pictureflow',

View File

@ -43,12 +43,6 @@ PARALLEL_FUNCS = {
'read_metadata' : 'read_metadata' :
('calibre.ebooks.metadata.worker', 'read_metadata_', 'notification'), ('calibre.ebooks.metadata.worker', 'read_metadata_', 'notification'),
'read_pdf_metadata' :
('calibre.utils.podofo.__init__', 'get_metadata_', None),
'write_pdf_metadata' :
('calibre.utils.podofo.__init__', 'set_metadata_', None),
'save_book' : 'save_book' :
('calibre.ebooks.metadata.worker', 'save_book', 'notification'), ('calibre.ebooks.metadata.worker', 'save_book', 'notification'),

View File

@ -6,109 +6,12 @@ __license__ = 'GPL v3'
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>' __copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en' __docformat__ = 'restructuredtext en'
import os, time, shutil import os, shutil
from calibre.constants import plugins, preferred_encoding from calibre.constants import plugins, preferred_encoding
from calibre.ebooks.metadata import MetaInformation, string_to_authors, \ from calibre.ebooks.metadata import authors_to_string
authors_to_string from calibre.ptempfile import TemporaryDirectory
from calibre.utils.ipc.job import ParallelJob from calibre.utils.ipc.simple_worker import fork_job, WorkerError
from calibre.utils.ipc.server import Server
from calibre.ptempfile import PersistentTemporaryFile, TemporaryFile
from calibre import prints
podofo, podofo_err = plugins['podofo']
class Unavailable(Exception): pass
def get_metadata(stream, cpath=None):
if not podofo:
raise Unavailable(podofo_err)
pt = PersistentTemporaryFile('_podofo.pdf')
pt.write(stream.read())
pt.close()
server = Server(pool_size=1)
job = ParallelJob('read_pdf_metadata', 'Read pdf metadata',
lambda x,y:x, args=[pt.name, cpath])
server.add_job(job)
while not job.is_finished:
time.sleep(0.1)
job.update()
job.update()
server.close()
if job.result is None:
raise ValueError('Failed to read metadata: ' + job.details)
title, authors, creator, tags, ok = job.result
if not ok:
print 'Failed to extract cover:'
print job.details
if title == '_':
title = getattr(stream, 'name', _('Unknown'))
title = os.path.splitext(title)[0]
mi = MetaInformation(title, authors)
if creator:
mi.book_producer = creator
if tags:
mi.tags = tags
if os.path.exists(pt.name): os.remove(pt.name)
if ok:
mi.cover = cpath
return mi
def get_metadata_quick(raw):
p = podofo.PDFDoc()
p.load(raw)
title = p.title
if not title:
title = '_'
author = p.author
authors = string_to_authors(author) if author else [_('Unknown')]
creator = p.creator
try:
tags = [x.strip() for x in p.keywords.split(u',')]
tags = [x for x in tags if x]
except:
tags = []
mi = MetaInformation(title, authors)
if creator:
mi.book_producer = creator
if tags:
mi.tags = tags
return mi
def get_metadata_(path, cpath=None):
p = podofo.PDFDoc()
p.open(path)
title = p.title
if not title:
title = '_'
author = p.author
authors = string_to_authors(author) if author else [_('Unknown')]
creator = p.creator
try:
tags = [x.strip() for x in p.keywords.split(u',')]
tags = [x for x in tags if x]
except:
tags = []
ok = True
try:
if cpath is not None:
pages = p.pages
if pages < 1:
raise ValueError('PDF has no pages')
if True or pages == 1:
shutil.copyfile(path, cpath)
else:
p.extract_first_page()
p.save(cpath)
except:
import traceback
traceback.print_exc()
ok = False
return (title, authors, creator, tags, ok)
def prep(val): def prep(val):
if not val: if not val:
@ -118,27 +21,16 @@ def prep(val):
return val.strip() return val.strip()
def set_metadata(stream, mi): def set_metadata(stream, mi):
if not podofo: with TemporaryDirectory(u'_podofo_set_metadata') as tdir:
raise Unavailable(podofo_err) with open(os.path.join(tdir, u'input.pdf'), 'wb') as f:
with TemporaryFile('_podofo_read.pdf') as inputf, \
TemporaryFile('_podofo_write.pdf') as outputf:
server = Server(pool_size=1)
with open(inputf, 'wb') as f:
shutil.copyfileobj(stream, f) shutil.copyfileobj(stream, f)
job = ParallelJob('write_pdf_metadata', 'Write pdf metadata', try:
lambda x,y:x, args=[inputf, outputf, mi.title, mi.authors, touched = fork_job('calibre.utils.podofo', 'set_metadata_', (tdir,
mi.book_producer, mi.tags]) mi.title, mi.authors, mi.book_producer, mi.tags))
server.add_job(job) except WorkerError as e:
while not job.is_finished: raise Exception('Failed to set PDF metadata: %s'%e.orig_tb)
time.sleep(0.1) if touched:
job.update() with open(os.path.join(tdir, u'output.pdf'), 'rb') as f:
job.update()
server.close()
if job.failed:
prints(job.details)
elif job.result:
with open(outputf, 'rb') as f:
f.seek(0, 2) f.seek(0, 2)
if f.tell() > 100: if f.tell() > 100:
f.seek(0) f.seek(0)
@ -148,10 +40,14 @@ def set_metadata(stream, mi):
stream.flush() stream.flush()
stream.seek(0) stream.seek(0)
def set_metadata_(tdir, title, authors, bkp, tags):
podofo, podofo_err = plugins['podofo']
if podofo is None:
raise RuntimeError('Failed to load podofo: %s'%podofo_err)
def set_metadata_(path, opath, title, authors, bkp, tags): os.chdir(tdir)
p = podofo.PDFDoc() p = podofo.PDFDoc()
p.open(path) p.open(u'input.pdf')
title = prep(title) title = prep(title)
touched = False touched = False
if title and title != p.title: if title and title != p.title:
@ -177,27 +73,32 @@ def set_metadata_(path, opath, title, authors, bkp, tags):
pass pass
if touched: if touched:
p.save(opath) p.save(u'output.pdf')
return True
return False return touched
def delete_all_but(path, pages): def delete_all_but(path, pages):
''' Delete all the pages in the pdf except for the specified ones. Negative ''' Delete all the pages in the pdf except for the specified ones. Negative
numbers are counted from the end of the PDF. ''' numbers are counted from the end of the PDF. '''
with TemporaryFile('_podofo_in.pdf') as of: podofo, podofo_err = plugins['podofo']
shutil.copyfile(path, of) if podofo is None:
raise RuntimeError('Failed to load podofo: %s'%podofo_err)
p = podofo.PDFDoc() p = podofo.PDFDoc()
p.open(of) with open(path, 'rb') as f:
raw = f.read()
p.load(raw)
total = p.page_count() total = p.page_count()
pages = { total + x if x < 0 else x for x in pages } pages = { total + x if x < 0 else x for x in pages }
for page in xrange(total-1, -1, -1): for page in xrange(total-1, -1, -1):
if page not in pages: if page not in pages:
p.delete_page(page) p.delete_page(page)
os.remove(path)
p.save(path) raw = p.write()
with open(path, 'wb') as f:
f.write(raw)
if __name__ == '__main__': if __name__ == '__main__':
f = '/tmp/t.pdf' f = u'/tmp/t.pdf'
delete_all_but(f, [0, 1, -2, -1]) delete_all_but(f, [0, 1, -2, -1])

View File

@ -77,11 +77,72 @@ PDFDoc_save(PDFDoc *self, PyObject *args, PyObject *kwargs) {
} }
} else return NULL; } else return NULL;
Py_RETURN_NONE;
Py_INCREF(Py_None);
return Py_None;
} }
static PyObject *
PDFDoc_write(PDFDoc *self, PyObject *args, PyObject *kwargs) {
PyObject *ans;
PdfRefCountedBuffer buffer(1*1024*1024);
PdfOutputDevice out(&buffer);
try {
self->doc->Write(&out);
} catch(const PdfError &err) {
podofo_set_exception(err);
return NULL;
}
ans = PyBytes_FromStringAndSize(buffer.GetBuffer(), out.Tell());
if (ans == NULL) PyErr_NoMemory();
return ans;
}
static PyObject *
PDFDoc_extract_first_page(PDFDoc *self, PyObject *args, PyObject *kwargs) {
try {
while (self->doc->GetPageCount() > 1) self->doc->GetPagesTree()->DeletePage(1);
} catch(const PdfError & err) {
podofo_set_exception(err);
return NULL;
}
Py_RETURN_NONE;
}
static PyObject *
PDFDoc_page_count(PDFDoc *self, PyObject *args, PyObject *kwargs) {
int count;
try {
count = self->doc->GetPageCount();
} catch(const PdfError & err) {
podofo_set_exception(err);
return NULL;
}
return Py_BuildValue("i", count);
}
static PyObject *
PDFDoc_delete_page(PDFDoc *self, PyObject *args, PyObject *kwargs) {
int num = 0;
if (PyArg_ParseTuple(args, "i", &num)) {
try {
self->doc->DeletePages(num, 1);
} catch(const PdfError & err) {
podofo_set_exception(err);
return NULL;
}
} else return NULL;
Py_RETURN_NONE;
}
static PyObject *
PDFDoc_append(PDFDoc *self, PyObject *args, PyObject *kwargs) {
Py_RETURN_NONE;
}
// Properties {{{
static PyObject * static PyObject *
PDFDoc_pages_getter(PDFDoc *self, void *closure) { PDFDoc_pages_getter(PDFDoc *self, void *closure) {
int pages = self->doc->GetPageCount(); int pages = self->doc->GetPageCount();
@ -123,46 +184,6 @@ PDFDoc_version_getter(PDFDoc *self, void *closure) {
} }
static PyObject *
PDFDoc_extract_first_page(PDFDoc *self, PyObject *args, PyObject *kwargs) {
try {
while (self->doc->GetPageCount() > 1) self->doc->GetPagesTree()->DeletePage(1);
} catch(const PdfError & err) {
podofo_set_exception(err);
return NULL;
}
Py_RETURN_NONE;
}
static PyObject *
PDFDoc_page_count(PDFDoc *self, PyObject *args, PyObject *kwargs) {
int count;
try {
count = self->doc->GetPageCount();
} catch(const PdfError & err) {
podofo_set_exception(err);
return NULL;
}
return Py_BuildValue("i", count);
}
static PyObject *
PDFDoc_delete_page(PDFDoc *self, PyObject *args, PyObject *kwargs) {
int num = 0;
if (PyArg_ParseTuple(args, "i", &num)) {
try {
self->doc->DeletePages(num, 1);
} catch(const PdfError & err) {
podofo_set_exception(err);
return NULL;
}
} else return NULL;
Py_RETURN_NONE;
}
static PyObject * static PyObject *
PDFDoc_getter(PDFDoc *self, int field) PDFDoc_getter(PDFDoc *self, int field)
{ {
@ -288,31 +309,6 @@ PDFDoc_producer_setter(PDFDoc *self, PyObject *val, void *closure) {
return PDFDoc_setter(self, val, 5); return PDFDoc_setter(self, val, 5);
} }
static PyMethodDef PDFDoc_methods[] = {
{"load", (PyCFunction)PDFDoc_load, METH_VARARGS,
"Load a PDF document from a byte buffer (string)"
},
{"open", (PyCFunction)PDFDoc_open, METH_VARARGS,
"Load a PDF document from a file path (string)"
},
{"save", (PyCFunction)PDFDoc_save, METH_VARARGS,
"Save the PDF document to a path on disk"
},
{"extract_first_page", (PyCFunction)PDFDoc_extract_first_page, METH_VARARGS,
"extract_first_page() -> Remove all but the first page."
},
{"page_count", (PyCFunction)PDFDoc_page_count, METH_VARARGS,
"page_count() -> Number of pages in the PDF."
},
{"delete_page", (PyCFunction)PDFDoc_delete_page, METH_VARARGS,
"delete_page(page_num) -> Delete the specified page from the pdf (0 is the first page)."
},
{NULL} /* Sentinel */
};
static PyGetSetDef PDFDoc_getsetters[] = { static PyGetSetDef PDFDoc_getsetters[] = {
{(char *)"title", {(char *)"title",
(getter)PDFDoc_title_getter, (setter)PDFDoc_title_setter, (getter)PDFDoc_title_getter, (setter)PDFDoc_title_setter,
@ -350,6 +346,39 @@ static PyGetSetDef PDFDoc_getsetters[] = {
{NULL} /* Sentinel */ {NULL} /* Sentinel */
}; };
// }}}
static PyMethodDef PDFDoc_methods[] = {
{"load", (PyCFunction)PDFDoc_load, METH_VARARGS,
"Load a PDF document from a byte buffer (string)"
},
{"open", (PyCFunction)PDFDoc_open, METH_VARARGS,
"Load a PDF document from a file path (string)"
},
{"save", (PyCFunction)PDFDoc_save, METH_VARARGS,
"Save the PDF document to a path on disk"
},
{"write", (PyCFunction)PDFDoc_write, METH_VARARGS,
"Return the PDF document as a bytestring."
},
{"extract_first_page", (PyCFunction)PDFDoc_extract_first_page, METH_VARARGS,
"extract_first_page() -> Remove all but the first page."
},
{"page_count", (PyCFunction)PDFDoc_page_count, METH_VARARGS,
"page_count() -> Number of pages in the PDF."
},
{"delete_page", (PyCFunction)PDFDoc_delete_page, METH_VARARGS,
"delete_page(page_num) -> Delete the specified page from the pdf (0 is the first page)."
},
{"append", (PyCFunction)PDFDoc_append, METH_VARARGS,
"append(doc) -> Append doc (which must be a PDFDoc) to this document."
},
{NULL} /* Sentinel */
};
PyTypeObject pdf::PDFDocType = { PyTypeObject pdf::PDFDocType = {
PyObject_HEAD_INIT(NULL) PyObject_HEAD_INIT(NULL)
0, /*ob_size*/ 0, /*ob_size*/