From 238bc24cf21f08830d5ccbec2cb464a9929ca8af Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sun, 26 Aug 2012 12:17:04 +0530 Subject: [PATCH] More podofo refactoring --- setup/extensions.py | 1 - src/calibre/utils/ipc/worker.py | 6 - src/calibre/utils/podofo/__init__.py | 179 ++++++--------------------- src/calibre/utils/podofo/doc.cpp | 169 ++++++++++++++----------- 4 files changed, 139 insertions(+), 216 deletions(-) diff --git a/setup/extensions.py b/setup/extensions.py index a7b01bcd19..d6052125e5 100644 --- a/setup/extensions.py +++ b/setup/extensions.py @@ -148,7 +148,6 @@ extensions = [ libraries=['podofo'], lib_dirs=[podofo_lib], inc_dirs=[podofo_inc, os.path.dirname(podofo_inc)], - optional=True, error=podofo_error), Extension('pictureflow', diff --git a/src/calibre/utils/ipc/worker.py b/src/calibre/utils/ipc/worker.py index 08374400ac..7d13a76cf0 100644 --- a/src/calibre/utils/ipc/worker.py +++ b/src/calibre/utils/ipc/worker.py @@ -43,12 +43,6 @@ PARALLEL_FUNCS = { 'read_metadata' : ('calibre.ebooks.metadata.worker', 'read_metadata_', 'notification'), - 'read_pdf_metadata' : - ('calibre.utils.podofo.__init__', 'get_metadata_', None), - - 'write_pdf_metadata' : - ('calibre.utils.podofo.__init__', 'set_metadata_', None), - 'save_book' : ('calibre.ebooks.metadata.worker', 'save_book', 'notification'), diff --git a/src/calibre/utils/podofo/__init__.py b/src/calibre/utils/podofo/__init__.py index 948962f438..232b6536af 100644 --- a/src/calibre/utils/podofo/__init__.py +++ b/src/calibre/utils/podofo/__init__.py @@ -6,109 +6,12 @@ __license__ = 'GPL v3' __copyright__ = '2009, Kovid Goyal ' __docformat__ = 'restructuredtext en' -import os, time, shutil +import os, shutil from calibre.constants import plugins, preferred_encoding -from calibre.ebooks.metadata import MetaInformation, string_to_authors, \ - authors_to_string -from calibre.utils.ipc.job import ParallelJob -from calibre.utils.ipc.server import Server -from calibre.ptempfile import PersistentTemporaryFile, TemporaryFile -from calibre import prints - -podofo, podofo_err = plugins['podofo'] - -class Unavailable(Exception): pass - -def get_metadata(stream, cpath=None): - if not podofo: - raise Unavailable(podofo_err) - pt = PersistentTemporaryFile('_podofo.pdf') - pt.write(stream.read()) - pt.close() - server = Server(pool_size=1) - job = ParallelJob('read_pdf_metadata', 'Read pdf metadata', - lambda x,y:x, args=[pt.name, cpath]) - server.add_job(job) - while not job.is_finished: - time.sleep(0.1) - job.update() - - job.update() - server.close() - if job.result is None: - raise ValueError('Failed to read metadata: ' + job.details) - title, authors, creator, tags, ok = job.result - if not ok: - print 'Failed to extract cover:' - print job.details - if title == '_': - title = getattr(stream, 'name', _('Unknown')) - title = os.path.splitext(title)[0] - - mi = MetaInformation(title, authors) - if creator: - mi.book_producer = creator - if tags: - mi.tags = tags - if os.path.exists(pt.name): os.remove(pt.name) - if ok: - mi.cover = cpath - return mi - -def get_metadata_quick(raw): - p = podofo.PDFDoc() - p.load(raw) - title = p.title - if not title: - title = '_' - author = p.author - authors = string_to_authors(author) if author else [_('Unknown')] - creator = p.creator - try: - tags = [x.strip() for x in p.keywords.split(u',')] - tags = [x for x in tags if x] - except: - tags = [] - - mi = MetaInformation(title, authors) - if creator: - mi.book_producer = creator - if tags: - mi.tags = tags - return mi - -def get_metadata_(path, cpath=None): - p = podofo.PDFDoc() - p.open(path) - title = p.title - if not title: - title = '_' - author = p.author - authors = string_to_authors(author) if author else [_('Unknown')] - creator = p.creator - try: - tags = [x.strip() for x in p.keywords.split(u',')] - tags = [x for x in tags if x] - except: - tags = [] - ok = True - try: - if cpath is not None: - pages = p.pages - if pages < 1: - raise ValueError('PDF has no pages') - if True or pages == 1: - shutil.copyfile(path, cpath) - else: - p.extract_first_page() - p.save(cpath) - except: - import traceback - traceback.print_exc() - ok = False - - return (title, authors, creator, tags, ok) +from calibre.ebooks.metadata import authors_to_string +from calibre.ptempfile import TemporaryDirectory +from calibre.utils.ipc.simple_worker import fork_job, WorkerError def prep(val): if not val: @@ -118,27 +21,16 @@ def prep(val): return val.strip() def set_metadata(stream, mi): - if not podofo: - raise Unavailable(podofo_err) - with TemporaryFile('_podofo_read.pdf') as inputf, \ - TemporaryFile('_podofo_write.pdf') as outputf: - server = Server(pool_size=1) - with open(inputf, 'wb') as f: + with TemporaryDirectory(u'_podofo_set_metadata') as tdir: + with open(os.path.join(tdir, u'input.pdf'), 'wb') as f: shutil.copyfileobj(stream, f) - job = ParallelJob('write_pdf_metadata', 'Write pdf metadata', - lambda x,y:x, args=[inputf, outputf, mi.title, mi.authors, - mi.book_producer, mi.tags]) - server.add_job(job) - while not job.is_finished: - time.sleep(0.1) - job.update() - - job.update() - server.close() - if job.failed: - prints(job.details) - elif job.result: - with open(outputf, 'rb') as f: + try: + touched = fork_job('calibre.utils.podofo', 'set_metadata_', (tdir, + mi.title, mi.authors, mi.book_producer, mi.tags)) + except WorkerError as e: + raise Exception('Failed to set PDF metadata: %s'%e.orig_tb) + if touched: + with open(os.path.join(tdir, u'output.pdf'), 'rb') as f: f.seek(0, 2) if f.tell() > 100: f.seek(0) @@ -148,10 +40,14 @@ def set_metadata(stream, mi): stream.flush() stream.seek(0) +def set_metadata_(tdir, title, authors, bkp, tags): + podofo, podofo_err = plugins['podofo'] + if podofo is None: + raise RuntimeError('Failed to load podofo: %s'%podofo_err) -def set_metadata_(path, opath, title, authors, bkp, tags): + os.chdir(tdir) p = podofo.PDFDoc() - p.open(path) + p.open(u'input.pdf') title = prep(title) touched = False if title and title != p.title: @@ -177,27 +73,32 @@ def set_metadata_(path, opath, title, authors, bkp, tags): pass if touched: - p.save(opath) - return True - return False + p.save(u'output.pdf') + + return touched def delete_all_but(path, pages): ''' Delete all the pages in the pdf except for the specified ones. Negative - numbers are counted from the end of the PDF.''' - with TemporaryFile('_podofo_in.pdf') as of: - shutil.copyfile(path, of) + numbers are counted from the end of the PDF. ''' + podofo, podofo_err = plugins['podofo'] + if podofo is None: + raise RuntimeError('Failed to load podofo: %s'%podofo_err) - p = podofo.PDFDoc() - p.open(of) - total = p.page_count() - pages = { total + x if x < 0 else x for x in pages } - for page in xrange(total-1, -1, -1): - if page not in pages: - p.delete_page(page) - os.remove(path) - p.save(path) + p = podofo.PDFDoc() + with open(path, 'rb') as f: + raw = f.read() + p.load(raw) + total = p.page_count() + pages = { total + x if x < 0 else x for x in pages } + for page in xrange(total-1, -1, -1): + if page not in pages: + p.delete_page(page) + + raw = p.write() + with open(path, 'wb') as f: + f.write(raw) if __name__ == '__main__': - f = '/tmp/t.pdf' + f = u'/tmp/t.pdf' delete_all_but(f, [0, 1, -2, -1]) diff --git a/src/calibre/utils/podofo/doc.cpp b/src/calibre/utils/podofo/doc.cpp index d591d6fc65..bf3135c7ee 100644 --- a/src/calibre/utils/podofo/doc.cpp +++ b/src/calibre/utils/podofo/doc.cpp @@ -57,8 +57,8 @@ PDFDoc_open(PDFDoc *self, PyObject *args, PyObject *kwargs) { } catch(const PdfError & err) { podofo_set_exception(err); return NULL; - } -} else return NULL; + } + } else return NULL; Py_RETURN_NONE; @@ -77,11 +77,72 @@ PDFDoc_save(PDFDoc *self, PyObject *args, PyObject *kwargs) { } } else return NULL; - - Py_INCREF(Py_None); - return Py_None; + Py_RETURN_NONE; } +static PyObject * +PDFDoc_write(PDFDoc *self, PyObject *args, PyObject *kwargs) { + PyObject *ans; + PdfRefCountedBuffer buffer(1*1024*1024); + PdfOutputDevice out(&buffer); + + try { + self->doc->Write(&out); + } catch(const PdfError &err) { + podofo_set_exception(err); + return NULL; + } + + ans = PyBytes_FromStringAndSize(buffer.GetBuffer(), out.Tell()); + if (ans == NULL) PyErr_NoMemory(); + return ans; +} + +static PyObject * +PDFDoc_extract_first_page(PDFDoc *self, PyObject *args, PyObject *kwargs) { + try { + while (self->doc->GetPageCount() > 1) self->doc->GetPagesTree()->DeletePage(1); + } catch(const PdfError & err) { + podofo_set_exception(err); + return NULL; + } + Py_RETURN_NONE; +} + +static PyObject * +PDFDoc_page_count(PDFDoc *self, PyObject *args, PyObject *kwargs) { + int count; + try { + count = self->doc->GetPageCount(); + } catch(const PdfError & err) { + podofo_set_exception(err); + return NULL; + } + return Py_BuildValue("i", count); +} + +static PyObject * +PDFDoc_delete_page(PDFDoc *self, PyObject *args, PyObject *kwargs) { + int num = 0; + if (PyArg_ParseTuple(args, "i", &num)) { + try { + self->doc->DeletePages(num, 1); + } catch(const PdfError & err) { + podofo_set_exception(err); + return NULL; + } + } else return NULL; + + Py_RETURN_NONE; +} + +static PyObject * +PDFDoc_append(PDFDoc *self, PyObject *args, PyObject *kwargs) { + Py_RETURN_NONE; +} + +// Properties {{{ + static PyObject * PDFDoc_pages_getter(PDFDoc *self, void *closure) { int pages = self->doc->GetPageCount(); @@ -123,46 +184,6 @@ PDFDoc_version_getter(PDFDoc *self, void *closure) { } - -static PyObject * -PDFDoc_extract_first_page(PDFDoc *self, PyObject *args, PyObject *kwargs) { - try { - while (self->doc->GetPageCount() > 1) self->doc->GetPagesTree()->DeletePage(1); - } catch(const PdfError & err) { - podofo_set_exception(err); - return NULL; - } - Py_RETURN_NONE; -} - -static PyObject * -PDFDoc_page_count(PDFDoc *self, PyObject *args, PyObject *kwargs) { - int count; - try { - count = self->doc->GetPageCount(); - } catch(const PdfError & err) { - podofo_set_exception(err); - return NULL; - } - return Py_BuildValue("i", count); -} - -static PyObject * -PDFDoc_delete_page(PDFDoc *self, PyObject *args, PyObject *kwargs) { - int num = 0; - if (PyArg_ParseTuple(args, "i", &num)) { - try { - self->doc->DeletePages(num, 1); - } catch(const PdfError & err) { - podofo_set_exception(err); - return NULL; - } - } else return NULL; - - Py_RETURN_NONE; -} - - static PyObject * PDFDoc_getter(PDFDoc *self, int field) { @@ -288,31 +309,6 @@ PDFDoc_producer_setter(PDFDoc *self, PyObject *val, void *closure) { return PDFDoc_setter(self, val, 5); } - -static PyMethodDef PDFDoc_methods[] = { - {"load", (PyCFunction)PDFDoc_load, METH_VARARGS, - "Load a PDF document from a byte buffer (string)" - }, - {"open", (PyCFunction)PDFDoc_open, METH_VARARGS, - "Load a PDF document from a file path (string)" - }, - {"save", (PyCFunction)PDFDoc_save, METH_VARARGS, - "Save the PDF document to a path on disk" - }, - {"extract_first_page", (PyCFunction)PDFDoc_extract_first_page, METH_VARARGS, - "extract_first_page() -> Remove all but the first page." - }, - {"page_count", (PyCFunction)PDFDoc_page_count, METH_VARARGS, - "page_count() -> Number of pages in the PDF." - }, - {"delete_page", (PyCFunction)PDFDoc_delete_page, METH_VARARGS, - "delete_page(page_num) -> Delete the specified page from the pdf (0 is the first page)." - }, - - - {NULL} /* Sentinel */ -}; - static PyGetSetDef PDFDoc_getsetters[] = { {(char *)"title", (getter)PDFDoc_title_getter, (setter)PDFDoc_title_setter, @@ -350,6 +346,39 @@ static PyGetSetDef PDFDoc_getsetters[] = { {NULL} /* Sentinel */ }; + +// }}} + +static PyMethodDef PDFDoc_methods[] = { + {"load", (PyCFunction)PDFDoc_load, METH_VARARGS, + "Load a PDF document from a byte buffer (string)" + }, + {"open", (PyCFunction)PDFDoc_open, METH_VARARGS, + "Load a PDF document from a file path (string)" + }, + {"save", (PyCFunction)PDFDoc_save, METH_VARARGS, + "Save the PDF document to a path on disk" + }, + {"write", (PyCFunction)PDFDoc_write, METH_VARARGS, + "Return the PDF document as a bytestring." + }, + {"extract_first_page", (PyCFunction)PDFDoc_extract_first_page, METH_VARARGS, + "extract_first_page() -> Remove all but the first page." + }, + {"page_count", (PyCFunction)PDFDoc_page_count, METH_VARARGS, + "page_count() -> Number of pages in the PDF." + }, + {"delete_page", (PyCFunction)PDFDoc_delete_page, METH_VARARGS, + "delete_page(page_num) -> Delete the specified page from the pdf (0 is the first page)." + }, + {"append", (PyCFunction)PDFDoc_append, METH_VARARGS, + "append(doc) -> Append doc (which must be a PDFDoc) to this document." + }, + + + {NULL} /* Sentinel */ +}; + PyTypeObject pdf::PDFDocType = { PyObject_HEAD_INIT(NULL) 0, /*ob_size*/