More podofo refactoring

2026-06-05 21:45:19 -04:00 · 2012-08-26 12:17:04 +05:30
parent 1136f26186
commit 238bc24cf2
4 changed files with 139 additions and 216 deletions
@@ -148,7 +148,6 @@ extensions = [
                    libraries=['podofo'],
                    lib_dirs=[podofo_lib],
                    inc_dirs=[podofo_inc, os.path.dirname(podofo_inc)],
-                    optional=True,
                    error=podofo_error),

    Extension('pictureflow',
@@ -43,12 +43,6 @@ PARALLEL_FUNCS = {
      'read_metadata' :
      ('calibre.ebooks.metadata.worker', 'read_metadata_', 'notification'),

-      'read_pdf_metadata' :
-      ('calibre.utils.podofo.__init__', 'get_metadata_', None),
-
-      'write_pdf_metadata' :
-      ('calibre.utils.podofo.__init__', 'set_metadata_', None),
-
      'save_book' :
      ('calibre.ebooks.metadata.worker', 'save_book', 'notification'),

@@ -6,109 +6,12 @@ __license__   = 'GPL v3'
 __copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
 __docformat__ = 'restructuredtext en'

-import os, time, shutil
+import os, shutil

 from calibre.constants import plugins, preferred_encoding
-from calibre.ebooks.metadata import MetaInformation, string_to_authors, \
-    authors_to_string
-from calibre.utils.ipc.job import ParallelJob
-from calibre.utils.ipc.server import Server
-from calibre.ptempfile import PersistentTemporaryFile, TemporaryFile
-from calibre import prints
-
-podofo, podofo_err = plugins['podofo']
-
-class Unavailable(Exception): pass
-
-def get_metadata(stream, cpath=None):
-    if not podofo:
-        raise Unavailable(podofo_err)
-    pt = PersistentTemporaryFile('_podofo.pdf')
-    pt.write(stream.read())
-    pt.close()
-    server = Server(pool_size=1)
-    job = ParallelJob('read_pdf_metadata', 'Read pdf metadata',
-        lambda x,y:x,  args=[pt.name, cpath])
-    server.add_job(job)
-    while not job.is_finished:
-        time.sleep(0.1)
-        job.update()
-
-    job.update()
-    server.close()
-    if job.result is None:
-        raise ValueError('Failed to read metadata: ' + job.details)
-    title, authors, creator, tags, ok = job.result
-    if not ok:
-        print 'Failed to extract cover:'
-        print job.details
-    if title == '_':
-        title = getattr(stream, 'name', _('Unknown'))
-        title = os.path.splitext(title)[0]
-
-    mi = MetaInformation(title, authors)
-    if creator:
-        mi.book_producer = creator
-    if tags:
-        mi.tags = tags
-    if os.path.exists(pt.name): os.remove(pt.name)
-    if ok:
-        mi.cover = cpath
-    return mi
-
-def get_metadata_quick(raw):
-    p = podofo.PDFDoc()
-    p.load(raw)
-    title = p.title
-    if not title:
-        title = '_'
-    author = p.author
-    authors = string_to_authors(author) if author else  [_('Unknown')]
-    creator = p.creator
-    try:
-        tags = [x.strip() for x in p.keywords.split(u',')]
-        tags = [x for x in tags if x]
-    except:
-        tags = []
-
-    mi = MetaInformation(title, authors)
-    if creator:
-        mi.book_producer = creator
-    if tags:
-        mi.tags = tags
-    return mi
-
-def get_metadata_(path, cpath=None):
-    p = podofo.PDFDoc()
-    p.open(path)
-    title = p.title
-    if not title:
-        title = '_'
-    author = p.author
-    authors = string_to_authors(author) if author else  [_('Unknown')]
-    creator = p.creator
-    try:
-        tags = [x.strip() for x in p.keywords.split(u',')]
-        tags = [x for x in tags if x]
-    except:
-        tags = []
-    ok = True
-    try:
-        if cpath is not None:
-            pages = p.pages
-            if pages < 1:
-                raise ValueError('PDF has no pages')
-            if True or pages == 1:
-                shutil.copyfile(path, cpath)
-            else:
-                p.extract_first_page()
-                p.save(cpath)
-    except:
-        import traceback
-        traceback.print_exc()
-        ok = False
-
-    return (title, authors, creator, tags, ok)
+from calibre.ebooks.metadata import authors_to_string
+from calibre.ptempfile import TemporaryDirectory
+from calibre.utils.ipc.simple_worker import fork_job, WorkerError

 def prep(val):
    if not val:
@@ -118,27 +21,16 @@ def prep(val):
    return val.strip()

 def set_metadata(stream, mi):
-    if not podofo:
-        raise Unavailable(podofo_err)
-    with TemporaryFile('_podofo_read.pdf') as inputf, \
-            TemporaryFile('_podofo_write.pdf') as outputf:
-        server = Server(pool_size=1)
-        with open(inputf, 'wb') as f:
+    with TemporaryDirectory(u'_podofo_set_metadata') as tdir:
+        with open(os.path.join(tdir, u'input.pdf'), 'wb') as f:
            shutil.copyfileobj(stream, f)
-        job = ParallelJob('write_pdf_metadata', 'Write pdf metadata',
-            lambda x,y:x,  args=[inputf, outputf, mi.title, mi.authors,
-                mi.book_producer, mi.tags])
-        server.add_job(job)
-        while not job.is_finished:
-            time.sleep(0.1)
-            job.update()
-
-        job.update()
-        server.close()
-        if job.failed:
-            prints(job.details)
-        elif job.result:
-            with open(outputf, 'rb') as f:
+        try:
+            touched = fork_job('calibre.utils.podofo', 'set_metadata_', (tdir,
+                mi.title, mi.authors, mi.book_producer, mi.tags))
+        except WorkerError as e:
+            raise Exception('Failed to set PDF metadata: %s'%e.orig_tb)
+        if touched:
+            with open(os.path.join(tdir, u'output.pdf'), 'rb') as f:
                f.seek(0, 2)
                if f.tell() > 100:
                    f.seek(0)
@@ -148,10 +40,14 @@ def set_metadata(stream, mi):
                    stream.flush()
    stream.seek(0)

+def set_metadata_(tdir, title, authors, bkp, tags):
+    podofo, podofo_err = plugins['podofo']
+    if podofo is None:
+        raise RuntimeError('Failed to load podofo: %s'%podofo_err)

-def set_metadata_(path, opath, title, authors, bkp, tags):
+    os.chdir(tdir)
    p = podofo.PDFDoc()
-    p.open(path)
+    p.open(u'input.pdf')
    title = prep(title)
    touched = False
    if title and title != p.title:
@@ -177,27 +73,32 @@ def set_metadata_(path, opath, title, authors, bkp, tags):
        pass

    if touched:
-        p.save(opath)
-        return True
-    return False
+        p.save(u'output.pdf')
+
+    return touched

 def delete_all_but(path, pages):
    ''' Delete all the pages in the pdf except for the specified ones. Negative
-    numbers are counted from the end of the PDF.'''
-    with TemporaryFile('_podofo_in.pdf') as of:
-        shutil.copyfile(path, of)
+    numbers are counted from the end of the PDF. '''
+    podofo, podofo_err = plugins['podofo']
+    if podofo is None:
+        raise RuntimeError('Failed to load podofo: %s'%podofo_err)

-        p = podofo.PDFDoc()
-        p.open(of)
-        total = p.page_count()
-        pages = { total + x if x < 0 else x for x in pages }
-        for page in xrange(total-1, -1, -1):
-            if page not in pages:
-                p.delete_page(page)
-        os.remove(path)
-        p.save(path)
+    p = podofo.PDFDoc()
+    with open(path, 'rb') as f:
+        raw = f.read()
+    p.load(raw)
+    total = p.page_count()
+    pages = { total + x if x < 0 else x for x in pages }
+    for page in xrange(total-1, -1, -1):
+        if page not in pages:
+            p.delete_page(page)
+
+    raw = p.write()
+    with open(path, 'wb') as f:
+        f.write(raw)

 if __name__ == '__main__':
-    f = '/tmp/t.pdf'
+    f = u'/tmp/t.pdf'
    delete_all_but(f, [0, 1, -2, -1])

@@ -57,8 +57,8 @@ PDFDoc_open(PDFDoc *self, PyObject *args, PyObject *kwargs) {
        } catch(const PdfError & err) {
            podofo_set_exception(err);
            return NULL;
-    }
-} else return NULL;
+        }
+    } else return NULL;


    Py_RETURN_NONE;
@@ -77,11 +77,72 @@ PDFDoc_save(PDFDoc *self, PyObject *args, PyObject *kwargs) {
        }
    } else return NULL;

-
-    Py_INCREF(Py_None);
-    return Py_None;
+    Py_RETURN_NONE;
 }

+static PyObject *
+PDFDoc_write(PDFDoc *self, PyObject *args, PyObject *kwargs) {
+    PyObject *ans;
+    PdfRefCountedBuffer buffer(1*1024*1024);
+    PdfOutputDevice out(&buffer);
+    
+    try {
+        self->doc->Write(&out);
+    } catch(const PdfError &err) {
+        podofo_set_exception(err);
+        return NULL;
+    }
+
+    ans = PyBytes_FromStringAndSize(buffer.GetBuffer(), out.Tell());
+    if (ans == NULL) PyErr_NoMemory();
+    return ans;
+}
+
+static PyObject *
+PDFDoc_extract_first_page(PDFDoc *self, PyObject *args, PyObject *kwargs) {
+    try {
+        while (self->doc->GetPageCount() > 1) self->doc->GetPagesTree()->DeletePage(1);
+    } catch(const PdfError & err) {
+        podofo_set_exception(err);
+        return NULL;
+    }
+    Py_RETURN_NONE;
+}
+
+static PyObject *
+PDFDoc_page_count(PDFDoc *self, PyObject *args, PyObject *kwargs) {
+    int count;
+    try {
+        count = self->doc->GetPageCount();
+    } catch(const PdfError & err) {
+        podofo_set_exception(err);
+        return NULL;
+    }
+    return Py_BuildValue("i", count);
+}
+
+static PyObject *
+PDFDoc_delete_page(PDFDoc *self, PyObject *args, PyObject *kwargs) {
+    int num = 0;
+    if (PyArg_ParseTuple(args, "i", &num)) {
+        try {
+            self->doc->DeletePages(num, 1);
+        } catch(const PdfError & err) {
+            podofo_set_exception(err);
+            return NULL;
+        }
+    } else return NULL;
+
+    Py_RETURN_NONE;
+}
+
+static PyObject *
+PDFDoc_append(PDFDoc *self, PyObject *args, PyObject *kwargs) {
+    Py_RETURN_NONE;
+}
+
+// Properties {{{
+
 static PyObject *
 PDFDoc_pages_getter(PDFDoc *self, void *closure) {
    int pages = self->doc->GetPageCount();
@@ -123,46 +184,6 @@ PDFDoc_version_getter(PDFDoc *self, void *closure) {
 }


- 
-static PyObject *
-PDFDoc_extract_first_page(PDFDoc *self, PyObject *args, PyObject *kwargs) {
-    try {
-        while (self->doc->GetPageCount() > 1) self->doc->GetPagesTree()->DeletePage(1);
-    } catch(const PdfError & err) {
-        podofo_set_exception(err);
-        return NULL;
-    }
-    Py_RETURN_NONE;
-}
-
-static PyObject *
-PDFDoc_page_count(PDFDoc *self, PyObject *args, PyObject *kwargs) {
-    int count;
-    try {
-        count = self->doc->GetPageCount();
-    } catch(const PdfError & err) {
-        podofo_set_exception(err);
-        return NULL;
-    }
-    return Py_BuildValue("i", count);
-}
-
-static PyObject *
-PDFDoc_delete_page(PDFDoc *self, PyObject *args, PyObject *kwargs) {
-    int num = 0;
-    if (PyArg_ParseTuple(args, "i", &num)) {
-        try {
-            self->doc->DeletePages(num, 1);
-        } catch(const PdfError & err) {
-            podofo_set_exception(err);
-            return NULL;
-        }
-    } else return NULL;
-
-    Py_RETURN_NONE;
-}
-
-
 static PyObject *
 PDFDoc_getter(PDFDoc *self, int field)
 {
@@ -288,31 +309,6 @@ PDFDoc_producer_setter(PDFDoc *self, PyObject *val, void *closure) {
    return  PDFDoc_setter(self, val, 5);
 }

-
-static PyMethodDef PDFDoc_methods[] = {
-    {"load", (PyCFunction)PDFDoc_load, METH_VARARGS,
-     "Load a PDF document from a byte buffer (string)"
-    },
-    {"open", (PyCFunction)PDFDoc_open, METH_VARARGS,
-     "Load a PDF document from a file path (string)"
-    },
-    {"save", (PyCFunction)PDFDoc_save, METH_VARARGS,
-     "Save the PDF document to a path on disk"
-    },
-    {"extract_first_page", (PyCFunction)PDFDoc_extract_first_page, METH_VARARGS,
-     "extract_first_page() -> Remove all but the first page."
-    },
-    {"page_count", (PyCFunction)PDFDoc_page_count, METH_VARARGS,
-     "page_count() -> Number of pages in the PDF."
-    },
-    {"delete_page", (PyCFunction)PDFDoc_delete_page, METH_VARARGS,
-     "delete_page(page_num) -> Delete the specified page from the pdf (0 is the first page)."
-    },
-
-
-    {NULL}  /* Sentinel */
-};
-
 static PyGetSetDef PDFDoc_getsetters[] = {
    {(char *)"title", 
     (getter)PDFDoc_title_getter, (setter)PDFDoc_title_setter,
@@ -350,6 +346,39 @@ static PyGetSetDef PDFDoc_getsetters[] = {
    {NULL}  /* Sentinel */
 };

+
+// }}}
+
+static PyMethodDef PDFDoc_methods[] = {
+    {"load", (PyCFunction)PDFDoc_load, METH_VARARGS,
+     "Load a PDF document from a byte buffer (string)"
+    },
+    {"open", (PyCFunction)PDFDoc_open, METH_VARARGS,
+     "Load a PDF document from a file path (string)"
+    },
+    {"save", (PyCFunction)PDFDoc_save, METH_VARARGS,
+     "Save the PDF document to a path on disk"
+    },
+    {"write", (PyCFunction)PDFDoc_write, METH_VARARGS,
+     "Return the PDF document as a bytestring."
+    },
+    {"extract_first_page", (PyCFunction)PDFDoc_extract_first_page, METH_VARARGS,
+     "extract_first_page() -> Remove all but the first page."
+    },
+    {"page_count", (PyCFunction)PDFDoc_page_count, METH_VARARGS,
+     "page_count() -> Number of pages in the PDF."
+    },
+    {"delete_page", (PyCFunction)PDFDoc_delete_page, METH_VARARGS,
+     "delete_page(page_num) -> Delete the specified page from the pdf (0 is the first page)."
+    },
+    {"append", (PyCFunction)PDFDoc_append, METH_VARARGS,
+     "append(doc) -> Append doc (which must be a PDFDoc) to this document."
+    },
+
+
+    {NULL}  /* Sentinel */
+};
+
 PyTypeObject pdf::PDFDocType = {
    PyObject_HEAD_INIT(NULL)
    0,                         /*ob_size*/