From 496e0dccd085cac8b84364b1a6dc04187b0232cb Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Wed, 27 May 2009 09:43:22 -0700 Subject: [PATCH] Use podofo to get PDF cover image --- src/calibre/ebooks/metadata/pdf.py | 35 ++++++----------- src/calibre/utils/podofo/__init__.py | 34 +++++++++++++++- src/calibre/utils/podofo/podofo.cpp | 59 +++++++++++++++++++++------- 3 files changed, 89 insertions(+), 39 deletions(-) diff --git a/src/calibre/ebooks/metadata/pdf.py b/src/calibre/ebooks/metadata/pdf.py index 0ffa0fb5a3..1fca98fc4c 100644 --- a/src/calibre/ebooks/metadata/pdf.py +++ b/src/calibre/ebooks/metadata/pdf.py @@ -18,7 +18,7 @@ except: from calibre.ebooks.metadata import MetaInformation, authors_to_string from calibre.utils.pdftk import set_metadata as pdftk_set_metadata from calibre.utils.podofo import get_metadata as podofo_get_metadata, \ - set_metadata as podofo_set_metadata, Unavailable + set_metadata as podofo_set_metadata, Unavailable, write_first_page def get_metadata(stream, extract_cover=True): @@ -119,29 +119,16 @@ def set_metadata_pypdf(stream, mi): stream.seek(0) def get_cover(stream): - from pyPdf import PdfFileReader, PdfFileWriter + stream.seek(0) + with TemporaryDirectory('_pdfmeta') as tdir: + cover_path = os.path.join(tdir, 'cover.pdf') + write_first_page(stream, cover_path) + with ImageMagick(): + wand = NewMagickWand() + MagickReadImage(wand, cover_path) + MagickSetImageFormat(wand, 'JPEG') + MagickWriteImage(wand, '%s.jpg' % cover_path) + return open('%s.jpg' % cover_path, 'rb').read() - try: - with StreamReadWrapper(stream) as stream: - pdf = PdfFileReader(stream) - output = PdfFileWriter() - if len(pdf.pages) >= 1: - output.addPage(pdf.getPage(0)) - with TemporaryDirectory('_pdfmeta') as tdir: - cover_path = os.path.join(tdir, 'cover.pdf') - - with open(cover_path, "wb") as outputStream: - output.write(outputStream) - with ImageMagick(): - wand = NewMagickWand() - MagickReadImage(wand, cover_path) - MagickSetImageFormat(wand, 'JPEG') - MagickWriteImage(wand, '%s.jpg' % cover_path) - return open('%s.jpg' % cover_path, 'rb').read() - except: - import traceback - traceback.print_exc() - - return '' diff --git a/src/calibre/utils/podofo/__init__.py b/src/calibre/utils/podofo/__init__.py index 8654a95c04..7a5cd9bf87 100644 --- a/src/calibre/utils/podofo/__init__.py +++ b/src/calibre/utils/podofo/__init__.py @@ -6,7 +6,7 @@ __license__ = 'GPL v3' __copyright__ = '2009, Kovid Goyal ' __docformat__ = 'restructuredtext en' -import os, time +import os, time, shutil from calibre.constants import plugins, preferred_encoding from calibre.ebooks.metadata import MetaInformation, string_to_authors, \ @@ -19,6 +19,38 @@ podofo, podofo_err = plugins['podofo'] class Unavailable(Exception): pass +def write_first_page(stream, opath): + if not podofo: + raise Unavailable(podofo_err) + pt = PersistentTemporaryFile('_podofo.pdf') + pt.write(stream.read()) + pt.close() + server = Server(pool_size=1) + job = ParallelJob('write_pdf_first_page', 'Extract first page of pdf', + lambda x,y:x, args=[pt.name, opath]) + server.add_job(job) + while not job.is_finished: + time.sleep(0.1) + job.update() + + job.update() + server.close() + if not job.result: + raise ValueError('Failed to extract first page: ' + job.details) + +def write_first_page_(inpath, outpath): + p = podofo.PDFDoc() + p.open(inpath) + pages = p.pages + if pages < 1: + raise ValueError('PDF has no pages') + if pages == 1: + shutil.copyfile(inpath, outpath) + return True + p.delete_pages(1, pages-1) + p.save(outpath) + return True + def get_metadata(stream): if not podofo: raise Unavailable(podofo_err) diff --git a/src/calibre/utils/podofo/podofo.cpp b/src/calibre/utils/podofo/podofo.cpp index e81cf0b475..e9c7bb4346 100644 --- a/src/calibre/utils/podofo/podofo.cpp +++ b/src/calibre/utils/podofo/podofo.cpp @@ -100,6 +100,29 @@ podofo_PDFDoc_save(podofo_PDFDoc *self, PyObject *args, PyObject *kwargs) { return Py_None; } +static PyObject * +podofo_PDFDoc_pages_getter(podofo_PDFDoc *self, void *closure) { + int pages = self->doc->GetPageCount(); + PyObject *ans = PyInt_FromLong(static_cast(pages)); + if (ans != NULL) Py_INCREF(ans); + return ans; +} + +static PyObject * +podofo_PDFDoc_delete_pages(podofo_PDFDoc *self, PyObject *args, PyObject *kwargs) { + int first_page, num_pages; + if (PyArg_ParseTuple(args, "ii", &first_page, &num_pages)) { + try { + self->doc->DeletePages(first_page, num_pages); + } catch(const PdfError & err) { + podofo_set_exception(err); + return NULL; + } + } else return NULL; + Py_INCREF(Py_None); + return Py_None; +} + static PyObject * podofo_convert_pdfstring(const PdfString &s) { std::string raw = s.GetStringUtf8(); @@ -256,33 +279,41 @@ static PyMethodDef podofo_PDFDoc_methods[] = { {"save", (PyCFunction)podofo_PDFDoc_save, METH_VARARGS, "Save the PDF document to a path on disk" }, + {"delete_pages", (PyCFunction)podofo_PDFDoc_delete_pages, METH_VARARGS, + "delete_pages(start_page, num_pages) -> int, int\nDelete pages from the PDF document." + }, + {NULL} /* Sentinel */ }; -static PyGetSetDef podofo_PDFDoc_getseters[] = { - {"title", +static PyGetSetDef podofo_PDFDoc_getsetters[] = { + {(char *)"title", (getter)podofo_PDFDoc_title_getter, (setter)podofo_PDFDoc_title_setter, - "Document title", + (char *)"Document title", NULL}, - {"author", + {(char *)"author", (getter)podofo_PDFDoc_author_getter, (setter)podofo_PDFDoc_author_setter, - "Document author", + (char *)"Document author", NULL}, - {"subject", + {(char *)"subject", (getter)podofo_PDFDoc_subject_getter, (setter)podofo_PDFDoc_subject_setter, - "Document subject", + (char *)"Document subject", NULL}, - {"keywords", + {(char *)"keywords", (getter)podofo_PDFDoc_keywords_getter, (setter)podofo_PDFDoc_keywords_setter, - "Document keywords", + (char *)"Document keywords", NULL}, - {"creator", + {(char *)"creator", (getter)podofo_PDFDoc_creator_getter, (setter)podofo_PDFDoc_creator_setter, - "Document creator", + (char *)"Document creator", NULL}, - {"producer", + {(char *)"producer", (getter)podofo_PDFDoc_producer_getter, (setter)podofo_PDFDoc_producer_setter, - "Document producer", + (char *)"Document producer", + NULL}, + {(char *)"pages", + (getter)podofo_PDFDoc_pages_getter, NULL, + (char *)"Number of pages in document (read only)", NULL}, {NULL} /* Sentinel */ @@ -319,7 +350,7 @@ static PyTypeObject podofo_PDFDocType = { 0, /* tp_iternext */ podofo_PDFDoc_methods, /* tp_methods */ 0, /* tp_members */ - podofo_PDFDoc_getseters, /* tp_getset */ + podofo_PDFDoc_getsetters, /* tp_getset */ 0, /* tp_base */ 0, /* tp_dict */ 0, /* tp_descr_get */