From 6c0b18461cf89c19d36344d65be58a482f30f85a Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sat, 6 Jun 2009 11:15:48 -0700 Subject: [PATCH] Speed up PDF cover extraction --- src/calibre/ebooks/metadata/pdf.py | 42 ++++++++----------- src/calibre/utils/ipc/worker.py | 3 -- src/calibre/utils/podofo/__init__.py | 63 ++++++++++++---------------- src/calibre/utils/podofo/podofo.cpp | 25 +++++------ 4 files changed, 55 insertions(+), 78 deletions(-) diff --git a/src/calibre/ebooks/metadata/pdf.py b/src/calibre/ebooks/metadata/pdf.py index 0f7c1e4a89..0dcecac740 100644 --- a/src/calibre/ebooks/metadata/pdf.py +++ b/src/calibre/ebooks/metadata/pdf.py @@ -18,8 +18,7 @@ except: from calibre.ebooks.metadata import MetaInformation, authors_to_string from calibre.utils.pdftk import set_metadata as pdftk_set_metadata from calibre.utils.podofo import get_metadata as podofo_get_metadata, \ - set_metadata as podofo_set_metadata, Unavailable, write_first_page, \ - get_metadata_quick + set_metadata as podofo_set_metadata, Unavailable, get_metadata_quick def get_quick_metadata(stream): raw = stream.read() @@ -32,19 +31,18 @@ def get_quick_metadata(stream): def get_metadata(stream, extract_cover=True): try: - mi = podofo_get_metadata(stream) + with TemporaryDirectory('_pdfmeta') as tdir: + cpath = os.path.join(tdir, 'cover.pdf') + if not extract_cover: + cpath = None + mi = podofo_get_metadata(stream, cpath=cpath) + if mi.cover is not None: + cdata = get_cover(mi.cover) + mi.cover = None + if cdata is not None: + mi.cover_data = ('jpg', cdata) except Unavailable: mi = get_metadata_pypdf(stream) - stream.seek(0) - - if extract_cover and _imagemagick_loaded: - try: - cdata = get_cover(stream) - if cdata is not None: - mi.cover_data = ('jpg', cdata) - except: - import traceback - traceback.print_exc() return mi @@ -127,17 +125,13 @@ def set_metadata_pypdf(stream, mi): stream.write(out_str.read()) stream.seek(0) -def get_cover(stream): - stream.seek(0) - with TemporaryDirectory('_pdfmeta') as tdir: - cover_path = os.path.join(tdir, 'cover.pdf') - write_first_page(stream, cover_path) - with ImageMagick(): - wand = NewMagickWand() - MagickReadImage(wand, cover_path) - MagickSetImageFormat(wand, 'JPEG') - MagickWriteImage(wand, '%s.jpg' % cover_path) - return open('%s.jpg' % cover_path, 'rb').read() +def get_cover(cover_path): + with ImageMagick(): + wand = NewMagickWand() + MagickReadImage(wand, cover_path) + MagickSetImageFormat(wand, 'JPEG') + MagickWriteImage(wand, '%s.jpg' % cover_path) + return open('%s.jpg' % cover_path, 'rb').read() diff --git a/src/calibre/utils/ipc/worker.py b/src/calibre/utils/ipc/worker.py index 0e637a6b55..a53d1818ba 100644 --- a/src/calibre/utils/ipc/worker.py +++ b/src/calibre/utils/ipc/worker.py @@ -39,9 +39,6 @@ PARALLEL_FUNCS = { 'write_pdf_metadata' : ('calibre.utils.podofo.__init__', 'set_metadata_', None), - 'write_pdf_first_page' : - ('calibre.utils.podofo.__init__', 'write_first_page_', None), - 'save_book' : ('calibre.ebooks.metadata.worker', 'save_book', 'notification'), } diff --git a/src/calibre/utils/podofo/__init__.py b/src/calibre/utils/podofo/__init__.py index d52e1b5658..095160a639 100644 --- a/src/calibre/utils/podofo/__init__.py +++ b/src/calibre/utils/podofo/__init__.py @@ -19,39 +19,7 @@ podofo, podofo_err = plugins['podofo'] class Unavailable(Exception): pass -def write_first_page(stream, opath): - if not podofo: - raise Unavailable(podofo_err) - pt = PersistentTemporaryFile('_podofo.pdf') - pt.write(stream.read()) - pt.close() - server = Server(pool_size=1) - job = ParallelJob('write_pdf_first_page', 'Extract first page of pdf', - lambda x,y:x, args=[pt.name, opath]) - server.add_job(job) - while not job.is_finished: - time.sleep(0.1) - job.update() - - job.update() - server.close() - if not job.result: - raise ValueError('Failed to extract first page: ' + job.details) - -def write_first_page_(inpath, outpath): - p = podofo.PDFDoc() - p.open(inpath) - pages = p.pages - if pages < 1: - raise ValueError('PDF has no pages') - if pages == 1: - shutil.copyfile(inpath, outpath) - return True - p.delete_pages(1, pages-1) - p.save(outpath) - return True - -def get_metadata(stream): +def get_metadata(stream, cpath=None): if not podofo: raise Unavailable(podofo_err) pt = PersistentTemporaryFile('_podofo.pdf') @@ -59,7 +27,7 @@ def get_metadata(stream): pt.close() server = Server(pool_size=1) job = ParallelJob('read_pdf_metadata', 'Read pdf metadata', - lambda x,y:x, args=[pt.name]) + lambda x,y:x, args=[pt.name, cpath]) server.add_job(job) while not job.is_finished: time.sleep(0.1) @@ -69,7 +37,10 @@ def get_metadata(stream): server.close() if job.result is None: raise ValueError('Failed to read metadata: ' + job.details) - title, authors, creator = job.result + title, authors, creator, ok = job.result + if not ok: + print 'Failed to extract cover:' + print job.details if title == '_': title = getattr(stream, 'name', _('Unknown')) title = os.path.splitext(title)[0] @@ -78,6 +49,8 @@ def get_metadata(stream): if creator: mi.book_producer = creator if os.path.exists(pt.name): os.remove(pt.name) + if ok: + mi.cover = cpath return mi def get_metadata_quick(raw): @@ -95,7 +68,7 @@ def get_metadata_quick(raw): return mi -def get_metadata_(path): +def get_metadata_(path, cpath=None): p = podofo.PDFDoc() p.open(path) title = p.title @@ -104,7 +77,23 @@ def get_metadata_(path): author = p.author authors = string_to_authors(author) if author else [_('Unknown')] creator = p.creator - return (title, authors, creator) + ok = True + try: + if cpath is not None: + pages = p.pages + if pages < 1: + raise ValueError('PDF has no pages') + if True or pages == 1: + shutil.copyfile(path, cpath) + else: + p.extract_first_page() + p.save(cpath) + except: + import traceback + traceback.print_exc() + ok = False + + return (title, authors, creator, ok) def prep(val): if not val: diff --git a/src/calibre/utils/podofo/podofo.cpp b/src/calibre/utils/podofo/podofo.cpp index e64faa53e2..188ed0c5dc 100644 --- a/src/calibre/utils/podofo/podofo.cpp +++ b/src/calibre/utils/podofo/podofo.cpp @@ -143,18 +143,15 @@ podofo_PDFDoc_version_getter(podofo_PDFDoc *self, void *closure) { static PyObject * -podofo_PDFDoc_delete_pages(podofo_PDFDoc *self, PyObject *args, PyObject *kwargs) { - int first_page, num_pages; - if (PyArg_ParseTuple(args, "ii", &first_page, &num_pages)) { - try { - self->doc->DeletePages(first_page, num_pages); - } catch(const PdfError & err) { - podofo_set_exception(err); - return NULL; - } - } else return NULL; - Py_INCREF(Py_None); - return Py_None; +podofo_PDFDoc_extract_first_page(podofo_PDFDoc *self, PyObject *args, PyObject *kwargs) { + int i, num_pages; + try { + while (self->doc->GetPageCount() > 1) self->doc->GetPagesTree()->DeletePage(1); + } catch(const PdfError & err) { + podofo_set_exception(err); + return NULL; + } + Py_RETURN_NONE; } static PyObject * @@ -313,8 +310,8 @@ static PyMethodDef podofo_PDFDoc_methods[] = { {"save", (PyCFunction)podofo_PDFDoc_save, METH_VARARGS, "Save the PDF document to a path on disk" }, - {"delete_pages", (PyCFunction)podofo_PDFDoc_delete_pages, METH_VARARGS, - "delete_pages(start_page, num_pages) -> int, int\nDelete pages from the PDF document." + {"extract_first_page", (PyCFunction)podofo_PDFDoc_extract_first_page, METH_VARARGS, + "extract_first_page() -> Remove all but the first page." }, {NULL} /* Sentinel */