Speed up PDF cover extraction

This commit is contained in:
Kovid Goyal 2009-06-06 11:15:48 -07:00
parent d6e4ce275f
commit 6c0b18461c
4 changed files with 55 additions and 78 deletions

View File

@ -18,8 +18,7 @@ except:
from calibre.ebooks.metadata import MetaInformation, authors_to_string from calibre.ebooks.metadata import MetaInformation, authors_to_string
from calibre.utils.pdftk import set_metadata as pdftk_set_metadata from calibre.utils.pdftk import set_metadata as pdftk_set_metadata
from calibre.utils.podofo import get_metadata as podofo_get_metadata, \ from calibre.utils.podofo import get_metadata as podofo_get_metadata, \
set_metadata as podofo_set_metadata, Unavailable, write_first_page, \ set_metadata as podofo_set_metadata, Unavailable, get_metadata_quick
get_metadata_quick
def get_quick_metadata(stream): def get_quick_metadata(stream):
raw = stream.read() raw = stream.read()
@ -32,19 +31,18 @@ def get_quick_metadata(stream):
def get_metadata(stream, extract_cover=True): def get_metadata(stream, extract_cover=True):
try: try:
mi = podofo_get_metadata(stream) with TemporaryDirectory('_pdfmeta') as tdir:
cpath = os.path.join(tdir, 'cover.pdf')
if not extract_cover:
cpath = None
mi = podofo_get_metadata(stream, cpath=cpath)
if mi.cover is not None:
cdata = get_cover(mi.cover)
mi.cover = None
if cdata is not None:
mi.cover_data = ('jpg', cdata)
except Unavailable: except Unavailable:
mi = get_metadata_pypdf(stream) mi = get_metadata_pypdf(stream)
stream.seek(0)
if extract_cover and _imagemagick_loaded:
try:
cdata = get_cover(stream)
if cdata is not None:
mi.cover_data = ('jpg', cdata)
except:
import traceback
traceback.print_exc()
return mi return mi
@ -127,17 +125,13 @@ def set_metadata_pypdf(stream, mi):
stream.write(out_str.read()) stream.write(out_str.read())
stream.seek(0) stream.seek(0)
def get_cover(stream): def get_cover(cover_path):
stream.seek(0) with ImageMagick():
with TemporaryDirectory('_pdfmeta') as tdir: wand = NewMagickWand()
cover_path = os.path.join(tdir, 'cover.pdf') MagickReadImage(wand, cover_path)
write_first_page(stream, cover_path) MagickSetImageFormat(wand, 'JPEG')
with ImageMagick(): MagickWriteImage(wand, '%s.jpg' % cover_path)
wand = NewMagickWand() return open('%s.jpg' % cover_path, 'rb').read()
MagickReadImage(wand, cover_path)
MagickSetImageFormat(wand, 'JPEG')
MagickWriteImage(wand, '%s.jpg' % cover_path)
return open('%s.jpg' % cover_path, 'rb').read()

View File

@ -39,9 +39,6 @@ PARALLEL_FUNCS = {
'write_pdf_metadata' : 'write_pdf_metadata' :
('calibre.utils.podofo.__init__', 'set_metadata_', None), ('calibre.utils.podofo.__init__', 'set_metadata_', None),
'write_pdf_first_page' :
('calibre.utils.podofo.__init__', 'write_first_page_', None),
'save_book' : 'save_book' :
('calibre.ebooks.metadata.worker', 'save_book', 'notification'), ('calibre.ebooks.metadata.worker', 'save_book', 'notification'),
} }

View File

@ -19,39 +19,7 @@ podofo, podofo_err = plugins['podofo']
class Unavailable(Exception): pass class Unavailable(Exception): pass
def write_first_page(stream, opath): def get_metadata(stream, cpath=None):
if not podofo:
raise Unavailable(podofo_err)
pt = PersistentTemporaryFile('_podofo.pdf')
pt.write(stream.read())
pt.close()
server = Server(pool_size=1)
job = ParallelJob('write_pdf_first_page', 'Extract first page of pdf',
lambda x,y:x, args=[pt.name, opath])
server.add_job(job)
while not job.is_finished:
time.sleep(0.1)
job.update()
job.update()
server.close()
if not job.result:
raise ValueError('Failed to extract first page: ' + job.details)
def write_first_page_(inpath, outpath):
p = podofo.PDFDoc()
p.open(inpath)
pages = p.pages
if pages < 1:
raise ValueError('PDF has no pages')
if pages == 1:
shutil.copyfile(inpath, outpath)
return True
p.delete_pages(1, pages-1)
p.save(outpath)
return True
def get_metadata(stream):
if not podofo: if not podofo:
raise Unavailable(podofo_err) raise Unavailable(podofo_err)
pt = PersistentTemporaryFile('_podofo.pdf') pt = PersistentTemporaryFile('_podofo.pdf')
@ -59,7 +27,7 @@ def get_metadata(stream):
pt.close() pt.close()
server = Server(pool_size=1) server = Server(pool_size=1)
job = ParallelJob('read_pdf_metadata', 'Read pdf metadata', job = ParallelJob('read_pdf_metadata', 'Read pdf metadata',
lambda x,y:x, args=[pt.name]) lambda x,y:x, args=[pt.name, cpath])
server.add_job(job) server.add_job(job)
while not job.is_finished: while not job.is_finished:
time.sleep(0.1) time.sleep(0.1)
@ -69,7 +37,10 @@ def get_metadata(stream):
server.close() server.close()
if job.result is None: if job.result is None:
raise ValueError('Failed to read metadata: ' + job.details) raise ValueError('Failed to read metadata: ' + job.details)
title, authors, creator = job.result title, authors, creator, ok = job.result
if not ok:
print 'Failed to extract cover:'
print job.details
if title == '_': if title == '_':
title = getattr(stream, 'name', _('Unknown')) title = getattr(stream, 'name', _('Unknown'))
title = os.path.splitext(title)[0] title = os.path.splitext(title)[0]
@ -78,6 +49,8 @@ def get_metadata(stream):
if creator: if creator:
mi.book_producer = creator mi.book_producer = creator
if os.path.exists(pt.name): os.remove(pt.name) if os.path.exists(pt.name): os.remove(pt.name)
if ok:
mi.cover = cpath
return mi return mi
def get_metadata_quick(raw): def get_metadata_quick(raw):
@ -95,7 +68,7 @@ def get_metadata_quick(raw):
return mi return mi
def get_metadata_(path): def get_metadata_(path, cpath=None):
p = podofo.PDFDoc() p = podofo.PDFDoc()
p.open(path) p.open(path)
title = p.title title = p.title
@ -104,7 +77,23 @@ def get_metadata_(path):
author = p.author author = p.author
authors = string_to_authors(author) if author else [_('Unknown')] authors = string_to_authors(author) if author else [_('Unknown')]
creator = p.creator creator = p.creator
return (title, authors, creator) ok = True
try:
if cpath is not None:
pages = p.pages
if pages < 1:
raise ValueError('PDF has no pages')
if True or pages == 1:
shutil.copyfile(path, cpath)
else:
p.extract_first_page()
p.save(cpath)
except:
import traceback
traceback.print_exc()
ok = False
return (title, authors, creator, ok)
def prep(val): def prep(val):
if not val: if not val:

View File

@ -143,18 +143,15 @@ podofo_PDFDoc_version_getter(podofo_PDFDoc *self, void *closure) {
static PyObject * static PyObject *
podofo_PDFDoc_delete_pages(podofo_PDFDoc *self, PyObject *args, PyObject *kwargs) { podofo_PDFDoc_extract_first_page(podofo_PDFDoc *self, PyObject *args, PyObject *kwargs) {
int first_page, num_pages; int i, num_pages;
if (PyArg_ParseTuple(args, "ii", &first_page, &num_pages)) { try {
try { while (self->doc->GetPageCount() > 1) self->doc->GetPagesTree()->DeletePage(1);
self->doc->DeletePages(first_page, num_pages); } catch(const PdfError & err) {
} catch(const PdfError & err) { podofo_set_exception(err);
podofo_set_exception(err); return NULL;
return NULL; }
} Py_RETURN_NONE;
} else return NULL;
Py_INCREF(Py_None);
return Py_None;
} }
static PyObject * static PyObject *
@ -313,8 +310,8 @@ static PyMethodDef podofo_PDFDoc_methods[] = {
{"save", (PyCFunction)podofo_PDFDoc_save, METH_VARARGS, {"save", (PyCFunction)podofo_PDFDoc_save, METH_VARARGS,
"Save the PDF document to a path on disk" "Save the PDF document to a path on disk"
}, },
{"delete_pages", (PyCFunction)podofo_PDFDoc_delete_pages, METH_VARARGS, {"extract_first_page", (PyCFunction)podofo_PDFDoc_extract_first_page, METH_VARARGS,
"delete_pages(start_page, num_pages) -> int, int\nDelete pages from the PDF document." "extract_first_page() -> Remove all but the first page."
}, },
{NULL} /* Sentinel */ {NULL} /* Sentinel */