mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Speed up PDF cover extraction
This commit is contained in:
parent
d6e4ce275f
commit
6c0b18461c
@ -18,8 +18,7 @@ except:
|
|||||||
from calibre.ebooks.metadata import MetaInformation, authors_to_string
|
from calibre.ebooks.metadata import MetaInformation, authors_to_string
|
||||||
from calibre.utils.pdftk import set_metadata as pdftk_set_metadata
|
from calibre.utils.pdftk import set_metadata as pdftk_set_metadata
|
||||||
from calibre.utils.podofo import get_metadata as podofo_get_metadata, \
|
from calibre.utils.podofo import get_metadata as podofo_get_metadata, \
|
||||||
set_metadata as podofo_set_metadata, Unavailable, write_first_page, \
|
set_metadata as podofo_set_metadata, Unavailable, get_metadata_quick
|
||||||
get_metadata_quick
|
|
||||||
|
|
||||||
def get_quick_metadata(stream):
|
def get_quick_metadata(stream):
|
||||||
raw = stream.read()
|
raw = stream.read()
|
||||||
@ -32,19 +31,18 @@ def get_quick_metadata(stream):
|
|||||||
|
|
||||||
def get_metadata(stream, extract_cover=True):
|
def get_metadata(stream, extract_cover=True):
|
||||||
try:
|
try:
|
||||||
mi = podofo_get_metadata(stream)
|
with TemporaryDirectory('_pdfmeta') as tdir:
|
||||||
|
cpath = os.path.join(tdir, 'cover.pdf')
|
||||||
|
if not extract_cover:
|
||||||
|
cpath = None
|
||||||
|
mi = podofo_get_metadata(stream, cpath=cpath)
|
||||||
|
if mi.cover is not None:
|
||||||
|
cdata = get_cover(mi.cover)
|
||||||
|
mi.cover = None
|
||||||
|
if cdata is not None:
|
||||||
|
mi.cover_data = ('jpg', cdata)
|
||||||
except Unavailable:
|
except Unavailable:
|
||||||
mi = get_metadata_pypdf(stream)
|
mi = get_metadata_pypdf(stream)
|
||||||
stream.seek(0)
|
|
||||||
|
|
||||||
if extract_cover and _imagemagick_loaded:
|
|
||||||
try:
|
|
||||||
cdata = get_cover(stream)
|
|
||||||
if cdata is not None:
|
|
||||||
mi.cover_data = ('jpg', cdata)
|
|
||||||
except:
|
|
||||||
import traceback
|
|
||||||
traceback.print_exc()
|
|
||||||
return mi
|
return mi
|
||||||
|
|
||||||
|
|
||||||
@ -127,17 +125,13 @@ def set_metadata_pypdf(stream, mi):
|
|||||||
stream.write(out_str.read())
|
stream.write(out_str.read())
|
||||||
stream.seek(0)
|
stream.seek(0)
|
||||||
|
|
||||||
def get_cover(stream):
|
def get_cover(cover_path):
|
||||||
stream.seek(0)
|
with ImageMagick():
|
||||||
with TemporaryDirectory('_pdfmeta') as tdir:
|
wand = NewMagickWand()
|
||||||
cover_path = os.path.join(tdir, 'cover.pdf')
|
MagickReadImage(wand, cover_path)
|
||||||
write_first_page(stream, cover_path)
|
MagickSetImageFormat(wand, 'JPEG')
|
||||||
with ImageMagick():
|
MagickWriteImage(wand, '%s.jpg' % cover_path)
|
||||||
wand = NewMagickWand()
|
return open('%s.jpg' % cover_path, 'rb').read()
|
||||||
MagickReadImage(wand, cover_path)
|
|
||||||
MagickSetImageFormat(wand, 'JPEG')
|
|
||||||
MagickWriteImage(wand, '%s.jpg' % cover_path)
|
|
||||||
return open('%s.jpg' % cover_path, 'rb').read()
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
@ -39,9 +39,6 @@ PARALLEL_FUNCS = {
|
|||||||
'write_pdf_metadata' :
|
'write_pdf_metadata' :
|
||||||
('calibre.utils.podofo.__init__', 'set_metadata_', None),
|
('calibre.utils.podofo.__init__', 'set_metadata_', None),
|
||||||
|
|
||||||
'write_pdf_first_page' :
|
|
||||||
('calibre.utils.podofo.__init__', 'write_first_page_', None),
|
|
||||||
|
|
||||||
'save_book' :
|
'save_book' :
|
||||||
('calibre.ebooks.metadata.worker', 'save_book', 'notification'),
|
('calibre.ebooks.metadata.worker', 'save_book', 'notification'),
|
||||||
}
|
}
|
||||||
|
@ -19,39 +19,7 @@ podofo, podofo_err = plugins['podofo']
|
|||||||
|
|
||||||
class Unavailable(Exception): pass
|
class Unavailable(Exception): pass
|
||||||
|
|
||||||
def write_first_page(stream, opath):
|
def get_metadata(stream, cpath=None):
|
||||||
if not podofo:
|
|
||||||
raise Unavailable(podofo_err)
|
|
||||||
pt = PersistentTemporaryFile('_podofo.pdf')
|
|
||||||
pt.write(stream.read())
|
|
||||||
pt.close()
|
|
||||||
server = Server(pool_size=1)
|
|
||||||
job = ParallelJob('write_pdf_first_page', 'Extract first page of pdf',
|
|
||||||
lambda x,y:x, args=[pt.name, opath])
|
|
||||||
server.add_job(job)
|
|
||||||
while not job.is_finished:
|
|
||||||
time.sleep(0.1)
|
|
||||||
job.update()
|
|
||||||
|
|
||||||
job.update()
|
|
||||||
server.close()
|
|
||||||
if not job.result:
|
|
||||||
raise ValueError('Failed to extract first page: ' + job.details)
|
|
||||||
|
|
||||||
def write_first_page_(inpath, outpath):
|
|
||||||
p = podofo.PDFDoc()
|
|
||||||
p.open(inpath)
|
|
||||||
pages = p.pages
|
|
||||||
if pages < 1:
|
|
||||||
raise ValueError('PDF has no pages')
|
|
||||||
if pages == 1:
|
|
||||||
shutil.copyfile(inpath, outpath)
|
|
||||||
return True
|
|
||||||
p.delete_pages(1, pages-1)
|
|
||||||
p.save(outpath)
|
|
||||||
return True
|
|
||||||
|
|
||||||
def get_metadata(stream):
|
|
||||||
if not podofo:
|
if not podofo:
|
||||||
raise Unavailable(podofo_err)
|
raise Unavailable(podofo_err)
|
||||||
pt = PersistentTemporaryFile('_podofo.pdf')
|
pt = PersistentTemporaryFile('_podofo.pdf')
|
||||||
@ -59,7 +27,7 @@ def get_metadata(stream):
|
|||||||
pt.close()
|
pt.close()
|
||||||
server = Server(pool_size=1)
|
server = Server(pool_size=1)
|
||||||
job = ParallelJob('read_pdf_metadata', 'Read pdf metadata',
|
job = ParallelJob('read_pdf_metadata', 'Read pdf metadata',
|
||||||
lambda x,y:x, args=[pt.name])
|
lambda x,y:x, args=[pt.name, cpath])
|
||||||
server.add_job(job)
|
server.add_job(job)
|
||||||
while not job.is_finished:
|
while not job.is_finished:
|
||||||
time.sleep(0.1)
|
time.sleep(0.1)
|
||||||
@ -69,7 +37,10 @@ def get_metadata(stream):
|
|||||||
server.close()
|
server.close()
|
||||||
if job.result is None:
|
if job.result is None:
|
||||||
raise ValueError('Failed to read metadata: ' + job.details)
|
raise ValueError('Failed to read metadata: ' + job.details)
|
||||||
title, authors, creator = job.result
|
title, authors, creator, ok = job.result
|
||||||
|
if not ok:
|
||||||
|
print 'Failed to extract cover:'
|
||||||
|
print job.details
|
||||||
if title == '_':
|
if title == '_':
|
||||||
title = getattr(stream, 'name', _('Unknown'))
|
title = getattr(stream, 'name', _('Unknown'))
|
||||||
title = os.path.splitext(title)[0]
|
title = os.path.splitext(title)[0]
|
||||||
@ -78,6 +49,8 @@ def get_metadata(stream):
|
|||||||
if creator:
|
if creator:
|
||||||
mi.book_producer = creator
|
mi.book_producer = creator
|
||||||
if os.path.exists(pt.name): os.remove(pt.name)
|
if os.path.exists(pt.name): os.remove(pt.name)
|
||||||
|
if ok:
|
||||||
|
mi.cover = cpath
|
||||||
return mi
|
return mi
|
||||||
|
|
||||||
def get_metadata_quick(raw):
|
def get_metadata_quick(raw):
|
||||||
@ -95,7 +68,7 @@ def get_metadata_quick(raw):
|
|||||||
return mi
|
return mi
|
||||||
|
|
||||||
|
|
||||||
def get_metadata_(path):
|
def get_metadata_(path, cpath=None):
|
||||||
p = podofo.PDFDoc()
|
p = podofo.PDFDoc()
|
||||||
p.open(path)
|
p.open(path)
|
||||||
title = p.title
|
title = p.title
|
||||||
@ -104,7 +77,23 @@ def get_metadata_(path):
|
|||||||
author = p.author
|
author = p.author
|
||||||
authors = string_to_authors(author) if author else [_('Unknown')]
|
authors = string_to_authors(author) if author else [_('Unknown')]
|
||||||
creator = p.creator
|
creator = p.creator
|
||||||
return (title, authors, creator)
|
ok = True
|
||||||
|
try:
|
||||||
|
if cpath is not None:
|
||||||
|
pages = p.pages
|
||||||
|
if pages < 1:
|
||||||
|
raise ValueError('PDF has no pages')
|
||||||
|
if True or pages == 1:
|
||||||
|
shutil.copyfile(path, cpath)
|
||||||
|
else:
|
||||||
|
p.extract_first_page()
|
||||||
|
p.save(cpath)
|
||||||
|
except:
|
||||||
|
import traceback
|
||||||
|
traceback.print_exc()
|
||||||
|
ok = False
|
||||||
|
|
||||||
|
return (title, authors, creator, ok)
|
||||||
|
|
||||||
def prep(val):
|
def prep(val):
|
||||||
if not val:
|
if not val:
|
||||||
|
@ -143,18 +143,15 @@ podofo_PDFDoc_version_getter(podofo_PDFDoc *self, void *closure) {
|
|||||||
|
|
||||||
|
|
||||||
static PyObject *
|
static PyObject *
|
||||||
podofo_PDFDoc_delete_pages(podofo_PDFDoc *self, PyObject *args, PyObject *kwargs) {
|
podofo_PDFDoc_extract_first_page(podofo_PDFDoc *self, PyObject *args, PyObject *kwargs) {
|
||||||
int first_page, num_pages;
|
int i, num_pages;
|
||||||
if (PyArg_ParseTuple(args, "ii", &first_page, &num_pages)) {
|
try {
|
||||||
try {
|
while (self->doc->GetPageCount() > 1) self->doc->GetPagesTree()->DeletePage(1);
|
||||||
self->doc->DeletePages(first_page, num_pages);
|
} catch(const PdfError & err) {
|
||||||
} catch(const PdfError & err) {
|
podofo_set_exception(err);
|
||||||
podofo_set_exception(err);
|
return NULL;
|
||||||
return NULL;
|
}
|
||||||
}
|
Py_RETURN_NONE;
|
||||||
} else return NULL;
|
|
||||||
Py_INCREF(Py_None);
|
|
||||||
return Py_None;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static PyObject *
|
static PyObject *
|
||||||
@ -313,8 +310,8 @@ static PyMethodDef podofo_PDFDoc_methods[] = {
|
|||||||
{"save", (PyCFunction)podofo_PDFDoc_save, METH_VARARGS,
|
{"save", (PyCFunction)podofo_PDFDoc_save, METH_VARARGS,
|
||||||
"Save the PDF document to a path on disk"
|
"Save the PDF document to a path on disk"
|
||||||
},
|
},
|
||||||
{"delete_pages", (PyCFunction)podofo_PDFDoc_delete_pages, METH_VARARGS,
|
{"extract_first_page", (PyCFunction)podofo_PDFDoc_extract_first_page, METH_VARARGS,
|
||||||
"delete_pages(start_page, num_pages) -> int, int\nDelete pages from the PDF document."
|
"extract_first_page() -> Remove all but the first page."
|
||||||
},
|
},
|
||||||
|
|
||||||
{NULL} /* Sentinel */
|
{NULL} /* Sentinel */
|
||||||
|
Loading…
x
Reference in New Issue
Block a user