mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-06-23 15:30:45 -04:00
More podofo refactoring
This commit is contained in:
parent
1136f26186
commit
238bc24cf2
@ -148,7 +148,6 @@ extensions = [
|
|||||||
libraries=['podofo'],
|
libraries=['podofo'],
|
||||||
lib_dirs=[podofo_lib],
|
lib_dirs=[podofo_lib],
|
||||||
inc_dirs=[podofo_inc, os.path.dirname(podofo_inc)],
|
inc_dirs=[podofo_inc, os.path.dirname(podofo_inc)],
|
||||||
optional=True,
|
|
||||||
error=podofo_error),
|
error=podofo_error),
|
||||||
|
|
||||||
Extension('pictureflow',
|
Extension('pictureflow',
|
||||||
|
@ -43,12 +43,6 @@ PARALLEL_FUNCS = {
|
|||||||
'read_metadata' :
|
'read_metadata' :
|
||||||
('calibre.ebooks.metadata.worker', 'read_metadata_', 'notification'),
|
('calibre.ebooks.metadata.worker', 'read_metadata_', 'notification'),
|
||||||
|
|
||||||
'read_pdf_metadata' :
|
|
||||||
('calibre.utils.podofo.__init__', 'get_metadata_', None),
|
|
||||||
|
|
||||||
'write_pdf_metadata' :
|
|
||||||
('calibre.utils.podofo.__init__', 'set_metadata_', None),
|
|
||||||
|
|
||||||
'save_book' :
|
'save_book' :
|
||||||
('calibre.ebooks.metadata.worker', 'save_book', 'notification'),
|
('calibre.ebooks.metadata.worker', 'save_book', 'notification'),
|
||||||
|
|
||||||
|
@ -6,109 +6,12 @@ __license__ = 'GPL v3'
|
|||||||
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
|
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
|
||||||
__docformat__ = 'restructuredtext en'
|
__docformat__ = 'restructuredtext en'
|
||||||
|
|
||||||
import os, time, shutil
|
import os, shutil
|
||||||
|
|
||||||
from calibre.constants import plugins, preferred_encoding
|
from calibre.constants import plugins, preferred_encoding
|
||||||
from calibre.ebooks.metadata import MetaInformation, string_to_authors, \
|
from calibre.ebooks.metadata import authors_to_string
|
||||||
authors_to_string
|
from calibre.ptempfile import TemporaryDirectory
|
||||||
from calibre.utils.ipc.job import ParallelJob
|
from calibre.utils.ipc.simple_worker import fork_job, WorkerError
|
||||||
from calibre.utils.ipc.server import Server
|
|
||||||
from calibre.ptempfile import PersistentTemporaryFile, TemporaryFile
|
|
||||||
from calibre import prints
|
|
||||||
|
|
||||||
podofo, podofo_err = plugins['podofo']
|
|
||||||
|
|
||||||
class Unavailable(Exception): pass
|
|
||||||
|
|
||||||
def get_metadata(stream, cpath=None):
|
|
||||||
if not podofo:
|
|
||||||
raise Unavailable(podofo_err)
|
|
||||||
pt = PersistentTemporaryFile('_podofo.pdf')
|
|
||||||
pt.write(stream.read())
|
|
||||||
pt.close()
|
|
||||||
server = Server(pool_size=1)
|
|
||||||
job = ParallelJob('read_pdf_metadata', 'Read pdf metadata',
|
|
||||||
lambda x,y:x, args=[pt.name, cpath])
|
|
||||||
server.add_job(job)
|
|
||||||
while not job.is_finished:
|
|
||||||
time.sleep(0.1)
|
|
||||||
job.update()
|
|
||||||
|
|
||||||
job.update()
|
|
||||||
server.close()
|
|
||||||
if job.result is None:
|
|
||||||
raise ValueError('Failed to read metadata: ' + job.details)
|
|
||||||
title, authors, creator, tags, ok = job.result
|
|
||||||
if not ok:
|
|
||||||
print 'Failed to extract cover:'
|
|
||||||
print job.details
|
|
||||||
if title == '_':
|
|
||||||
title = getattr(stream, 'name', _('Unknown'))
|
|
||||||
title = os.path.splitext(title)[0]
|
|
||||||
|
|
||||||
mi = MetaInformation(title, authors)
|
|
||||||
if creator:
|
|
||||||
mi.book_producer = creator
|
|
||||||
if tags:
|
|
||||||
mi.tags = tags
|
|
||||||
if os.path.exists(pt.name): os.remove(pt.name)
|
|
||||||
if ok:
|
|
||||||
mi.cover = cpath
|
|
||||||
return mi
|
|
||||||
|
|
||||||
def get_metadata_quick(raw):
|
|
||||||
p = podofo.PDFDoc()
|
|
||||||
p.load(raw)
|
|
||||||
title = p.title
|
|
||||||
if not title:
|
|
||||||
title = '_'
|
|
||||||
author = p.author
|
|
||||||
authors = string_to_authors(author) if author else [_('Unknown')]
|
|
||||||
creator = p.creator
|
|
||||||
try:
|
|
||||||
tags = [x.strip() for x in p.keywords.split(u',')]
|
|
||||||
tags = [x for x in tags if x]
|
|
||||||
except:
|
|
||||||
tags = []
|
|
||||||
|
|
||||||
mi = MetaInformation(title, authors)
|
|
||||||
if creator:
|
|
||||||
mi.book_producer = creator
|
|
||||||
if tags:
|
|
||||||
mi.tags = tags
|
|
||||||
return mi
|
|
||||||
|
|
||||||
def get_metadata_(path, cpath=None):
|
|
||||||
p = podofo.PDFDoc()
|
|
||||||
p.open(path)
|
|
||||||
title = p.title
|
|
||||||
if not title:
|
|
||||||
title = '_'
|
|
||||||
author = p.author
|
|
||||||
authors = string_to_authors(author) if author else [_('Unknown')]
|
|
||||||
creator = p.creator
|
|
||||||
try:
|
|
||||||
tags = [x.strip() for x in p.keywords.split(u',')]
|
|
||||||
tags = [x for x in tags if x]
|
|
||||||
except:
|
|
||||||
tags = []
|
|
||||||
ok = True
|
|
||||||
try:
|
|
||||||
if cpath is not None:
|
|
||||||
pages = p.pages
|
|
||||||
if pages < 1:
|
|
||||||
raise ValueError('PDF has no pages')
|
|
||||||
if True or pages == 1:
|
|
||||||
shutil.copyfile(path, cpath)
|
|
||||||
else:
|
|
||||||
p.extract_first_page()
|
|
||||||
p.save(cpath)
|
|
||||||
except:
|
|
||||||
import traceback
|
|
||||||
traceback.print_exc()
|
|
||||||
ok = False
|
|
||||||
|
|
||||||
return (title, authors, creator, tags, ok)
|
|
||||||
|
|
||||||
def prep(val):
|
def prep(val):
|
||||||
if not val:
|
if not val:
|
||||||
@ -118,27 +21,16 @@ def prep(val):
|
|||||||
return val.strip()
|
return val.strip()
|
||||||
|
|
||||||
def set_metadata(stream, mi):
|
def set_metadata(stream, mi):
|
||||||
if not podofo:
|
with TemporaryDirectory(u'_podofo_set_metadata') as tdir:
|
||||||
raise Unavailable(podofo_err)
|
with open(os.path.join(tdir, u'input.pdf'), 'wb') as f:
|
||||||
with TemporaryFile('_podofo_read.pdf') as inputf, \
|
|
||||||
TemporaryFile('_podofo_write.pdf') as outputf:
|
|
||||||
server = Server(pool_size=1)
|
|
||||||
with open(inputf, 'wb') as f:
|
|
||||||
shutil.copyfileobj(stream, f)
|
shutil.copyfileobj(stream, f)
|
||||||
job = ParallelJob('write_pdf_metadata', 'Write pdf metadata',
|
try:
|
||||||
lambda x,y:x, args=[inputf, outputf, mi.title, mi.authors,
|
touched = fork_job('calibre.utils.podofo', 'set_metadata_', (tdir,
|
||||||
mi.book_producer, mi.tags])
|
mi.title, mi.authors, mi.book_producer, mi.tags))
|
||||||
server.add_job(job)
|
except WorkerError as e:
|
||||||
while not job.is_finished:
|
raise Exception('Failed to set PDF metadata: %s'%e.orig_tb)
|
||||||
time.sleep(0.1)
|
if touched:
|
||||||
job.update()
|
with open(os.path.join(tdir, u'output.pdf'), 'rb') as f:
|
||||||
|
|
||||||
job.update()
|
|
||||||
server.close()
|
|
||||||
if job.failed:
|
|
||||||
prints(job.details)
|
|
||||||
elif job.result:
|
|
||||||
with open(outputf, 'rb') as f:
|
|
||||||
f.seek(0, 2)
|
f.seek(0, 2)
|
||||||
if f.tell() > 100:
|
if f.tell() > 100:
|
||||||
f.seek(0)
|
f.seek(0)
|
||||||
@ -148,10 +40,14 @@ def set_metadata(stream, mi):
|
|||||||
stream.flush()
|
stream.flush()
|
||||||
stream.seek(0)
|
stream.seek(0)
|
||||||
|
|
||||||
|
def set_metadata_(tdir, title, authors, bkp, tags):
|
||||||
|
podofo, podofo_err = plugins['podofo']
|
||||||
|
if podofo is None:
|
||||||
|
raise RuntimeError('Failed to load podofo: %s'%podofo_err)
|
||||||
|
|
||||||
def set_metadata_(path, opath, title, authors, bkp, tags):
|
os.chdir(tdir)
|
||||||
p = podofo.PDFDoc()
|
p = podofo.PDFDoc()
|
||||||
p.open(path)
|
p.open(u'input.pdf')
|
||||||
title = prep(title)
|
title = prep(title)
|
||||||
touched = False
|
touched = False
|
||||||
if title and title != p.title:
|
if title and title != p.title:
|
||||||
@ -177,27 +73,32 @@ def set_metadata_(path, opath, title, authors, bkp, tags):
|
|||||||
pass
|
pass
|
||||||
|
|
||||||
if touched:
|
if touched:
|
||||||
p.save(opath)
|
p.save(u'output.pdf')
|
||||||
return True
|
|
||||||
return False
|
return touched
|
||||||
|
|
||||||
def delete_all_but(path, pages):
|
def delete_all_but(path, pages):
|
||||||
''' Delete all the pages in the pdf except for the specified ones. Negative
|
''' Delete all the pages in the pdf except for the specified ones. Negative
|
||||||
numbers are counted from the end of the PDF. '''
|
numbers are counted from the end of the PDF. '''
|
||||||
with TemporaryFile('_podofo_in.pdf') as of:
|
podofo, podofo_err = plugins['podofo']
|
||||||
shutil.copyfile(path, of)
|
if podofo is None:
|
||||||
|
raise RuntimeError('Failed to load podofo: %s'%podofo_err)
|
||||||
|
|
||||||
p = podofo.PDFDoc()
|
p = podofo.PDFDoc()
|
||||||
p.open(of)
|
with open(path, 'rb') as f:
|
||||||
|
raw = f.read()
|
||||||
|
p.load(raw)
|
||||||
total = p.page_count()
|
total = p.page_count()
|
||||||
pages = { total + x if x < 0 else x for x in pages }
|
pages = { total + x if x < 0 else x for x in pages }
|
||||||
for page in xrange(total-1, -1, -1):
|
for page in xrange(total-1, -1, -1):
|
||||||
if page not in pages:
|
if page not in pages:
|
||||||
p.delete_page(page)
|
p.delete_page(page)
|
||||||
os.remove(path)
|
|
||||||
p.save(path)
|
raw = p.write()
|
||||||
|
with open(path, 'wb') as f:
|
||||||
|
f.write(raw)
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
f = '/tmp/t.pdf'
|
f = u'/tmp/t.pdf'
|
||||||
delete_all_but(f, [0, 1, -2, -1])
|
delete_all_but(f, [0, 1, -2, -1])
|
||||||
|
|
||||||
|
@ -77,11 +77,72 @@ PDFDoc_save(PDFDoc *self, PyObject *args, PyObject *kwargs) {
|
|||||||
}
|
}
|
||||||
} else return NULL;
|
} else return NULL;
|
||||||
|
|
||||||
|
Py_RETURN_NONE;
|
||||||
Py_INCREF(Py_None);
|
|
||||||
return Py_None;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static PyObject *
|
||||||
|
PDFDoc_write(PDFDoc *self, PyObject *args, PyObject *kwargs) {
|
||||||
|
PyObject *ans;
|
||||||
|
PdfRefCountedBuffer buffer(1*1024*1024);
|
||||||
|
PdfOutputDevice out(&buffer);
|
||||||
|
|
||||||
|
try {
|
||||||
|
self->doc->Write(&out);
|
||||||
|
} catch(const PdfError &err) {
|
||||||
|
podofo_set_exception(err);
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
ans = PyBytes_FromStringAndSize(buffer.GetBuffer(), out.Tell());
|
||||||
|
if (ans == NULL) PyErr_NoMemory();
|
||||||
|
return ans;
|
||||||
|
}
|
||||||
|
|
||||||
|
static PyObject *
|
||||||
|
PDFDoc_extract_first_page(PDFDoc *self, PyObject *args, PyObject *kwargs) {
|
||||||
|
try {
|
||||||
|
while (self->doc->GetPageCount() > 1) self->doc->GetPagesTree()->DeletePage(1);
|
||||||
|
} catch(const PdfError & err) {
|
||||||
|
podofo_set_exception(err);
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
Py_RETURN_NONE;
|
||||||
|
}
|
||||||
|
|
||||||
|
static PyObject *
|
||||||
|
PDFDoc_page_count(PDFDoc *self, PyObject *args, PyObject *kwargs) {
|
||||||
|
int count;
|
||||||
|
try {
|
||||||
|
count = self->doc->GetPageCount();
|
||||||
|
} catch(const PdfError & err) {
|
||||||
|
podofo_set_exception(err);
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
return Py_BuildValue("i", count);
|
||||||
|
}
|
||||||
|
|
||||||
|
static PyObject *
|
||||||
|
PDFDoc_delete_page(PDFDoc *self, PyObject *args, PyObject *kwargs) {
|
||||||
|
int num = 0;
|
||||||
|
if (PyArg_ParseTuple(args, "i", &num)) {
|
||||||
|
try {
|
||||||
|
self->doc->DeletePages(num, 1);
|
||||||
|
} catch(const PdfError & err) {
|
||||||
|
podofo_set_exception(err);
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
} else return NULL;
|
||||||
|
|
||||||
|
Py_RETURN_NONE;
|
||||||
|
}
|
||||||
|
|
||||||
|
static PyObject *
|
||||||
|
PDFDoc_append(PDFDoc *self, PyObject *args, PyObject *kwargs) {
|
||||||
|
Py_RETURN_NONE;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Properties {{{
|
||||||
|
|
||||||
static PyObject *
|
static PyObject *
|
||||||
PDFDoc_pages_getter(PDFDoc *self, void *closure) {
|
PDFDoc_pages_getter(PDFDoc *self, void *closure) {
|
||||||
int pages = self->doc->GetPageCount();
|
int pages = self->doc->GetPageCount();
|
||||||
@ -123,46 +184,6 @@ PDFDoc_version_getter(PDFDoc *self, void *closure) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
static PyObject *
|
|
||||||
PDFDoc_extract_first_page(PDFDoc *self, PyObject *args, PyObject *kwargs) {
|
|
||||||
try {
|
|
||||||
while (self->doc->GetPageCount() > 1) self->doc->GetPagesTree()->DeletePage(1);
|
|
||||||
} catch(const PdfError & err) {
|
|
||||||
podofo_set_exception(err);
|
|
||||||
return NULL;
|
|
||||||
}
|
|
||||||
Py_RETURN_NONE;
|
|
||||||
}
|
|
||||||
|
|
||||||
static PyObject *
|
|
||||||
PDFDoc_page_count(PDFDoc *self, PyObject *args, PyObject *kwargs) {
|
|
||||||
int count;
|
|
||||||
try {
|
|
||||||
count = self->doc->GetPageCount();
|
|
||||||
} catch(const PdfError & err) {
|
|
||||||
podofo_set_exception(err);
|
|
||||||
return NULL;
|
|
||||||
}
|
|
||||||
return Py_BuildValue("i", count);
|
|
||||||
}
|
|
||||||
|
|
||||||
static PyObject *
|
|
||||||
PDFDoc_delete_page(PDFDoc *self, PyObject *args, PyObject *kwargs) {
|
|
||||||
int num = 0;
|
|
||||||
if (PyArg_ParseTuple(args, "i", &num)) {
|
|
||||||
try {
|
|
||||||
self->doc->DeletePages(num, 1);
|
|
||||||
} catch(const PdfError & err) {
|
|
||||||
podofo_set_exception(err);
|
|
||||||
return NULL;
|
|
||||||
}
|
|
||||||
} else return NULL;
|
|
||||||
|
|
||||||
Py_RETURN_NONE;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
static PyObject *
|
static PyObject *
|
||||||
PDFDoc_getter(PDFDoc *self, int field)
|
PDFDoc_getter(PDFDoc *self, int field)
|
||||||
{
|
{
|
||||||
@ -288,31 +309,6 @@ PDFDoc_producer_setter(PDFDoc *self, PyObject *val, void *closure) {
|
|||||||
return PDFDoc_setter(self, val, 5);
|
return PDFDoc_setter(self, val, 5);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
static PyMethodDef PDFDoc_methods[] = {
|
|
||||||
{"load", (PyCFunction)PDFDoc_load, METH_VARARGS,
|
|
||||||
"Load a PDF document from a byte buffer (string)"
|
|
||||||
},
|
|
||||||
{"open", (PyCFunction)PDFDoc_open, METH_VARARGS,
|
|
||||||
"Load a PDF document from a file path (string)"
|
|
||||||
},
|
|
||||||
{"save", (PyCFunction)PDFDoc_save, METH_VARARGS,
|
|
||||||
"Save the PDF document to a path on disk"
|
|
||||||
},
|
|
||||||
{"extract_first_page", (PyCFunction)PDFDoc_extract_first_page, METH_VARARGS,
|
|
||||||
"extract_first_page() -> Remove all but the first page."
|
|
||||||
},
|
|
||||||
{"page_count", (PyCFunction)PDFDoc_page_count, METH_VARARGS,
|
|
||||||
"page_count() -> Number of pages in the PDF."
|
|
||||||
},
|
|
||||||
{"delete_page", (PyCFunction)PDFDoc_delete_page, METH_VARARGS,
|
|
||||||
"delete_page(page_num) -> Delete the specified page from the pdf (0 is the first page)."
|
|
||||||
},
|
|
||||||
|
|
||||||
|
|
||||||
{NULL} /* Sentinel */
|
|
||||||
};
|
|
||||||
|
|
||||||
static PyGetSetDef PDFDoc_getsetters[] = {
|
static PyGetSetDef PDFDoc_getsetters[] = {
|
||||||
{(char *)"title",
|
{(char *)"title",
|
||||||
(getter)PDFDoc_title_getter, (setter)PDFDoc_title_setter,
|
(getter)PDFDoc_title_getter, (setter)PDFDoc_title_setter,
|
||||||
@ -350,6 +346,39 @@ static PyGetSetDef PDFDoc_getsetters[] = {
|
|||||||
{NULL} /* Sentinel */
|
{NULL} /* Sentinel */
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
|
// }}}
|
||||||
|
|
||||||
|
static PyMethodDef PDFDoc_methods[] = {
|
||||||
|
{"load", (PyCFunction)PDFDoc_load, METH_VARARGS,
|
||||||
|
"Load a PDF document from a byte buffer (string)"
|
||||||
|
},
|
||||||
|
{"open", (PyCFunction)PDFDoc_open, METH_VARARGS,
|
||||||
|
"Load a PDF document from a file path (string)"
|
||||||
|
},
|
||||||
|
{"save", (PyCFunction)PDFDoc_save, METH_VARARGS,
|
||||||
|
"Save the PDF document to a path on disk"
|
||||||
|
},
|
||||||
|
{"write", (PyCFunction)PDFDoc_write, METH_VARARGS,
|
||||||
|
"Return the PDF document as a bytestring."
|
||||||
|
},
|
||||||
|
{"extract_first_page", (PyCFunction)PDFDoc_extract_first_page, METH_VARARGS,
|
||||||
|
"extract_first_page() -> Remove all but the first page."
|
||||||
|
},
|
||||||
|
{"page_count", (PyCFunction)PDFDoc_page_count, METH_VARARGS,
|
||||||
|
"page_count() -> Number of pages in the PDF."
|
||||||
|
},
|
||||||
|
{"delete_page", (PyCFunction)PDFDoc_delete_page, METH_VARARGS,
|
||||||
|
"delete_page(page_num) -> Delete the specified page from the pdf (0 is the first page)."
|
||||||
|
},
|
||||||
|
{"append", (PyCFunction)PDFDoc_append, METH_VARARGS,
|
||||||
|
"append(doc) -> Append doc (which must be a PDFDoc) to this document."
|
||||||
|
},
|
||||||
|
|
||||||
|
|
||||||
|
{NULL} /* Sentinel */
|
||||||
|
};
|
||||||
|
|
||||||
PyTypeObject pdf::PDFDocType = {
|
PyTypeObject pdf::PDFDocType = {
|
||||||
PyObject_HEAD_INIT(NULL)
|
PyObject_HEAD_INIT(NULL)
|
||||||
0, /*ob_size*/
|
0, /*ob_size*/
|
||||||
|
Loading…
x
Reference in New Issue
Block a user