Switch to using poppler to read PDF metadata. Hopefully is faster/more stable. Fixes #3041 (version 0.6.3 doesnt delete temp files)

This commit is contained in:
Kovid Goyal 2009-08-01 15:07:33 -06:00
parent acd4055c7a
commit 929d74f718
8 changed files with 475 additions and 16 deletions

View File

@ -234,6 +234,12 @@ _check_symlinks_prescript()
print 'Adding PoDoFo'
pdf = glob.glob(os.path.expanduser('/Volumes/sw/podofo/libpodofo*.dylib'))[0]
shutil.copyfile(pdf, os.path.join(frameworks_dir, os.path.basename(pdf)))
print
print 'Adding poppler'
for x in ('pdftohtml', 'libpoppler.4.dylib', 'libpoppler-qt4.3.dylib'):
os.link(os.path.join(os.path.expanduser('~/poppler'), x),
os.path.join(frameworks_dir, x))
loader_path = os.path.join(resource_dir, 'loaders')
@ -248,14 +254,9 @@ _check_symlinks_prescript()
f.close()
os.chmod(path, stat.S_IXUSR|stat.S_IXGRP|stat.S_IXOTH|stat.S_IREAD\
|stat.S_IWUSR|stat.S_IROTH|stat.S_IRGRP)
self.add_plugins()
print
print 'Adding pdftohtml'
os.link(os.path.expanduser('~/pdftohtml/pdftohtml'), os.path.join(frameworks_dir, 'pdftohtml'))
os.link(os.path.expanduser('~/pdftohtml/libpoppler.4.dylib'),
os.path.join(frameworks_dir, 'libpoppler.4.dylib'))
print 'Adding fontconfig'
for f in glob.glob(os.path.expanduser('~/fontconfig-bundled/*')):
dest = os.path.join(frameworks_dir, os.path.basename(f))

View File

@ -65,6 +65,12 @@ No
Icon
Modern/Small/SetupModernSmall01.gif
IgnoreDirectories
{}
IgnoreFiles
{}
Image
Modern/SetupModern01.gif
@ -77,6 +83,9 @@ InstallPassword
InstallVersion
1.0.0.0
Language,ca
Yes
Language,de
No
@ -95,6 +104,9 @@ No
Language,it
No
Language,lt
Yes
Language,nl
No
@ -107,6 +119,12 @@ No
Language,ru
No
LastIgnoreDirectories
{}
LastIgnoreFiles
{}
LaunchApplication
Yes
@ -144,7 +162,7 @@ ProjectID
DA98A0C6-9102-73EC-2516-B147E972D3F7
ProjectVersion
1.2.13.1
1.2.13.8
SaveOnlyToplevelDirs
No
@ -547,8 +565,8 @@ File ::0450D3E0-07EB-F81F-DA39-038494E4C8FE -name win32com.shell.shell.pyd -pare
File ::8D7A36A6-4517-E995-E989-2E522E7A1438 -name calibre-smtp.exe.local -parent 8E5D85A4-7608-47A1-CF7C-309060D5FF40
File ::9E4E5E8F-30C0-E631-9516-2AE01A5CA0E9 -name ebook-device.exe.local -parent 8E5D85A4-7608-47A1-CF7C-309060D5FF40
File ::7BE6B538-70D5-A7EB-5F91-E14CE57B394B -name calibre-complete.exe.local -parent 8E5D85A4-7608-47A1-CF7C-309060D5FF40
File ::F54BC3BB-5F21-3D81-043E-603D53754CFC -name pdftohtml.exe.manifest -parent 8E5D85A4-7608-47A1-CF7C-309060D5FF40
File ::C4E40030-3EE0-8B05-E6B9-89E81433EE1F -name phonon4.dll -parent 8E5D85A4-7608-47A1-CF7C-309060D5FF40
File ::9E84342F-36ED-7ED3-8F90-1EC55267BCFC -name poppler-qt4.dll -parent 8E5D85A4-7608-47A1-CF7C-309060D5FF40
Component ::F6829AB7-9F66-4CEE-CA0E-21F54C6D3609 -setup Install -active Yes -platforms {AIX-ppc FreeBSD-4-x86 FreeBSD-x86 HPUX-hppa Linux-x86 Solaris-sparc Windows FreeBSD-5-x86 FreeBSD-6-x86 FreeBSD-7-x86 Linux-x86_64 Solaris-x86} -name Main -parent Components
SetupType ::D9ADE41C-B744-690C-2CED-CF826BF03D2E -setup Install -active Yes -platforms {AIX-ppc FreeBSD-4-x86 FreeBSD-x86 HPUX-hppa Linux-x86 Solaris-sparc Windows FreeBSD-5-x86 FreeBSD-6-x86 FreeBSD-7-x86 Linux-x86_64 Solaris-x86} -name Typical -parent SetupTypes
@ -1023,6 +1041,9 @@ No
6661142D-D174-F52E-CD1D-6BFB3649BC64,Conditions
{2 conditions}
6661142D-D174-F52E-CD1D-6BFB3649BC64,Destination,subst
1
6661142D-D174-F52E-CD1D-6BFB3649BC64,DestinationLabel,subst
0
@ -2449,6 +2470,9 @@ Standard
Windows,InstallType
Typical
Windows,LastRequireAdministrator
Yes
Windows,ProgramExecutable
<%InstallDir%>/calibre.exe
@ -2467,6 +2491,12 @@ Windows,ProgramName
Windows,ProgramReadme
<%InstallDir%>/README.txt
Windows,RequireAdministrator
Yes
Windows,UseUncompressedBinaries
No
Windows,WindowsIcon
{Setup Blue Screen.ico}

View File

@ -10,6 +10,7 @@ QT_DIR = 'C:\\Qt\\4.5.2'
LIBUSB_DIR = 'C:\\libusb'
LIBUNRAR = 'C:\\Program Files\\UnrarDLL\\unrar.dll'
PDFTOHTML = 'C:\\cygwin\\home\\kovid\\poppler-0.10.6\\rel\\pdftohtml.exe'
POPPLER = 'C:\\cygwin\\home\\kovid\\poppler'
IMAGEMAGICK_DIR = 'C:\\ImageMagick'
PDFTK = 'C:\\pdftk.exe'
PODOFO = 'C:\\podofo'
@ -98,13 +99,17 @@ class BuildEXE(py2exe.build_exe.py2exe):
shutil.copyfile(f, os.path.join(tdir, os.path.basename(f)))
print '\tAdding unrar'
shutil.copyfile(LIBUNRAR, os.path.join(PY2EXE_DIR, os.path.basename(LIBUNRAR)))
print '\tAdding pdftohtml'
shutil.copyfile(PDFTOHTML, os.path.join(PY2EXE_DIR, os.path.basename(PDFTOHTML)))
shutil.copyfile(PDFTOHTML+'.manifest', os.path.join(PY2EXE_DIR,
os.path.basename(PDFTOHTML)+'.manifest'))
print '\tAdding poppler'
for x in ('bin\\pdftohtml.exe', 'bin\\poppler-qt4.dll',
'bin\\freetype.dll', 'bin\\jpeg62.dll'):
shutil.copyfile(os.path.join(POPPLER, x),
os.path.join(PY2EXE_DIR, os.path.basename(x)))
#shutil.copyfile(PDFTOHTML, os.path.join(PY2EXE_DIR, os.path.basename(PDFTOHTML)))
#shutil.copyfile(PDFTOHTML+'.manifest', os.path.join(PY2EXE_DIR,
# os.path.basename(PDFTOHTML)+'.manifest'))
#print '\tAdding pdftk'
#shutil.copyfile(PDFTK, os.path.join(PY2EXE_DIR, os.path.basename(PDFTK)))
print 'Adding podofo'
print '\tAdding podofo'
for f in glob.glob(os.path.join(PODOFO, '*.dll')):
shutil.copyfile(f, os.path.join(PY2EXE_DIR, os.path.basename(f)))

View File

@ -58,6 +58,18 @@ if __name__ == '__main__':
entry_points['console_scripts'].append(
'calibre_postinstall = calibre.linux:post_install')
optional = []
qmake = '/Volumes/sw/qt/bin/qmake' if isosx else 'qmake'
qmake = os.environ.get('QMAKE', qmake)
raw = subprocess.Popen([qmake, '-query'],
stdout=subprocess.PIPE).stdout.read()
qt_inc = qt_lib = None
for line in raw.splitlines():
q, _, w = line.partition(':')
if q == 'QT_INSTALL_HEADERS':
qt_inc = w
elif q == 'QT_INSTALL_LIBS':
qt_lib = w
if iswindows:
optional.append(Extension('calibre.plugins.winutil',
@ -69,6 +81,28 @@ if __name__ == '__main__':
extra_compile_args=['/X']
))
poppler_inc = '/usr/include/poppler/qt4'
poppler_lib = '/usr/lib'
poppler_libs = []
if iswindows:
poppler_inc = r'C:\cygwin\home\kovid\poppler\include\poppler\qt4'
poppler_lib = r'C:\cygwin\home\kovid\poppler\lib'
poppler_libs = ['QtCore4', 'QtGui4']
if isosx:
poppler_inc = '/Volumes/sw/build/poppler-0.10.7/qt4/src'
poppler_lib = '/Users/kovid/poppler/lib'
poppler_inc = os.environ.get('POPPLER_INC_DIR', poppler_inc)
if os.path.exists(os.path.join(poppler_inc, 'poppler-qt4.h')):
optional.append(Extension('calibre.plugins.calibre_poppler',
sources=['src/calibre/utils/poppler/poppler.cpp'],
libraries=(['poppler', 'poppler-qt4']+poppler_libs),
library_dirs=[os.environ.get('POPPLER_LIB_DIR',
poppler_lib), qt_lib],
include_dirs=[poppler_inc, qt_inc]))
else:
print 'WARNING: Poppler not found on your system. Various PDF related',
print 'functionality will not work. Use the POPPLER_INC_DIR and',
print 'POPPLER_LIB_DIR environment variables.'
podofo_inc = '/usr/include/podofo' if islinux else \
'C:\\podofo\\include\\podofo' if iswindows else \
@ -84,7 +118,8 @@ if __name__ == '__main__':
include_dirs=[podofo_inc]))
else:
print 'WARNING: PoDoFo not found on your system. Various PDF related',
print 'functionality will not work.'
print 'functionality will not work. Use the PODOFO_INC_DIR and',
print 'PODOFO_LIB_DIR environment variables.'
fc_inc = '/usr/include/fontconfig' if islinux else \
r'C:\cygwin\home\kovid\fontconfig\include\fontconfig' if iswindows else \

View File

@ -60,7 +60,7 @@ if plugins is None:
sys.path.insert(0, plugin_path)
for plugin in ['pictureflow', 'lzx', 'msdes', 'podofo', 'cPalmdoc',
'fontconfig'] + \
'fontconfig', 'calibre_poppler'] + \
(['winutil'] if iswindows else []) + \
(['usbobserver'] if isosx else []):
try:

View File

@ -19,6 +19,7 @@ from calibre.ebooks.metadata import MetaInformation, string_to_authors, authors_
from calibre.utils.pdftk import set_metadata as pdftk_set_metadata
from calibre.utils.podofo import get_metadata as podofo_get_metadata, \
set_metadata as podofo_set_metadata, Unavailable, get_metadata_quick
from calibre.utils.poppler import get_metadata as get_metadata_poppler, NotAvailable
def get_quick_metadata(stream):
return get_metadata_pypdf(stream)
@ -31,6 +32,10 @@ def get_quick_metadata(stream):
def get_metadata(stream, extract_cover=True):
try:
return get_metadata_poppler(stream, extract_cover)
except NotAvailable:
pass
try:
with TemporaryDirectory('_pdfmeta') as tdir:
cpath = os.path.join(tdir, 'cover.pdf')

View File

@ -0,0 +1,55 @@
#!/usr/bin/env python
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from __future__ import with_statement
__license__ = 'GPL v3'
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
import os
from calibre.constants import plugins
from calibre.ebooks.metadata import MetaInformation, string_to_authors
poppler, poppler_err = plugins['calibre_poppler']
class NotAvailable(Exception):
pass
def get_metadata(stream, cover=True):
if not poppler:
raise NotAvailable('Failed to load poppler with error: '+poppler_err)
raw = stream.read()
doc = poppler.PDFDoc()
doc.load(raw)
title = doc.title
if not title or not title.strip():
title = _('Unknown')
if hasattr(stream, 'name'):
title = os.path.splitext(stream.name)[0]
author = doc.author
authors = string_to_authors(author) if author else [_('Unknown')]
creator = doc.creator
mi = MetaInformation(title, authors)
if creator:
mi.book_producer = creator
if cover:
from calibre.gui2 import is_ok_to_use_qt
cdata = None
if is_ok_to_use_qt():
try:
cdata = doc.render_page(0)
except:
import traceback
traceback.print_exc()
if cdata is not None:
mi.cover_data = ('jpg', cdata)
return mi

View File

@ -0,0 +1,328 @@
#define UNICODE
#define PY_SSIZE_T_CLEAN
#include <Python.h>
#include <poppler-qt4.h>
#include <QtCore/QBuffer>
#include <QtGui/QImage>
typedef struct {
PyObject_HEAD
/* Type-specific fields go here. */
Poppler::Document *doc;
} poppler_PDFDoc;
extern "C" {
static void
poppler_PDFDoc_dealloc(poppler_PDFDoc* self)
{
if (self->doc != NULL) delete self->doc;
self->ob_type->tp_free((PyObject*)self);
}
static PyObject *
poppler_PDFDoc_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
{
poppler_PDFDoc *self;
self = (poppler_PDFDoc *)type->tp_alloc(type, 0);
if (self != NULL) {
self->doc = NULL;
}
return (PyObject *)self;
}
static PyObject *
poppler_PDFDoc_load(poppler_PDFDoc *self, PyObject *args, PyObject *kwargs) {
char *buffer; Py_ssize_t size; QByteArray data;
if (!PyArg_ParseTuple(args, "s#", &buffer, &size)) return NULL;
data = QByteArray::fromRawData(buffer, size);
self->doc = Poppler::Document::loadFromData(data);
if (self->doc == NULL) {PyErr_SetString(PyExc_ValueError, "Could not load PDF file from data."); return NULL;}
Py_RETURN_NONE;
}
}
static QString
poppler_convert_pystring(PyObject *py) {
QString ans;
Py_UNICODE* u = PyUnicode_AS_UNICODE(py);
PyObject *u8 = PyUnicode_EncodeUTF8(u, PyUnicode_GET_SIZE(py), "replace");
if (u8 == NULL) { PyErr_NoMemory(); return NULL; }
ans = QString::fromUtf8(PyString_AS_STRING(u8));
Py_DECREF(u8);
return ans;
}
extern "C" {
static PyObject *
poppler_convert_qstring(const QString &src) {
QByteArray data = src.toUtf8();
const char *cdata = data.constData();
int sz = data.size();
return PyUnicode_Decode(cdata, sz, "utf-8", "error");
}
static PyObject *
poppler_PDFDoc_open(poppler_PDFDoc *self, PyObject *args, PyObject *kwargs) {
PyObject *fname; QString _fname;
if (!PyArg_ParseTuple(args, "O", &fname)) return NULL;
_fname = poppler_convert_pystring(fname);
self->doc = Poppler::Document::load(_fname);
Py_RETURN_NONE;
}
static PyObject *
poppler_PDFDoc_getter(poppler_PDFDoc *self, int field)
{
PyObject *ans;
const char *s;
switch (field) {
case 0:
s = "Title"; break;
case 1:
s = "Author"; break;
case 2:
s = "Subject"; break;
case 3:
s = "Keywords"; break;
case 4:
s = "Creator"; break;
case 5:
s = "Producer"; break;
default:
PyErr_SetString(PyExc_Exception, "Bad field");
return NULL;
}
ans = poppler_convert_qstring(self->doc->info(QString(s)));
if (ans != NULL) Py_INCREF(ans);
return ans;
}
static int
poppler_PDFDoc_setter(poppler_PDFDoc *self, PyObject *val, int field) {
return -1;
}
static PyObject *
poppler_PDFDoc_title_getter(poppler_PDFDoc *self, void *closure) {
return poppler_PDFDoc_getter(self, 0);
}
static PyObject *
poppler_PDFDoc_author_getter(poppler_PDFDoc *self, void *closure) {
return poppler_PDFDoc_getter(self, 1);
}
static PyObject *
poppler_PDFDoc_subject_getter(poppler_PDFDoc *self, void *closure) {
return poppler_PDFDoc_getter(self, 2);
}
static PyObject *
poppler_PDFDoc_keywords_getter(poppler_PDFDoc *self, void *closure) {
return poppler_PDFDoc_getter(self, 3);
}
static PyObject *
poppler_PDFDoc_creator_getter(poppler_PDFDoc *self, void *closure) {
return poppler_PDFDoc_getter(self, 4);
}
static PyObject *
poppler_PDFDoc_producer_getter(poppler_PDFDoc *self, void *closure) {
return poppler_PDFDoc_getter(self, 5);
}
static PyObject *
poppler_PDFDoc_version_getter(poppler_PDFDoc *self, void *closure) {
PyObject *ans = PyFloat_FromDouble(self->doc->pdfVersion());
if (ans != NULL) Py_INCREF(ans);
return ans;
}
static int
poppler_PDFDoc_title_setter(poppler_PDFDoc *self, PyObject *val, void *closure) {
return poppler_PDFDoc_setter(self, val, 0);
}
static int
poppler_PDFDoc_author_setter(poppler_PDFDoc *self, PyObject *val, void *closure) {
return poppler_PDFDoc_setter(self, val, 1);
}
static int
poppler_PDFDoc_subject_setter(poppler_PDFDoc *self, PyObject *val, void *closure) {
return poppler_PDFDoc_setter(self, val, 2);
}
static int
poppler_PDFDoc_keywords_setter(poppler_PDFDoc *self, PyObject *val, void *closure) {
return poppler_PDFDoc_setter(self, val, 3);
}
static int
poppler_PDFDoc_creator_setter(poppler_PDFDoc *self, PyObject *val, void *closure) {
return poppler_PDFDoc_setter(self, val, 4);
}
static int
poppler_PDFDoc_producer_setter(poppler_PDFDoc *self, PyObject *val, void *closure) {
return poppler_PDFDoc_setter(self, val, 5);
}
}
static PyObject *
poppler_PDFDoc_render_page(poppler_PDFDoc *self, PyObject *args, PyObject *kwargs) {
QImage img;
float xdpi = 166.0, ydpi = 166.0;
Poppler::Page *page;
QByteArray ba;
PyObject *ans = NULL;
QBuffer buffer(&ba);
int num;
if (!PyArg_ParseTuple(args, "i|ff", &num, &xdpi, &ydpi)) return ans;
if ( num < 0 || num >= self->doc->numPages()) {
PyErr_SetString(PyExc_ValueError, "Invalid page number");
return ans;
}
if ( self->doc->isLocked()) {
PyErr_SetString(PyExc_ValueError, "This document is copyrighted.");
return ans;
}
page = self->doc->page(num);
img = page->renderToImage(xdpi, ydpi);
if (img.isNull()) {
PyErr_SetString(PyExc_Exception, "Failed to render first page of PDF");
return ans;
}
buffer.open(QIODevice::WriteOnly);
if (!img.save(&buffer, "JPEG")) {
PyErr_SetString(PyExc_Exception, "Failed to save rendered page");
return ans;
}
ans = PyString_FromStringAndSize(ba.data(), ba.size());
if (ans != NULL) { Py_INCREF(ans); }
return ans;
}
static PyMethodDef poppler_PDFDoc_methods[] = {
{"load", (PyCFunction)poppler_PDFDoc_load, METH_VARARGS,
"Load a PDF document from a byte buffer (string)"
},
{"open", (PyCFunction)poppler_PDFDoc_open, METH_VARARGS,
"Load a PDF document from a file path (string)"
},
{"render_page", (PyCFunction)poppler_PDFDoc_render_page, METH_VARARGS,
"render_page(page_num, xdpi=166, ydpi=166) -> Render a page to a JPEG image. Page numbers start from zero."
},
{NULL} /* Sentinel */
};
static PyObject *
poppler_PDFDoc_pages_getter(poppler_PDFDoc *self, void *closure) {
int pages = self->doc->numPages();
PyObject *ans = PyInt_FromLong(static_cast<long>(pages));
if (ans != NULL) Py_INCREF(ans);
return ans;
}
static PyGetSetDef poppler_PDFDoc_getsetters[] = {
{(char *)"title",
(getter)poppler_PDFDoc_title_getter, (setter)poppler_PDFDoc_title_setter,
(char *)"Document title",
NULL},
{(char *)"author",
(getter)poppler_PDFDoc_author_getter, (setter)poppler_PDFDoc_author_setter,
(char *)"Document author",
NULL},
{(char *)"subject",
(getter)poppler_PDFDoc_subject_getter, (setter)poppler_PDFDoc_subject_setter,
(char *)"Document subject",
NULL},
{(char *)"keywords",
(getter)poppler_PDFDoc_keywords_getter, (setter)poppler_PDFDoc_keywords_setter,
(char *)"Document keywords",
NULL},
{(char *)"creator",
(getter)poppler_PDFDoc_creator_getter, (setter)poppler_PDFDoc_creator_setter,
(char *)"Document creator",
NULL},
{(char *)"producer",
(getter)poppler_PDFDoc_producer_getter, (setter)poppler_PDFDoc_producer_setter,
(char *)"Document producer",
NULL},
{(char *)"pages",
(getter)poppler_PDFDoc_pages_getter, NULL,
(char *)"Number of pages in document (read only)",
NULL},
{(char *)"version",
(getter)poppler_PDFDoc_version_getter, NULL,
(char *)"The PDF version (read only)",
NULL},
{NULL} /* Sentinel */
};
static PyTypeObject poppler_PDFDocType = {
PyObject_HEAD_INIT(NULL)
0, /*ob_size*/
"calibre_poppler.PDFDoc", /*tp_name*/
sizeof(poppler_PDFDoc), /*tp_basicsize*/
0, /*tp_itemsize*/
(destructor)poppler_PDFDoc_dealloc, /*tp_dealloc*/
0, /*tp_print*/
0, /*tp_getattr*/
0, /*tp_setattr*/
0, /*tp_compare*/
0, /*tp_repr*/
0, /*tp_as_number*/
0, /*tp_as_sequence*/
0, /*tp_as_mapping*/
0, /*tp_hash */
0, /*tp_call*/
0, /*tp_str*/
0, /*tp_getattro*/
0, /*tp_setattro*/
0, /*tp_as_buffer*/
Py_TPFLAGS_DEFAULT, /*tp_flags*/
"PDF Documents", /* tp_doc */
0, /* tp_traverse */
0, /* tp_clear */
0, /* tp_richcompare */
0, /* tp_weaklistoffset */
0, /* tp_iter */
0, /* tp_iternext */
poppler_PDFDoc_methods, /* tp_methods */
0, /* tp_members */
poppler_PDFDoc_getsetters, /* tp_getset */
0, /* tp_base */
0, /* tp_dict */
0, /* tp_descr_get */
0, /* tp_descr_set */
0, /* tp_dictoffset */
0, /* tp_init */
0, /* tp_alloc */
poppler_PDFDoc_new, /* tp_new */
};
static PyMethodDef poppler_methods[] = {
{NULL} /* Sentinel */
};
extern "C" {
PyMODINIT_FUNC
initcalibre_poppler(void)
{
PyObject* m;
if (PyType_Ready(&poppler_PDFDocType) < 0)
return;
m = Py_InitModule3("calibre_poppler", poppler_methods,
"Wrapper for the Poppler PDF library");
Py_INCREF(&poppler_PDFDocType);
PyModule_AddObject(m, "PDFDoc", (PyObject *)&poppler_PDFDocType);
}
}