PDF Metadata: Switch to using PoDoFp to read/write PDF metadata. On linux, calibre will fall back to pdftk and pypdf. Linux distributors: calibre will only try to build the podofo extension if it detects the podofo header files in the directory pointed to by PODOFO_INC_DIR, defaults to /usr/include/podofo

This commit is contained in:
Kovid Goyal 2009-05-10 12:56:44 -07:00
parent 16a11369a5
commit 3e9e6a63d7
12 changed files with 339 additions and 51 deletions

View File

@ -31,6 +31,7 @@ def freeze():
'/usr/lib/libsqlite3.so.0',
'/usr/lib/libsqlite3.so.0',
'/usr/lib/libmng.so.1',
'/usr/lib/libpodofo.so.0.6.99',
'/lib/libz.so.1',
'/lib/libbz2.so.1',
'/lib/libbz2.so.1',

View File

@ -229,6 +229,11 @@ _check_symlinks_prescript()
all_modules = main_modules['console'] + main_modules['gui']
all_functions = main_functions['console'] + main_functions['gui']
print
print 'Adding PoDoFo'
pdf = glob.glob(os.path.expanduser('~/podofo/*.dylib'))[0]
shutil.copyfile(pdf, os.path.join(frameworks_dir, os.path.basename(pdf)))
loader_path = os.path.join(resource_dir, 'loaders')
if not os.path.exists(loader_path):
os.mkdir(loader_path)

View File

@ -12,6 +12,7 @@ LIBUNRAR = 'C:\\Program Files\\UnrarDLL\\unrar.dll'
PDFTOHTML = 'C:\\cygwin\\home\\kovid\\poppler-0.10.6\\rel\\pdftohtml.exe'
IMAGEMAGICK_DIR = 'C:\\ImageMagick'
PDFTK = 'C:\\pdftk.exe'
PODOFO = 'C:\\podofo'
FONTCONFIG_DIR = 'C:\\fontconfig'
VC90 = r'C:\VC90.CRT'
@ -101,8 +102,11 @@ class BuildEXE(py2exe.build_exe.py2exe):
shutil.copyfile(PDFTOHTML, os.path.join(PY2EXE_DIR, os.path.basename(PDFTOHTML)))
shutil.copyfile(PDFTOHTML+'.manifest', os.path.join(PY2EXE_DIR,
os.path.basename(PDFTOHTML)+'.manifest'))
print '\tAdding pdftk'
shutil.copyfile(PDFTK, os.path.join(PY2EXE_DIR, os.path.basename(PDFTK)))
#print '\tAdding pdftk'
#shutil.copyfile(PDFTK, os.path.join(PY2EXE_DIR, os.path.basename(PDFTK)))
print 'Adding podofo'
for f in glob.glob(os.path.join(PODOFO, '*.dll')):
shutil.copyfile(f, os.path.join(PY2EXE_DIR, os.path.basename(f)))
print '\tAdding ImageMagick'
for f in os.listdir(IMAGEMAGICK_DIR):

View File

@ -80,13 +80,16 @@ CONFIG += x86 ppc
os.chdir(cwd)
def build_sbf(self, sip, sbf, bdir):
print '\tBuilding spf...'
print '\tBuilding sbf...'
sip_bin = self.sipcfg.sip_bin
pyqt_sip_flags = []
if hasattr(self, 'pyqtcfg'):
pyqt_sip_flags += ['-I', self.pyqtcfg.pyqt_sip_dir]
pyqt_sip_flags += self.pyqtcfg.pyqt_sip_flags.split()
self.spawn([sip_bin,
"-c", bdir,
"-b", sbf,
'-I', self.pyqtcfg.pyqt_sip_dir,
] + self.pyqtcfg.pyqt_sip_flags.split()+
] + pyqt_sip_flags +
[sip])
def build_pyqt(self, bdir, sbf, ext, qtobjs, headers):
@ -94,9 +97,14 @@ CONFIG += x86 ppc
build_file=sbf, dir=bdir,
makefile='Makefile.pyqt',
universal=OSX_SDK, qt=1)
makefile.extra_libs = ext.libraries
makefile.extra_lib_dirs = ext.library_dirs
makefile.extra_cxxflags = ext.extra_compile_args
if 'win32' in sys.platform:
makefile.extra_lib_dirs += WINDOWS_PYTHON
makefile.extra_include_dirs = list(set(map(os.path.dirname, headers)))
makefile.extra_include_dirs += ext.include_dirs
makefile.extra_lflags += qtobjs
makefile.generate()
cwd = os.getcwd()
@ -110,7 +118,7 @@ CONFIG += x86 ppc
def build_extension(self, ext):
self.inplace = True # Causes extensions to be built in the source tree
fullname = self.get_ext_fullname(ext.name)
if self.inplace:
# ignore build-lib -- put the compiled extension into
@ -127,14 +135,14 @@ CONFIG += x86 ppc
else:
ext_filename = os.path.join(self.build_lib,
self.get_ext_filename(fullname))
bdir = os.path.abspath(os.path.join(self.build_temp, fullname))
bdir = os.path.abspath(os.path.join(self.build_temp, fullname))
if not os.path.exists(bdir):
os.makedirs(bdir)
if not isinstance(ext, PyQtExtension):
if not iswindows:
return _build_ext.build_extension(self, ext)
c_sources = [f for f in ext.sources if os.path.splitext(f)[1].lower() in ('.c', '.cpp', '.cxx')]
compile_args = '/c /nologo /Ox /MD /W3 /GX /DNDEBUG'.split()
compile_args += ext.extra_compile_args
@ -147,7 +155,7 @@ CONFIG += x86 ppc
objects.append(o)
compiler = cc + ['/Tc'+f, '/Fo'+o]
self.spawn(compiler)
out = os.path.join(bdir, base+'.pyd')
out = os.path.join(bdir, base+'.pyd')
linker = [msvc.linker] + '/DLL /nologo /INCREMENTAL:NO'.split()
linker += ['/LIBPATH:'+x for x in self.library_dirs]
linker += [x+'.lib' for x in ext.libraries]
@ -156,9 +164,9 @@ CONFIG += x86 ppc
for src in (out, out+'.manifest'):
shutil.copyfile(src, os.path.join('src', 'calibre', 'plugins', os.path.basename(src)))
return
if not os.path.exists(bdir):
os.makedirs(bdir)
ext.sources2 = map(os.path.abspath, ext.sources)
@ -200,6 +208,14 @@ CONFIG += x86 ppc
shutil.copyfile(mod, ext_filename)
shutil.copymode(mod, ext_filename)
if self.force or newer_group([mod], ext_filename, 'newer'):
if os.path.exists(ext_filename):
os.unlink(ext_filename)
shutil.copyfile(mod, ext_filename)
shutil.copymode(mod, ext_filename)
def get_sip_output_list(self, sbf, bdir):
"""
Parse the sbf file specified to extract the name of the generated source

View File

@ -2,7 +2,7 @@ from __future__ import with_statement
__license__ = 'GPL v3'
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
import sys, re, os, shutil, cStringIO, tempfile, subprocess, time
import sys, re, os, subprocess
sys.path.append('src')
iswindows = re.search('win(32|64)', sys.platform)
isosx = 'darwin' in sys.platform
@ -54,10 +54,28 @@ if __name__ == '__main__':
build_osx, upload_installers, upload_user_manual, \
upload_to_pypi, stage3, stage2, stage1, upload, \
upload_rss
entry_points['console_scripts'].append(
'calibre_postinstall = calibre.linux:post_install')
ext_modules = [
optional = []
podofo_inc = '/usr/include/podofo' if islinux else \
'C:\\podofo\\include\\podofo' if iswindows else \
'/Users/kovid/podofo/include/podofo'
podofo_lib = '/usr/lib' if islinux else r'C:\podofo' if iswindows else \
'/Users/kovid/podofo/lib'
if os.path.exists(os.path.join(podofo_inc, 'PdfString.h')):
eca = ['/EHsc'] if iswindows else []
optional.append(PyQtExtension('calibre.plugins.podofo', [],
['src/calibre/utils/podofo/podofo.sip'],
libraries=['podofo'], extra_compile_args=eca,
library_dirs=[os.environ.get('PODOFO_LIB_DIR', podofo_lib)],
include_dirs=\
[os.environ.get('PODOFO_INC_DIR', podofo_inc)]))
ext_modules = optional + [
Extension('calibre.plugins.lzx',
sources=['src/calibre/utils/lzx/lzxmodule.c',
'src/calibre/utils/lzx/compressor.c',
@ -65,12 +83,12 @@ if __name__ == '__main__':
'src/calibre/utils/lzx/lzc.c',
'src/calibre/utils/lzx/lzxc.c'],
include_dirs=['src/calibre/utils/lzx']),
Extension('calibre.plugins.msdes',
sources=['src/calibre/utils/msdes/msdesmodule.c',
'src/calibre/utils/msdes/des.c'],
include_dirs=['src/calibre/utils/msdes']),
PyQtExtension('calibre.plugins.pictureflow',
['src/calibre/gui2/pictureflow/pictureflow.cpp',
'src/calibre/gui2/pictureflow/pictureflow.h'],
@ -81,7 +99,7 @@ if __name__ == '__main__':
ext_modules.append(Extension('calibre.plugins.winutil',
sources=['src/calibre/utils/windows/winutil.c'],
libraries=['shell32', 'setupapi'],
include_dirs=os.environ.get('INCLUDE',
include_dirs=os.environ.get('INCLUDE',
'C:/WinDDK/6001.18001/inc/api/;'
'C:/WinDDK/6001.18001/inc/crt/').split(';'),
extra_compile_args=['/X']
@ -91,7 +109,7 @@ if __name__ == '__main__':
sources=['src/calibre/devices/usbobserver/usbobserver.c'],
extra_link_args=['-framework', 'IOKit'])
)
if not iswindows:
plugins = ['plugins/%s.so'%(x.name.rpartition('.')[-1]) for x in ext_modules]
else:
@ -99,7 +117,7 @@ if __name__ == '__main__':
['plugins/%s.pyd.manifest'%(x.name.rpartition('.')[-1]) \
for x in ext_modules if 'pictureflow' not in x.name]
setup(
name = APPNAME,
packages = find_packages('src'),
@ -152,9 +170,9 @@ if __name__ == '__main__':
'Topic :: System :: Hardware :: Hardware Drivers'
],
cmdclass = {
'build_ext' : build_ext,
'build_ext' : build_ext,
'build' : build,
'build_py' : build_py,
'build_py' : build_py,
'pot' : pot,
'manual' : manual,
'resources' : resources,

View File

@ -53,7 +53,7 @@ if plugins is None:
plugin_path = getattr(pkg_resources, 'resource_filename')('calibre', 'plugins')
sys.path.insert(0, plugin_path)
for plugin in ['pictureflow', 'lzx', 'msdes'] + \
for plugin in ['pictureflow', 'lzx', 'msdes', 'podofo'] + \
(['winutil'] if iswindows else []) + \
(['usbobserver'] if isosx else []):
try:

View File

@ -80,7 +80,7 @@ def fb22opf(path, tdir, opts):
from calibre.ebooks.lrf.fb2.convert_from import to_html
print 'Converting FB2 to HTML...'
return to_html(path, tdir)
def rtf2opf(path, tdir, opts):
from calibre.ebooks.lrf.rtf.convert_from import generate_html
generate_html(path, tdir)
@ -89,6 +89,7 @@ def rtf2opf(path, tdir, opts):
def txt2opf(path, tdir, opts):
from calibre.ebooks.lrf.txt.convert_from import generate_html
generate_html(path, opts.encoding, tdir)
opts.encoding = 'utf-8'
return os.path.join(tdir, 'metadata.opf')
def pdf2opf(path, tdir, opts):
@ -110,11 +111,11 @@ def epub2opf(path, tdir, opts):
if opf and os.path.exists(encfile):
if not process_encryption(encfile, opf):
raise DRMError(os.path.basename(path))
if opf is None:
raise ValueError('%s is not a valid EPUB file'%path)
return opf
def odt2epub(path, tdir, opts):
from calibre.ebooks.odt.to_oeb import Extract
opts.encoding = 'utf-8'
@ -132,13 +133,13 @@ MAP = {
'epub' : epub2opf,
'odt' : odt2epub,
}
SOURCE_FORMATS = ['lit', 'mobi', 'prc', 'azw', 'fb2', 'odt', 'rtf',
SOURCE_FORMATS = ['lit', 'mobi', 'prc', 'azw', 'fb2', 'odt', 'rtf',
'txt', 'pdf', 'rar', 'zip', 'oebzip', 'htm', 'html', 'epub']
def unarchive(path, tdir):
extract(path, tdir)
files = list(walk(tdir))
for ext in ['opf'] + list(MAP.keys()):
for f in files:
if f.lower().endswith('.'+ext):
@ -147,32 +148,32 @@ def unarchive(path, tdir):
return f, ext
return find_html_index(files)
def any2epub(opts, path, notification=None, create_epub=True,
def any2epub(opts, path, notification=None, create_epub=True,
oeb_cover=False, extract_to=None):
path = run_plugins_on_preprocess(path)
ext = os.path.splitext(path)[1]
if not ext:
raise ValueError('Unknown file type: '+path)
ext = ext.lower()[1:]
if opts.output is None:
opts.output = os.path.splitext(os.path.basename(path))[0]+'.epub'
with nested(TemporaryDirectory('_any2epub1'), TemporaryDirectory('_any2epub2')) as (tdir1, tdir2):
if ext in ['rar', 'zip', 'oebzip']:
path, ext = unarchive(path, tdir1)
print 'Found %s file in archive'%(ext.upper())
if ext in MAP.keys():
path = MAP[ext](path, tdir2, opts)
ext = 'opf'
if re.match(r'((x){0,1}htm(l){0,1})|opf', ext) is None:
raise ValueError('Conversion from %s is not supported'%ext.upper())
print 'Creating EPUB file...'
html2epub(path, opts, notification=notification,
html2epub(path, opts, notification=notification,
create_epub=create_epub, oeb_cover=oeb_cover,
extract_to=extract_to)

View File

@ -6,13 +6,35 @@ __copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
import sys, os, cStringIO
from threading import Thread
from calibre import FileWrapper
from calibre.ebooks.metadata import MetaInformation, authors_to_string, get_parser
from pyPdf import PdfFileReader, PdfFileWriter
from calibre.utils.pdftk import set_metadata as pdftk_set_metadata
from calibre.utils.podofo import get_metadata as podofo_get_metadata, \
set_metadata as podofo_set_metadata
def get_metadata(stream):
try:
return podofo_get_metadata(stream)
except:
return get_metadata_pypdf(stream)
def set_metadata(stream, mi):
stream.seek(0)
try:
return podofo_set_metadata(stream, mi)
except:
pass
try:
return pdftk_set_metadata(stream, mi)
except:
pass
set_metadata_pypdf(stream, mi)
def get_metadata_pypdf(stream):
""" Return metadata as a L{MetaInfo} object """
from pyPdf import PdfFileReader
from calibre import FileWrapper
mi = MetaInformation(_('Unknown'), [_('Unknown')])
stream.seek(0)
try:
@ -48,18 +70,12 @@ class MetadataWriter(Thread):
except RuntimeError:
pass
def set_metadata(stream, mi):
stream.seek(0)
try:
pdftk_set_metadata(stream, mi)
except:
pass
else:
return
def set_metadata_pypdf(stream, mi):
# Use a StringIO object for the pdf because we will want to over
# write it later and if we are working on the stream directly it
# could cause some issues.
from pyPdf import PdfFileReader, PdfFileWriter
raw = cStringIO.StringIO(stream.read())
orig_pdf = PdfFileReader(raw)
@ -73,7 +89,7 @@ def set_metadata(stream, mi):
out_pdf.addPage(page)
writer.start()
writer.join(15) # Wait 15 secs for writing to complete
writer.join(10) # Wait 10 secs for writing to complete
out_pdf.killed = True
writer.join()
if out_pdf.killed:

View File

@ -402,7 +402,8 @@ class LibraryDatabase2(LibraryDatabase):
def get_property(idx, index_is_id=False, loc=-1):
row = self.data._data[idx] if index_is_id else self.data[idx]
return row[loc]
if row is not None:
return row[loc]
for prop in ('author_sort', 'authors', 'comment', 'comments', 'isbn',
'publisher', 'rating', 'series', 'series_index', 'tags',

View File

@ -220,7 +220,7 @@ Post any output you see in a help message on the `Forum <http://www.mobileread.c
My antivirus programs claims |app| is a virus/trojan?
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Your antivirus program is wrong. |app| is a completely open source product. You can actually browse the source code yourself (or hire someone to do it for you) to verify that it is not a virus. Please report the false identification to whatever company you buy your antivirus software from.
Your antivirus program is wrong. |app| is a completely open source product. You can actually browse the source code yourself (or hire someone to do it for you) to verify that it is not a virus. Please report the false identification to whatever company you buy your antivirus software from. If the antivirus program is preventing you from downloading/installing |app|, disable it temporarily, install |app| and then re-enable it.
I want some feature added to |app|. What can I do?

View File

@ -0,0 +1,98 @@
#!/usr/bin/env python
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from __future__ import with_statement
__license__ = 'GPL v3'
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
import os
from calibre.constants import plugins, preferred_encoding
from calibre.ebooks.metadata import MetaInformation, string_to_authors, \
authors_to_string
podofo, podofo_err = plugins['podofo']
class Unavailable(Exception): pass
def get_metadata(stream):
if not podofo:
raise Unavailable(podofo_err)
raw = stream.read()
stream.seek(0)
p = podofo.PdfMemDocument()
p.Load(raw, len(raw))
info = p.GetInfo()
title = info.GetTitle().decode('utf-8').strip()
if not title:
title = getattr(stream, 'name', _('Unknown'))
title = os.path.splitext(os.path.basename(title))[0]
author = info.GetAuthor().decode('utf-8').strip()
authors = string_to_authors(author) if author else [_('Unknown')]
mi = MetaInformation(title, authors)
creator = info.GetCreator().decode('utf-8').strip()
if creator:
mi.book_producer = creator
return mi
def prep(val):
if not val:
return u''
if not isinstance(val, unicode):
val = val.decode(preferred_encoding, 'replace')
return val.strip()
def set_metadata(stream, mi):
if not podofo:
raise Unavailable(podofo_err)
raw = stream.read()
p = podofo.PdfMemDocument()
p.Load(raw, len(raw))
info = p.GetInfo()
title = prep(mi.title)
touched = False
if title:
info.SetTitle(title)
touched = True
author = prep(authors_to_string(mi.authors))
if author:
print repr(author)
info.SetAuthor(author)
touched = True
bkp = prep(mi.book_producer)
if bkp:
info.SetCreator(bkp)
touched = True
if touched:
p.SetInfo(info)
from calibre.ptempfile import TemporaryFile
with TemporaryFile('_pdf_set_metadata.pdf') as f:
p.Write(f)
raw = open(f, 'rb').read()
stream.seek(0)
stream.truncate()
stream.write(raw)
stream.flush()
stream.seek(0)
if __name__ == '__main__':
f = '/tmp/t.pdf'
import StringIO
stream = StringIO.StringIO(open(f).read())
mi = get_metadata(open(f))
print
print 'Original metadata:'
print mi
mi.title = 'Test title'
mi.authors = ['Test author', 'author2']
mi.book_producer = 'calibre'
set_metadata(stream, mi)
open('/tmp/x.pdf', 'wb').write(stream.getvalue())
print
print 'New pdf written to /tmp/x.pdf'

View File

@ -0,0 +1,128 @@
%Module podofo 0
%MappedType PdfString
{
%TypeHeaderCode
#define USING_SHARED_PODOFO
#include <PdfString.h>
using namespace PoDoFo;
%End
%ConvertFromTypeCode
if (sipCpp -> IsValid()) {
std::string raw = sipCpp->GetStringUtf8();
return PyString_FromStringAndSize(raw.c_str(), raw.length());
} else return PyString_FromString("");
%End
%ConvertToTypeCode
if (sipIsErr == NULL) {
if (sipIsErr == NULL)
return (PyUnicode_Check(sipPy) || PyString_Check(sipPy));
}
if (sipPy == Py_None) {
*sipCppPtr = NULL;
return 0;
}
if (PyString_Check(sipPy)) {
*sipCppPtr = new PdfString((pdf_utf8 *)PyString_AS_STRING(sipPy));
return sipGetState(sipTransferObj);
}
if (PyUnicode_Check(sipPy)) {
Py_UNICODE* u = PyUnicode_AS_UNICODE(sipPy);
PyObject *u8 = PyUnicode_EncodeUTF8(u, PyUnicode_GET_SIZE(sipPy), "replace");
pdf_utf8 *s8 = (pdf_utf8 *)PyString_AS_STRING(u8);
*sipCppPtr = new PdfString(s8);
return sipGetState(sipTransferObj);
}
*sipCppPtr = (PdfString *)sipForceConvertTo_PdfString(sipPy,sipIsErr);
return 1;
%End
};
class PdfObject {
%TypeHeaderCode
#define USING_SHARED_PODOFO
#include <PdfObject.h>
using namespace PoDoFo;
%End
public:
PdfObject();
};
class PdfInfo {
%TypeHeaderCode
#define USING_SHARED_PODOFO
#include <PdfInfo.h>
using namespace PoDoFo;
%End
public:
PdfInfo(PdfObject *);
PdfString GetAuthor() const;
PdfString GetSubject() const;
PdfString GetTitle() const;
PdfString GetKeywords() const;
PdfString GetCreator() const;
PdfString GetProducer() const;
void SetAuthor(PdfString &);
void SetSubject(PdfString &);
void SetTitle(PdfString &);
void SetKeywords(PdfString &);
void SetCreator(PdfString &);
void SetProducer(PdfString &);
};
class PdfOutputDevice {
%TypeHeaderCode
#define USING_SHARED_PODOFO
#include <PdfOutputDevice.h>
using namespace PoDoFo;
%End
public:
PdfOutputDevice(char *, long);
unsigned long GetLength();
unsigned long Tell();
void Flush();
};
class PdfMemDocument {
%TypeHeaderCode
#define USING_SHARED_PODOFO
#include <PdfMemDocument.h>
using namespace PoDoFo;
%End
public:
PdfMemDocument();
void Load(const char *filename);
void Load(const char *buffer, long size);
void Write(const char *filename);
PdfInfo *GetInfo() const;
protected:
void SetInfo(PdfInfo * /TransferThis/);
private:
PdfMemDocument(PdfMemDocument &);
};
%Exception PoDoFo::PdfError /PyName=PdfError/
{
%TypeHeaderCode
#define USING_SHARED_PODOFO
#include <PdfError.h>
%End
%RaiseCode
const char *detail = sipExceptionRef.what();
SIP_BLOCK_THREADS
PyErr_SetString(sipException_PoDoFo_PdfError, detail);
SIP_UNBLOCK_THREADS
%End
};