From 3e9e6a63d784baa4f46203cd27dcf0ecdcfa0aa7 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sun, 10 May 2009 12:56:44 -0700 Subject: [PATCH] PDF Metadata: Switch to using PoDoFp to read/write PDF metadata. On linux, calibre will fall back to pdftk and pypdf. Linux distributors: calibre will only try to build the podofo extension if it detects the podofo header files in the directory pointed to by PODOFO_INC_DIR, defaults to /usr/include/podofo --- installer/linux/freeze.py | 1 + installer/osx/freeze.py | 5 ++ installer/windows/freeze.py | 8 +- pyqtdistutils.py | 38 +++++--- setup.py | 38 +++++--- src/calibre/constants.py | 2 +- src/calibre/ebooks/epub/from_any.py | 27 +++--- src/calibre/ebooks/metadata/pdf.py | 40 ++++++--- src/calibre/library/database2.py | 3 +- src/calibre/manual/faq.rst | 2 +- src/calibre/utils/podofo/__init__.py | 98 ++++++++++++++++++++ src/calibre/utils/podofo/podofo.sip | 128 +++++++++++++++++++++++++++ 12 files changed, 339 insertions(+), 51 deletions(-) create mode 100644 src/calibre/utils/podofo/__init__.py create mode 100644 src/calibre/utils/podofo/podofo.sip diff --git a/installer/linux/freeze.py b/installer/linux/freeze.py index 8524172a72..31f645dff1 100644 --- a/installer/linux/freeze.py +++ b/installer/linux/freeze.py @@ -31,6 +31,7 @@ def freeze(): '/usr/lib/libsqlite3.so.0', '/usr/lib/libsqlite3.so.0', '/usr/lib/libmng.so.1', + '/usr/lib/libpodofo.so.0.6.99', '/lib/libz.so.1', '/lib/libbz2.so.1', '/lib/libbz2.so.1', diff --git a/installer/osx/freeze.py b/installer/osx/freeze.py index 1861596f61..75621da017 100644 --- a/installer/osx/freeze.py +++ b/installer/osx/freeze.py @@ -229,6 +229,11 @@ _check_symlinks_prescript() all_modules = main_modules['console'] + main_modules['gui'] all_functions = main_functions['console'] + main_functions['gui'] print + print 'Adding PoDoFo' + pdf = glob.glob(os.path.expanduser('~/podofo/*.dylib'))[0] + shutil.copyfile(pdf, os.path.join(frameworks_dir, os.path.basename(pdf))) + + loader_path = os.path.join(resource_dir, 'loaders') if not os.path.exists(loader_path): os.mkdir(loader_path) diff --git a/installer/windows/freeze.py b/installer/windows/freeze.py index a4988c6703..f545f7e534 100644 --- a/installer/windows/freeze.py +++ b/installer/windows/freeze.py @@ -12,6 +12,7 @@ LIBUNRAR = 'C:\\Program Files\\UnrarDLL\\unrar.dll' PDFTOHTML = 'C:\\cygwin\\home\\kovid\\poppler-0.10.6\\rel\\pdftohtml.exe' IMAGEMAGICK_DIR = 'C:\\ImageMagick' PDFTK = 'C:\\pdftk.exe' +PODOFO = 'C:\\podofo' FONTCONFIG_DIR = 'C:\\fontconfig' VC90 = r'C:\VC90.CRT' @@ -101,8 +102,11 @@ class BuildEXE(py2exe.build_exe.py2exe): shutil.copyfile(PDFTOHTML, os.path.join(PY2EXE_DIR, os.path.basename(PDFTOHTML))) shutil.copyfile(PDFTOHTML+'.manifest', os.path.join(PY2EXE_DIR, os.path.basename(PDFTOHTML)+'.manifest')) - print '\tAdding pdftk' - shutil.copyfile(PDFTK, os.path.join(PY2EXE_DIR, os.path.basename(PDFTK))) + #print '\tAdding pdftk' + #shutil.copyfile(PDFTK, os.path.join(PY2EXE_DIR, os.path.basename(PDFTK))) + print 'Adding podofo' + for f in glob.glob(os.path.join(PODOFO, '*.dll')): + shutil.copyfile(f, os.path.join(PY2EXE_DIR, os.path.basename(f))) print '\tAdding ImageMagick' for f in os.listdir(IMAGEMAGICK_DIR): diff --git a/pyqtdistutils.py b/pyqtdistutils.py index 0e53aaabfe..b91b011bc1 100644 --- a/pyqtdistutils.py +++ b/pyqtdistutils.py @@ -80,13 +80,16 @@ CONFIG += x86 ppc os.chdir(cwd) def build_sbf(self, sip, sbf, bdir): - print '\tBuilding spf...' + print '\tBuilding sbf...' sip_bin = self.sipcfg.sip_bin + pyqt_sip_flags = [] + if hasattr(self, 'pyqtcfg'): + pyqt_sip_flags += ['-I', self.pyqtcfg.pyqt_sip_dir] + pyqt_sip_flags += self.pyqtcfg.pyqt_sip_flags.split() self.spawn([sip_bin, "-c", bdir, "-b", sbf, - '-I', self.pyqtcfg.pyqt_sip_dir, - ] + self.pyqtcfg.pyqt_sip_flags.split()+ + ] + pyqt_sip_flags + [sip]) def build_pyqt(self, bdir, sbf, ext, qtobjs, headers): @@ -94,9 +97,14 @@ CONFIG += x86 ppc build_file=sbf, dir=bdir, makefile='Makefile.pyqt', universal=OSX_SDK, qt=1) + makefile.extra_libs = ext.libraries + makefile.extra_lib_dirs = ext.library_dirs + makefile.extra_cxxflags = ext.extra_compile_args + if 'win32' in sys.platform: makefile.extra_lib_dirs += WINDOWS_PYTHON makefile.extra_include_dirs = list(set(map(os.path.dirname, headers))) + makefile.extra_include_dirs += ext.include_dirs makefile.extra_lflags += qtobjs makefile.generate() cwd = os.getcwd() @@ -110,7 +118,7 @@ CONFIG += x86 ppc def build_extension(self, ext): self.inplace = True # Causes extensions to be built in the source tree - + fullname = self.get_ext_fullname(ext.name) if self.inplace: # ignore build-lib -- put the compiled extension into @@ -127,14 +135,14 @@ CONFIG += x86 ppc else: ext_filename = os.path.join(self.build_lib, self.get_ext_filename(fullname)) - bdir = os.path.abspath(os.path.join(self.build_temp, fullname)) + bdir = os.path.abspath(os.path.join(self.build_temp, fullname)) if not os.path.exists(bdir): os.makedirs(bdir) - + if not isinstance(ext, PyQtExtension): if not iswindows: return _build_ext.build_extension(self, ext) - + c_sources = [f for f in ext.sources if os.path.splitext(f)[1].lower() in ('.c', '.cpp', '.cxx')] compile_args = '/c /nologo /Ox /MD /W3 /GX /DNDEBUG'.split() compile_args += ext.extra_compile_args @@ -147,7 +155,7 @@ CONFIG += x86 ppc objects.append(o) compiler = cc + ['/Tc'+f, '/Fo'+o] self.spawn(compiler) - out = os.path.join(bdir, base+'.pyd') + out = os.path.join(bdir, base+'.pyd') linker = [msvc.linker] + '/DLL /nologo /INCREMENTAL:NO'.split() linker += ['/LIBPATH:'+x for x in self.library_dirs] linker += [x+'.lib' for x in ext.libraries] @@ -156,9 +164,9 @@ CONFIG += x86 ppc for src in (out, out+'.manifest'): shutil.copyfile(src, os.path.join('src', 'calibre', 'plugins', os.path.basename(src))) return - - - + + + if not os.path.exists(bdir): os.makedirs(bdir) ext.sources2 = map(os.path.abspath, ext.sources) @@ -200,6 +208,14 @@ CONFIG += x86 ppc shutil.copyfile(mod, ext_filename) shutil.copymode(mod, ext_filename) + + if self.force or newer_group([mod], ext_filename, 'newer'): + if os.path.exists(ext_filename): + os.unlink(ext_filename) + shutil.copyfile(mod, ext_filename) + shutil.copymode(mod, ext_filename) + + def get_sip_output_list(self, sbf, bdir): """ Parse the sbf file specified to extract the name of the generated source diff --git a/setup.py b/setup.py index b0ff04a983..d7d0d6cb1c 100644 --- a/setup.py +++ b/setup.py @@ -2,7 +2,7 @@ from __future__ import with_statement __license__ = 'GPL v3' __copyright__ = '2008, Kovid Goyal ' -import sys, re, os, shutil, cStringIO, tempfile, subprocess, time +import sys, re, os, subprocess sys.path.append('src') iswindows = re.search('win(32|64)', sys.platform) isosx = 'darwin' in sys.platform @@ -54,10 +54,28 @@ if __name__ == '__main__': build_osx, upload_installers, upload_user_manual, \ upload_to_pypi, stage3, stage2, stage1, upload, \ upload_rss - + entry_points['console_scripts'].append( 'calibre_postinstall = calibre.linux:post_install') - ext_modules = [ + optional = [] + + + podofo_inc = '/usr/include/podofo' if islinux else \ + 'C:\\podofo\\include\\podofo' if iswindows else \ + '/Users/kovid/podofo/include/podofo' + podofo_lib = '/usr/lib' if islinux else r'C:\podofo' if iswindows else \ + '/Users/kovid/podofo/lib' + if os.path.exists(os.path.join(podofo_inc, 'PdfString.h')): + eca = ['/EHsc'] if iswindows else [] + optional.append(PyQtExtension('calibre.plugins.podofo', [], + ['src/calibre/utils/podofo/podofo.sip'], + libraries=['podofo'], extra_compile_args=eca, + library_dirs=[os.environ.get('PODOFO_LIB_DIR', podofo_lib)], + include_dirs=\ + [os.environ.get('PODOFO_INC_DIR', podofo_inc)])) + + ext_modules = optional + [ + Extension('calibre.plugins.lzx', sources=['src/calibre/utils/lzx/lzxmodule.c', 'src/calibre/utils/lzx/compressor.c', @@ -65,12 +83,12 @@ if __name__ == '__main__': 'src/calibre/utils/lzx/lzc.c', 'src/calibre/utils/lzx/lzxc.c'], include_dirs=['src/calibre/utils/lzx']), - + Extension('calibre.plugins.msdes', sources=['src/calibre/utils/msdes/msdesmodule.c', 'src/calibre/utils/msdes/des.c'], include_dirs=['src/calibre/utils/msdes']), - + PyQtExtension('calibre.plugins.pictureflow', ['src/calibre/gui2/pictureflow/pictureflow.cpp', 'src/calibre/gui2/pictureflow/pictureflow.h'], @@ -81,7 +99,7 @@ if __name__ == '__main__': ext_modules.append(Extension('calibre.plugins.winutil', sources=['src/calibre/utils/windows/winutil.c'], libraries=['shell32', 'setupapi'], - include_dirs=os.environ.get('INCLUDE', + include_dirs=os.environ.get('INCLUDE', 'C:/WinDDK/6001.18001/inc/api/;' 'C:/WinDDK/6001.18001/inc/crt/').split(';'), extra_compile_args=['/X'] @@ -91,7 +109,7 @@ if __name__ == '__main__': sources=['src/calibre/devices/usbobserver/usbobserver.c'], extra_link_args=['-framework', 'IOKit']) ) - + if not iswindows: plugins = ['plugins/%s.so'%(x.name.rpartition('.')[-1]) for x in ext_modules] else: @@ -99,7 +117,7 @@ if __name__ == '__main__': ['plugins/%s.pyd.manifest'%(x.name.rpartition('.')[-1]) \ for x in ext_modules if 'pictureflow' not in x.name] - + setup( name = APPNAME, packages = find_packages('src'), @@ -152,9 +170,9 @@ if __name__ == '__main__': 'Topic :: System :: Hardware :: Hardware Drivers' ], cmdclass = { - 'build_ext' : build_ext, + 'build_ext' : build_ext, 'build' : build, - 'build_py' : build_py, + 'build_py' : build_py, 'pot' : pot, 'manual' : manual, 'resources' : resources, diff --git a/src/calibre/constants.py b/src/calibre/constants.py index cf4c6a28f5..d5958712f1 100644 --- a/src/calibre/constants.py +++ b/src/calibre/constants.py @@ -53,7 +53,7 @@ if plugins is None: plugin_path = getattr(pkg_resources, 'resource_filename')('calibre', 'plugins') sys.path.insert(0, plugin_path) - for plugin in ['pictureflow', 'lzx', 'msdes'] + \ + for plugin in ['pictureflow', 'lzx', 'msdes', 'podofo'] + \ (['winutil'] if iswindows else []) + \ (['usbobserver'] if isosx else []): try: diff --git a/src/calibre/ebooks/epub/from_any.py b/src/calibre/ebooks/epub/from_any.py index 9a8e251108..a3e266991f 100644 --- a/src/calibre/ebooks/epub/from_any.py +++ b/src/calibre/ebooks/epub/from_any.py @@ -80,7 +80,7 @@ def fb22opf(path, tdir, opts): from calibre.ebooks.lrf.fb2.convert_from import to_html print 'Converting FB2 to HTML...' return to_html(path, tdir) - + def rtf2opf(path, tdir, opts): from calibre.ebooks.lrf.rtf.convert_from import generate_html generate_html(path, tdir) @@ -89,6 +89,7 @@ def rtf2opf(path, tdir, opts): def txt2opf(path, tdir, opts): from calibre.ebooks.lrf.txt.convert_from import generate_html generate_html(path, opts.encoding, tdir) + opts.encoding = 'utf-8' return os.path.join(tdir, 'metadata.opf') def pdf2opf(path, tdir, opts): @@ -110,11 +111,11 @@ def epub2opf(path, tdir, opts): if opf and os.path.exists(encfile): if not process_encryption(encfile, opf): raise DRMError(os.path.basename(path)) - + if opf is None: raise ValueError('%s is not a valid EPUB file'%path) return opf - + def odt2epub(path, tdir, opts): from calibre.ebooks.odt.to_oeb import Extract opts.encoding = 'utf-8' @@ -132,13 +133,13 @@ MAP = { 'epub' : epub2opf, 'odt' : odt2epub, } -SOURCE_FORMATS = ['lit', 'mobi', 'prc', 'azw', 'fb2', 'odt', 'rtf', +SOURCE_FORMATS = ['lit', 'mobi', 'prc', 'azw', 'fb2', 'odt', 'rtf', 'txt', 'pdf', 'rar', 'zip', 'oebzip', 'htm', 'html', 'epub'] def unarchive(path, tdir): extract(path, tdir) files = list(walk(tdir)) - + for ext in ['opf'] + list(MAP.keys()): for f in files: if f.lower().endswith('.'+ext): @@ -147,32 +148,32 @@ def unarchive(path, tdir): return f, ext return find_html_index(files) -def any2epub(opts, path, notification=None, create_epub=True, +def any2epub(opts, path, notification=None, create_epub=True, oeb_cover=False, extract_to=None): path = run_plugins_on_preprocess(path) ext = os.path.splitext(path)[1] if not ext: raise ValueError('Unknown file type: '+path) ext = ext.lower()[1:] - + if opts.output is None: opts.output = os.path.splitext(os.path.basename(path))[0]+'.epub' - + with nested(TemporaryDirectory('_any2epub1'), TemporaryDirectory('_any2epub2')) as (tdir1, tdir2): if ext in ['rar', 'zip', 'oebzip']: path, ext = unarchive(path, tdir1) print 'Found %s file in archive'%(ext.upper()) - + if ext in MAP.keys(): path = MAP[ext](path, tdir2, opts) ext = 'opf' - - + + if re.match(r'((x){0,1}htm(l){0,1})|opf', ext) is None: raise ValueError('Conversion from %s is not supported'%ext.upper()) - + print 'Creating EPUB file...' - html2epub(path, opts, notification=notification, + html2epub(path, opts, notification=notification, create_epub=create_epub, oeb_cover=oeb_cover, extract_to=extract_to) diff --git a/src/calibre/ebooks/metadata/pdf.py b/src/calibre/ebooks/metadata/pdf.py index b6bf425d9f..20ba98ff54 100644 --- a/src/calibre/ebooks/metadata/pdf.py +++ b/src/calibre/ebooks/metadata/pdf.py @@ -6,13 +6,35 @@ __copyright__ = '2008, Kovid Goyal ' import sys, os, cStringIO from threading import Thread -from calibre import FileWrapper from calibre.ebooks.metadata import MetaInformation, authors_to_string, get_parser -from pyPdf import PdfFileReader, PdfFileWriter from calibre.utils.pdftk import set_metadata as pdftk_set_metadata +from calibre.utils.podofo import get_metadata as podofo_get_metadata, \ + set_metadata as podofo_set_metadata + def get_metadata(stream): + try: + return podofo_get_metadata(stream) + except: + return get_metadata_pypdf(stream) + +def set_metadata(stream, mi): + stream.seek(0) + try: + return podofo_set_metadata(stream, mi) + except: + pass + try: + return pdftk_set_metadata(stream, mi) + except: + pass + set_metadata_pypdf(stream, mi) + + +def get_metadata_pypdf(stream): """ Return metadata as a L{MetaInfo} object """ + from pyPdf import PdfFileReader + from calibre import FileWrapper mi = MetaInformation(_('Unknown'), [_('Unknown')]) stream.seek(0) try: @@ -48,18 +70,12 @@ class MetadataWriter(Thread): except RuntimeError: pass -def set_metadata(stream, mi): - stream.seek(0) - try: - pdftk_set_metadata(stream, mi) - except: - pass - else: - return - +def set_metadata_pypdf(stream, mi): # Use a StringIO object for the pdf because we will want to over # write it later and if we are working on the stream directly it # could cause some issues. + + from pyPdf import PdfFileReader, PdfFileWriter raw = cStringIO.StringIO(stream.read()) orig_pdf = PdfFileReader(raw) @@ -73,7 +89,7 @@ def set_metadata(stream, mi): out_pdf.addPage(page) writer.start() - writer.join(15) # Wait 15 secs for writing to complete + writer.join(10) # Wait 10 secs for writing to complete out_pdf.killed = True writer.join() if out_pdf.killed: diff --git a/src/calibre/library/database2.py b/src/calibre/library/database2.py index bc94d4faa3..4247e0cad3 100644 --- a/src/calibre/library/database2.py +++ b/src/calibre/library/database2.py @@ -402,7 +402,8 @@ class LibraryDatabase2(LibraryDatabase): def get_property(idx, index_is_id=False, loc=-1): row = self.data._data[idx] if index_is_id else self.data[idx] - return row[loc] + if row is not None: + return row[loc] for prop in ('author_sort', 'authors', 'comment', 'comments', 'isbn', 'publisher', 'rating', 'series', 'series_index', 'tags', diff --git a/src/calibre/manual/faq.rst b/src/calibre/manual/faq.rst index 0c02ef2925..717effd455 100644 --- a/src/calibre/manual/faq.rst +++ b/src/calibre/manual/faq.rst @@ -220,7 +220,7 @@ Post any output you see in a help message on the `Forum +using namespace PoDoFo; +%End +%ConvertFromTypeCode + if (sipCpp -> IsValid()) { + std::string raw = sipCpp->GetStringUtf8(); + return PyString_FromStringAndSize(raw.c_str(), raw.length()); + } else return PyString_FromString(""); +%End +%ConvertToTypeCode + if (sipIsErr == NULL) { + if (sipIsErr == NULL) + return (PyUnicode_Check(sipPy) || PyString_Check(sipPy)); + } + if (sipPy == Py_None) { + *sipCppPtr = NULL; + return 0; + } + if (PyString_Check(sipPy)) { + *sipCppPtr = new PdfString((pdf_utf8 *)PyString_AS_STRING(sipPy)); + return sipGetState(sipTransferObj); + } + if (PyUnicode_Check(sipPy)) { + Py_UNICODE* u = PyUnicode_AS_UNICODE(sipPy); + PyObject *u8 = PyUnicode_EncodeUTF8(u, PyUnicode_GET_SIZE(sipPy), "replace"); + pdf_utf8 *s8 = (pdf_utf8 *)PyString_AS_STRING(u8); + *sipCppPtr = new PdfString(s8); + return sipGetState(sipTransferObj); + } + *sipCppPtr = (PdfString *)sipForceConvertTo_PdfString(sipPy,sipIsErr); + return 1; +%End +}; + +class PdfObject { +%TypeHeaderCode +#define USING_SHARED_PODOFO +#include +using namespace PoDoFo; +%End + public: + PdfObject(); + +}; + +class PdfInfo { +%TypeHeaderCode +#define USING_SHARED_PODOFO +#include +using namespace PoDoFo; +%End + public: + PdfInfo(PdfObject *); + + PdfString GetAuthor() const; + PdfString GetSubject() const; + PdfString GetTitle() const; + PdfString GetKeywords() const; + PdfString GetCreator() const; + PdfString GetProducer() const; + + void SetAuthor(PdfString &); + void SetSubject(PdfString &); + void SetTitle(PdfString &); + void SetKeywords(PdfString &); + void SetCreator(PdfString &); + void SetProducer(PdfString &); + +}; + +class PdfOutputDevice { +%TypeHeaderCode +#define USING_SHARED_PODOFO +#include +using namespace PoDoFo; +%End + public: + PdfOutputDevice(char *, long); + unsigned long GetLength(); + unsigned long Tell(); + void Flush(); +}; + + +class PdfMemDocument { +%TypeHeaderCode +#define USING_SHARED_PODOFO +#include +using namespace PoDoFo; +%End + public: + PdfMemDocument(); + + void Load(const char *filename); + void Load(const char *buffer, long size); + void Write(const char *filename); + PdfInfo *GetInfo() const; + + protected: + void SetInfo(PdfInfo * /TransferThis/); + + private: + PdfMemDocument(PdfMemDocument &); + +}; + + +%Exception PoDoFo::PdfError /PyName=PdfError/ +{ +%TypeHeaderCode +#define USING_SHARED_PODOFO +#include +%End +%RaiseCode + const char *detail = sipExceptionRef.what(); + + SIP_BLOCK_THREADS + PyErr_SetString(sipException_PoDoFo_PdfError, detail); + SIP_UNBLOCK_THREADS +%End +}; +