From 5a94e3d965a368537ec03da345e9782a34f88105 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Mon, 21 Sep 2009 21:17:38 -0600 Subject: [PATCH 1/8] IGN:Initial implementation of PDF->XML engine --- setup/build_environment.py | 112 +++++++-- setup/commands.py | 5 +- setup/extensions.py | 67 ++++-- setup/install.py | 5 +- setup/installer/linux/freeze.py | 1 + src/calibre/constants.py | 2 +- src/calibre/customize/ui.py | 3 + src/calibre/debug.py | 7 + src/calibre/ebooks/metadata/pdf.py | 48 +++- src/calibre/gui2/convert/debug.ui | 18 +- src/calibre/utils/poppler/__init__.py | 63 ----- src/calibre/utils/poppler/poppler.cpp | 329 -------------------------- 12 files changed, 227 insertions(+), 433 deletions(-) delete mode 100644 src/calibre/utils/poppler/__init__.py delete mode 100644 src/calibre/utils/poppler/poppler.cpp diff --git a/setup/build_environment.py b/setup/build_environment.py index 1523ec0c62..b39df4e58d 100644 --- a/setup/build_environment.py +++ b/setup/build_environment.py @@ -6,7 +6,7 @@ __license__ = 'GPL v3' __copyright__ = '2009, Kovid Goyal ' __docformat__ = 'restructuredtext en' -import os, socket, struct +import os, socket, struct, subprocess from distutils.spawn import find_executable from PyQt4 import pyqtconfig @@ -42,6 +42,39 @@ elif find_executable('qmake'): QMAKE = find_executable('qmake') QMAKE = os.environ.get('QMAKE', QMAKE) +PKGCONFIG = find_executable('pkg-config') +PKGCONFIG = os.environ.get('PKG_CONFIG', PKGCONFIG) + +def run_pkgconfig(name, envvar, default, flag, prefix): + ans = [] + if envvar: + ans = os.environ.get(envvar, default) + ans = [x.strip() for x in ans.split(os.pathsep)] + ans = [x for x in ans if x and (prefix=='-l' or os.path.exists(x))] + if not ans: + try: + raw = subprocess.Popen([PKGCONFIG, flag, name], + stdout=subprocess.PIPE).stdout.read() + ans = [x.strip() for x in raw.split(prefix)] + ans = [x for x in ans if x and (prefix=='-l' or os.path.exists(x))] + except: + print 'Failed to run pkg-config:', PKGCONFIG, 'for:', name + + return ans + +def pkgconfig_include_dirs(name, envvar, default): + return run_pkgconfig(name, envvar, default, '--cflags-only-I', '-I') + +def pkgconfig_lib_dirs(name, envvar, default): + return run_pkgconfig(name, envvar, default,'--libs-only-L', '-L') + +def pkgconfig_libs(name, envvar, default): + return run_pkgconfig(name, envvar, default,'--libs-only-l', '-l') + +def consolidate(envvar, default): + val = os.environ.get(envvar, default) + ans = [x.strip() for x in val.split(os.pathsep())] + return [x for x in ans if x and os.path.exists(x)] pyqt = pyqtconfig.Configuration() @@ -50,28 +83,62 @@ qt_lib = pyqt.qt_lib_dir fc_inc = '/usr/include/fontconfig' fc_lib = '/usr/lib' -poppler_inc = '/usr/include/poppler/qt4' -poppler_lib = '/usr/lib' -poppler_libs = [] podofo_inc = '/usr/include/podofo' podofo_lib = '/usr/lib' if iswindows: fc_inc = r'C:\cygwin\home\kovid\fontconfig\include\fontconfig' fc_lib = r'C:\cygwin\home\kovid\fontconfig\lib' - poppler_inc = r'C:\cygwin\home\kovid\poppler\include\poppler\qt4' - poppler_lib = r'C:\cygwin\home\kovid\poppler\lib' - poppler_libs = ['QtCore4', 'QtGui4'] + poppler_inc_dirs = consolidate('POPPLER_INC_DIR', + r'C:\cygwin\home\kovid\poppler\include\poppler') + popplerqt4_inc_dirs = poppler_inc_dirs + [poppler_inc_dirs[0]+r'\qt4'] + poppler_lib_dirs = consolidate('POPPLER_LIB_DIR', + r'C:\cygwin\home\kovid\poppler\lib') + popplerqt4_lib_dirs = poppler_lib_dirs + poppler_libs = ['poppler'] + popplerqt4_libs = poppler_libs + ['QtCore4', 'QtGui4'] podofo_inc = 'C:\\podofo\\include\\podofo' podofo_lib = r'C:\podofo' - -if isosx: +elif isosx: fc_inc = '/Users/kovid/fontconfig/include/fontconfig' fc_lib = '/Users/kovid/fontconfig/lib' - poppler_inc = '/Volumes/sw/build/poppler-0.10.7/qt4/src' - poppler_lib = '/Users/kovid/poppler/lib' + poppler_inc_dirs = consolidate('POPPLER_INC_DIR', + '/Volumes/sw/build/poppler-0.10.7/poppler') + popplerqt4_inc_dirs = poppler_inc_dirs + [poppler_inc_dirs[0]+'/qt4'] + poppler_lib_dirs = consolidate('POPPLER_LIB_DIR', + '/Users/kovid/poppler/lib') + popplerqt4_lib_dirs = poppler_lib_dirs + poppler_libs = popplerqt4_libs = ['poppler'] podofo_inc = '/usr/local/include/podofo' podofo_lib = '/usr/local/lib' +else: + # Include directories + poppler_inc_dirs = pkgconfig_include_dirs('poppler', + 'POPPLER_INC_DIR', '/usr/include/poppler') + popplerqt4_inc_dirs = pkgconfig_include_dirs('poppler-qt4', '', '') + if not popplerqt4_inc_dirs: + popplerqt4_inc_dirs = poppler_inc_dirs + [poppler_inc_dirs[0]+'/qt4'] + png_inc_dirs = pkgconfig_include_dirs('libpng', 'PNG_INC_DIR', + '/usr/include') + magick_inc_dirs = pkgconfig_include_dirs('MagickWand', 'MAGICK_INC', '/usr/include/ImageMagick') + + # Library directories + poppler_lib_dirs = popplerqt4_lib_dirs = pkgconfig_lib_dirs('poppler', 'POPPLER_LIB_DIR', + '/usr/lib') + png_lib_dirs = pkgconfig_lib_dirs('libpng', 'PNG_LIB_DIR', '/usr/lib') + magick_lib_dirs = pkgconfig_lib_dirs('MagickWand', 'MAGICK_LIB', '/usr/lib') + + # Libraries + poppler_libs = pkgconfig_libs('poppler', '', '') + if not poppler_libs: + poppler_libs = ['poppler'] + popplerqt4_libs = pkgconfig_libs('poppler-qt4', '', '') + if not popplerqt4_libs: + popplerqt4_libs = ['poppler-qt4', 'poppler'] + magick_libs = pkgconfig_libs('MagickWand', '', '') + if not magick_libs: + magick_libs = ['MagickWand', 'MagickCore'] + png_libs = ['png'] fc_inc = os.environ.get('FC_INC_DIR', fc_inc) @@ -82,14 +149,27 @@ fc_error = None if os.path.exists(os.path.join(fc_inc, 'fontconfig.h')) else \ 'variables.') -poppler_inc = os.environ.get('POPPLER_INC_DIR', poppler_inc) -poppler_lib = os.environ.get('POPPLER_LIB_DIR', poppler_lib) -poppler_error = None if os.path.exists(os.path.join(poppler_inc, - 'poppler-qt4.h')) else \ +poppler_error = None +if not poppler_inc_dirs or not os.path.exists( + os.path.join(poppler_inc_dirs[0], 'OutputDev.h')): + poppler_error = \ ('Poppler not found on your system. Various PDF related', ' functionality will not work. Use the POPPLER_INC_DIR and', ' POPPLER_LIB_DIR environment variables.') +popplerqt4_error = None +if not popplerqt4_inc_dirs or not os.path.exists( + os.path.join(popplerqt4_inc_dirs[-1], 'poppler-qt4.h')): + popplerqt4_error = \ + ('Poppler Qt4 bindings not found on your system.') + +magick_error = None +if not magick_inc_dirs or not os.path.exists(os.path.join(magick_inc_dirs[0], + 'wand')): + magick_error = ('ImageMagick not found on your system. ' + 'Try setting the environment variables MAGICK_INC ' + 'and MAGICK_LIB to help calibre locate the inclue and libbrary ' + 'files.') podofo_lib = os.environ.get('PODOFO_LIB_DIR', podofo_lib) podofo_inc = os.environ.get('PODOFO_INC_DIR', podofo_inc) @@ -116,3 +196,5 @@ except: HOST='unknown' PROJECT=os.path.basename(os.path.abspath('.')) + + diff --git a/setup/commands.py b/setup/commands.py index 6fada9ab51..6fb593bd21 100644 --- a/setup/commands.py +++ b/setup/commands.py @@ -8,7 +8,7 @@ __docformat__ = 'restructuredtext en' __all__ = [ 'pot', 'translations', 'get_translations', 'iso639', - 'build', + 'build', 'build_pdf2xml', 'gui', 'develop', 'install', 'resources', @@ -30,8 +30,9 @@ translations = Translations() get_translations = GetTranslations() iso639 = ISO639() -from setup.extensions import Build +from setup.extensions import Build, BuildPDF2XML build = Build() +build_pdf2xml = BuildPDF2XML() from setup.install import Develop, Install, Sdist develop = Develop() diff --git a/setup/extensions.py b/setup/extensions.py index 67918255fa..1f50593bb8 100644 --- a/setup/extensions.py +++ b/setup/extensions.py @@ -12,10 +12,12 @@ from distutils import sysconfig from PyQt4.pyqtconfig import QtGuiModuleMakefile from setup import Command, islinux, isosx, SRC, iswindows -from setup.build_environment import fc_inc, fc_lib, qt_inc, qt_lib, \ - fc_error, poppler_libs, poppler_lib, poppler_inc, podofo_inc, \ +from setup.build_environment import fc_inc, fc_lib, \ + fc_error, poppler_libs, poppler_lib_dirs, poppler_inc_dirs, podofo_inc, \ podofo_lib, podofo_error, poppler_error, pyqt, OSX_SDK, NMAKE, \ - leopard_build, QMAKE, msvc, MT, win_inc, win_lib + leopard_build, QMAKE, msvc, MT, win_inc, win_lib, png_inc_dirs, \ + magick_inc_dirs, magick_lib_dirs, png_lib_dirs, png_libs, \ + magick_error, magick_libs MT isunix = islinux or isosx @@ -43,6 +45,10 @@ class Extension(object): self.ldflags = kwargs.get('ldflags', []) self.optional = kwargs.get('optional', False) +reflow_sources = glob.glob(os.path.join(SRC, 'calibre', 'ebooks', 'pdf', '*.cpp')) +reflow_headers = glob.glob(os.path.join(SRC, 'calibre', 'ebooks', 'pdf', '*.h')) +reflow_error = poppler_error if poppler_error else magick_error + extensions = [ Extension('lzx', ['calibre/utils/lzx/lzxmodule.c', @@ -76,15 +82,6 @@ extensions = [ Extension('cPalmdoc', ['calibre/ebooks/compression/palmdoc.c']), - Extension('calibre_poppler', - ['calibre/utils/poppler/poppler.cpp'], - libraries=(['poppler', 'poppler-qt4']+poppler_libs), - lib_dirs=[os.environ.get('POPPLER_LIB_DIR', - poppler_lib), qt_lib], - inc_dirs=[poppler_inc, qt_inc], - error=poppler_error, - optional=True), - Extension('podofo', ['calibre/utils/podofo/podofo.cpp'], libraries=['podofo'], @@ -97,10 +94,20 @@ extensions = [ inc_dirs = ['calibre/gui2/pictureflow'], headers = ['calibre/gui2/pictureflow/pictureflow.h'], sip_files = ['calibre/gui2/pictureflow/pictureflow.sip'] - ) + ), + Extension('pdfreflow', + reflow_sources, + headers=reflow_headers, + libraries=poppler_libs+magick_libs+png_libs, + lib_dirs=poppler_lib_dirs+magick_lib_dirs+png_lib_dirs, + inc_dirs=poppler_inc_dirs+magick_inc_dirs+png_inc_dirs, + error=reflow_error, + cflags=['-DPNG_SKIP_SETJMP_CHECK'] if islinux else [] + ) ] + if iswindows: extensions.append(Extension('winutil', ['calibre/utils/windows/winutil.c'], @@ -346,10 +353,36 @@ class Build(Command): - - - - +class BuildPDF2XML(Command): + + description = 'Build command line pdf2xml utility' + + def run(self, opts): + dest = os.path.expanduser('~/bin/pdf2xml') + odest = self.j(self.d(self.SRC), 'build', 'objects', 'pdf2xml') + if not os.path.exists(odest): + os.makedirs(odest) + + objects = [] + for src in reflow_sources: + if src.endswith('python.cpp'): + continue + obj = self.j(odest, self.b(src+'.o')) + if self.newer(obj, [src]+reflow_headers): + cmd = ['g++', '-pthread', '-pedantic', '-g', '-c', '-Wall', '-I/usr/include/poppler', + '-I/usr/include/ImageMagick', + '-DPDF2XML', '-o', obj, src] + self.info(*cmd) + subprocess.check_call(cmd) + objects.append(obj) + + if self.newer(dest, objects): + cmd = ['g++', '-g', '-o', dest]+objects+['-lpoppler', '-lMagickWand', + '-lpng', '-lpthread'] + self.info(*cmd) + subprocess.check_call(cmd) + + self.info('Binary installed as', dest) diff --git a/setup/install.py b/setup/install.py index 03b6b85b6c..d9f5f3e4fe 100644 --- a/setup/install.py +++ b/setup/install.py @@ -192,6 +192,10 @@ class Install(Develop): x = self.j(dest, x) if os.path.exists(dest): shutil.rmtree(x) + for x in os.walk(dest): + for f in x[-1]: + if os.path.splitext(f)[1] in ('.c', '.cpp', '.h'): + os.remove(self.j(x[0], f)) dest = self.root + self.resources if os.path.exists(dest): shutil.rmtree(dest) @@ -241,4 +245,3 @@ class Sdist(Command): os.remove(self.DEST) - diff --git a/setup/installer/linux/freeze.py b/setup/installer/linux/freeze.py index c29505f3c5..abf51a9750 100644 --- a/setup/installer/linux/freeze.py +++ b/setup/installer/linux/freeze.py @@ -38,6 +38,7 @@ class LinuxFreeze(Command): binary_includes = [ '/usr/bin/pdftohtml', '/usr/lib/libwmflite-0.2.so.7', + '/usr/lib/liblcms.so.1', '/tmp/calibre-mount-helper', '/usr/lib/libunrar.so', '/usr/lib/libsqlite3.so.0', diff --git a/src/calibre/constants.py b/src/calibre/constants.py index aa9d3a7236..a68cfdb9d3 100644 --- a/src/calibre/constants.py +++ b/src/calibre/constants.py @@ -55,7 +55,7 @@ if plugins is None: sys.path.insert(0, plugin_path) for plugin in ['pictureflow', 'lzx', 'msdes', 'podofo', 'cPalmdoc', - 'fontconfig', 'calibre_poppler'] + \ + 'fontconfig', 'pdfreflow'] + \ (['winutil'] if iswindows else []) + \ (['usbobserver'] if isosx else []): try: diff --git a/src/calibre/customize/ui.py b/src/calibre/customize/ui.py index 0c40e7c8da..4b2aa18a2b 100644 --- a/src/calibre/customize/ui.py +++ b/src/calibre/customize/ui.py @@ -161,6 +161,7 @@ quick_metadata = QuickMetadata() def get_file_type_metadata(stream, ftype): mi = MetaInformation(None, None) + ftype = ftype.lower().strip() if _metadata_readers.has_key(ftype): for plugin in _metadata_readers[ftype]: @@ -168,6 +169,8 @@ def get_file_type_metadata(stream, ftype): with plugin: try: plugin.quick = quick_metadata.quick + if hasattr(stream, 'seek'): + stream.seek(0) mi = plugin.get_metadata(stream, ftype.lower().strip()) break except: diff --git a/src/calibre/debug.py b/src/calibre/debug.py index d1e97efe2a..d2fb72a393 100644 --- a/src/calibre/debug.py +++ b/src/calibre/debug.py @@ -10,6 +10,7 @@ import sys, os, re, shutil from calibre.utils.config import OptionParser from calibre.constants import iswindows, isosx from calibre.libunzip import update +from calibre import prints def option_parser(): parser = OptionParser(usage='''\ @@ -28,6 +29,8 @@ Run an embedded python interpreter. help='Debug the specified device driver.') parser.add_option('-g', '--gui', default=False, action='store_true', help='Run the GUI',) + parser.add_option('--paths', default=False, action='store_true', + help='Output the paths necessary to setup the calibre environment') parser.add_option('--migrate', action='store_true', default=False, help='Migrate old database. Needs two arguments. Path ' 'to library1.db and path to new library folder.') @@ -203,6 +206,10 @@ def main(args=sys.argv): migrate(args[1], args[2]) elif opts.add_simple_plugin is not None: add_simple_plugin(opts.add_simple_plugin) + elif opts.paths: + prints('CALIBRE_RESOURCES_LOCATION='+sys.resources_location) + prints('CALIBRE_EXTENSIONS_LOCATION='+sys.extensions_location) + prints('CALIBRE_PYTHON_PATH='+os.pathsep.join(sys.path)) else: from IPython.Shell import IPShellEmbed ipshell = IPShellEmbed() diff --git a/src/calibre/ebooks/metadata/pdf.py b/src/calibre/ebooks/metadata/pdf.py index 3881f65c63..e11197e4fe 100644 --- a/src/calibre/ebooks/metadata/pdf.py +++ b/src/calibre/ebooks/metadata/pdf.py @@ -3,6 +3,52 @@ __license__ = 'GPL v3' __copyright__ = '2008, Kovid Goyal ' '''Read meta information from PDF files''' +from functools import partial + +from calibre import plugins, prints +from calibre.ebooks.metadata import MetaInformation, string_to_authors#, authors_to_string + +pdfreflow, pdfreflow_error = plugins['pdfreflow'] + +def get_metadata(stream, cover=True): + if pdfreflow is None: + raise RuntimeError(pdfreflow_error) + info = pdfreflow.get_metadata(stream.read(), cover) + title = info.get('Title', None) + au = info.get('Author', None) + if au is None: + au = [_('Unknown')] + else: + au = string_to_authors(au) + mi = MetaInformation(title, au) + + creator = info.get('Creator', None) + if creator: + mi.book_producer = creator + + keywords = info.get('Keywords', None) + mi.tags = [] + if keywords: + mi.tags = [x.strip() for x in keywords.split(',')] + + subject = info.get('Subject', None) + if subject: + mi.tags.insert(0, subject) + + if cover and 'cover' in info: + data = info['cover'] + if data is None: + prints(title, 'is an encrypted document, cover extraction not allowed.') + else: + mi.cover_data = ('png', data) + + return mi + + + +get_quick_metadata = partial(get_metadata, cover=False) + +''' import sys, os, cStringIO from threading import Thread @@ -139,6 +185,6 @@ def get_cover(cover_path): MagickSetImageFormat(wand, 'JPEG') MagickWriteImage(wand, '%s.jpg' % cover_path) return open('%s.jpg' % cover_path, 'rb').read() - +''' diff --git a/src/calibre/gui2/convert/debug.ui b/src/calibre/gui2/convert/debug.ui index 1f651cf057..27d2c6fef0 100644 --- a/src/calibre/gui2/convert/debug.ui +++ b/src/calibre/gui2/convert/debug.ui @@ -40,12 +40,12 @@ ... - + :/images/document_open.svg:/images/document_open.svg - + Qt::Vertical @@ -64,15 +64,25 @@ ... - + :/images/clear_left.svg:/images/clear_left.svg + + + + The debug process outputs the intermediate HTML generated at various stages of the conversion process. This HTML can sometimes serve as a good starting point for hand editing a conversion. + + + true + + + - + diff --git a/src/calibre/utils/poppler/__init__.py b/src/calibre/utils/poppler/__init__.py deleted file mode 100644 index 47d357b933..0000000000 --- a/src/calibre/utils/poppler/__init__.py +++ /dev/null @@ -1,63 +0,0 @@ -#!/usr/bin/env python -# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai -from __future__ import with_statement - -__license__ = 'GPL v3' -__copyright__ = '2009, Kovid Goyal ' -__docformat__ = 'restructuredtext en' - -import os - -from calibre.constants import plugins -from calibre.ebooks.metadata import MetaInformation, string_to_authors - -poppler, poppler_err = plugins['calibre_poppler'] - -class NotAvailable(Exception): - pass - -def get_metadata(stream, cover=True): - if not poppler: - raise NotAvailable('Failed to load poppler with error: '+poppler_err) - raw = stream.read() - doc = poppler.PDFDoc() - doc.load(raw) - del raw - title = doc.title - if not title or not title.strip(): - title = _('Unknown') - if hasattr(stream, 'name'): - title = os.path.splitext(os.path.basename(stream.name))[0] - author = doc.author - authors = string_to_authors(author) if author else [_('Unknown')] - creator = doc.creator - mi = MetaInformation(title, authors) - - if creator: - mi.book_producer = creator - - if doc.subject: - mi.category = doc.subject - - if doc.keywords: - mi.tags = [x.strip() for x in doc.keywords.split(',')] - - if cover: - from calibre.gui2 import is_ok_to_use_qt - cdata = None - if is_ok_to_use_qt(): - - try: - cdata = doc.render_page(0) - except: - import traceback - traceback.print_exc() - - if cdata is not None: - mi.cover_data = ('jpg', cdata) - del doc - return mi - - - - diff --git a/src/calibre/utils/poppler/poppler.cpp b/src/calibre/utils/poppler/poppler.cpp deleted file mode 100644 index b64536c85d..0000000000 --- a/src/calibre/utils/poppler/poppler.cpp +++ /dev/null @@ -1,329 +0,0 @@ -#define UNICODE -#define PY_SSIZE_T_CLEAN -#include -#include -#include -#include - -typedef struct { - PyObject_HEAD - /* Type-specific fields go here. */ - Poppler::Document *doc; - -} poppler_PDFDoc; - -extern "C" { -static void -poppler_PDFDoc_dealloc(poppler_PDFDoc* self) -{ - if (self->doc != NULL) delete self->doc; - self->ob_type->tp_free((PyObject*)self); -} - -static PyObject * -poppler_PDFDoc_new(PyTypeObject *type, PyObject *args, PyObject *kwds) -{ - poppler_PDFDoc *self; - - self = (poppler_PDFDoc *)type->tp_alloc(type, 0); - if (self != NULL) { - self->doc = NULL; - } - - return (PyObject *)self; -} - -static PyObject * -poppler_PDFDoc_load(poppler_PDFDoc *self, PyObject *args, PyObject *kwargs) { - char *buffer; Py_ssize_t size; QByteArray data; - - if (!PyArg_ParseTuple(args, "s#", &buffer, &size)) return NULL; - - data = QByteArray::fromRawData(buffer, size); - self->doc = Poppler::Document::loadFromData(data); - if (self->doc == NULL) {PyErr_SetString(PyExc_ValueError, "Could not load PDF file from data."); return NULL;} - Py_RETURN_NONE; -} -} -static QString -poppler_convert_pystring(PyObject *py) { - QString ans; - Py_UNICODE* u = PyUnicode_AS_UNICODE(py); - PyObject *u8 = PyUnicode_EncodeUTF8(u, PyUnicode_GET_SIZE(py), "replace"); - if (u8 == NULL) { PyErr_NoMemory(); return NULL; } - ans = QString::fromUtf8(PyString_AS_STRING(u8)); - Py_DECREF(u8); - return ans; -} -extern "C" { -static PyObject * -poppler_convert_qstring(const QString &src) { - QByteArray data = src.toUtf8(); - const char *cdata = data.constData(); - int sz = data.size(); - return PyUnicode_Decode(cdata, sz, "utf-8", "error"); -} - - -static PyObject * -poppler_PDFDoc_open(poppler_PDFDoc *self, PyObject *args, PyObject *kwargs) { - PyObject *fname; QString _fname; - if (!PyArg_ParseTuple(args, "O", &fname)) return NULL; - _fname = poppler_convert_pystring(fname); - self->doc = Poppler::Document::load(_fname); - Py_RETURN_NONE; -} - -static PyObject * -poppler_PDFDoc_getter(poppler_PDFDoc *self, int field) -{ - PyObject *ans; - const char *s; - switch (field) { - case 0: - s = "Title"; break; - case 1: - s = "Author"; break; - case 2: - s = "Subject"; break; - case 3: - s = "Keywords"; break; - case 4: - s = "Creator"; break; - case 5: - s = "Producer"; break; - default: - PyErr_SetString(PyExc_Exception, "Bad field"); - return NULL; - } - ans = poppler_convert_qstring(self->doc->info(QString(s))); - if (ans != NULL) Py_INCREF(ans); - return ans; - -} - -static int -poppler_PDFDoc_setter(poppler_PDFDoc *self, PyObject *val, int field) { - return -1; -} - -static PyObject * -poppler_PDFDoc_title_getter(poppler_PDFDoc *self, void *closure) { - return poppler_PDFDoc_getter(self, 0); -} -static PyObject * -poppler_PDFDoc_author_getter(poppler_PDFDoc *self, void *closure) { - return poppler_PDFDoc_getter(self, 1); -} -static PyObject * -poppler_PDFDoc_subject_getter(poppler_PDFDoc *self, void *closure) { - return poppler_PDFDoc_getter(self, 2); -} -static PyObject * -poppler_PDFDoc_keywords_getter(poppler_PDFDoc *self, void *closure) { - return poppler_PDFDoc_getter(self, 3); -} -static PyObject * -poppler_PDFDoc_creator_getter(poppler_PDFDoc *self, void *closure) { - return poppler_PDFDoc_getter(self, 4); -} -static PyObject * -poppler_PDFDoc_producer_getter(poppler_PDFDoc *self, void *closure) { - return poppler_PDFDoc_getter(self, 5); -} -static PyObject * -poppler_PDFDoc_version_getter(poppler_PDFDoc *self, void *closure) { - PyObject *ans = PyFloat_FromDouble(self->doc->pdfVersion()); - if (ans != NULL) Py_INCREF(ans); - return ans; -} - - -static int -poppler_PDFDoc_title_setter(poppler_PDFDoc *self, PyObject *val, void *closure) { - return poppler_PDFDoc_setter(self, val, 0); -} -static int -poppler_PDFDoc_author_setter(poppler_PDFDoc *self, PyObject *val, void *closure) { - return poppler_PDFDoc_setter(self, val, 1); -} -static int -poppler_PDFDoc_subject_setter(poppler_PDFDoc *self, PyObject *val, void *closure) { - return poppler_PDFDoc_setter(self, val, 2); -} -static int -poppler_PDFDoc_keywords_setter(poppler_PDFDoc *self, PyObject *val, void *closure) { - return poppler_PDFDoc_setter(self, val, 3); -} -static int -poppler_PDFDoc_creator_setter(poppler_PDFDoc *self, PyObject *val, void *closure) { - return poppler_PDFDoc_setter(self, val, 4); -} -static int -poppler_PDFDoc_producer_setter(poppler_PDFDoc *self, PyObject *val, void *closure) { - return poppler_PDFDoc_setter(self, val, 5); -} -} - -static PyObject * -poppler_PDFDoc_render_page(poppler_PDFDoc *self, PyObject *args, PyObject *kwargs) { - QImage img; - float xdpi = 166.0, ydpi = 166.0; - Poppler::Page *page; - QByteArray ba; - PyObject *ans = NULL; - QBuffer buffer(&ba); - int num; - - if (!PyArg_ParseTuple(args, "i|ff", &num, &xdpi, &ydpi)) return ans; - if ( self->doc->isLocked()) { - PyErr_SetString(PyExc_ValueError, "This document is copyrighted."); - return ans; - } - - if ( num < 0 || num >= self->doc->numPages()) { - PyErr_SetString(PyExc_ValueError, "Invalid page number"); - return ans; - } - - page = self->doc->page(num); - img = page->renderToImage(xdpi, ydpi); - if (img.isNull()) { - PyErr_SetString(PyExc_Exception, "Failed to render first page of PDF"); - return ans; - } - buffer.open(QIODevice::WriteOnly); - if (!img.save(&buffer, "JPEG")) { - PyErr_SetString(PyExc_Exception, "Failed to save rendered page"); - return ans; - } - ans = PyString_FromStringAndSize(ba.data(), ba.size()); - if (ans != NULL) { Py_INCREF(ans); } - return ans; -} - -static PyMethodDef poppler_PDFDoc_methods[] = { - {"load", (PyCFunction)poppler_PDFDoc_load, METH_VARARGS, - "Load a PDF document from a byte buffer (string)" - }, - {"open", (PyCFunction)poppler_PDFDoc_open, METH_VARARGS, - "Load a PDF document from a file path (string)" - }, - {"render_page", (PyCFunction)poppler_PDFDoc_render_page, METH_VARARGS, - "render_page(page_num, xdpi=166, ydpi=166) -> Render a page to a JPEG image. Page numbers start from zero." - }, - {NULL} /* Sentinel */ -}; - -static PyObject * -poppler_PDFDoc_pages_getter(poppler_PDFDoc *self, void *closure) { - int pages = self->doc->numPages(); - PyObject *ans = PyInt_FromLong(static_cast(pages)); - if (ans != NULL) Py_INCREF(ans); - return ans; -} - -static PyGetSetDef poppler_PDFDoc_getsetters[] = { - {(char *)"title", - (getter)poppler_PDFDoc_title_getter, (setter)poppler_PDFDoc_title_setter, - (char *)"Document title", - NULL}, - {(char *)"author", - (getter)poppler_PDFDoc_author_getter, (setter)poppler_PDFDoc_author_setter, - (char *)"Document author", - NULL}, - {(char *)"subject", - (getter)poppler_PDFDoc_subject_getter, (setter)poppler_PDFDoc_subject_setter, - (char *)"Document subject", - NULL}, - {(char *)"keywords", - (getter)poppler_PDFDoc_keywords_getter, (setter)poppler_PDFDoc_keywords_setter, - (char *)"Document keywords", - NULL}, - {(char *)"creator", - (getter)poppler_PDFDoc_creator_getter, (setter)poppler_PDFDoc_creator_setter, - (char *)"Document creator", - NULL}, - {(char *)"producer", - (getter)poppler_PDFDoc_producer_getter, (setter)poppler_PDFDoc_producer_setter, - (char *)"Document producer", - NULL}, - {(char *)"pages", - (getter)poppler_PDFDoc_pages_getter, NULL, - (char *)"Number of pages in document (read only)", - NULL}, - {(char *)"version", - (getter)poppler_PDFDoc_version_getter, NULL, - (char *)"The PDF version (read only)", - NULL}, - - {NULL} /* Sentinel */ -}; - - - -static PyTypeObject poppler_PDFDocType = { - PyObject_HEAD_INIT(NULL) - 0, /*ob_size*/ - "calibre_poppler.PDFDoc", /*tp_name*/ - sizeof(poppler_PDFDoc), /*tp_basicsize*/ - 0, /*tp_itemsize*/ - (destructor)poppler_PDFDoc_dealloc, /*tp_dealloc*/ - 0, /*tp_print*/ - 0, /*tp_getattr*/ - 0, /*tp_setattr*/ - 0, /*tp_compare*/ - 0, /*tp_repr*/ - 0, /*tp_as_number*/ - 0, /*tp_as_sequence*/ - 0, /*tp_as_mapping*/ - 0, /*tp_hash */ - 0, /*tp_call*/ - 0, /*tp_str*/ - 0, /*tp_getattro*/ - 0, /*tp_setattro*/ - 0, /*tp_as_buffer*/ - Py_TPFLAGS_DEFAULT, /*tp_flags*/ - "PDF Documents", /* tp_doc */ - 0, /* tp_traverse */ - 0, /* tp_clear */ - 0, /* tp_richcompare */ - 0, /* tp_weaklistoffset */ - 0, /* tp_iter */ - 0, /* tp_iternext */ - poppler_PDFDoc_methods, /* tp_methods */ - 0, /* tp_members */ - poppler_PDFDoc_getsetters, /* tp_getset */ - 0, /* tp_base */ - 0, /* tp_dict */ - 0, /* tp_descr_get */ - 0, /* tp_descr_set */ - 0, /* tp_dictoffset */ - 0, /* tp_init */ - 0, /* tp_alloc */ - poppler_PDFDoc_new, /* tp_new */ -}; - - - -static PyMethodDef poppler_methods[] = { - {NULL} /* Sentinel */ -}; - -extern "C" { - -PyMODINIT_FUNC -initcalibre_poppler(void) -{ - PyObject* m; - - if (PyType_Ready(&poppler_PDFDocType) < 0) - return; - - m = Py_InitModule3("calibre_poppler", poppler_methods, - "Wrapper for the Poppler PDF library"); - - Py_INCREF(&poppler_PDFDocType); - PyModule_AddObject(m, "PDFDoc", (PyObject *)&poppler_PDFDocType); -} -} From d0865b25fbba025c8de96dac571abe05ea7b3e4c Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Mon, 21 Sep 2009 21:28:37 -0600 Subject: [PATCH 2/8] IGN:New PDF engine can be accessed via calibre-debug --pdfreflow --- src/calibre/debug.py | 8 + src/calibre/ebooks/pdf/fonts.cpp | 143 +++++ src/calibre/ebooks/pdf/fonts.h | 105 ++++ src/calibre/ebooks/pdf/images.cpp | 289 ++++++++++ src/calibre/ebooks/pdf/images.h | 94 +++ src/calibre/ebooks/pdf/links.cpp | 56 ++ src/calibre/ebooks/pdf/links.h | 69 +++ src/calibre/ebooks/pdf/main.cpp | 150 +++++ src/calibre/ebooks/pdf/reflow.cpp | 911 ++++++++++++++++++++++++++++++ src/calibre/ebooks/pdf/reflow.h | 238 ++++++++ src/calibre/ebooks/pdf/reflow.py | 127 +++++ src/calibre/ebooks/pdf/utils.h | 48 ++ 12 files changed, 2238 insertions(+) create mode 100644 src/calibre/ebooks/pdf/fonts.cpp create mode 100644 src/calibre/ebooks/pdf/fonts.h create mode 100644 src/calibre/ebooks/pdf/images.cpp create mode 100644 src/calibre/ebooks/pdf/images.h create mode 100644 src/calibre/ebooks/pdf/links.cpp create mode 100644 src/calibre/ebooks/pdf/links.h create mode 100644 src/calibre/ebooks/pdf/main.cpp create mode 100644 src/calibre/ebooks/pdf/reflow.cpp create mode 100644 src/calibre/ebooks/pdf/reflow.h create mode 100644 src/calibre/ebooks/pdf/reflow.py create mode 100644 src/calibre/ebooks/pdf/utils.h diff --git a/src/calibre/debug.py b/src/calibre/debug.py index d2fb72a393..d9912e61d8 100644 --- a/src/calibre/debug.py +++ b/src/calibre/debug.py @@ -38,6 +38,9 @@ Run an embedded python interpreter. help='Add a simple plugin (i.e. a plugin that consists of only a ' '.py file), by specifying the path to the py file containing the ' 'plugin code.') + parser.add_option('--pdfreflow', default=None, + help='Path to PDF file to try and reflow. Output will be placed in ' + 'current directory. ') return parser @@ -210,6 +213,11 @@ def main(args=sys.argv): prints('CALIBRE_RESOURCES_LOCATION='+sys.resources_location) prints('CALIBRE_EXTENSIONS_LOCATION='+sys.extensions_location) prints('CALIBRE_PYTHON_PATH='+os.pathsep.join(sys.path)) + elif opts.pdfreflow: + from calibre.ebooks.pdf.reflow import option_parser as px, run + from calibre.utils.logging import default_log + opts2, args = px().parse_args(['xxxx', '-vvvv', opts.pdfreflow]) + run(opts2, opts.pdfreflow, default_log) else: from IPython.Shell import IPShellEmbed ipshell = IPShellEmbed() diff --git a/src/calibre/ebooks/pdf/fonts.cpp b/src/calibre/ebooks/pdf/fonts.cpp new file mode 100644 index 0000000000..43de8646a9 --- /dev/null +++ b/src/calibre/ebooks/pdf/fonts.cpp @@ -0,0 +1,143 @@ +/** + * Copyright 2009 Kovid Goyal + * License: GNU GPL v3 + */ + + + +#include "fonts.h" +#include "utils.h" + +using namespace calibre_reflow; +using namespace std; + +XMLColor::XMLColor(GfxRGB rgb) { + this->r = static_cast(rgb.r/65535.0*255.0); + this->g = static_cast(rgb.g/65535.0*255.0); + this->b = static_cast(rgb.b/65535.0*255.0); + if (!(this->ok(this->r) && this->ok(this->b) && this->ok(this->g))) { + this->r = 0; this->g = 0; this->b = 0; + } +} + +string XMLColor::str() const { + ostringstream oss; + oss << "rgb(" << this->r << "," << this->g << "," << this->b << ")"; + return oss.str(); +} + +static const char *FONT_MODS[7] = { + "-bolditalic", "-boldoblique", "-bold", "-italic", "-oblique", "-roman", + NULL +}; + +#define ap_toupper(c) (toupper(((unsigned char)(c)))) +static inline +char *strcasestr( char *h, char *n ) +{ /* h="haystack", n="needle" */ + char *a=h, *e=n; + + if( !h || !*h || !n || !*n ) { return 0; } + + while( *a && *e ) { + if( ap_toupper(*a)!=ap_toupper(*e) ) { + ++h; a=h; e=n; + } + else { + ++a; ++e; + } + } + return *e ? 0 : h; +} + +static string* family_name(const string *font_name) { + if (!font_name) return NULL; + string *fn = new string(*font_name); + size_t pos; + const char *p; + for (size_t i = 0; FONT_MODS[i] != NULL; i++) { + p = strcasestr(fn->c_str(), FONT_MODS[i]); + if (p != NULL) { + pos = p - fn->c_str(); + fn->replace(pos, strlen(FONT_MODS[i]), ""); + break; + } + } + return fn; +} + +XMLFont::XMLFont(string* font_name, double size, GfxRGB rgb) : + size(size-1), line_size(-1.0), italic(false), bold(false), font_name(font_name), + font_family(NULL), color(rgb) { + + if (!this->font_name) this->font_name = new string(DEFAULT_FONT_FAMILY); + this->font_family = family_name(this->font_name); + if (strcasestr(font_name->c_str(), "bold")) this->bold = true; + + if (strcasestr(font_name->c_str(),"italic")|| + strcasestr(font_name->c_str(),"oblique")) this->italic = true; + + +} + +XMLFont& XMLFont::operator=(const XMLFont& x){ + if (this==&x) return *this; + this->size = x.size; + this->line_size = x.line_size; + this->italic = x.italic; + this->bold = x.bold; + this->color = x.color; + if (this->font_name) delete this->font_name; + this->font_name = new string(*x.font_name); + if (this->font_family) delete this->font_family; + this->font_family = new string(*x.font_family); + return *this; +} + +bool XMLFont::operator==(const XMLFont &f) const { + return (fabs(this->size - f.size) < 0.1) && + (fabs(this->line_size - f.line_size) < 0.1) && + (this->italic == f.italic) && + (this->bold == f.bold) && + (this->color == f.color) && + ((*this->font_family) == (*f.font_family)); +} + +bool XMLFont::eq_upto_inline(const XMLFont &f) const { + return (fabs(this->size - f.size) < 0.1) && + (fabs(this->line_size - f.line_size) < 0.1) && + (this->color == f.color) && + ((*this->font_family) == (*f.font_family)); +} + +string XMLFont::str(Fonts::size_type id) const { + ostringstream oss; + oss << "font_family) << "\" "; + oss << "color=\"" << this->color.str() << "\" "; + oss << setiosflags(ios::fixed) << setprecision(2) + << "size=\"" << this->size << "\""; + oss << "/>"; + return oss.str(); +} + +Fonts::size_type Fonts::add_font(XMLFont *f) { + Fonts::iterator it; + size_type i; + for ( i=0, it=this->begin(); it < this->end(); it++, i++ ) { + if (**it == *f) return i; + } + this->push_back(f); + return this->size()-1; +} + +Fonts::size_type Fonts::add_font(string* font_name, double size, GfxRGB rgb) { + XMLFont *f = new XMLFont(font_name, size, rgb); + return this->add_font(f); +} + +Fonts::~Fonts() { + Fonts::iterator it; + for ( it=this->begin(); it < this->end(); it++ ) delete *it; + this->resize(0); +} diff --git a/src/calibre/ebooks/pdf/fonts.h b/src/calibre/ebooks/pdf/fonts.h new file mode 100644 index 0000000000..c285b1dacc --- /dev/null +++ b/src/calibre/ebooks/pdf/fonts.h @@ -0,0 +1,105 @@ +/** + * Copyright 2009 Kovid Goyal + * License: GNU GPL v3 + */ + + + +#ifndef CALIBRE_REFLOW_FONTS +#define CALIBRE_REFLOW_FONTS + +#include +#include +#include +#include +#include +#include + +using namespace std; + +#define DEFAULT_FONT_FAMILY "Times New Roman" + +namespace calibre_reflow { + +class XMLColor { + + private: + unsigned int r; + unsigned int g; + unsigned int b; + inline bool ok(unsigned int xcol) const { + return ( (xcol <= 255) && (xcol >= 0) ); + } + + public: + XMLColor():r(0),g(0),b(0){} + + XMLColor(GfxRGB rgb); + + XMLColor(const XMLColor& x) { + this->r=x.r; this->g=x.g; this->b=x.b; + } + + XMLColor& operator=(const XMLColor &x){ + this->r=x.r; this->g=x.g; this->b=x.b; + return *this; + } + + ~XMLColor(){} + + string str() const; + + bool operator==(const XMLColor &col) const { + return ((r==col.r)&&(g==col.g)&&(b==col.b)); + } + +}; + + +class XMLFont { + +private: + double size; + double line_size; + bool italic; + bool bold; + string *font_name; + string *font_family; + XMLColor color; + +public: + XMLFont(const char *font_family=DEFAULT_FONT_FAMILY, double size=12.0) : + size(size), line_size(-1.0), italic(false), bold(false), + font_name(new string(font_family)), font_family(new string(font_family)), + color() {} + + XMLFont(string* font_name, double size, GfxRGB rgb); + XMLFont(const XMLFont& other) : + size(other.size), line_size(other.line_size), italic(other.italic), + bold(other.bold), font_name(new string(*other.font_name)), + font_family(other.font_family), color(other.color) {} + + XMLColor get_color() { return this->color; } + string* get_font_name() { return this->font_name; } + double get_size() const { return this->size; } + double get_line_size() { return this->line_size; } + void set_line_size(double ls) { this->line_size = ls; } + bool is_italic() const { return this->italic; } + bool is_bold() const { return this->bold; } + ~XMLFont() { delete this->font_name; delete this->font_family; } + XMLFont& operator=(const XMLFont& other); + bool operator==(const XMLFont &other) const; + bool eq_upto_inline(const XMLFont &f) const; + string str(vector::size_type id) const; +}; + +class Fonts : public vector { + public: + Fonts::size_type add_font(XMLFont *f); + Fonts::size_type add_font(string* font_name, double size, GfxRGB rgb); + ~Fonts(); +}; + + +} +#endif diff --git a/src/calibre/ebooks/pdf/images.cpp b/src/calibre/ebooks/pdf/images.cpp new file mode 100644 index 0000000000..2221c06900 --- /dev/null +++ b/src/calibre/ebooks/pdf/images.cpp @@ -0,0 +1,289 @@ +#include +#include +#include +#include +#include +#include +#include +#include + +#include "images.h" +#include "utils.h" + +#define xoutRound(x) ( static_cast(round(x)) ) +using namespace std; +using namespace calibre_reflow; + +calibre_reflow::ImageInfo::ImageInfo(GfxState *state) { + // get image position and size + state->transform(0, 0, &xt, &yt); + state->transformDelta(1, 1, &wt, &ht); + if (wt > 0) { + x0 = xoutRound(xt); + w0 = xoutRound(wt); + } else { + x0 = xoutRound(xt + wt); + w0 = xoutRound(-wt); + } + if (ht > 0) { + y0 = xoutRound(yt); + h0 = xoutRound(ht); + } else { + y0 = xoutRound(yt + ht); + h0 = xoutRound(-ht); + } + state->transformDelta(1, 0, &xt, &yt); + rotate = fabs(xt) < fabs(yt); + if (rotate) { + w1 = h0; + h1 = w0; + x_flip = ht < 0; + y_flip = wt > 0; + } else { + w1 = w0; + h1 = h0; + x_flip = wt < 0; + y_flip = ht > 0; + } + //cout << x_flip << "|" << y_flip << endl; +} + +void XMLImages::clear() { + vector::iterator it; + for (it = this->masks.begin(); it < this->masks.end(); it++) + delete *it; + for (it = this->images.begin(); it < this->images.end(); it++) + delete *it; + this->masks.clear(); + this->images.clear(); +} + +void XMLImages::add_mask(GfxState *state, Object *ref, Stream *str, + unsigned int width, unsigned int height, bool invert, + bool interpolate, bool inline_img) { +} + +static void throw_magick_exception(MagickWand *wand) { + ExceptionType severity; + char *description = MagickGetException(wand, &severity); + ostringstream oss; + oss << description << endl; + description=(char *) MagickRelinquishMemory(description); + wand = DestroyMagickWand(wand); + MagickWandTerminus(); + throw ReflowException(oss.str().c_str()); +} + + +static void flip_image(string file_name, bool x_flip, bool y_flip) { + MagickWand *magick_wand; + MagickBooleanType status; + + MagickWandGenesis(); + magick_wand = NewMagickWand(); + status = MagickReadImage(magick_wand, file_name.c_str()); + if (status == MagickFalse) throw_magick_exception(magick_wand); + + if (y_flip) { + status = MagickFlipImage(magick_wand); + if (status == MagickFalse) throw_magick_exception(magick_wand); + } + if (x_flip) { + status = MagickFlopImage(magick_wand); + if (status == MagickFalse) throw_magick_exception(magick_wand); + } + + status = MagickWriteImage(magick_wand, NULL); + if (status == MagickFalse) throw_magick_exception(magick_wand); + + magick_wand = DestroyMagickWand(magick_wand); + MagickWandTerminus(); +} + +void XMLImages::add(GfxState *state, Object *ref, Stream *str, + unsigned int width, unsigned int height, GfxImageColorMap *colorMap, + bool interpolate, int *maskColors, bool inline_img) { + XMLImage *img = new XMLImage(state); + this->images.push_back(img); + img->width = width; img->height = height; + img->type = (str->getKind() == strDCT) ? jpeg : png; + string file_name = this->file_name(img); + + FILE *of = fopen(file_name.c_str(), "wb"); + if (!of) throw ReflowException(strerror(errno)); + + if (img->type == jpeg) { + int c; + str = ((DCTStream *)str)->getRawStream(); + str->reset(); + + // copy the stream + while ((c = str->getChar()) != EOF) fputc(c, of); + } else { //Render as PNG + Guchar *p; + GfxRGB rgb; + png_byte *row = (png_byte *) malloc(3 * width); // 3 bytes/pixel: RGB + png_bytep *row_pointer= &row; + + PNGWriter *writer = new PNGWriter(); + writer->init(of, width, height); + + // Initialize the image stream + ImageStream *imgStr = new ImageStream(str, width, + colorMap->getNumPixelComps(), colorMap->getBits()); + imgStr->reset(); + + // For each line... + for (unsigned int y = 0; y < height; y++) { + // Convert into a PNG row + p = imgStr->getLine(); + for (unsigned int x = 0; x < width; x++) { + colorMap->getRGB(p, &rgb); + // Write the RGB pixels into the row + row[3*x]= colToByte(rgb.r); + row[3*x+1]= colToByte(rgb.g); + row[3*x+2]= colToByte(rgb.b); + p += colorMap->getNumPixelComps(); + } + + writer->writeRow(row_pointer); + } + + writer->close(); + delete writer; + + free(row); + imgStr->close(); + delete imgStr; + + } + fclose(of); + img->written = true; + if (img->info.x_flip || img->info.y_flip) + flip_image(file_name, img->info.x_flip, img->info.y_flip); +} + + +string XMLImages::file_name(const XMLImage *img) const { + vector::const_iterator ir, mr; + size_t idx = 0; + bool mask = false; + + ir = find( this->images.begin(), this->images.end(), img); + if (ir == this->images.end()) { + mr = find( this->masks.begin(), this->masks.end(), img); + idx = mr - this->masks.begin(); + mask = true; + } else idx = ir - this->images.begin(); + + ostringstream oss; + oss << ((mask) ? "mask" : "image") << "-" << idx+1 << '.'; + oss << ((img->type == jpeg) ? "jpg" : "png"); + return oss.str(); +} + +vector XMLImages::str() const { + vector ans; + vector ::const_iterator it; + for (it = this->masks.begin(); it < this->masks.end(); it++) { + if ((*it)->written) + ans.push_back(new string((*it)->str(it - this->masks.begin(), true, + this->file_name(*it)))); + } + for (it = this->images.begin(); it < this->images.end(); it++) { + if ((*it)->written) + ans.push_back(new string((*it)->str(it - this->images.begin(), false, + this->file_name(*it)))); + } + return ans; +} + +string XMLImage::str(size_t num, bool mask, string file_name) const { + ostringstream oss; + oss << "width << "\" iheight=\"" << this->height << "\" " + << "rwidth=\"" << this->info.w1 << "\" rheight=\"" << this->info.h1 << "\" " + << setiosflags(ios::fixed) << setprecision(2) + << "top=\"" << this->info.y0 << "\" left=\"" << this->info.x0 << "\"/>"; + return oss.str(); + + +} +PNGWriter::~PNGWriter() +{ + /* cleanup heap allocation */ + png_destroy_write_struct(&png_ptr, &info_ptr); +} + +void PNGWriter::init(FILE *f, int width, int height) +{ + /* initialize stuff */ + png_ptr = png_create_write_struct(PNG_LIBPNG_VER_STRING, NULL, NULL, NULL); + if (!png_ptr) + throw ReflowException("png_create_write_struct failed"); + + info_ptr = png_create_info_struct(png_ptr); + if (!info_ptr) + throw ReflowException("png_create_info_struct failed"); + + if (setjmp(png_jmpbuf(png_ptr))) + throw ReflowException("png_jmpbuf failed"); + + /* write header */ + png_init_io(png_ptr, f); + if (setjmp(png_jmpbuf(png_ptr))) + throw ReflowException("Error during writing header"); + + // Set up the type of PNG image and the compression level + png_set_compression_level(png_ptr, Z_BEST_COMPRESSION); + + png_byte bit_depth = 8; + png_byte color_type = PNG_COLOR_TYPE_RGB; + png_byte interlace_type = PNG_INTERLACE_NONE; + + png_set_IHDR(png_ptr, info_ptr, width, height, bit_depth, color_type, interlace_type, PNG_COMPRESSION_TYPE_DEFAULT, PNG_FILTER_TYPE_DEFAULT); + + png_write_info(png_ptr, info_ptr); + if (setjmp(png_jmpbuf(png_ptr))) + throw ReflowException("error during writing png info bytes"); + +} + +void PNGWriter::writePointers(png_bytep *rowPointers) +{ + png_write_image(png_ptr, rowPointers); + /* write bytes */ + if (setjmp(png_jmpbuf(png_ptr))) + throw ReflowException("Error during writing bytes"); +} + +void PNGWriter::writeRow(png_bytep *row) +{ + // Write the row to the file + png_write_rows(png_ptr, row, 1); + if (setjmp(png_jmpbuf(png_ptr))) + throw ReflowException("error during png row write"); +} + +void PNGWriter::close() +{ + /* end write */ + png_write_end(png_ptr, info_ptr); + if (setjmp(png_jmpbuf(png_ptr))) + throw ReflowException("Error during end of write"); +} + +void PNGWriter::write_splash_bitmap(SplashBitmap *bitmap) { + SplashColorPtr row = bitmap->getDataPtr(); + int height = bitmap->getHeight(); + int row_size = bitmap->getRowSize(); + png_bytep *row_pointers = new png_bytep[height]; + + for (int y = 0; y < height; ++y) { + row_pointers[y] = row; + row += row_size; + } + this->writePointers(row_pointers); + delete[] row_pointers; +} diff --git a/src/calibre/ebooks/pdf/images.h b/src/calibre/ebooks/pdf/images.h new file mode 100644 index 0000000000..b403abe4fb --- /dev/null +++ b/src/calibre/ebooks/pdf/images.h @@ -0,0 +1,94 @@ +#ifndef _CALIBRE_REFLOW_IMAGES +#define _CALIBRE_REFLOW_IMAGES + +#include +#include +#include +#include + +using namespace std; + +namespace calibre_reflow { + + enum ImageType { + jpeg, png + }; + + class PNGWriter + { + public: + PNGWriter() {} + ~PNGWriter(); + + void init(FILE *f, int width, int height); + + void writePointers(png_bytep *rowPointers); + void writeRow(png_bytep *row); + void write_splash_bitmap(SplashBitmap *bitmap); + void close(); + + private: + png_structp png_ptr; + png_infop info_ptr; + }; + + class ImageInfo { + public: + + ImageInfo(GfxState *state); + + private: + int x0, y0; // top left corner of image + int w0, h0, w1, h1; // size of image + double xt, yt, wt, ht; + bool rotate, x_flip, y_flip; + + friend class XMLImage; + friend class XMLImages; + + }; + + class XMLImage { + private: + double x, y; + unsigned int width, height; + ImageType type; + bool written; + ImageInfo info; + + friend class XMLImages; + + public: + XMLImage(GfxState *state) : + x(0.), y(0.), width(0), height(0), type(jpeg), written(false), info(state) + {} + + ~XMLImage() {} + + string str(size_t num, bool mask, string file_name) const; + }; + + class XMLImages { + private: + vector images; + vector masks; + + public: + + ~XMLImages() { this->clear(); } + + void add_mask(GfxState *state, Object *ref, Stream *str, + unsigned int width, unsigned int height, bool invert, + bool interpolate, bool inline_img); + + void add(GfxState *state, Object *ref, Stream *str, + unsigned int width, unsigned int height, GfxImageColorMap *colorMap, + bool interpolate, int *maskColors, bool inline_img); + + string file_name(const XMLImage *img) const; + vector str() const; + void clear(); + }; +} + +#endif diff --git a/src/calibre/ebooks/pdf/links.cpp b/src/calibre/ebooks/pdf/links.cpp new file mode 100644 index 0000000000..414ff5ce24 --- /dev/null +++ b/src/calibre/ebooks/pdf/links.cpp @@ -0,0 +1,56 @@ +/** + * Copyright 2009 Kovid Goyal + * License: GNU GPL v3 + */ + + + +#include "links.h" +#include "utils.h" + +using namespace std; +using namespace calibre_reflow; + +XMLLink& XMLLink::operator=(const XMLLink &x) { + if (this==&x) return *this; + if (this->dest) {delete this->dest; this->dest=NULL;} + this->x_min = x.x_min; + this->y_min = x.y_min; + this->x_max = x.x_max; + this->y_max = x.y_max; + this->dest = new string(*x.dest); + return *this; +} + +bool XMLLink::in_link(double xmin,double ymin,double xmax,double ymax) const { + double y = (ymin + ymax)/2; + if (y > this->y_max) return false; + return (y > this->y_min) && (xmin < this->x_max) && (xmax > this->x_min); +} + +string XMLLink::get_link_start() { + ostringstream oss; + oss << "dest) oss << encode_for_xml(*this->dest); + oss << "\">"; + return oss.str(); +} + +XMLLinks::~XMLLinks() { + for(XMLLinks::iterator i = this->begin(); i != this->end(); i++) + delete *i; + this->clear(); +} + +bool XMLLinks::in_link(double xmin, double ymin, double xmax, + double ymax, XMLLinks::size_type &p) const { + for(XMLLinks::const_iterator i = this->begin(); i != this->end(); i++) { + if ( (*i)->in_link(xmin, ymin, xmax, ymax) ) { + p = (i - this->begin()); + return true; + } + } + return false; +} + + diff --git a/src/calibre/ebooks/pdf/links.h b/src/calibre/ebooks/pdf/links.h new file mode 100644 index 0000000000..a84693ed0e --- /dev/null +++ b/src/calibre/ebooks/pdf/links.h @@ -0,0 +1,69 @@ +/** + * Copyright 2009 Kovid Goyal + * License: GNU GPL v3 + */ + + + +#ifndef _CALIBRE_XML_LINKS +#define _CALIBRE_XML_LINKS + +#include +#include + +using namespace std; + +namespace calibre_reflow { + +class XMLLink { + +private: + double x_min; + double y_min; + double x_max; + double y_max; + string* dest; + +public: + XMLLink() : dest(NULL) {} + XMLLink(const XMLLink& x) : + x_min(x.x_min), y_min(x.y_min), x_max(x.x_max), + y_max(x.y_max), dest(new string(*x.dest)) {} + XMLLink(double x_min, double y_min, double x_max, + double y_max, const char *dest) : + x_min((x_min < x_max) ? x_min : x_max), + y_min((y_min < y_max) ? y_min : y_max), + x_max((x_max > x_min) ? x_max : x_min), + y_max((y_max > y_min) ? y_max : y_min), + dest(new string(dest)) {} + + ~XMLLink() { delete this->dest; } + + string* get_dest() { return this->dest; } + double get_x1() const {return x_min;} + double get_x2() const {return x_max;} + double get_y1() const {return y_min;} + double get_y2() const {return y_max;} + + XMLLink& operator=(const XMLLink &x); + bool operator==(const XMLLink &x) const { + return (this->dest != NULL) && (x.dest != NULL) && + this->dest->compare(*x.dest) == 0; + } + bool in_link(double xmin, double ymin, double xmax, double ymax) const; + string get_link_start(); + +}; + +class XMLLinks : public vector { + public: + ~XMLLinks(); + + bool in_link(double xmin, double ymin, double xmax, + double ymax, XMLLinks::size_type &p) const; +}; + + +} +#endif + diff --git a/src/calibre/ebooks/pdf/main.cpp b/src/calibre/ebooks/pdf/main.cpp new file mode 100644 index 0000000000..358f344c09 --- /dev/null +++ b/src/calibre/ebooks/pdf/main.cpp @@ -0,0 +1,150 @@ +#ifndef PDF2XML +#define UNICODE +#define PY_SSIZE_T_CLEAN +#include +#endif + +#include "reflow.h" + +using namespace std; +using namespace calibre_reflow; + +#ifndef PDF2XML + +extern "C" { + + static PyObject * + pdfreflow_reflow(PyObject *self, PyObject *args) { + char *pdfdata; + Py_ssize_t size; + + if (!PyArg_ParseTuple(args, "s#", &pdfdata, &size)) + return NULL; + + try { + Reflow reflow(pdfdata, static_cast(size)); + reflow.render(); + } catch (std::exception &e) { + PyErr_SetString(PyExc_RuntimeError, e.what()); return NULL; + } catch (...) { + PyErr_SetString(PyExc_RuntimeError, + "Unknown exception raised while rendering PDF"); return NULL; + } + + Py_RETURN_NONE; + } + + static PyObject * + pdfreflow_get_metadata(PyObject *self, PyObject *args) { + char *pdfdata; + Py_ssize_t size; + map info; + PyObject *cover; + PyObject *ans = PyDict_New(); + + if (!ans) return PyErr_NoMemory(); + + if (!PyArg_ParseTuple(args, "s#O", &pdfdata, &size, &cover)) + return NULL; + + try { + Reflow reflow(pdfdata, static_cast(size)); + info = reflow.get_info(); + if (PyObject_IsTrue(cover)) { + if (!reflow.is_locked()) { + size_t size; + char *data = reflow.render_first_page(&size); + PyObject *d = PyString_FromStringAndSize(data, size); + delete[] data; + if (d == NULL) return PyErr_NoMemory(); + if (PyDict_SetItemString(ans, "cover", d) == -1) return NULL; + } else { + if (PyDict_SetItemString(ans, "cover", Py_None) == -1) return NULL; + } + } + } catch (std::exception &e) { + PyErr_SetString(PyExc_RuntimeError, e.what()); return NULL; + } catch (...) { + PyErr_SetString(PyExc_RuntimeError, + "Unknown exception raised while getting metadata from PDF"); return NULL; + } + + for (map::const_iterator it = info.begin() ; it != info.end(); it++ ) { + PyObject *key = PyUnicode_Decode((*it).first.c_str(), (*it).first.size(), "UTF-8", "replace"); + if (!key) return NULL; + PyObject *val = PyUnicode_Decode((*it).second.c_str(), (*it).second.size(), "UTF-8", "replace"); + if (!val) return NULL; + if (PyDict_SetItem(ans, key, val) == -1) return NULL; + } + return ans; + } + + + static + PyMethodDef pdfreflow_methods[] = { + {"reflow", pdfreflow_reflow, METH_VARARGS, + "reflow(pdf_data)\n\n" + "Reflow the specified PDF." + }, + {"get_metadata", pdfreflow_get_metadata, METH_VARARGS, + "get_metadata(pdf_data, cover)\n\n" + "Get metadata and (optionally) cover from the specified PDF." + }, + + {NULL, NULL, 0, NULL} + }; + + + PyMODINIT_FUNC + initpdfreflow(void) + { + PyObject* m; + + m = Py_InitModule3("pdfreflow", pdfreflow_methods, + "Reflow a PDF file"); + + if (m == NULL) return; + + } +} + + +#else + +int main(int argc, char **argv) { + char *memblock; + ifstream::pos_type size; + + if (argc != 2) { + cerr << "Usage: " << argv[0] << " file.pdf" << endl; + return 1; + } + + ifstream file (argv[1], ios::in|ios::binary|ios::ate); + if (file.is_open()) { + size = file.tellg(); + memblock = new char[size]; + file.seekg (0, ios::beg); + file.read (memblock, size); + file.close(); + } else { + cerr << "Unable to read from: " << argv[1] << endl; + return 1; + } + + try { + Reflow reflow(memblock, size); + reflow.render(); + size_t sz = 0; + char *data = reflow.render_first_page(&sz); + ofstream file("cover.png", ios::binary); + file.write(data, sz); + file.close(); + } catch(exception &e) { + cerr << e.what() << endl; + return 1; + } + + return 0; +} +#endif diff --git a/src/calibre/ebooks/pdf/reflow.cpp b/src/calibre/ebooks/pdf/reflow.cpp new file mode 100644 index 0000000000..0181194ea2 --- /dev/null +++ b/src/calibre/ebooks/pdf/reflow.cpp @@ -0,0 +1,911 @@ +/** + * Copyright 2009 Kovid Goyal + * License: GNU GPL v3 + */ + +#include +#include +#include +#include +#include +#include +#include +#include "reflow.h" +#include "utils.h" + +using namespace std; +using namespace calibre_reflow; + +static const size_t num_info_keys = 8; +static const char* info_keys[num_info_keys] = { + "Title", "Subject", "Keywords", "Author", "Creator", "Producer", + "CreationDate", "ModDate" +}; + + +//------------------------------------------------------------------------ +// XMLString +//------------------------------------------------------------------------ + +XMLString::XMLString(GfxState *state, GooString *s, double current_font_size, + Fonts *fonts) : + text(new vector(0)), x_right(new vector(0)), + yx_next(NULL), xy_next(NULL), fonts(fonts), font_idx(0), xml_text(NULL), + link(NULL), x_min(0), x_max(0), y_min(0), y_max(0), col(0), dir(text_dir_unknown) +{ + double x = 0, y = 0; + GfxFont *font; + + state->transform(state->getCurX(), state->getCurY(), &x, &y); + + if ((font = state->getFont())) { + double ascent = font->getAscent(); + double descent = font->getDescent(); + if( ascent > 1.05 ){ + //printf( "ascent=%.15g is too high, descent=%.15g\n", ascent, descent ); + ascent = 1.05; + } + if( descent < -0.4 ){ + //printf( "descent %.15g is too low, ascent=%.15g\n", descent, ascent ); + descent = -0.4; + } + this->y_min = y - ascent * current_font_size; + this->y_max = y - descent * current_font_size; + GfxRGB rgb; + state->getFillRGB(&rgb); + GooString *name = state->getFont()->getName(); + if (!name) + this->font_idx = this->fonts->add_font(NULL, current_font_size-1, rgb); + else + this->font_idx = this->fonts->add_font( + new string(name->getCString()), current_font_size-1, rgb); + + } else { + // this means that the PDF file draws text without a current font, + // which should never happen + this->y_min = y - 0.95 * current_font_size; + this->y_max = y + 0.35 * current_font_size; + } + if (this->y_min == this->y_max) { + // this is a sanity check for a case that shouldn't happen -- but + // if it does happen, we want to avoid dividing by zero later + this->y_min = y; + this->y_max = y + 1; + } +} + +void XMLString::add_char(GfxState *state, double x, double y, + double dx, double dy, Unicode u) { + if (dir == text_dir_unknown) { + //dir = UnicodeMap::getDirection(u); + dir = text_dir_left_right; + } + + if (this->text->capacity() == this->text->size()) { + this->text->reserve(text->size()+16); + this->x_right->reserve(x_right->size()+16); + } + this->text->push_back(u); + if (this->length() == 1) { + this->x_min = x; + } + this->x_max = x + dx; + this->x_right->push_back(x_max); + //printf("added char: %f %f xright = %f\n", x, dx, x+dx); +} + +void XMLString::end_string() +{ + if( this->dir == text_dir_right_left && this->length() > 1 ) + { + //printf("will reverse!\n"); + reverse(this->text->begin(), this->text->end()); + } +} + +static string encode_unicode_chars(const Unicode *u, size_t num) { + ostringstream oss; + UnicodeMap *uMap; + char buf[10]; + int n; + if (!(uMap = globalParams->getTextEncoding())) { + throw ReflowException("Failed to allocate unicode map."); + } + + for (size_t i = 0; i < num; i++) { + switch (u[i]) { + case '&': oss << "&"; break; + case '<': oss << "<"; break; + case '>': oss << ">"; break; + default: + { + // convert unicode to string + if ((n = uMap->mapUnicode(u[i], buf, sizeof(buf))) > 0) { + buf[n] = 0; + oss << buf; + } + } + } + } + uMap->decRefCnt(); + return oss.str(); +} + +void XMLString::encode() { + delete this->xml_text; + this->xml_text = new string(encode_unicode_chars(&((*this->text)[0]), this->text->size())); +} + +string XMLString::str() const { + ostringstream oss; + oss << "font_idx << "\" "; + oss << setiosflags(ios::fixed) << setprecision(2) + << "top=\"" << this->y_min << "\" left=\"" << this->x_min + << "\" width=\"" << this->x_max - this->x_min << "\" " + << "height=\"" << this->y_max - this->y_min << "\">"; + oss << *this->xml_text << ""; + return oss.str(); +} + +XMLString::~XMLString() { + delete this->text; delete this->x_right; +} + + +//------------------------------------------------------------------------ +// XMLPage +//------------------------------------------------------------------------ + +XMLPage::XMLPage(unsigned int num, GfxState *state, ofstream *output, Fonts* fonts) : + current_string(NULL), num(num), output(output), current_font_size(0.0), + yx_strings(NULL), xy_strings(NULL), yx_cur1(NULL), yx_cur2(NULL), + fonts(fonts), links(new XMLLinks()) +{ + (*this->output) << setiosflags(ios::fixed) << setprecision(2) << + "\t\tnum << "\" width=\"" << + state->getPageWidth() << "\" height=\"" << state->getPageHeight() << + "\">" << endl; + if (!(*this->output)) throw ReflowException(strerror(errno)); +} + +XMLPage::~XMLPage() { + (*this->output) << "\t\t" << endl; + if (!(*this->output)) throw ReflowException(strerror(errno)); + for (XMLString *tmp = this->yx_strings; tmp; tmp = tmp->yx_next) + delete tmp; + + delete this->links; +} + +void XMLPage::update_font(GfxState *state) { + GfxFont *font; + double *fm; + char *name; + int code; + double w; + + current_font_size = state->getTransformedFontSize(); + + if ((font = state->getFont()) && font->getType() == fontType3) { + // This is a hack which makes it possible to deal with some Type 3 + // fonts. The problem is that it's impossible to know what the + // base coordinate system used in the font is without actually + // rendering the font. This code tries to guess by looking at the + // width of the character 'm' (which breaks if the font is a + // subset that doesn't contain 'm'). + for (code = 0; code < 256; ++code) { + if ((name = ((Gfx8BitFont *)font)->getCharName(code)) && + name[0] == 'm' && name[1] == '\0') break; + + } + if (code < 256) { + w = ((Gfx8BitFont *)font)->getWidth(code); + if (w != 0) { + // 600 is a generic average 'm' width -- yes, this is a hack + current_font_size *= w / 0.6; + } + } + fm = font->getFontMatrix(); + if (fm[0] != 0) { + current_font_size *= fabs(fm[3] / fm[0]); + } + } + +} + +void XMLPage::draw_char(GfxState *state, double x, double y, + double dx, double dy, + double originX, double originY, + CharCode code, int nBytes, Unicode *u, int uLen) { + if ( (state->getRender() & 3) == 3) return; //Hidden text + double x1, y1, w1, h1, dx2, dy2; + int i; + state->transform(x, y, &x1, &y1); + + // check that new character is in the same direction as current string + // and is not too far away from it before adding + if (this->current_string->character_does_not_belong_to_string(state, x1)) { + this->end_string(); + this->begin_string(state, NULL); + } + state->textTransformDelta(state->getCharSpace() * state->getHorizScaling(), + 0, &dx2, &dy2); + dx -= dx2; + dy -= dy2; + state->transformDelta(dx, dy, &w1, &h1); + if (uLen != 0) { + w1 /= uLen; + h1 /= uLen; + } + for (i = 0; i < uLen; ++i) { + this->current_string->add_char(state, x1 + i*w1, y1 + i*h1, w1, h1, u[i]); + } + +} + +void XMLPage::end_string() { + XMLString *p1 = NULL, *p2 = NULL; + double h, y1, y2; + + // throw away zero-length strings -- they don't have valid xMin/xMax + // values, and they're useless anyway + if (this->current_string->length() == 0) { + delete this->current_string; + this->current_string = NULL; + return; + } + + this->current_string->end_string(); + + // insert string in y-major list + h = this->current_string->height(); + y1 = this->current_string->y_min + 0.5 * h; + y2 = this->current_string->y_min + 0.8 * h; + if (gFalse) { //rawOrder + p1 = this->yx_cur1; + p2 = NULL; + } else if ( + (!this->yx_cur1 || + (y1 >= this->yx_cur1->y_min && + (y2 >= this->yx_cur1->y_max || + this->current_string->x_max >= this->yx_cur1->x_min))) && + (!this->yx_cur2 || + (y1 < this->yx_cur2->y_min || + (y2 < this->yx_cur2->y_max && + this->current_string->x_max < this->yx_cur2->x_min))) + ) { + p1 = this->yx_cur1; + p2 = this->yx_cur2; + } else { + for (p1 = NULL, p2 = this->yx_strings; p2; p1 = p2, p2 = p2->yx_next) { + if (y1 < p2->y_min || (y2 < p2->y_max && this->current_string->x_max < p2->x_min)) + break; + } + this->yx_cur2 = p2; + } + this->yx_cur1 = this->current_string; + if (p1) + p1->yx_next = this->current_string; + else + this->yx_strings = this->current_string; + this->current_string->yx_next = p2; + this->current_string = NULL; +} + +void XMLPage::end() { + XMLLinks::size_type link_index = 0; + Fonts::size_type pos = 0; + XMLFont* h; + + for (XMLString *tmp = this->yx_strings; tmp; tmp = tmp->yx_next) { + pos = tmp->font_idx; + h = this->fonts->at(pos); + + tmp->encode(); + + if (this->links->in_link( + tmp->x_min, tmp->y_min, tmp->x_max, tmp->y_max, link_index)) { + tmp->link = links->at(link_index); + } + } + + this->coalesce(); + + for (XMLString *tmp = yx_strings; tmp; tmp=tmp->yx_next) { + if (tmp->xml_text && tmp->xml_text->size() > 0) { + (*this->output) << "\t\t\t" << tmp->str() << endl; + if (!(*this->output)) throw ReflowException(strerror(errno)); + } + } +} + +static const char *strrstr( const char *s, const char *ss ) +{ + const char *p = strstr( s, ss ); + for( const char *pp = p; pp != NULL; pp = strstr( p+1, ss ) ){ + p = pp; + } + return p; +} + + +static void close_tags( string *xml_text, bool &finish_a, bool &finish_italic, bool &finish_bold ) +{ + const char *last_italic = finish_italic && ( finish_bold || finish_a ) ? strrstr( xml_text->c_str(), "" ) : NULL; + const char *last_bold = finish_bold && ( finish_italic || finish_a ) ? strrstr( xml_text->c_str(), "" ) : NULL; + const char *last_a = finish_a && ( finish_italic || finish_bold ) ? strrstr( xml_text->c_str(), " ( last_italic > last_bold ? last_italic : last_bold ) ) { + xml_text->append(""); + finish_a = false; + } + if( finish_italic && finish_bold && last_italic > last_bold ){ + xml_text->append(""); + finish_italic = false; + } + if( finish_bold ) + xml_text->append(""); + if( finish_italic ) + xml_text->append(""); + if( finish_a ) + xml_text->append(""); +} + +void XMLPage::coalesce() { + XMLString *str1, *str2, *str3; + XMLFont *hfont1, *hfont2; + double space, hor_space, vert_space, vert_overlap, size, x_limit; + bool add_space, found; + int n, i; + double cur_x, cur_y; + + str1 = this->yx_strings; + + if( !str1 ) return; + + //----- discard duplicated text (fake boldface, drop shadows) + + while (str1) + { + size = str1->y_max - str1->y_min; + x_limit = str1->x_min + size * 0.2; + found = false; + for (str2 = str1, str3 = str1->yx_next; + str3 && str3->x_min < x_limit; + str2 = str3, str3 = str2->yx_next) + { + if (str3->length() == str1->length() && + !memcmp(str3->text, str1->text, str1->length() * sizeof(Unicode)) && + fabs(str3->y_min - str1->y_min) < size * 0.2 && + fabs(str3->y_max - str1->y_max) < size * 0.2 && + fabs(str3->x_max - str1->x_max) < size * 0.2) + { + found = true; + //printf("found duplicate!\n"); + break; + } + } + if (found) + { + str2->xy_next = str3->xy_next; + str2->yx_next = str3->yx_next; + delete str3; + } + else + { + str1 = str1->yx_next; + } + } + + str1 = yx_strings; + + hfont1 = this->fonts->at(str1->font_idx); + if( hfont1->is_bold() ) + str1->xml_text->insert(0, ""); + if( hfont1->is_italic() ) + str1->xml_text->insert(0, ""); + if (str1->get_link()) + str1->xml_text->insert(0, str1->get_link()->get_link_start()); + cur_x = str1->x_min; cur_y = str1->y_min; + + while (str1 && (str2 = str1->yx_next)) { + hfont2 = this->fonts->at(str2->font_idx); + space = str1->y_max - str1->y_min; + hor_space = str2->x_min - str1->x_max; + vert_space = str2->y_min - str1->y_max; + + vert_overlap = 0; + if (str2->y_min >= str1->y_min && str2->y_min <= str1->y_max) + { + vert_overlap = str1->y_max - str2->y_min; + } else if (str2->y_max >= str1->y_min && str2->y_max <= str1->y_max) + { + vert_overlap = str2->y_max - str1->y_min; + } + if ( + ( + ( + (str2->y_min < str1->y_max) + && + (hor_space > -0.5 * space && hor_space < space) + ) + ) && + (hfont1->eq_upto_inline(*hfont2)) && + str1->dir == str2->dir // text direction the same + ) + { + n = str1->length() + str2->length(); + if ((add_space = hor_space > 0.1 * space)) { + ++n; + } + + str1->text->reserve((n + 15) & ~15); + str1->x_right->reserve((n + 15) & ~15); + if (add_space) { + str1->text->push_back(0x20); + str1->xml_text->push_back(' '); + str1->x_right->push_back(str2->x_min); + } + + for (i = 0; i < str2->length(); i++) { + str1->text->push_back(str2->text->at(i)); + str1->x_right->push_back(str2->x_right->at(i)); + } + + /* fix , if str1 and str2 differ and handle switch of links */ + XMLLink *hlink1 = str1->get_link(); + XMLLink *hlink2 = str2->get_link(); + bool switch_links = !hlink1 || !hlink2 || !((*hlink1) == (*hlink2)); + bool finish_a = switch_links && hlink1 != NULL; + bool finish_italic = hfont1->is_italic() && ( !hfont2->is_italic() || finish_a ); + bool finish_bold = hfont1->is_bold() && + ( !hfont2->is_bold() || finish_a || finish_italic ); + close_tags( str1->xml_text, finish_a, finish_italic, finish_bold ); + if( switch_links && hlink2 != NULL ) { + string ls = hlink2->get_link_start(); + str1->xml_text->append(ls); + } + if( ( !hfont1->is_italic() || finish_italic ) && hfont2->is_italic() ) + str1->xml_text->append(""); + if( ( !hfont1->is_bold() || finish_bold ) && hfont2->is_bold() ) + str1->xml_text->append(""); + + + str1->xml_text->append(*str2->xml_text); + // str1 now contains href for link of str2 (if it is defined) + str1->link = str2->link; + hfont1 = hfont2; + if (str2->x_max > str1->x_max) { + str1->x_max = str2->x_max; + } + if (str2->y_max > str1->y_max) { + str1->y_max = str2->y_max; + } + str1->yx_next = str2->yx_next; + delete str2; + } else { // keep strings separate + bool finish_a = str1->get_link() != NULL; + bool finish_bold = hfont1->is_bold(); + bool finish_italic = hfont1->is_italic(); + close_tags( str1->xml_text, finish_a, finish_italic, finish_bold ); + + str1->x_min = cur_x; str1->y_min = cur_y; + str1 = str2; + cur_x = str1->x_min; cur_y = str1->y_min; + hfont1 = hfont2; + if ( hfont1->is_bold() ) + str1->xml_text->insert(0, ""); + if( hfont1->is_italic() ) + str1->xml_text->insert(0, ""); + if( str1->get_link() != NULL ) { + str1->xml_text->insert(0, str1->get_link()->get_link_start()); + } + } + } + str1->x_min = cur_x; str1->y_min = cur_y; + + bool finish_bold = hfont1->is_bold(); + bool finish_italic = hfont1->is_italic(); + bool finish_a = str1->get_link() != NULL; + close_tags( str1->xml_text, finish_a, finish_italic, finish_bold ); + +} + + +//------------------------------------------------------------------------ +// XMLOutputDev +//------------------------------------------------------------------------ + +XMLOutputDev::XMLOutputDev(PDFDoc *doc) : + current_page(NULL), output(new ofstream("index.xml", ios::trunc)), + fonts(new Fonts()), catalog(NULL), images(new XMLImages()), doc(doc) +{ + if (!(*this->output)) { + throw ReflowException(strerror(errno)); + } + (*this->output) << "" << endl; + (*this->output) << "\t" << endl; + if (!(*this->output)) throw ReflowException(strerror(errno)); +} + +XMLOutputDev::~XMLOutputDev() { + (*this->output) << "\t" << endl; + if (!(*this->output)) throw ReflowException(strerror(errno)); + (*this->output) << "\t" << endl; + if (!(*this->output)) throw ReflowException(strerror(errno)); + for (Fonts::const_iterator it = this->fonts->begin(); it < this->fonts->end(); it++) { + (*this->output) << "\t\t" << (*it)->str(it - this->fonts->begin()) << endl; + if (!(*this->output)) throw ReflowException(strerror(errno)); + } + (*this->output) << "\t" << endl; + if (!(*this->output)) throw ReflowException(strerror(errno)); + (*this->output) << "" << endl; + if (!(*this->output)) throw ReflowException(strerror(errno)); + this->output->close(); + delete this->output; + delete this->fonts; + delete this->images; +} + +static string get_link_dest(LinkAction *link, PDFDoc *doc) { + unsigned int page = 1; + ostringstream oss; + + switch(link->getKind()) + { + case actionGoTo: + { + LinkGoTo *ha = (LinkGoTo *)link; + LinkDest *dest = NULL; + if (ha->getDest() != NULL) + dest = ha->getDest()->copy(); + else if (ha->getNamedDest() != NULL) { + dest = doc->findDest(ha->getNamedDest()); + } + + if (dest) { + if (dest->isPageRef()) { + Ref pageref = dest->getPageRef(); + page = doc->findPage(pageref.num, pageref.gen); + } + else { + page = dest->getPageNum(); + } + + oss << "#" << page + << setiosflags(ios::fixed) << setprecision(2) + << ":l=" << dest->getLeft() + << "t=" << dest->getTop(); + //<< "r=" << dest->getRight() + //<< "b=" << dest->getBottom(); + delete dest; + } + break; + } + + case actionGoToR: + { + LinkGoToR *ha = (LinkGoToR *) link; + LinkDest *dest = NULL; + bool has_file = false; + if (ha->getFileName()) { + oss << ha->getFileName()->getCString(); + has_file = true; + } + if (ha->getDest() != NULL) dest=ha->getDest()->copy(); + + if (dest && has_file) { + if (!(dest->isPageRef())) page = dest->getPageNum(); + delete dest; + oss << '#' << page; + } + break; + } + case actionURI: + { + LinkURI *ha=(LinkURI *) link; + oss << ha->getURI()->getCString(); + break; + } + case actionLaunch: + { + LinkLaunch *ha = (LinkLaunch *) link; + oss << ha->getFileName()->getCString(); + break; + } + case actionNamed: break; + case actionMovie: break; + case actionRendition: break; + case actionSound: break; + case actionJavaScript: break; + case actionUnknown: break; + } + return oss.str(); +} + +void XMLOutputDev::process_link(Link* link){ + double _x1, _y1, _x2, _y2; + int x1, y1, x2, y2; + + link->getRect(&_x1, &_y1, &_x2, &_y2); + cvtUserToDev(_x1, _y1, &x1, &y1); + + cvtUserToDev(_x2, _y2, &x2, &y2); + + LinkAction *a = link->getAction(); + if (!a) return; + string dest = get_link_dest(a, this->doc); + if (dest.length() > 0) { + XMLLink *t = new XMLLink((double)x1, (double)y2, (double)x2, (double)y1, + dest.c_str()); + this->current_page->add_link(t); + } +} + + +void XMLOutputDev::endPage() { + Links *slinks = catalog->getPage(current_page->number())->getLinks(catalog); + for (int i = 0; i < slinks->getNumLinks(); i++) + { + this->process_link(slinks->getLink(i)); + } + delete slinks; + + this->current_page->end(); + vector images = this->images->str(); + for (vector::iterator it = images.begin(); it < images.end(); it++) { + (*this->output) << "\t\t\t" << *(*it) << endl; + if (!(*this->output)) throw ReflowException(strerror(errno)); + delete *it; + } + this->images->clear(); + delete this->current_page; + this->current_page = NULL; +} + + +void XMLOutputDev::drawImageMask(GfxState *state, Object *ref, Stream *str, + int width, int height, GBool invert, + GBool interpolate, GBool inlineImg) { + OutputDev::drawImageMask(state, ref, str, width, height, + invert, interpolate, inlineImg); + //this->images->add_mask(); + cerr << "mask requested" << endl; +} + +void XMLOutputDev::drawImage(GfxState *state, Object *ref, Stream *str, + int width, int height, GfxImageColorMap *colorMap, + GBool interpolate, int *maskColors, GBool inlineImg) { + this->images->add(state, ref, str, + static_cast(width), static_cast(height), + colorMap, interpolate, maskColors, inlineImg); +} + +Reflow::Reflow(char *pdfdata, size_t sz) : + pdfdata(pdfdata), current_font_size(-1), doc(NULL) +{ + Object obj; + obj.initNull(); + if (globalParams == NULL) { + globalParams = new GlobalParams(); + if (!globalParams) + throw ReflowException("Failed to allocate Globalparams"); + } + MemStream *str = new MemStream(pdfdata, 0, sz, &obj); + this->doc = new PDFDoc(str, NULL, NULL); + + if (!this->doc->isOk()) { + ostringstream stm; + stm << "Failed to open PDF file"; + stm << " with error code: " << doc->getErrorCode(); + delete this->doc; + this->doc = NULL; + throw ReflowException(stm.str().c_str()); + } + +} + +void +Reflow::render() { + if (this->doc->isEncrypted()) { + throw ReflowException("Document is encrypted."); + } + + if (!this->doc->okToCopy()) + cout << "Warning, this document has the copy protection flag set, ignoring." << endl; + + char encoding[10] = "UTF-8"; + globalParams->setTextEncoding(encoding); + + int first_page = 1; + int last_page = doc->getNumPages(); + + XMLOutputDev *xml_out = new XMLOutputDev(this->doc); + doc->displayPages(xml_out, first_page, last_page, + 96, //hDPI + 96, //vDPI + 0, //rotate + true, //UseMediaBox + true, //Crop + false //Printing + ); + + this->dump_outline(); + + delete xml_out; +} + +void Reflow::dump_outline() { + Outline *outline = this->doc->getOutline(); + if (!outline) return; + GooList *items = outline->getItems(); + if ( !items || items->getLength() < 1 ) + return; + + ostringstream *output = new ostringstream(); + (*output) << "" << endl; + this->outline_level(output, items); + (*output) << "" << endl; + ofstream of("outline.xml", ios::trunc); + of << output->str(); + if (!of) throw ReflowException("Error writing outline file"); + of.close(); + delete output; +} + +static inline void outline_tabs(ostringstream *o, int level) { + for (int i = 0; i < level; i++) + (*o) << "\t"; +} + +void Reflow::outline_level(ostringstream *oss, GooList *items, int level) +{ + int num_of_items = items->getLength(); + if (num_of_items > 0) { + outline_tabs(oss, level); + (*oss) << "" << endl; + + for (int i = 0; i < num_of_items; i++) { + OutlineItem* item = (OutlineItem *)items->get(i); + Unicode *u = item->getTitle(); + string title = encode_unicode_chars(u, item->getTitleLength()); + if (title.size() < 1) continue; + outline_tabs(oss, level+1); + (*oss) << "isOpen()?"yes":"no") << "\""; + LinkAction *a = item->getAction(); + if (a != NULL) + (*oss) << " dest=\"" << get_link_dest(a, this->doc) << "\""; + (*oss) << ">" << title << "" << endl; + item->open(); + GooList *children = item->getKids(); + if (children) + outline_level(oss, children, level+1); + } + } +} + +Reflow::~Reflow() { + delete this->doc; +} + +map Reflow::get_info() { + Object info; + map ans; + string val; + char encoding[10] = "UTF-8"; + globalParams->setTextEncoding(encoding); + + this->doc->getDocInfo(&info); + if (info.isDict()) { + for(size_t i = 0; i < num_info_keys; i++) { + val = this->decode_info_string(info.getDict(), info_keys[i]); + if (val.size() > 0) { + ans[string(info_keys[i])] = string(val); + } + } + } + return ans; +} + +string Reflow::decode_info_string(Dict *info, const char *key) const { + Object obj; + GooString *s1; + bool is_unicode; + Unicode u; + char buf[8]; + int i, n; + ostringstream oss; + char *tmp = new char[strlen(key)+1]; + strcpy(tmp, key); + UnicodeMap *umap; + if (!(umap = globalParams->getTextEncoding())) { + throw ReflowException("Failed to allocate unicode map."); + } + + + if (info->lookup(tmp, &obj)->isString()) { + s1 = obj.getString(); + if ((s1->getChar(0) & 0xff) == 0xfe && + (s1->getChar(1) & 0xff) == 0xff) { + is_unicode = true; + i = 2; + } else { + is_unicode = false; + i = 0; + } + while (i < obj.getString()->getLength()) { + if (is_unicode) { + u = ((s1->getChar(i) & 0xff) << 8) | + (s1->getChar(i+1) & 0xff); + i += 2; + } else { + u = pdfDocEncoding[s1->getChar(i) & 0xff]; + ++i; + } + n = umap->mapUnicode(u, buf, sizeof(buf)); + buf[n] = 0; + oss << buf; + } + } + obj.free(); + delete[] tmp; + return oss.str(); +} + +char* Reflow::render_first_page(size_t *data_size, + bool use_crop_box, double x_res, + double y_res) { + if (this->is_locked()) throw ReflowException("Document is locked."); + char encoding[10] = "UTF-8"; + char yes[10] = "yes"; + globalParams->setTextEncoding(encoding); + globalParams->setEnableFreeType(yes); + globalParams->setAntialias(yes); + globalParams->setVectorAntialias(yes); + + SplashColor paper_color; + paper_color[0] = 255; + paper_color[1] = 255; + paper_color[2] = 255; + SplashOutputDev *out = new SplashOutputDev(splashModeRGB8, 4, false, paper_color); + if (!out) { + throw ReflowException("Failed to allocate SplashOutputDev"); + } + out->startDoc(doc->getXRef()); + + double pg_w, pg_h; + int pg = 1; + + if (use_crop_box) { + pg_w = this->doc->getPageCropWidth(pg); + pg_h = this->doc->getPageCropHeight(pg); + } else { + pg_w = this->doc->getPageMediaWidth(pg); + pg_h = this->doc->getPageMediaHeight(pg); + } + + pg_w *= x_res/72.; + pg_h *= x_res/72.; + + int x=0, y=0; + this->doc->displayPageSlice(out, pg, x_res, y_res, 0, + !use_crop_box, false, false, x, y, pg_w, pg_h); + + FILE * f = tmpfile(); + if (!f) throw ReflowException(strerror(errno)); + SplashBitmap *bmp = out->getBitmap(); + PNGWriter *writer = new PNGWriter(); + writer->init(f, bmp->getWidth(), bmp->getHeight()); + writer->write_splash_bitmap(bmp); + writer->close(); + delete writer; + + + long size = ftell(f); + rewind(f); + char *buffer = new char[size]; + *data_size = fread(buffer, 1, size, f); + if (*data_size != (size_t)size) { + throw ReflowException("I/O error reading from tmpfile"); + } + return buffer; +} diff --git a/src/calibre/ebooks/pdf/reflow.h b/src/calibre/ebooks/pdf/reflow.h new file mode 100644 index 0000000000..2a672c6661 --- /dev/null +++ b/src/calibre/ebooks/pdf/reflow.h @@ -0,0 +1,238 @@ +/** + * Copyright 2009 Kovid Goyal + * License: GNU GPL v3 + * Based on pdftohtml from the poppler project. + */ + +#ifndef CALIBRE_REFLOW +#define CALIBRE_REFLOW +#define UNICODE + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "fonts.h" +#include "links.h" +#include "images.h" + +using namespace std; + +namespace calibre_reflow { + + +enum UnicodeTextDirection { + text_dir_unknown, + text_dir_left_right, + text_dir_right_left, + text_dir_top_bottom +}; + +class Reflow { + + private: + char *pdfdata; + double current_font_size; + PDFDoc *doc; + + string decode_info_string(Dict *info, const char *key) const; + void outline_level(ostringstream *oss, GooList *items, + int level=1); + + public: + Reflow (char *xpdfdata, size_t sz); + ~Reflow(); + + /* Convert the PDF to XML. All files are output to the current directory */ + void render(); + + /* Get the PDF Info Dictionary */ + map get_info(); + + /* True if the PDF is encrypted */ + bool is_locked() const { return !this->doc || this->doc->isEncrypted(); } + + /* Return the first page of the PDF, rendered as a PNG image */ + char* render_first_page(size_t *data_size, + bool use_crop_box=true, double x_res=150.0, + double y_res = 150.0); + + /* Dump the PDF outline as the file outline.xml in the current directory */ + void dump_outline(); +}; + +class XMLString { + private: + vector *text; // the text + vector *x_right; // right-hand x coord of each char + XMLString *yx_next; // next string in y-major order + XMLString *xy_next; // next string in x-major order + Fonts *fonts; + Fonts::size_type font_idx; + string *xml_text; + XMLLink *link; + + double x_min, x_max; // bounding box x coordinates + double y_min, y_max; // bounding box y coordinates + int col; // starting column + UnicodeTextDirection dir; // direction (left to right/right to left) + + friend class XMLPage; + + public: + XMLString(GfxState *state, GooString *s, double current_font_size, Fonts *fonts); + ~XMLString(); + + bool character_does_not_belong_to_string(GfxState *state, double x1) { + return this->length() > 0 && + fabs(x1 - x_right->at(this->length()-1)) > 0.1 * (y_max - y_min); + } + + void add_char(GfxState *state, double x, double y, + double dx, double dy, Unicode u); + + void end_string(); + inline int length() const { return this->text->size(); } + inline double height() const { return y_max - y_min; } + void encode(); + XMLLink* get_link() { return this->link; } + string str() const; +}; + +class XMLPage { + private: + XMLString *current_string; + unsigned int num; + ofstream *output; + double current_font_size; + XMLString *yx_strings; // strings in y-major order + XMLString *xy_strings; // strings in x-major order + XMLString *yx_cur1, *yx_cur2; // cursors for yxStrings list + Fonts *fonts; + XMLLinks *links; + void coalesce(); + + public: + XMLPage(unsigned int num, GfxState *state, ofstream *output, Fonts* fonts); + ~XMLPage(); + + void update_font(GfxState *state); + + void begin_string(GfxState *state, GooString *s) { + this->current_string = new XMLString(state, s, + this->current_font_size, this->fonts); + } + + void draw_char(GfxState *state, double x, double y, + double dx, double dy, + double originX, double originY, + CharCode code, int nBytes, Unicode *u, int uLen); + + void end_string(); + + void end(); + + void add_link(XMLLink *t) { this->links->push_back(t); } + + unsigned int number() const { return this->num; } +}; + +class XMLOutputDev : public OutputDev { + public: + XMLOutputDev(PDFDoc *doc); + virtual ~XMLOutputDev(); + //---- get info about output device + + // Does this device use upside-down coordinates? + // (Upside-down means (0,0) is the top left corner of the page.) + virtual GBool upsideDown() { return gTrue; } + + // Does this device use drawChar() or drawString()? + virtual GBool useDrawChar() { return gTrue; } + + // Does this device use beginType3Char/endType3Char? Otherwise, + // text in Type 3 fonts will be drawn with drawChar/drawString. + virtual GBool interpretType3Chars() { return gFalse; } + + // Does this device need non-text content? + virtual GBool needNonText() { return gTrue; } + + //----- initialization and control + + virtual GBool checkPageSlice(Page *page, double hDPI, double vDPI, + int rotate, GBool useMediaBox, GBool crop, + int sliceX, int sliceY, int sliceW, int sliceH, + GBool printing, Catalog * catalogA, + GBool (* abortCheckCbk)(void *data) = NULL, + void * abortCheckCbkData = NULL) + { + this->catalog = catalogA; + return gTrue; + } + + + // Start a page. + virtual void startPage(int page_num, GfxState *state) { + this->current_page = new XMLPage(page_num, state, this->output, this->fonts); + } + + + // End a page. + virtual void endPage(); + + //----- update text state + virtual void updateFont(GfxState *state) {current_page->update_font(state);} + + //----- text drawing + virtual void beginString(GfxState *state, GooString *s) { + this->current_page->begin_string(state, s); + } + virtual void endString(GfxState *state) { + this->current_page->end_string(); + } + virtual void drawChar(GfxState *state, double x, double y, + double dx, double dy, + double originX, double originY, + CharCode code, int nBytes, Unicode *u, int uLen) { + this->current_page->draw_char(state, x, y, dx, dy, originX, + originY, code, nBytes, u, uLen); + } + + virtual void drawImageMask(GfxState *state, Object *ref, + Stream *str, + int width, int height, GBool invert, + GBool interpolate, GBool inlineImg); + virtual void drawImage(GfxState *state, Object *ref, Stream *str, + int width, int height, GfxImageColorMap *colorMap, + GBool interpolate, int *maskColors, GBool inlineImg); + + //new feature + virtual int DevType() {return 1234;} + + private: + XMLPage *current_page; + ofstream *output; // xml file + Fonts *fonts; + Catalog *catalog; + XMLImages *images; + PDFDoc *doc; + + void process_link(Link* link); +}; +} +#endif diff --git a/src/calibre/ebooks/pdf/reflow.py b/src/calibre/ebooks/pdf/reflow.py new file mode 100644 index 0000000000..1acc700a37 --- /dev/null +++ b/src/calibre/ebooks/pdf/reflow.py @@ -0,0 +1,127 @@ +#!/usr/bin/env python +# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai +from __future__ import with_statement + +__license__ = 'GPL v3' +__copyright__ = '2009, Kovid Goyal ' +__docformat__ = 'restructuredtext en' + +import sys, os +from copy import deepcopy + +from lxml import etree + +class Font(object): + + def __init__(self, spec): + self.id = spec.get('id') + self.size = float(spec.get('size')) + self.color = spec.get('color') + self.family = spec.get('family') + +class Text(object): + + A = etree.XPath('descendant::a[@href]') + + def __init__(self, text, font_map, classes, opts, log): + self.opts, self.log = opts, log + self.font_map = font_map + self.top, self.left, self.width, self.height = map(float, map(text.get, + ('top', 'left', 'width', 'height'))) + self.font = self.font_map[text.get('font')] + self.font_size = self.font.size + self.color = self.font.color + self.font_family = self.font.family + + for a in self.A(text): + href = a.get('href') + if href.startswith('index.'): + href = href.split('#')[-1] + a.set('href', '#page'+href) + + self.text = etree.Element('span') + css = {'font_size':'%.1fpt'%self.font_size, 'color': self.color} + if css not in classes: + classes.append(css) + idx = classes.index(css) + self.text.set('class', 't%d'%idx) + if text.text: + self.text.text = text.text + for x in text: + self.text.append(deepcopy(x)) + #print etree.tostring(self.text, encoding='utf-8', with_tail=False) + +class Page(object): + + def __init__(self, page, font_map, classes, opts, log): + self.opts, self.log = opts, log + self.font_map = font_map + self.number = int(page.get('number')) + self.top, self.left, self.width, self.height = map(float, map(page.get, + ('top', 'left', 'width', 'height'))) + self.id = 'page%d'%self.number + + self.texts = [] + + for text in page.xpath('descendant::text'): + self.texts.append(Text(text, self.font_map, classes, self.opts, self.log)) + + +class PDFDocument(object): + + def __init__(self, xml, opts, log): + self.opts, self.log = opts, log + parser = etree.XMLParser(recover=True) + self.root = etree.fromstring(xml, parser=parser) + + self.fonts = [] + self.font_map = {} + + for spec in self.root.xpath('//fontspec'): + self.fonts.append(Font(spec)) + self.font_map[self.fonts[-1].id] = self.fonts[-1] + + self.pages = [] + self.page_map = {} + + self.classes = [] + + for page in self.root.xpath('//page'): + page = Page(page, self.font_map, self.classes, opts, log) + self.page_map[page.id] = page + self.pages.append(page) + + + + +def run(opts, pathtopdf, log): + from calibre.constants import plugins + pdfreflow, err = plugins['pdfreflow'] + if pdfreflow is None: + raise RuntimeError('Failed to load PDF Reflow plugin: '+err) + data = open(pathtopdf, 'rb').read() + pdfreflow.reflow(data) + index = os.path.join(os.getcwdu(), 'index.xml') + xml = open(index, 'rb').read() + #pdfdoc = PDFDocument(xml, opts, log) + +def option_parser(): + from optparse import OptionParser + p = OptionParser() + p.add_option('-v', '--verbose', action='count', default=0) + return p + +def main(args=sys.argv): + p = option_parser() + opts, args = p.parse_args(args) + from calibre.utils.logging import default_log + + if len(args) < 2: + p.print_help() + default_log('No input PDF file specified', file=sys.stderr) + return 1 + + + run(opts, args[1], default_log) + + return 0 diff --git a/src/calibre/ebooks/pdf/utils.h b/src/calibre/ebooks/pdf/utils.h new file mode 100644 index 0000000000..ae342d09da --- /dev/null +++ b/src/calibre/ebooks/pdf/utils.h @@ -0,0 +1,48 @@ +/** + * Copyright 2009 Kovid Goyal + * License: GNU GPL v3 + */ + + +#ifndef _CALIBRE_REFLOW_UTILS +#define _CALIBRE_REFLOW_UTILS + +#include +#include + +using namespace std; + +namespace calibre_reflow { + + class ReflowException : public exception { + const char *msg; + public: + ReflowException(const char *m) : msg(m) {} + virtual const char* what() const throw() { return msg; } + }; + +inline string encode_for_xml(const string &sSrc ) +{ + ostringstream sRet; + + for( string::const_iterator iter = sSrc.begin(); iter!=sSrc.end(); iter++ ) + { + unsigned char c = (unsigned char)*iter; + + switch( c ) + { + case '&': sRet << "&"; break; + case '<': sRet << "<"; break; + case '>': sRet << ">"; break; + case '"': sRet << """; break; + + default: sRet << c; + } + } + + return sRet.str(); +} + + +} +#endif From 4be28fb1fa978bbfdaa68b03b7e752ddb4911eba Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Tue, 22 Sep 2009 09:13:15 -0600 Subject: [PATCH 3/8] New recipe for Smashing Magazine by Darko Miletic and improved recipe for Die Ziet --- resources/images/news/smashing.png | Bin 0 -> 843 bytes src/calibre/web/feeds/recipes/__init__.py | 2 +- .../web/feeds/recipes/recipe_smashing.py | 51 +++++++++++++++++ .../web/feeds/recipes/recipe_zeitde.py | 52 +++++++++++++++--- 4 files changed, 96 insertions(+), 9 deletions(-) create mode 100644 resources/images/news/smashing.png create mode 100644 src/calibre/web/feeds/recipes/recipe_smashing.py diff --git a/resources/images/news/smashing.png b/resources/images/news/smashing.png new file mode 100644 index 0000000000000000000000000000000000000000..05d029512fe7d16bc7f06b1eec4b2f31c241e6e8 GIT binary patch literal 843 zcmeAS@N?(olHy`uVBq!ia0vp^0wB!61|;P_|4#%`Y)RhkE)4%caKYZ?lYt_f1s;*b zK-vS0-A-oPfdtD69Mgd`SU*F|v9*U87??^tT^vI!PM1zT=$%?Ba@>CZ{k`S)KHIv_ zPwZ`L5}dqZVsFrygo6zxE@xa$XLL-OnGkhNDeLLF*wtnS&m6iM@V{ZgYzOupC8adA zCJ$ffvvIFJzpVa#@BZHBVMUC*1)!VAeoE5kYG z$vMV0{ePQYz-h;Svb?y-|Jbker~kIEIL&#kKk-n9(#eG1Y#n-gojzPKc(+{sWZhjh z=|rh~8~saXChnb4P&4V2_4~l8mqsm$|HFUY;_Cf=-_`6Ir`?-Z^S=L7c^c7MuVFFws@+D7xw zYGTDw`G!}umfiCFciQ=z{SMdccV&j|B7$)|%cl*V%*Duc9lRqlxI9Z}7yyT&+*z*-J^A~>mDZ9(=Z%@W2 zSD&rISDdFM?%us}PpxqEy8R2L+h2S5KKruLZ{EPrz|fDLIzMl9Ejj&NCH9xwr~Bdm zXUo3-bFuBpdY_Z_8d?7FX0E3|iCMM8HKHUXu_Vn+Rhw8?85=+}e2C|>0cv3IboFyt=akR{0664s AumAu6 literal 0 HcmV?d00001 diff --git a/src/calibre/web/feeds/recipes/__init__.py b/src/calibre/web/feeds/recipes/__init__.py index ee5829948a..926580bba1 100644 --- a/src/calibre/web/feeds/recipes/__init__.py +++ b/src/calibre/web/feeds/recipes/__init__.py @@ -57,7 +57,7 @@ recipe_modules = ['recipe_' + r for r in ( 'monitor', 'republika', 'beta', 'beta_en', 'glasjavnosti', 'esquire', 'livemint', 'thedgesingapore', 'darknet', 'rga', 'intelligencer', 'theoldfoodie', 'hln_be', 'honvedelem', - 'the_new_republic', 'philly', 'salon', 'tweakers', + 'the_new_republic', 'philly', 'salon', 'tweakers', 'smashing', )] diff --git a/src/calibre/web/feeds/recipes/recipe_smashing.py b/src/calibre/web/feeds/recipes/recipe_smashing.py new file mode 100644 index 0000000000..cc4edd2c77 --- /dev/null +++ b/src/calibre/web/feeds/recipes/recipe_smashing.py @@ -0,0 +1,51 @@ +#!/usr/bin/env python + +__license__ = 'GPL v3' +__copyright__ = '2009, Darko Miletic ' +''' +www.smashingmagazine.com +''' + +from calibre.web.feeds.news import BasicNewsRecipe +from calibre.ebooks.BeautifulSoup import Tag + +class SmashingMagazine(BasicNewsRecipe): + title = 'Smashing Magazine' + __author__ = 'Darko Miletic' + description = 'We smash you with the information that will make your life easier, really' + oldest_article = 20 + language = 'en' + max_articles_per_feed = 100 + no_stylesheets = True + use_embedded_content = False + publisher = 'Smashing Magazine' + category = 'news, web, IT, css, javascript, html' + encoding = 'utf-8' + + conversion_options = { + 'comments' : description + ,'tags' : category + ,'publisher' : publisher + } + + keep_only_tags = [dict(name='div', attrs={'id':'leftcolumn'})] + remove_tags_after = dict(name='ul',attrs={'class':'social'}) + remove_tags = [ + dict(name=['link','object']) + ,dict(name='h1',attrs={'class':'logo'}) + ,dict(name='div',attrs={'id':'booklogosec'}) + ,dict(attrs={'src':'http://media2.smashingmagazine.com/wp-content/uploads/images/the-smashing-book/smbook6.gif'}) + ] + + feeds = [(u'Articles', u'http://rss1.smashingmagazine.com/feed/')] + + def preprocess_html(self, soup): + for iter in soup.findAll('div',attrs={'class':'leftframe'}): + it = iter.find('h1') + if it == None: + iter.extract() + for item in soup.findAll('img'): + oldParent = item.parent + if oldParent.name == 'a': + oldParent.name = 'div' + return soup diff --git a/src/calibre/web/feeds/recipes/recipe_zeitde.py b/src/calibre/web/feeds/recipes/recipe_zeitde.py index 1c00b74146..648e3f9148 100644 --- a/src/calibre/web/feeds/recipes/recipe_zeitde.py +++ b/src/calibre/web/feeds/recipes/recipe_zeitde.py @@ -13,18 +13,17 @@ class ZeitDe(BasicNewsRecipe): title = 'Die Zeit Nachrichten' description = 'Die Zeit - Online Nachrichten' language = 'de' + lang = 'de_DE' - __author__ = 'Kovid Goyal and Martin Pitt' + __author__ = 'Martin Pitt and Suajta Raman' use_embedded_content = False - timefmt = ' [%d %b %Y]' max_articles_per_feed = 40 + remove_empty_feeds = True no_stylesheets = True - encoding = 'utf8' + encoding = 'utf-8' - remove_tags = [{'class': 'adwrap'}] - keep_only_tags = [{'name': 'div', 'class': 'content'}] - feeds = [ ('Kurznachrichten', 'http://newsfeed.zeit.de/index'), + feeds = [ ('Politik', 'http://newsfeed.zeit.de/politik/index'), ('Wirtschaft', 'http://newsfeed.zeit.de/wirtschaft/index'), ('Meinung', 'http://newsfeed.zeit.de/meinung/index'), @@ -33,6 +32,43 @@ class ZeitDe(BasicNewsRecipe): ('Wissen', 'http://newsfeed.zeit.de/wissen/index'), ] - def print_version(self,url): - return url.replace('http://www.zeit.de/', 'http://mobil.zeit.de/') + extra_css = ''' + .supertitle{color:#990000; font-family:Arial,Helvetica,sans-serif;font-size:xx-small;} + .excerpt{font-family:Georgia,Palatino,Palatino Linotype,FreeSerif,serif;font-size:large;} + .title{font-family:Arial,Helvetica,sans-serif;font-size:large} + .caption{color:#666666; font-family:Arial,Helvetica,sans-serif;font-size:xx-small;} + .copyright{color:#666666; font-family:Arial,Helvetica,sans-serif;font-size:xx-small;} + .article{font-family:Georgia,Palatino,Palatino Linotype,FreeSerif,serif;font-size:x-small} + .headline iconportrait_inline{font-family:Arial,Helvetica,sans-serif;font-size:x-small} + ''' + filter_regexps = [r'ad.de.doubleclick.net/'] + keep_only_tags = [ + dict(name='div', attrs={'class':["article"]}) , + ] + remove_tags = [ + dict(name='link'), dict(name='iframe'),dict(name='style'), + dict(name='div', attrs={'class':["pagination block","pagenav","inline link"] }), + dict(name='div', attrs={'id':["place_5","place_4"]}) + ] + + def get_article_url(self, article): + + url = article.get('guid', None) + + if 'video' in url or 'quiz' in url : + + url = None + + return url + + def preprocess_html(self, soup): + soup.html['xml:lang'] = self.lang + soup.html['lang'] = self.lang + mtag = '' + soup.head.insert(0,mtag) + + return soup + + #def print_version(self,url): + # return url.replace('http://www.zeit.de/', 'http://images.zeit.de/text/').replace('?from=rss', '') From c24f507cc233db31ac1ff523283ad789a269b110 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Tue, 22 Sep 2009 11:27:53 -0600 Subject: [PATCH 4/8] IGN:Restore PoDoFo based set pdf metadata functionality --- src/calibre/ebooks/metadata/pdf.py | 88 ++---------------------------- src/calibre/ebooks/pdf/main.cpp | 48 ++++++++++++++++ src/calibre/ebooks/pdf/reflow.cpp | 65 +++++++++++++++++++++- src/calibre/ebooks/pdf/reflow.h | 3 + 4 files changed, 120 insertions(+), 84 deletions(-) diff --git a/src/calibre/ebooks/metadata/pdf.py b/src/calibre/ebooks/metadata/pdf.py index e11197e4fe..147e3d2504 100644 --- a/src/calibre/ebooks/metadata/pdf.py +++ b/src/calibre/ebooks/metadata/pdf.py @@ -5,8 +5,9 @@ __copyright__ = '2008, Kovid Goyal ' from functools import partial -from calibre import plugins, prints -from calibre.ebooks.metadata import MetaInformation, string_to_authors#, authors_to_string +from calibre import prints +from calibre.constants import plugins +from calibre.ebooks.metadata import MetaInformation, string_to_authors, authors_to_string pdfreflow, pdfreflow_error = plugins['pdfreflow'] @@ -44,64 +45,13 @@ def get_metadata(stream, cover=True): return mi - - get_quick_metadata = partial(get_metadata, cover=False) -''' -import sys, os, cStringIO +import cStringIO from threading import Thread -from calibre import StreamReadWrapper -from calibre.ptempfile import TemporaryDirectory -try: - from calibre.utils.PythonMagickWand import \ - NewMagickWand, MagickReadImage, MagickSetImageFormat, \ - MagickWriteImage, ImageMagick - _imagemagick_loaded = True -except: - _imagemagick_loaded = False -from calibre.ebooks.metadata import MetaInformation, string_to_authors, authors_to_string from calibre.utils.pdftk import set_metadata as pdftk_set_metadata -from calibre.utils.podofo import get_metadata as podofo_get_metadata, \ - set_metadata as podofo_set_metadata, Unavailable, get_metadata_quick -from calibre.utils.poppler import get_metadata as get_metadata_poppler, NotAvailable - -def get_quick_metadata(stream): - try: - return get_metadata_poppler(stream, False) - except NotAvailable: - pass - - return get_metadata_pypdf(stream) - raw = stream.read() - mi = get_metadata_quick(raw) - if mi.title == '_': - mi.title = getattr(stream, 'name', _('Unknown')) - mi.title = mi.title.rpartition('.')[0] - return mi - - -def get_metadata(stream, extract_cover=True): - try: - return get_metadata_poppler(stream, extract_cover) - except NotAvailable: - pass - try: - with TemporaryDirectory('_pdfmeta') as tdir: - cpath = os.path.join(tdir, 'cover.pdf') - if not extract_cover: - cpath = None - mi = podofo_get_metadata(stream, cpath=cpath) - if mi.cover is not None: - cdata = get_cover(mi.cover) - mi.cover = None - if cdata is not None: - mi.cover_data = ('jpg', cdata) - except Unavailable: - mi = get_metadata_pypdf(stream) - return mi - +from calibre.utils.podofo import set_metadata as podofo_set_metadata, Unavailable def set_metadata(stream, mi): stream.seek(0) @@ -116,25 +66,6 @@ def set_metadata(stream, mi): set_metadata_pypdf(stream, mi) -def get_metadata_pypdf(stream): - """ Return metadata as a L{MetaInfo} object """ - from pyPdf import PdfFileReader - mi = MetaInformation(_('Unknown'), [_('Unknown')]) - try: - with StreamReadWrapper(stream) as stream: - info = PdfFileReader(stream).getDocumentInfo() - if info.title: - mi.title = info.title - if info.author: - mi.author = info.author - mi.authors = string_to_authors(info.author) - if info.subject: - mi.category = info.subject - except Exception, err: - msg = u'Couldn\'t read metadata from pdf: %s with error %s'%(mi.title, unicode(err)) - print >>sys.stderr, msg.encode('utf8') - return mi - class MetadataWriter(Thread): def __init__(self, out_pdf, buf): @@ -178,13 +109,4 @@ def set_metadata_pypdf(stream, mi): stream.write(out_str.read()) stream.seek(0) -def get_cover(cover_path): - with ImageMagick(): - wand = NewMagickWand() - MagickReadImage(wand, cover_path) - MagickSetImageFormat(wand, 'JPEG') - MagickWriteImage(wand, '%s.jpg' % cover_path) - return open('%s.jpg' % cover_path, 'rb').read() -''' - diff --git a/src/calibre/ebooks/pdf/main.cpp b/src/calibre/ebooks/pdf/main.cpp index 358f344c09..96bb5ed853 100644 --- a/src/calibre/ebooks/pdf/main.cpp +++ b/src/calibre/ebooks/pdf/main.cpp @@ -79,6 +79,50 @@ extern "C" { return ans; } + static PyObject * + pdfreflow_set_metadata(PyObject *self, PyObject *args) { + char *pdfdata; + Py_ssize_t size; + PyObject *info; + + if (!PyArg_ParseTuple(args, "s#O", &pdfdata, &size, &info)) + return NULL; + + if (!PyDict_Check(info)) { + PyErr_SetString(PyExc_ValueError, "Info object must be a dictionary."); + return NULL; + } + + char Title[10] = "Title", Author[10] = "Author", Keywords[10] = "Keywords"; + char *keys[3] = { Title, Author, Keywords }; + map pinfo; + PyObject *val = NULL, *utf8 = NULL; + + for (int i = 0; i < 3; i++) { + val = PyDict_GetItemString(info, keys[i]); + if (!val || !PyUnicode_Check(val)) continue; + utf8 = PyUnicode_AsUTF8String(val); + if (!utf8) continue; + pinfo[keys[i]] = PyString_AS_STRING(utf8); + } + + PyObject *ans = NULL; + try { + Reflow reflow(pdfdata, static_cast(size)); + if (reflow.is_locked()) { + PyErr_SetString(PyExc_ValueError, "Setting metadata not possible in encrypeted PDFs"); + return NULL; + } + string result = reflow.set_info(pinfo); + ans = PyString_FromStringAndSize(result.c_str(), result.size()); + } catch (std::exception &e) { + PyErr_SetString(PyExc_RuntimeError, e.what()); return NULL; + } catch (...) { + PyErr_SetString(PyExc_RuntimeError, + "Unknown exception raised while getting metadata from PDF"); return NULL; + } + return ans; + } static PyMethodDef pdfreflow_methods[] = { @@ -90,6 +134,10 @@ extern "C" { "get_metadata(pdf_data, cover)\n\n" "Get metadata and (optionally) cover from the specified PDF." }, + {"set_metadata", pdfreflow_set_metadata, METH_VARARGS, + "get_metadata(info_dict)\n\n" + "Set metadata in the specified PDF. Currently broken." + }, {NULL, NULL, 0, NULL} }; diff --git a/src/calibre/ebooks/pdf/reflow.cpp b/src/calibre/ebooks/pdf/reflow.cpp index 0181194ea2..a494887bca 100644 --- a/src/calibre/ebooks/pdf/reflow.cpp +++ b/src/calibre/ebooks/pdf/reflow.cpp @@ -680,6 +680,16 @@ void XMLOutputDev::drawImage(GfxState *state, Object *ref, Stream *str, colorMap, interpolate, maskColors, inlineImg); } +static char stream_pdf[15] = "stream.pdf"; + +class MemInStream : public MemStream { + public: + MemInStream(char *buf, size_t st, size_t sz, Object *obj) : + MemStream(buf, st, sz, obj) {} + ~MemInStream() {} + GooString *getFileName() { return new GooString(stream_pdf); } +}; + Reflow::Reflow(char *pdfdata, size_t sz) : pdfdata(pdfdata), current_font_size(-1), doc(NULL) { @@ -690,7 +700,7 @@ Reflow::Reflow(char *pdfdata, size_t sz) : if (!globalParams) throw ReflowException("Failed to allocate Globalparams"); } - MemStream *str = new MemStream(pdfdata, 0, sz, &obj); + MemInStream *str = new MemInStream(pdfdata, 0, sz, &obj); this->doc = new PDFDoc(str, NULL, NULL); if (!this->doc->isOk()) { @@ -909,3 +919,56 @@ char* Reflow::render_first_page(size_t *data_size, } return buffer; } + +class MemOutStream : public OutStream { + private: + ostringstream out; + + public: + MemOutStream() :OutStream() {} + ~MemOutStream() {} + void close() {} + int getPos() { return out.tellp(); } + void put(char c) { out.put(c); } + void printf (const char *format, ...) { + vector buf; + size_t written = strlen(format)*5; + va_list ap; + do { + buf.reserve(written + 20); + va_start(ap, format); + written = vsnprintf(&buf[0], buf.capacity(), format, ap); + va_end(ap); + } while (written >= buf.capacity()); + out.write(&buf[0], written); + } +}; + +string Reflow::set_info(map sinfo) { + XRef *xref = this->doc->getXRef(); + if (!xref) throw ReflowException("No XRef table"); + Object *trailer_dict = xref->getTrailerDict(); + if (!trailer_dict || !trailer_dict->isDict()) throw ReflowException("No trailer dictionary"); + Object tmp; + char INFO[5] = "Info"; + Object *info = trailer_dict->dictLookup(INFO, &tmp); + if (!info) { + info = new Object(); + info->initDict(xref); + } + if (!info->isDict()) throw ReflowException("Invalid info object"); + + for (map::iterator it = sinfo.begin(); it != sinfo.end(); it++) { + Object *tmp = new Object(); + tmp->initString(new GooString((*it).second)); + info->dictSet((*it).first, tmp); + } + + trailer_dict->dictSet(INFO, info); + char out[20] = "/t/out.pdf"; + this->doc->saveAs(new GooString(out), writeForceRewrite); + string ans; + return ans; +} + + diff --git a/src/calibre/ebooks/pdf/reflow.h b/src/calibre/ebooks/pdf/reflow.h index 2a672c6661..cf17cd15ae 100644 --- a/src/calibre/ebooks/pdf/reflow.h +++ b/src/calibre/ebooks/pdf/reflow.h @@ -74,6 +74,9 @@ class Reflow { /* Dump the PDF outline as the file outline.xml in the current directory */ void dump_outline(); + + /* Set the info dictionary. Currently broken. */ + string set_info(map info); }; class XMLString { From af79cbe591297ef730a586b485c4ebac89e88d96 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Tue, 22 Sep 2009 12:29:43 -0600 Subject: [PATCH 5/8] New recipe for The Toronto Star by Darko Miletic --- resources/images/news/thestar.png | Bin 0 -> 629 bytes src/calibre/web/feeds/recipes/__init__.py | 1 + .../web/feeds/recipes/recipe_thestar.py | 47 ++++++++++++++++++ 3 files changed, 48 insertions(+) create mode 100644 resources/images/news/thestar.png create mode 100644 src/calibre/web/feeds/recipes/recipe_thestar.py diff --git a/resources/images/news/thestar.png b/resources/images/news/thestar.png new file mode 100644 index 0000000000000000000000000000000000000000..94974bca7a3906d95555e6b7bb1bb95bc2989581 GIT binary patch literal 629 zcmeAS@N?(olHy`uVBq!ia0vp^0wB!61|;P_|4#%`Y)RhkE)4%caKYZ?lYt_f1s;*b zK-vS0-A-oPfdtD69Mgd`SU*F|v9*U87#Qbyx;TbdoIX0)*W*^A#DV(tv)nj&pC?`s z_+rFmCNr^EFgvzvKw;a z?r|eYd)m2Q`>jou%&>BApZECl$Lf6#SGm^Mp7PB9VLW&HYeUH|6^ElUu4uIgJXIEL zt=r9dwsfx_OS0z8xBU;wZ(In?{NS^;IgEFe(S$#my}WE(a-UCbDe+%tWS8*x!E^n# zW!LB zqTvwiGxOA+3Fi{#T=$9}`R=mD1n)9Z1a_l}1eRT4I6&CS~VBe^obd@Ti(fEoE~|aMLf-5<0j?U zg% Date: Tue, 22 Sep 2009 12:31:43 -0600 Subject: [PATCH 6/8] TXT Output: Fix line endings setting not being respected on windows --- src/calibre/ebooks/txt/newlines.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/calibre/ebooks/txt/newlines.py b/src/calibre/ebooks/txt/newlines.py index ae766a216f..d7e97654b4 100644 --- a/src/calibre/ebooks/txt/newlines.py +++ b/src/calibre/ebooks/txt/newlines.py @@ -19,7 +19,11 @@ class TxtNewlines(object): self.newline = self.NEWLINE_TYPES.get(newline_type.lower(), os.linesep) def specified_newlines(newline, text): + # Convert all newlines to \n + text = text.replace('\r\n', '\n') + text = text.replace('\r', '\n') + if newline == '\n': return text - return text.replace(os.linesep, newline) + return text.replace('\n', newline) From 74db760282280e0d3023cc690890b77a96737c95 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Tue, 22 Sep 2009 13:12:02 -0600 Subject: [PATCH 7/8] Fix #3452 (epubcheck1.0.4 errors - attribute file-as not allowed at this point) --- src/calibre/ebooks/metadata/opf2.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/calibre/ebooks/metadata/opf2.py b/src/calibre/ebooks/metadata/opf2.py index d0056b2deb..e4c6d19321 100644 --- a/src/calibre/ebooks/metadata/opf2.py +++ b/src/calibre/ebooks/metadata/opf2.py @@ -666,7 +666,7 @@ class OPF(object): for key in matches[0].attrib: if key.endswith('file-as'): matches[0].attrib.pop(key) - matches[0].set('file-as', unicode(val)) + matches[0].set('{%s}file-as'%self.NAMESPACES['opf'], unicode(val)) return property(fget=fget, fset=fset) From be0a623e6589b04362e6cdd449a279bf7fc4110d Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Tue, 22 Sep 2009 13:47:14 -0600 Subject: [PATCH 8/8] Fix #3555 (UnicodeDecodeError: ... unexpected code byte in Russian fb2) --- src/calibre/ebooks/fb2/input.py | 2 +- src/calibre/ebooks/metadata/meta.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/calibre/ebooks/fb2/input.py b/src/calibre/ebooks/fb2/input.py index efe2606141..12fa1da8e9 100644 --- a/src/calibre/ebooks/fb2/input.py +++ b/src/calibre/ebooks/fb2/input.py @@ -45,7 +45,7 @@ class FB2Input(InputFormatPlugin): log.debug('Parsing XML...') parser = etree.XMLParser(recover=True, no_network=True) - doc = etree.parse(stream, parser) + doc = etree.fromstring(stream.read()) self.extract_embedded_content(doc) log.debug('Converting XML to HTML...') ss = open(P('templates/fb2.xsl'), 'rb').read() diff --git a/src/calibre/ebooks/metadata/meta.py b/src/calibre/ebooks/metadata/meta.py index 28dcbf5ae0..2fb70d71b8 100644 --- a/src/calibre/ebooks/metadata/meta.py +++ b/src/calibre/ebooks/metadata/meta.py @@ -130,7 +130,7 @@ def metadata_from_filename(name, pat=None): au = match.group('author') aus = string_to_authors(au) mi.authors = aus - except IndexError: + except (IndexError, ValueError): pass try: mi.series = match.group('series')