PDF metadata: Do not crash when reading malformed PDF files

This commit is contained in:
Kovid Goyal 2012-05-31 00:45:20 +05:30
parent d805a93282
commit e2148e812b
19 changed files with 86 additions and 2552 deletions

View File

@ -1,6 +1,9 @@
" Project wide builtins " Project wide builtins
let $PYFLAKES_BUILTINS = "_,dynamic_property,__,P,I,lopen,icu_lower,icu_upper,icu_title,ngettext" let $PYFLAKES_BUILTINS = "_,dynamic_property,__,P,I,lopen,icu_lower,icu_upper,icu_title,ngettext"
" Include directories for C modules
let g:syntastic_c_include_dirs = [ '/usr/include/podofo']
fun! CalibreLog() fun! CalibreLog()
" Setup buffers to edit the calibre changelog and version info prior to " Setup buffers to edit the calibre changelog and version info prior to
" making a release. " making a release.

View File

@ -6,7 +6,7 @@ __license__ = 'GPL v3'
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>' __copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en' __docformat__ = 'restructuredtext en'
import os, socket, struct, subprocess, glob import os, socket, struct, subprocess
from distutils.spawn import find_executable from distutils.spawn import find_executable
from PyQt4 import pyqtconfig from PyQt4 import pyqtconfig
@ -84,7 +84,6 @@ ft_lib_dirs = []
ft_libs = [] ft_libs = []
jpg_libs = [] jpg_libs = []
jpg_lib_dirs = [] jpg_lib_dirs = []
poppler_objs = []
fc_inc = '/usr/include/fontconfig' fc_inc = '/usr/include/fontconfig'
fc_lib = '/usr/lib' fc_lib = '/usr/lib'
podofo_inc = '/usr/include/podofo' podofo_inc = '/usr/include/podofo'
@ -114,12 +113,7 @@ if iswindows:
jpg_libs = ['jpeg'] jpg_libs = ['jpeg']
ft_lib_dirs = [sw_lib_dir] ft_lib_dirs = [sw_lib_dir]
ft_libs = ['freetype'] ft_libs = ['freetype']
poppler_inc_dirs = consolidate('POPPLER_INC_DIR',
r'%s\poppler;%s'%(sw_inc_dir, sw_inc_dir))
poppler_lib_dirs = consolidate('POPPLER_LIB_DIR', sw_lib_dir)
popplerqt4_lib_dirs = poppler_lib_dirs
poppler_libs = ['poppler']
magick_inc_dirs = [os.path.join(prefix, 'build', 'ImageMagick-6.7.6')] magick_inc_dirs = [os.path.join(prefix, 'build', 'ImageMagick-6.7.6')]
magick_lib_dirs = [os.path.join(magick_inc_dirs[0], 'VisualMagick', 'lib')] magick_lib_dirs = [os.path.join(magick_inc_dirs[0], 'VisualMagick', 'lib')]
magick_libs = ['CORE_RL_wand_', 'CORE_RL_magick_'] magick_libs = ['CORE_RL_wand_', 'CORE_RL_magick_']
@ -128,13 +122,6 @@ if iswindows:
elif isosx: elif isosx:
fc_inc = '/sw/include/fontconfig' fc_inc = '/sw/include/fontconfig'
fc_lib = '/sw/lib' fc_lib = '/sw/lib'
poppler = glob.glob('/sw/build/poppler-*')[-1]
poppler_inc_dirs = consolidate('POPPLER_INC_DIR',
'{0}/poppler:{0}'.format(poppler))
poppler_lib_dirs = consolidate('POPPLER_LIB_DIR',
'/sw/lib')
poppler_libs = ['poppler']
popplerqt4_lib_dirs = poppler_lib_dirs
podofo_inc = '/sw/podofo' podofo_inc = '/sw/podofo'
podofo_lib = '/sw/lib' podofo_lib = '/sw/lib'
magick_inc_dirs = consolidate('MAGICK_INC', magick_inc_dirs = consolidate('MAGICK_INC',
@ -147,22 +134,15 @@ elif isosx:
png_libs = ['png12'] png_libs = ['png12']
else: else:
# Include directories # Include directories
poppler_inc_dirs = pkgconfig_include_dirs('poppler',
'POPPLER_INC_DIR', '/usr/include/poppler')
png_inc_dirs = pkgconfig_include_dirs('libpng', 'PNG_INC_DIR', png_inc_dirs = pkgconfig_include_dirs('libpng', 'PNG_INC_DIR',
'/usr/include') '/usr/include')
magick_inc_dirs = pkgconfig_include_dirs('MagickWand', 'MAGICK_INC', '/usr/include/ImageMagick') magick_inc_dirs = pkgconfig_include_dirs('MagickWand', 'MAGICK_INC', '/usr/include/ImageMagick')
# Library directories # Library directories
poppler_lib_dirs = popplerqt4_lib_dirs = pkgconfig_lib_dirs('poppler', 'POPPLER_LIB_DIR',
'/usr/lib')
png_lib_dirs = pkgconfig_lib_dirs('libpng', 'PNG_LIB_DIR', '/usr/lib') png_lib_dirs = pkgconfig_lib_dirs('libpng', 'PNG_LIB_DIR', '/usr/lib')
magick_lib_dirs = pkgconfig_lib_dirs('MagickWand', 'MAGICK_LIB', '/usr/lib') magick_lib_dirs = pkgconfig_lib_dirs('MagickWand', 'MAGICK_LIB', '/usr/lib')
# Libraries # Libraries
poppler_libs = pkgconfig_libs('poppler', '', '')
if not poppler_libs:
poppler_libs = ['poppler']
magick_libs = pkgconfig_libs('MagickWand', '', '') magick_libs = pkgconfig_libs('MagickWand', '', '')
if not magick_libs: if not magick_libs:
magick_libs = ['MagickWand', 'MagickCore'] magick_libs = ['MagickWand', 'MagickCore']
@ -176,26 +156,6 @@ fc_error = None if os.path.exists(os.path.join(fc_inc, 'fontconfig.h')) else \
'Try setting the FC_INC_DIR and FC_LIB_DIR environment ' 'Try setting the FC_INC_DIR and FC_LIB_DIR environment '
'variables.') 'variables.')
poppler_error = None
poppler_cflags = ['-DPNG_SKIP_SETJMP_CHECK'] if islinux else []
if not poppler_inc_dirs or not os.path.exists(
os.path.join(poppler_inc_dirs[0], 'OutputDev.h')):
poppler_error = \
('Poppler not found on your system. Various PDF related',
' functionality will not work. Use the POPPLER_INC_DIR and',
' POPPLER_LIB_DIR environment variables. calibre requires '
' the poppler XPDF headers. If your distro does not '
' include them you will have to re-compile poppler '
' by hand with --enable-xpdf-headers')
else:
lh = os.path.join(poppler_inc_dirs[0], 'Link.h')
if 'class AnnotLink' not in open(lh, 'rb').read():
poppler_cflags.append('-DPOPPLER_OLD_LINK_TYPE')
ph = os.path.join(poppler_inc_dirs[0], 'Page.h')
if 'getLinks(Catalog' in open(ph, 'rb').read():
poppler_cflags.append('-DPOPPLER_PRE_20')
magick_error = None magick_error = None
if not magick_inc_dirs or not os.path.exists(os.path.join(magick_inc_dirs[0], if not magick_inc_dirs or not os.path.exists(os.path.join(magick_inc_dirs[0],
'wand')): 'wand')):

View File

@ -8,7 +8,7 @@ __docformat__ = 'restructuredtext en'
__all__ = [ __all__ = [
'pot', 'translations', 'get_translations', 'iso639', 'pot', 'translations', 'get_translations', 'iso639',
'build', 'build_pdf2xml', 'server', 'build', 'server',
'gui', 'gui',
'develop', 'install', 'develop', 'install',
'kakasi', 'coffee', 'resources', 'kakasi', 'coffee', 'resources',
@ -31,9 +31,8 @@ translations = Translations()
get_translations = GetTranslations() get_translations = GetTranslations()
iso639 = ISO639() iso639 = ISO639()
from setup.extensions import Build, BuildPDF2XML from setup.extensions import Build
build = Build() build = Build()
build_pdf2xml = BuildPDF2XML()
from setup.server import Server from setup.server import Server
server = Server() server = Server()

View File

@ -12,14 +12,11 @@ from distutils import sysconfig
from PyQt4.pyqtconfig import QtGuiModuleMakefile from PyQt4.pyqtconfig import QtGuiModuleMakefile
from setup import Command, islinux, isbsd, isosx, SRC, iswindows from setup import Command, islinux, isbsd, isosx, SRC, iswindows
from setup.build_environment import (fc_inc, fc_lib, chmlib_inc_dirs, from setup.build_environment import (fc_inc, fc_lib, chmlib_inc_dirs, fc_error,
fc_error, poppler_libs, poppler_lib_dirs, poppler_inc_dirs, podofo_inc, podofo_inc, podofo_lib, podofo_error, pyqt, OSX_SDK, NMAKE, QMAKE,
podofo_lib, podofo_error, poppler_error, pyqt, OSX_SDK, NMAKE, msvc, MT, win_inc, win_lib, win_ddk, magick_inc_dirs, magick_lib_dirs,
QMAKE, msvc, MT, win_inc, win_lib, png_inc_dirs, win_ddk, magick_libs, chmlib_lib_dirs, sqlite_inc_dirs, icu_inc_dirs,
magick_inc_dirs, magick_lib_dirs, png_lib_dirs, png_libs, icu_lib_dirs)
magick_error, magick_libs, ft_lib_dirs, ft_libs, jpg_libs,
jpg_lib_dirs, chmlib_lib_dirs, sqlite_inc_dirs, icu_inc_dirs,
icu_lib_dirs, poppler_cflags)
MT MT
isunix = islinux or isosx or isbsd isunix = islinux or isosx or isbsd
@ -51,7 +48,6 @@ class Extension(object):
reflow_sources = glob.glob(os.path.join(SRC, 'calibre', 'ebooks', 'pdf', '*.cpp')) reflow_sources = glob.glob(os.path.join(SRC, 'calibre', 'ebooks', 'pdf', '*.cpp'))
reflow_headers = glob.glob(os.path.join(SRC, 'calibre', 'ebooks', 'pdf', '*.h')) reflow_headers = glob.glob(os.path.join(SRC, 'calibre', 'ebooks', 'pdf', '*.h'))
reflow_error = poppler_error if poppler_error else magick_error
pdfreflow_libs = [] pdfreflow_libs = []
if iswindows: if iswindows:
@ -107,16 +103,6 @@ extensions = [
inc_dirs=magick_inc_dirs inc_dirs=magick_inc_dirs
), ),
Extension('pdfreflow',
reflow_sources,
headers=reflow_headers,
libraries=poppler_libs+magick_libs+png_libs+ft_libs+jpg_libs+pdfreflow_libs,
lib_dirs=poppler_lib_dirs+magick_lib_dirs+png_lib_dirs+ft_lib_dirs+jpg_lib_dirs,
inc_dirs=poppler_inc_dirs+magick_inc_dirs+png_inc_dirs,
error=reflow_error,
cflags=poppler_cflags
),
Extension('lzx', Extension('lzx',
['calibre/utils/lzx/lzxmodule.c', ['calibre/utils/lzx/lzxmodule.c',
'calibre/utils/lzx/compressor.c', 'calibre/utils/lzx/compressor.c',
@ -445,48 +431,5 @@ class Build(Command):
shutil.rmtree(build_dir) shutil.rmtree(build_dir)
class BuildPDF2XML(Command):
description = 'Build command line pdf2xml utility'
def run(self, opts):
dest = os.path.expanduser('~/bin/pdf2xml')
if iswindows:
dest = r'C:\cygwin\home\kovid\sw\bin\pdf2xml.exe'
odest = self.j(self.d(self.SRC), 'build', 'objects', 'pdf2xml')
if not os.path.exists(odest):
os.makedirs(odest)
objects = []
for src in reflow_sources:
if src.endswith('python.cpp'):
continue
obj = self.j(odest, self.b(src+('.obj' if iswindows else '.o')))
if self.newer(obj, [src]+reflow_headers):
cmd = [cxx, '-pthread', '-pedantic', '-ggdb', '-c', '-Wall', '-I/usr/include/poppler',
'-I/usr/include/ImageMagick',
'-DPDF2XML', '-o', obj, src]
if iswindows:
cmd = [cxx, '/c', '/MD', '/W3', '/EHsc', '/Zi', '/DPDF2XML']
cmd += ['-I'+x for x in poppler_inc_dirs+magick_inc_dirs]
cmd += ['/Fo'+obj, src]
self.info(*cmd)
self.check_call(cmd)
objects.append(obj)
if self.newer(dest, objects):
cmd = ['g++', '-ggdb', '-o', dest]+objects+['-lpoppler', '-lMagickWand',
'-lpng', '-lpthread']
if iswindows:
cmd = [msvc.linker] + '/INCREMENTAL:NO /DEBUG /NODEFAULTLIB:libcmt.lib'.split()
cmd += ['/LIBPATH:'+x for x in magick_lib_dirs+poppler_lib_dirs]
cmd += [x+'.lib' for x in
png_libs+magick_libs+poppler_libs+ft_libs+jpg_libs+pdfreflow_libs]
cmd += ['/OUT:'+dest] + objects
self.info(*cmd)
self.check_call(cmd)
self.info('Binary installed as', dest)

View File

@ -22,6 +22,8 @@ QTDLLS = ('QtCore', 'QtGui', 'QtNetwork', 'QtSvg', 'QtXml', 'QtWebKit',
MAGICK_PREFIX = '/usr' MAGICK_PREFIX = '/usr'
binary_includes = [ binary_includes = [
'/usr/bin/pdftohtml', '/usr/bin/pdftohtml',
'/usr/bin/pdfinfo',
'/usr/bin/pdftoppm',
'/usr/lib/libwmflite-0.2.so.7', '/usr/lib/libwmflite-0.2.so.7',
'/usr/lib/liblcms.so.1', '/usr/lib/liblcms.so.1',
'/usr/lib/liblzma.so.0', '/usr/lib/liblzma.so.0',

View File

@ -387,7 +387,8 @@ class Py2App(object):
info('\nAdding poppler') info('\nAdding poppler')
for x in ('libpoppler.25.dylib',): for x in ('libpoppler.25.dylib',):
self.install_dylib(os.path.join(SW, 'lib', x)) self.install_dylib(os.path.join(SW, 'lib', x))
self.install_dylib(os.path.join(SW, 'bin', 'pdftohtml'), False) for x in ('pdftohtml', 'pdftoppm', 'pdfinfo'):
self.install_dylib(os.path.join(SW, 'bin', x), False)
@flush @flush
def add_libjpeg(self): def add_libjpeg(self):

View File

@ -260,7 +260,8 @@ class Win32Freeze(Command, WixMixIn):
print '\tAdding misc binary deps' print '\tAdding misc binary deps'
bindir = os.path.join(SW, 'bin') bindir = os.path.join(SW, 'bin')
shutil.copy2(os.path.join(bindir, 'pdftohtml.exe'), self.base) for x in ('pdftohtml', 'pdfinfo', 'pdftoppm'):
shutil.copy2(os.path.join(bindir, x+'.exe'), self.base)
for pat in ('*.dll',): for pat in ('*.dll',):
for f in glob.glob(os.path.join(bindir, pat)): for f in glob.glob(os.path.join(bindir, pat)):
ok = True ok = True

View File

@ -293,9 +293,7 @@ In Cmake: disable GTK, Qt, OPenjpeg, cpp, lcms, gtk_tests, qt_tests. Enable qt4,
NOTE: poppler must be built as a static library, unless you build the qt4 bindings NOTE: poppler must be built as a static library, unless you build the qt4 bindings
Now do the same for the pdftohtml project cp build/utils/Release/*.exe ../../bin/
cp poppler/*.h ~/sw/include/poppler && cp goo/*.h ~/sw/include/poppler/goo && cp splash/*.h ~/sw/include/poppler/splash && cp build/Release/poppler.lib ../../lib/ && cp build/utils/Release/pdftohtml.exe ../../bin/
podofo podofo

View File

@ -82,7 +82,6 @@ class Plugins(collections.Mapping):
'podofo', 'podofo',
'cPalmdoc', 'cPalmdoc',
'fontconfig', 'fontconfig',
'pdfreflow',
'progress_indicator', 'progress_indicator',
'chmlib', 'chmlib',
'chm_extra', 'chm_extra',

View File

@ -4,25 +4,80 @@ __copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
'''Read meta information from PDF files''' '''Read meta information from PDF files'''
#import re #import re
import os, subprocess, shutil
from functools import partial from functools import partial
from calibre import prints from calibre import prints
from calibre.constants import plugins from calibre.constants import iswindows
from calibre.ptempfile import TemporaryDirectory
from calibre.ebooks.metadata import MetaInformation, string_to_authors from calibre.ebooks.metadata import MetaInformation, string_to_authors
from calibre.utils.ipc.simple_worker import fork_job
pdfreflow, pdfreflow_error = plugins['pdfreflow']
#_isbn_pat = re.compile(r'ISBN[: ]*([-0-9Xx]+)') #_isbn_pat = re.compile(r'ISBN[: ]*([-0-9Xx]+)')
def read_info(outputdir, get_cover):
''' Read info dict and cover from a pdf file named src.pdf in outputdir.
Note that this function changes the cwd to outputdir and is therefore not
thread safe. Run it using fork_job. This is necessary as there is no safe
way to pass unicode paths via command line arguments. This also ensures
that if poppler crashes, no stale file handles are left for the original
file, only for src.pdf.'''
from calibre.ebooks.pdf.pdftohtml import PDFTOHTML
os.chdir(outputdir)
base = os.path.dirname(PDFTOHTML)
suffix = '.exe' if iswindows else ''
pdfinfo = os.path.join(base, 'pdfinfo') + suffix
pdftoppm = os.path.join(base, 'pdftoppm') + suffix
try:
raw = subprocess.check_output([pdfinfo, '-enc', 'UTF-8', 'src.pdf'])
except subprocess.CalledProcessError as e:
prints('pdfinfo errored out with return code: %d'%e.returncode)
return None
try:
raw = raw.decode('utf-8')
except UnicodeDecodeError:
prints('pdfinfo returned no UTF-8 data')
return None
ans = {}
for line in raw.splitlines():
if u':' not in line: continue
field, val = line.partition(u':')[::2]
val = val.strip()
if field and val:
ans[field] = val.strip()
if get_cover:
try:
subprocess.check_call([pdftoppm, '-singlefile', '-jpeg',
'src.pdf', 'cover'])
except subprocess.CalledProcessError as e:
prints('pdftoppm errored out with return code: %d'%e.returncode)
return ans
def get_metadata(stream, cover=True): def get_metadata(stream, cover=True):
if pdfreflow is None: with TemporaryDirectory('_pdf_metadata_read') as pdfpath:
raise RuntimeError(pdfreflow_error) stream.seek(0)
stream.seek(0) with open(os.path.join(pdfpath, 'src.pdf'), 'wb') as f:
raw = stream.read() shutil.copyfileobj(stream, f)
#isbn = _isbn_pat.search(raw) res = fork_job('calibre.ebooks.metadata.pdf', 'read_info', (pdfpath,
#if isbn is not None: bool(cover)))
# isbn = isbn.group(1).replace('-', '').replace(' ', '') info = res['result']
info = pdfreflow.get_metadata(raw, cover) with open(res['stdout_stderr'], 'rb') as f:
raw = f.read().strip()
if raw:
prints(raw)
if not info:
raise ValueError('Could not read info dict from PDF')
covpath = os.path.join(pdfpath, 'cover.jpg')
cdata = None
if cover and os.path.exists(covpath):
with open(covpath, 'rb') as f:
cdata = f.read()
title = info.get('Title', None) title = info.get('Title', None)
au = info.get('Author', None) au = info.get('Author', None)
if au is None: if au is None:
@ -46,12 +101,8 @@ def get_metadata(stream, cover=True):
if subject: if subject:
mi.tags.insert(0, subject) mi.tags.insert(0, subject)
if cover and 'cover' in info: if cdata:
data = info['cover'] mi.cover_data = ('jpeg', cdata)
if data is None:
prints(title, 'has no pages, cover extraction impossible.')
else:
mi.cover_data = ('png', data)
return mi return mi

View File

@ -1,151 +0,0 @@
/**
* Copyright 2009 Kovid Goyal <kovid@kovidgoyal.net>
* License: GNU GPL v2+
*/
#include "fonts.h"
#include "utils.h"
using namespace calibre_reflow;
using namespace std;
XMLColor::XMLColor(GfxRGB rgb) {
this->r = static_cast<int>(rgb.r/65535.0*255.0);
this->g = static_cast<int>(rgb.g/65535.0*255.0);
this->b = static_cast<int>(rgb.b/65535.0*255.0);
if (!(this->ok(this->r) && this->ok(this->b) && this->ok(this->g))) {
this->r = 0; this->g = 0; this->b = 0;
}
}
string XMLColor::str() const {
ostringstream oss;
oss << "rgb(" << this->r << "," << this->g << "," << this->b << ")";
return oss.str();
}
static const char *FONT_MODS[7] = {
"-bolditalic", "-boldoblique", "-bold", "-italic", "-oblique", "-roman",
NULL
};
#ifdef _WIN32
#define ap_toupper(c) (toupper(((unsigned char)(c))))
static inline
const char *strcasestr(const char *h, const char *n )
{ /* h="haystack", n="needle" */
const char *a=h, *e=n;
if( !h || !*h || !n || !*n ) { return 0; }
while( *a && *e ) {
if( ap_toupper(*a)!=ap_toupper(*e) ) {
++h; a=h; e=n;
}
else {
++a; ++e;
}
}
return *e ? 0 : h;
}
#endif
static string* family_name(const string *font_name) {
if (!font_name) return NULL;
string *fn = new string(*font_name);
size_t pos;
const char *p;
for (size_t i = 0; FONT_MODS[i] != NULL; i++) {
p = strcasestr(fn->c_str(), FONT_MODS[i]);
if (p != NULL) {
pos = p - fn->c_str();
fn->replace(pos, strlen(FONT_MODS[i]), "");
break;
}
}
return fn;
}
XMLFont::XMLFont(string* font_name, double size, GfxRGB rgb) :
size(size-1), line_size(-1.0), italic(false), bold(false), font_name(font_name),
font_family(NULL), color(rgb) {
if (!this->font_name) this->font_name = new string(DEFAULT_FONT_FAMILY);
this->font_family = family_name(this->font_name);
if (strcasestr(font_name->c_str(), "bold")) this->bold = true;
if (strcasestr(font_name->c_str(),"italic")||
strcasestr(font_name->c_str(),"oblique")) this->italic = true;
}
XMLFont& XMLFont::operator=(const XMLFont& x){
if (this==&x) return *this;
this->size = x.size;
this->line_size = x.line_size;
this->italic = x.italic;
this->bold = x.bold;
this->color = x.color;
if (this->font_name) delete this->font_name;
this->font_name = new string(*x.font_name);
if (this->font_family) delete this->font_family;
this->font_family = new string(*x.font_family);
return *this;
}
bool XMLFont::operator==(const XMLFont &f) const {
return (fabs(this->size - f.size) < 0.1) &&
(fabs(this->line_size - f.line_size) < 0.1) &&
(this->italic == f.italic) &&
(this->bold == f.bold) &&
(this->color == f.color) &&
((*this->font_family) == (*f.font_family));
}
bool XMLFont::eq_upto_inline(const XMLFont &f) const {
return (fabs(this->size - f.size) < 0.1) &&
(fabs(this->line_size - f.line_size) < 0.1) &&
(this->color == f.color) &&
((*this->font_family) == (*f.font_family));
}
string XMLFont::str(Fonts::size_type id) const {
ostringstream oss;
oss << "<font id=\"" << id << "\" ";
oss << "family=\"" << encode_for_xml(*this->font_family) << "\" ";
oss << "color=\"" << this->color.str() << "\" ";
oss << setiosflags(ios::fixed) << setprecision(2)
<< "size=\"" << this->size << "\"";
oss << "/>";
return oss.str();
}
Fonts::size_type Fonts::add_font(XMLFont *f) {
Fonts::iterator it;
size_type i;
for ( i=0, it=this->begin(); it < this->end(); it++, i++ ) {
if (**it == *f) return i;
}
this->push_back(f);
return this->size()-1;
}
Fonts::size_type Fonts::add_font(string* font_name, double size, GfxRGB rgb) {
XMLFont *f = NULL;
if (font_name == NULL)
font_name = new string("Unknown");
// font_name must not be deleted
f = new XMLFont(font_name, size, rgb);
return this->add_font(f);
}
Fonts::~Fonts() {
Fonts::iterator it;
for ( it=this->begin(); it < this->end(); it++ ) delete *it;
this->resize(0);
}

View File

@ -1,102 +0,0 @@
/**
* Copyright 2009 Kovid Goyal <kovid@kovidgoyal.net>
* License: GNU GPL v2+
*/
#pragma once
#include <vector>
#include <sstream>
#include <iomanip>
#include <ctype.h>
#include <math.h>
#include <GfxState.h>
using namespace std;
#define DEFAULT_FONT_FAMILY "Times New Roman"
namespace calibre_reflow {
class XMLColor {
private:
unsigned int r;
unsigned int g;
unsigned int b;
inline bool ok(unsigned int xcol) const {
return ( (xcol <= 255) && (xcol >= 0) );
}
public:
XMLColor():r(0),g(0),b(0){}
XMLColor(GfxRGB rgb);
XMLColor(const XMLColor& x) {
this->r=x.r; this->g=x.g; this->b=x.b;
}
XMLColor& operator=(const XMLColor &x){
this->r=x.r; this->g=x.g; this->b=x.b;
return *this;
}
~XMLColor(){}
string str() const;
bool operator==(const XMLColor &col) const {
return ((r==col.r)&&(g==col.g)&&(b==col.b));
}
};
class XMLFont {
private:
double size;
double line_size;
bool italic;
bool bold;
string *font_name;
string *font_family;
XMLColor color;
public:
XMLFont(const char *font_family=DEFAULT_FONT_FAMILY, double size=12.0) :
size(size), line_size(-1.0), italic(false), bold(false),
font_name(new string(font_family)), font_family(new string(font_family)),
color() {}
XMLFont(string* font_name, double size, GfxRGB rgb);
XMLFont(const XMLFont& other) :
size(other.size), line_size(other.line_size), italic(other.italic),
bold(other.bold), font_name(new string(*other.font_name)),
font_family(other.font_family), color(other.color) {}
XMLColor get_color() { return this->color; }
string* get_font_name() { return this->font_name; }
double get_size() const { return this->size; }
double get_line_size() { return this->line_size; }
void set_line_size(double ls) { this->line_size = ls; }
bool is_italic() const { return this->italic; }
bool is_bold() const { return this->bold; }
~XMLFont() { delete this->font_name; delete this->font_family; }
XMLFont& operator=(const XMLFont& other);
bool operator==(const XMLFont &other) const;
bool eq_upto_inline(const XMLFont &f) const;
string str(vector<XMLFont*>::size_type id) const;
};
class Fonts : public vector<XMLFont*> {
public:
Fonts::size_type add_font(XMLFont *f);
Fonts::size_type add_font(string* font_name, double size, GfxRGB rgb);
~Fonts();
};
}

View File

@ -1,433 +0,0 @@
/**
* Copyright 2009 Kovid Goyal <kovid@kovidgoyal.net>
* License: GNU GPL v2+
*/
#include <stdio.h>
#include <errno.h>
#include <sstream>
#include <algorithm>
#include <iomanip>
#include <math.h>
#include <iostream>
#include <wand/MagickWand.h>
#include <zlib.h>
#include "images.h"
#include "utils.h"
#ifdef _WIN32
inline double round(double x) { return (x-floor(x))>0.5 ? ceil(x) : floor(x); }
#endif
#define xoutRound(x) ( static_cast<int>(round(x)) )
using namespace std;
using namespace calibre_reflow;
calibre_reflow::ImageInfo::ImageInfo(GfxState *state) {
// get image position and size
state->transform(0, 0, &xt, &yt);
state->transformDelta(1, 1, &wt, &ht);
if (wt > 0) {
x0 = xoutRound(xt);
w0 = xoutRound(wt);
} else {
x0 = xoutRound(xt + wt);
w0 = xoutRound(-wt);
}
if (ht > 0) {
y0 = xoutRound(yt);
h0 = xoutRound(ht);
} else {
y0 = xoutRound(yt + ht);
h0 = xoutRound(-ht);
}
state->transformDelta(1, 0, &xt, &yt);
rotate = fabs(xt) < fabs(yt);
if (rotate) {
w1 = h0;
h1 = w0;
x_flip = ht < 0;
y_flip = wt > 0;
} else {
w1 = w0;
h1 = h0;
x_flip = wt < 0;
y_flip = ht > 0;
}
//cout << x_flip << "|" << y_flip << endl;
}
void XMLImages::clear() {
vector<XMLImage*>::iterator it;
for (it = this->masks.begin(); it < this->masks.end(); it++)
delete *it;
for (it = this->images.begin(); it < this->images.end(); it++)
delete *it;
this->masks.clear();
this->images.clear();
}
void XMLImages::add_mask(GfxState *state, Object *ref, Stream *str,
unsigned int width, unsigned int height, bool invert,
bool interpolate, bool inline_img) {
}
static void throw_magick_exception(MagickWand *wand) {
ExceptionType severity;
char *description = MagickGetException(wand, &severity);
ostringstream oss;
oss << description << endl;
description=(char *) MagickRelinquishMemory(description);
wand = DestroyMagickWand(wand);
MagickWandTerminus();
throw ReflowException(oss.str().c_str());
}
static void flip_image(string file_name, bool x_flip, bool y_flip) {
MagickWand *magick_wand;
MagickBooleanType status;
MagickWandGenesis();
magick_wand = NewMagickWand();
status = MagickReadImage(magick_wand, file_name.c_str());
if (status == MagickFalse) throw_magick_exception(magick_wand);
if (y_flip) {
status = MagickFlipImage(magick_wand);
if (status == MagickFalse) throw_magick_exception(magick_wand);
}
if (x_flip) {
status = MagickFlopImage(magick_wand);
if (status == MagickFalse) throw_magick_exception(magick_wand);
}
status = MagickWriteImage(magick_wand, NULL);
if (status == MagickFalse) throw_magick_exception(magick_wand);
magick_wand = DestroyMagickWand(magick_wand);
MagickWandTerminus();
}
void XMLImages::add(GfxState *state, Object *ref, Stream *str,
unsigned int width, unsigned int height, GfxImageColorMap *colorMap,
bool interpolate, int *maskColors, bool inline_img) {
XMLImage *img = new XMLImage(state);
this->images.push_back(img);
img->width = width; img->height = height;
img->type = (str->getKind() == strDCT) ? jpeg : png;
string file_name = this->file_name(img);
FILE *of = fopen(file_name.c_str(), "wb");
if (!of) throw ReflowException(strerror(errno));
if (img->type == jpeg) {
int c;
str = str->getNextStream();
str->reset();
// copy the stream
while ((c = str->getChar()) != EOF) fputc(c, of);
} else { //Render as PNG
Guchar *p;
GfxRGB rgb;
png_byte *row = (png_byte *) malloc(3 * width); // 3 bytes/pixel: RGB
png_bytep *row_pointer= &row;
PNGWriter *writer = new PNGWriter();
writer->init(of, width, height);
// Initialize the image stream
ImageStream *imgStr = new ImageStream(str, width,
colorMap->getNumPixelComps(), colorMap->getBits());
imgStr->reset();
// For each line...
for (unsigned int y = 0; y < height; y++) {
// Convert into a PNG row
p = imgStr->getLine();
for (unsigned int x = 0; x < width; x++) {
colorMap->getRGB(p, &rgb);
// Write the RGB pixels into the row
row[3*x]= colToByte(rgb.r);
row[3*x+1]= colToByte(rgb.g);
row[3*x+2]= colToByte(rgb.b);
p += colorMap->getNumPixelComps();
}
writer->writeRow(row_pointer);
}
writer->close();
delete writer;
free(row);
imgStr->close();
delete imgStr;
}
fclose(of);
img->written = true;
if (img->info.x_flip || img->info.y_flip)
flip_image(file_name, img->info.x_flip, img->info.y_flip);
}
string XMLImages::file_name(const XMLImage *img) const {
vector<XMLImage*>::const_iterator ir, mr;
size_t idx = 0;
bool mask = false;
ir = find( this->images.begin(), this->images.end(), img);
if (ir == this->images.end()) {
mr = find( this->masks.begin(), this->masks.end(), img);
idx = mr - this->masks.begin();
mask = true;
} else idx = ir - this->images.begin();
ostringstream oss;
oss << ((mask) ? "mask" : "image") << "-" << idx+1 << '.';
oss << ((img->type == jpeg) ? "jpg" : "png");
return oss.str();
}
vector<string*> XMLImages::str() const {
vector<string*> ans;
vector <XMLImage*>::const_iterator it;
for (it = this->masks.begin(); it < this->masks.end(); it++) {
if ((*it)->written)
ans.push_back(new string((*it)->str(it - this->masks.begin(), true,
this->file_name(*it))));
}
for (it = this->images.begin(); it < this->images.end(); it++) {
if ((*it)->written)
ans.push_back(new string((*it)->str(it - this->images.begin(), false,
this->file_name(*it))));
}
return ans;
}
string XMLImage::str(size_t num, bool mask, string file_name) const {
ostringstream oss;
oss << "<img type=\"" << ((mask) ? "mask" : "image") << "\" "
<< "src=\"" << file_name << "\" "
<< "iwidth=\"" << this->width << "\" iheight=\"" << this->height << "\" "
<< "rwidth=\"" << this->info.w1 << "\" rheight=\"" << this->info.h1 << "\" "
<< setiosflags(ios::fixed) << setprecision(2)
<< "top=\"" << this->info.y0 << "\" left=\"" << this->info.x0 << "\"/>";
return oss.str();
}
PNGWriter::~PNGWriter()
{
/* cleanup heap allocation */
png_destroy_write_struct(&png_ptr, &info_ptr);
}
void PNGWriter::init(FILE *f, int width, int height)
{
/* initialize stuff */
png_ptr = png_create_write_struct(PNG_LIBPNG_VER_STRING, NULL, NULL, NULL);
if (!png_ptr)
throw ReflowException("png_create_write_struct failed");
info_ptr = png_create_info_struct(png_ptr);
if (!info_ptr)
throw ReflowException("png_create_info_struct failed");
if (setjmp(png_jmpbuf(png_ptr)))
throw ReflowException("png_jmpbuf failed");
/* write header */
png_init_io(png_ptr, f);
if (setjmp(png_jmpbuf(png_ptr)))
throw ReflowException("Error during writing header");
// Set up the type of PNG image and the compression level
png_set_compression_level(png_ptr, Z_BEST_COMPRESSION);
png_byte bit_depth = 8;
png_byte color_type = PNG_COLOR_TYPE_RGB;
png_byte interlace_type = PNG_INTERLACE_NONE;
png_set_IHDR(png_ptr, info_ptr, width, height, bit_depth, color_type, interlace_type, PNG_COMPRESSION_TYPE_DEFAULT, PNG_FILTER_TYPE_DEFAULT);
png_write_info(png_ptr, info_ptr);
if (setjmp(png_jmpbuf(png_ptr)))
throw ReflowException("error during writing png info bytes");
}
void PNGWriter::writePointers(png_bytep *rowPointers)
{
png_write_image(png_ptr, rowPointers);
/* write bytes */
if (setjmp(png_jmpbuf(png_ptr)))
throw ReflowException("Error during writing bytes");
}
void PNGWriter::writeRow(png_bytep *row)
{
// Write the row to the file
png_write_rows(png_ptr, row, 1);
if (setjmp(png_jmpbuf(png_ptr)))
throw ReflowException("error during png row write");
}
void PNGWriter::close()
{
/* end write */
png_write_end(png_ptr, info_ptr);
if (setjmp(png_jmpbuf(png_ptr)))
throw ReflowException("Error during end of write");
}
void PNGWriter::write_splash_bitmap(SplashBitmap *bitmap) {
SplashColorPtr row = bitmap->getDataPtr();
int height = bitmap->getHeight();
int row_size = bitmap->getRowSize();
png_bytep *row_pointers = new png_bytep[height];
for (int y = 0; y < height; ++y) {
row_pointers[y] = row;
row += row_size;
}
this->writePointers(row_pointers);
delete[] row_pointers;
}
void calibre_png_mem_write(png_structp png_ptr, png_bytep data, png_size_t length) {
if (!png_ptr || length < 1) return;
vector<char> *buf = static_cast< vector<char>* >(png_get_io_ptr(png_ptr));
buf->reserve(buf->capacity() + length);
do {
buf->push_back(static_cast<char>(*data));
data++; length--;
} while(length > 0);
}
void calibre_png_mem_flush(png_structp png_ptr) {}
void PNGMemWriter::init(vector<char> *buf, int width, int height) {
/* initialize stuff */
this->png_ptr = png_create_write_struct(PNG_LIBPNG_VER_STRING, NULL, NULL, NULL);
if (!this->png_ptr)
throw ReflowException("png_create_write_struct failed");
this->info_ptr = png_create_info_struct(png_ptr);
if (!this->info_ptr)
throw ReflowException("png_create_info_struct failed");
if (setjmp(png_jmpbuf(this->png_ptr)))
throw ReflowException("png_jmpbuf failed");
png_set_write_fn(this->png_ptr, static_cast<void *>(buf),
calibre_png_mem_write, calibre_png_mem_flush);
if (setjmp(png_jmpbuf(this->png_ptr)))
throw ReflowException("png_set_write failed");
// Set up the type of PNG image and the compression level
png_set_compression_level(this->png_ptr, Z_BEST_COMPRESSION);
png_byte bit_depth = 8;
png_byte color_type = PNG_COLOR_TYPE_RGB;
png_byte interlace_type = PNG_INTERLACE_NONE;
png_set_IHDR(this->png_ptr, this->info_ptr, width, height,
bit_depth, color_type, interlace_type,
PNG_COMPRESSION_TYPE_DEFAULT, PNG_FILTER_TYPE_DEFAULT);
png_write_info(png_ptr, info_ptr);
if (setjmp(png_jmpbuf(png_ptr)))
throw ReflowException("error during writing png info bytes");
}
/*
void calibre_jpeg_error_exit (j_common_ptr cinfo)
{
// cinfo->err really points to a my_error_mgr struct, so coerce pointer
calibre_jpeg_err_mgr *err = (calibre_jpeg_err_mgr *)(cinfo->err);
// Always display the message.
// We could postpone this until after returning, if we chose.
//(*cinfo->err->output_message) (cinfo);
// Return control to the setjmp point
longjmp(err->setjmp_buffer, 1);
}
JPEGWriter::JPEGWriter() {
this->cinfo.err = jpeg_std_error(&this->jerr.pub);
jpeg_create_compress(&this->cinfo);
this->jerr.pub.error_exit = calibre_jpeg_error_exit;
this->check();
this->outfile = NULL;
}
void JPEGWriter::init(int width, int height) {
cinfo.image_width = width;
cinfo.image_height = height;
cinfo.input_components = 3; // # of color components per pixel
cinfo.in_color_space = JCS_RGB;
jpeg_set_defaults(&this->cinfo);
this->check();
jpeg_start_compress(&this->cinfo, TRUE);
this->check();
}
void JPEGWriter::init_io(FILE *f) {
jpeg_stdio_dest(&this->cinfo, f);
this->check();
this->outfile = f;
}
void JPEGWriter::check() {
if (setjmp(jerr.setjmp_buffer)) this->raise();
}
void JPEGWriter::raise() {
char buffer[JMSG_LENGTH_MAX];
// Create the message
(*this->cinfo.err->format_message) ((jpeg_common_struct *)(&this->cinfo), buffer);
jpeg_destroy_compress(&this->cinfo);
throw ReflowException(buffer);
}
void JPEGWriter::write_image(JSAMPARRAY image_buffer, JDIMENSION num) {
size_t num_written = jpeg_write_scanlines(&this->cinfo, image_buffer, num);
this->check();
if (num_written != num) {
jpeg_destroy_compress(&this->cinfo);
throw ReflowException("Failed to write all JPEG scanlines.");
}
}
void JPEGWriter::write_splash_bitmap(SplashBitmap *bitmap) {
SplashColorPtr row = bitmap->getDataPtr();
int height = bitmap->getHeight();
int row_size = bitmap->getRowSize();
JSAMPARRAY row_pointers = new JSAMPLE*[height];
for (int y = 0; y < height; ++y) {
row_pointers[y] = row;
row += row_size;
}
this->write_image(row_pointers, height);
delete[] row_pointers;
jpeg_finish_compress(&this->cinfo);
this->check();
fclose(this->outfile);
}
JPEGWriter::~JPEGWriter() {
jpeg_destroy_compress(&this->cinfo);
}
*/

View File

@ -1,135 +0,0 @@
/**
* Copyright 2009 Kovid Goyal <kovid@kovidgoyal.net>
* License: GNU GPL v2+
*/
#pragma once
#include <vector>
#include <GfxState.h>
#include <splash/SplashBitmap.h>
#include <png.h>
#include <jpeglib.h>
#include "utils.h"
using namespace std;
namespace calibre_reflow {
enum ImageType {
jpeg, png
};
class PNGWriter
{
public:
PNGWriter() {}
~PNGWriter();
void init(FILE *f, int width, int height);
void writePointers(png_bytep *rowPointers);
void writeRow(png_bytep *row);
void write_splash_bitmap(SplashBitmap *bitmap);
void close();
protected:
png_structp png_ptr;
png_infop info_ptr;
};
class PNGMemWriter : public PNGWriter
{
public:
void init(vector<char> *buf, int width, int height);
};
class ImageInfo {
public:
ImageInfo(GfxState *state);
private:
int x0, y0; // top left corner of image
int w0, h0, w1, h1; // size of image
double xt, yt, wt, ht;
bool rotate, x_flip, y_flip;
friend class XMLImage;
friend class XMLImages;
};
class XMLImage {
private:
double x, y;
unsigned int width, height;
ImageType type;
bool written;
ImageInfo info;
friend class XMLImages;
public:
XMLImage(GfxState *state) :
x(0.), y(0.), width(0), height(0), type(jpeg), written(false), info(state)
{}
~XMLImage() {}
string str(size_t num, bool mask, string file_name) const;
};
class XMLImages {
private:
vector<XMLImage*> images;
vector<XMLImage*> masks;
public:
~XMLImages() { this->clear(); }
void add_mask(GfxState *state, Object *ref, Stream *str,
unsigned int width, unsigned int height, bool invert,
bool interpolate, bool inline_img);
void add(GfxState *state, Object *ref, Stream *str,
unsigned int width, unsigned int height, GfxImageColorMap *colorMap,
bool interpolate, int *maskColors, bool inline_img);
string file_name(const XMLImage *img) const;
vector<string*> str() const;
void clear();
};
/*
struct calibre_jpeg_err_mgr {
struct jpeg_error_mgr pub; // "public" fields
jmp_buf setjmp_buffer; // for return to caller
};
class JPEGWriter {
private:
FILE *outfile;
protected:
struct jpeg_compress_struct cinfo;
struct calibre_jpeg_err_mgr jerr;
void raise();
void check();
public:
JPEGWriter();
~JPEGWriter();
void init_io(FILE *f);
void init(int width, int height);
void write_image(JSAMPARRAY image_buffer, JDIMENSION number_of_scanlines);
void write_splash_bitmap(SplashBitmap *bitmap);
};
*/
}

View File

@ -1,56 +0,0 @@
/**
* Copyright 2009 Kovid Goyal <kovid@kovidgoyal.net>
* License: GNU GPL v2+
*/
#include "links.h"
#include "utils.h"
using namespace std;
using namespace calibre_reflow;
XMLLink& XMLLink::operator=(const XMLLink &x) {
if (this==&x) return *this;
if (this->dest) {delete this->dest; this->dest=NULL;}
this->x_min = x.x_min;
this->y_min = x.y_min;
this->x_max = x.x_max;
this->y_max = x.y_max;
this->dest = new string(*x.dest);
return *this;
}
bool XMLLink::in_link(double xmin,double ymin,double xmax,double ymax) const {
double y = (ymin + ymax)/2;
if (y > this->y_max) return false;
return (y > this->y_min) && (xmin < this->x_max) && (xmax > this->x_min);
}
string XMLLink::get_link_start() {
ostringstream oss;
oss << "<a href=\"";
if (this->dest) oss << encode_for_xml(*this->dest);
oss << "\">";
return oss.str();
}
XMLLinks::~XMLLinks() {
for(XMLLinks::iterator i = this->begin(); i != this->end(); i++)
delete *i;
this->clear();
}
bool XMLLinks::in_link(double xmin, double ymin, double xmax,
double ymax, XMLLinks::size_type &p) const {
for(XMLLinks::const_iterator i = this->begin(); i != this->end(); i++) {
if ( (*i)->in_link(xmin, ymin, xmax, ymax) ) {
p = (i - this->begin());
return true;
}
}
return false;
}

View File

@ -1,66 +0,0 @@
/**
* Copyright 2009 Kovid Goyal <kovid@kovidgoyal.net>
* License: GNU GPL v2+
*/
#pragma once
#include <vector>
#include <sstream>
using namespace std;
namespace calibre_reflow {
class XMLLink {
private:
double x_min;
double y_min;
double x_max;
double y_max;
string* dest;
public:
XMLLink() : dest(NULL) {}
XMLLink(const XMLLink& x) :
x_min(x.x_min), y_min(x.y_min), x_max(x.x_max),
y_max(x.y_max), dest(new string(*x.dest)) {}
XMLLink(double x_min, double y_min, double x_max,
double y_max, const char *dest) :
x_min((x_min < x_max) ? x_min : x_max),
y_min((y_min < y_max) ? y_min : y_max),
x_max((x_max > x_min) ? x_max : x_min),
y_max((y_max > y_min) ? y_max : y_min),
dest(new string(dest)) {}
~XMLLink() { delete this->dest; }
string* get_dest() { return this->dest; }
double get_x1() const {return x_min;}
double get_x2() const {return x_max;}
double get_y1() const {return y_min;}
double get_y2() const {return y_max;}
XMLLink& operator=(const XMLLink &x);
bool operator==(const XMLLink &x) const {
return (this->dest != NULL) && (x.dest != NULL) &&
this->dest->compare(*x.dest) == 0;
}
bool in_link(double xmin, double ymin, double xmax, double ymax) const;
string get_link_start();
};
class XMLLinks : public vector<XMLLink*> {
public:
~XMLLinks();
bool in_link(double xmin, double ymin, double xmax,
double ymax, XMLLinks::size_type &p) const;
};
}

View File

@ -1,251 +0,0 @@
/**
* Copyright 2009 Kovid Goyal <kovid@kovidgoyal.net>
* License: GNU GPL v2+
*/
#ifndef PDF2XML
#define UNICODE
#define PY_SSIZE_T_CLEAN
#include <Python.h>
#endif
#include "reflow.h"
using namespace std;
using namespace calibre_reflow;
#ifndef PDF2XML
extern "C" {
static PyObject *
pdfreflow_reflow(PyObject *self, PyObject *args) {
char *pdfdata;
Py_ssize_t size;
int first_page, last_page, num = 0;
if (!PyArg_ParseTuple(args, "s#ii", &pdfdata, &size, &first_page, &last_page))
return NULL;
try {
Reflow reflow(pdfdata, static_cast<std::ifstream::pos_type>(size));
num = reflow.render(first_page, last_page);
} catch (std::exception &e) {
PyErr_SetString(PyExc_RuntimeError, e.what()); return NULL;
} catch (...) {
PyErr_SetString(PyExc_RuntimeError,
"Unknown exception raised while rendering PDF"); return NULL;
}
return Py_BuildValue("i", num);
}
static PyObject *
pdfreflow_get_metadata(PyObject *self, PyObject *args) {
char *pdfdata;
Py_ssize_t size;
map<string,string> info;
PyObject *cover;
PyObject *ans = PyDict_New();
if (!ans) return PyErr_NoMemory();
if (!PyArg_ParseTuple(args, "s#O", &pdfdata, &size, &cover))
return NULL;
Reflow *reflow = NULL;
try {
reflow = new Reflow(pdfdata, size);
info = reflow->get_info();
if (PyObject_IsTrue(cover)) {
if (reflow->numpages() > 0) {
vector<char> *data = reflow->render_first_page();
if (data && data->size() > 0) {
PyObject *d = PyBytes_FromStringAndSize(&((*data)[0]), data->size());
delete data;
if (d == NULL) {delete reflow; return PyErr_NoMemory();}
if (PyDict_SetItemString(ans, "cover", d) == -1) {delete reflow; return NULL;}
Py_XDECREF(d);
}
} else {
if (PyDict_SetItemString(ans, "cover", Py_None) == -1) {delete reflow; return NULL;}
}
}
} catch (std::exception &e) {
PyErr_SetString(PyExc_RuntimeError, e.what()); delete reflow; return NULL;
} catch (...) {
PyErr_SetString(PyExc_RuntimeError,
"Unknown exception raised while getting metadata from PDF"); delete reflow; return NULL;
}
delete reflow; reflow = NULL;
for (map<string,string>::const_iterator it = info.begin() ; it != info.end(); it++ ) {
PyObject *key = PyUnicode_Decode((*it).first.c_str(), (*it).first.size(), "UTF-8", "replace");
if (!key) return NULL;
PyObject *val = PyUnicode_Decode((*it).second.c_str(), (*it).second.size(), "UTF-8", "replace");
if (!val) return NULL;
if (PyDict_SetItem(ans, key, val) == -1) return NULL;
Py_XDECREF(key); Py_XDECREF(val);
}
return ans;
}
static PyObject *
pdfreflow_get_numpages(PyObject *self, PyObject *args) {
char *pdfdata;
int num = 0;
Py_ssize_t size;
map<string,string> info;
if (!PyArg_ParseTuple(args, "s#", &pdfdata, &size))
return NULL;
Reflow *reflow = NULL;
try {
reflow = new Reflow(pdfdata, size);
num = reflow->numpages();
} catch (std::exception &e) {
PyErr_SetString(PyExc_RuntimeError, e.what()); delete reflow; return NULL;
} catch (...) {
PyErr_SetString(PyExc_RuntimeError,
"Unknown exception raised while getting metadata from PDF"); delete reflow; return NULL;
}
delete reflow; reflow = NULL;
return Py_BuildValue("i", num);
}
static PyObject *
pdfreflow_set_metadata(PyObject *self, PyObject *args) {
char *pdfdata;
Py_ssize_t size;
PyObject *info;
if (!PyArg_ParseTuple(args, "s#O", &pdfdata, &size, &info))
return NULL;
if (!PyDict_Check(info)) {
PyErr_SetString(PyExc_ValueError, "Info object must be a dictionary.");
return NULL;
}
char Title[10] = "Title", Author[10] = "Author", Keywords[10] = "Keywords";
char *keys[3] = { Title, Author, Keywords };
map<char *, char *> pinfo;
PyObject *val = NULL, *utf8 = NULL;
for (int i = 0; i < 3; i++) {
val = PyDict_GetItemString(info, keys[i]);
if (!val || !PyUnicode_Check(val)) continue;
utf8 = PyUnicode_AsUTF8String(val);
if (!utf8) continue;
pinfo[keys[i]] = PyString_AS_STRING(utf8);
}
PyObject *ans = NULL;
try {
Reflow reflow(pdfdata, static_cast<std::ifstream::pos_type>(size));
if (reflow.is_locked()) {
PyErr_SetString(PyExc_ValueError, "Setting metadata not possible in encrypeted PDFs");
return NULL;
}
string result = reflow.set_info(pinfo);
ans = PyString_FromStringAndSize(result.c_str(), result.size());
} catch (std::exception &e) {
PyErr_SetString(PyExc_RuntimeError, e.what()); return NULL;
} catch (...) {
PyErr_SetString(PyExc_RuntimeError,
"Unknown exception raised while getting metadata from PDF"); return NULL;
}
return ans;
}
static
PyMethodDef pdfreflow_methods[] = {
{"reflow", pdfreflow_reflow, METH_VARARGS,
"reflow(pdf_data, first_page, last_page)\n\n"
"Reflow the specified PDF. Returns the number of pages in the PDF. If last_page is -1 renders to end of document."
},
{"get_metadata", pdfreflow_get_metadata, METH_VARARGS,
"get_metadata(pdf_data, cover)\n\n"
"Get metadata and (optionally) cover from the specified PDF."
},
{"set_metadata", pdfreflow_set_metadata, METH_VARARGS,
"get_metadata(info_dict)\n\n"
"Set metadata in the specified PDF. Currently broken."
},
{"get_numpages", pdfreflow_get_numpages, METH_VARARGS,
"get_numpages(pdf_data)\n\n"
"Get number of pages in the PDF."
},
{NULL, NULL, 0, NULL}
};
PyMODINIT_FUNC
initpdfreflow(void)
{
PyObject* m;
m = Py_InitModule3("pdfreflow", pdfreflow_methods,
"Reflow a PDF file");
if (m == NULL) return;
}
}
#else
int main(int argc, char **argv) {
char *memblock;
ifstream::pos_type size;
int ret = 0;
map<string,string> info;
Reflow *reflow = NULL;
if (argc != 2) {
cerr << "Usage: " << argv[0] << " file.pdf" << endl;
return 1;
}
ifstream file (argv[1], ios::in|ios::binary|ios::ate);
if (file.is_open()) {
size = file.tellg();
memblock = new char[size];
file.seekg (0, ios::beg);
file.read (memblock, size);
file.close();
} else {
cerr << "Unable to read from: " << argv[1] << endl;
return 1;
}
try {
reflow = new Reflow(memblock, size);
info = reflow->get_info();
for (map<string,string>::const_iterator it = info.begin() ; it != info.end(); it++ ) {
cout << (*it).first << " : " << (*it).second << endl;
}
//reflow->render();
vector<char> *data = reflow->render_first_page();
ofstream file("cover.png", ios::binary);
file.write(&((*data)[0]), data->size());
delete data;
file.close();
} catch(exception &e) {
cerr << e.what() << endl;
ret = 1;
}
delete reflow;
delete[] memblock;
return ret;
}
#endif

View File

@ -1,976 +0,0 @@
/**
* Copyright 2009 Kovid Goyal <kovid@kovidgoyal.net>
* License: GNU GPL v2+
*/
#include <Outline.h>
#include <PDFDocEncoding.h>
#include <poppler/ErrorCodes.h>
#include <goo/GooList.h>
#include <SplashOutputDev.h>
#include <splash/SplashBitmap.h>
#include <splash/SplashErrorCodes.h>
#include "reflow.h"
#include "utils.h"
using namespace std;
using namespace calibre_reflow;
static const size_t num_info_keys = 8;
static const char* info_keys[num_info_keys] = {
"Title", "Subject", "Keywords", "Author", "Creator", "Producer",
"CreationDate", "ModDate"
};
static char encoding[10] = "UTF-8";
static char yes[10] = "yes";
//------------------------------------------------------------------------
// XMLString
//------------------------------------------------------------------------
XMLString::XMLString(GfxState *state, GooString *s, double current_font_size,
Fonts *fonts) :
text(new vector<Unicode>(0)), x_right(new vector<double>(0)),
yx_next(NULL), xy_next(NULL), fonts(fonts), font_idx(0), xml_text(NULL),
link(NULL), x_min(0), x_max(0), y_min(0), y_max(0), col(0), dir(text_dir_unknown)
{
double x = 0, y = 0;
GfxFont *font;
state->transform(state->getCurX(), state->getCurY(), &x, &y);
if ((font = state->getFont())) {
double ascent = font->getAscent();
double descent = font->getDescent();
if( ascent > 1.05 ){
//printf( "ascent=%.15g is too high, descent=%.15g\n", ascent, descent );
ascent = 1.05;
}
if( descent < -0.4 ){
//printf( "descent %.15g is too low, ascent=%.15g\n", descent, ascent );
descent = -0.4;
}
this->y_min = y - ascent * current_font_size;
this->y_max = y - descent * current_font_size;
GfxRGB rgb;
state->getFillRGB(&rgb);
GooString *name = state->getFont()->getName();
if (!name)
this->font_idx = this->fonts->add_font(NULL, current_font_size-1, rgb);
else
this->font_idx = this->fonts->add_font(
new string(name->getCString()), current_font_size-1, rgb);
} else {
// this means that the PDF file draws text without a current font,
// which should never happen
this->y_min = y - 0.95 * current_font_size;
this->y_max = y + 0.35 * current_font_size;
}
if (this->y_min == this->y_max) {
// this is a sanity check for a case that shouldn't happen -- but
// if it does happen, we want to avoid dividing by zero later
this->y_min = y;
this->y_max = y + 1;
}
}
void XMLString::add_char(GfxState *state, double x, double y,
double dx, double dy, Unicode u) {
if (dir == text_dir_unknown) {
//dir = UnicodeMap::getDirection(u);
dir = text_dir_left_right;
}
if (this->text->capacity() == this->text->size()) {
this->text->reserve(text->size()+16);
this->x_right->reserve(x_right->size()+16);
}
this->text->push_back(u);
if (this->length() == 1) {
this->x_min = x;
}
this->x_max = x + dx;
this->x_right->push_back(x_max);
//printf("added char: %f %f xright = %f\n", x, dx, x+dx);
}
void XMLString::end_string()
{
if( this->dir == text_dir_right_left && this->length() > 1 )
{
//printf("will reverse!\n");
reverse(this->text->begin(), this->text->end());
}
}
static string encode_unicode_chars(const Unicode *u, size_t num) {
ostringstream oss;
UnicodeMap *uMap;
char buf[10];
int n;
if (!(uMap = globalParams->getTextEncoding())) {
throw ReflowException("Failed to allocate unicode map.");
}
for (size_t i = 0; i < num; i++) {
switch (u[i]) {
case '&': oss << "&amp;"; break;
case '<': oss << "&lt;"; break;
case '>': oss << "&gt;"; break;
default:
{
// convert unicode to string
if ((n = uMap->mapUnicode(u[i], buf, sizeof(buf))) > 0) {
buf[n] = 0;
oss << buf;
}
}
}
}
uMap->decRefCnt();
return oss.str();
}
void XMLString::encode() {
delete this->xml_text;
this->xml_text = new string(encode_unicode_chars(&((*this->text)[0]), this->text->size()));
}
string XMLString::str() const {
ostringstream oss;
oss << "<text font=\"" << this->font_idx << "\" ";
oss << setiosflags(ios::fixed) << setprecision(2)
<< "top=\"" << this->y_min << "\" left=\"" << this->x_min
<< "\" width=\"" << this->x_max - this->x_min << "\" "
<< "height=\"" << this->y_max - this->y_min << "\">";
oss << *this->xml_text << "</text>";
return oss.str();
}
XMLString::~XMLString() {
delete this->text; delete this->x_right; delete this->xml_text;
}
//------------------------------------------------------------------------
// XMLPage
//------------------------------------------------------------------------
XMLPage::XMLPage(unsigned int num, GfxState *state, ofstream *output, Fonts* fonts) :
current_string(NULL), num(num), output(output), current_font_size(0.0),
yx_strings(NULL), xy_strings(NULL), yx_cur1(NULL), yx_cur2(NULL),
fonts(fonts), links(new XMLLinks())
{
(*this->output) << setiosflags(ios::fixed) << setprecision(2) <<
"\t\t<page number=\"" << this->num << "\" width=\"" <<
state->getPageWidth() << "\" height=\"" << state->getPageHeight() <<
"\">" << endl;
if (!(*this->output)) throw ReflowException(strerror(errno));
}
XMLPage::~XMLPage() {
(*this->output) << "\t\t</page>" << endl;
if (!(*this->output)) throw ReflowException(strerror(errno));
for (XMLString *tmp = this->yx_strings; tmp; tmp = tmp->yx_next)
delete tmp;
delete this->links;
}
void XMLPage::update_font(GfxState *state) {
GfxFont *font;
double *fm;
char *name;
int code;
double w;
current_font_size = state->getTransformedFontSize();
if ((font = state->getFont()) && font->getType() == fontType3) {
// This is a hack which makes it possible to deal with some Type 3
// fonts. The problem is that it's impossible to know what the
// base coordinate system used in the font is without actually
// rendering the font. This code tries to guess by looking at the
// width of the character 'm' (which breaks if the font is a
// subset that doesn't contain 'm').
for (code = 0; code < 256; ++code) {
if ((name = ((Gfx8BitFont *)font)->getCharName(code)) &&
name[0] == 'm' && name[1] == '\0') break;
}
if (code < 256) {
w = ((Gfx8BitFont *)font)->getWidth(code);
if (w != 0) {
// 600 is a generic average 'm' width -- yes, this is a hack
current_font_size *= w / 0.6;
}
}
fm = font->getFontMatrix();
if (fm[0] != 0) {
current_font_size *= fabs(fm[3] / fm[0]);
}
}
}
void XMLPage::draw_char(GfxState *state, double x, double y,
double dx, double dy,
double originX, double originY,
CharCode code, int nBytes, Unicode *u, int uLen) {
if ( (state->getRender() & 3) == 3) return; //Hidden text
double x1, y1, w1, h1, dx2, dy2;
int i;
state->transform(x, y, &x1, &y1);
// check that new character is in the same direction as current string
// and is not too far away from it before adding
if (this->current_string->character_does_not_belong_to_string(state, x1)) {
this->end_string();
this->begin_string(state, NULL);
}
state->textTransformDelta(state->getCharSpace() * state->getHorizScaling(),
0, &dx2, &dy2);
dx -= dx2;
dy -= dy2;
state->transformDelta(dx, dy, &w1, &h1);
if (uLen != 0) {
w1 /= uLen;
h1 /= uLen;
}
for (i = 0; i < uLen; ++i) {
this->current_string->add_char(state, x1 + i*w1, y1 + i*h1, w1, h1, u[i]);
}
}
void XMLPage::end_string() {
XMLString *p1 = NULL, *p2 = NULL;
double h, y1, y2;
// throw away zero-length strings -- they don't have valid xMin/xMax
// values, and they're useless anyway
if (this->current_string->length() == 0) {
delete this->current_string;
this->current_string = NULL;
return;
}
this->current_string->end_string();
// insert string in y-major list
h = this->current_string->height();
y1 = this->current_string->y_min + 0.5 * h;
y2 = this->current_string->y_min + 0.8 * h;
if (gFalse) { //rawOrder
p1 = this->yx_cur1;
p2 = NULL;
} else if (
(!this->yx_cur1 ||
(y1 >= this->yx_cur1->y_min &&
(y2 >= this->yx_cur1->y_max ||
this->current_string->x_max >= this->yx_cur1->x_min))) &&
(!this->yx_cur2 ||
(y1 < this->yx_cur2->y_min ||
(y2 < this->yx_cur2->y_max &&
this->current_string->x_max < this->yx_cur2->x_min)))
) {
p1 = this->yx_cur1;
p2 = this->yx_cur2;
} else {
for (p1 = NULL, p2 = this->yx_strings; p2; p1 = p2, p2 = p2->yx_next) {
if (y1 < p2->y_min || (y2 < p2->y_max && this->current_string->x_max < p2->x_min))
break;
}
this->yx_cur2 = p2;
}
this->yx_cur1 = this->current_string;
if (p1)
p1->yx_next = this->current_string;
else
this->yx_strings = this->current_string;
this->current_string->yx_next = p2;
this->current_string = NULL;
}
void XMLPage::end() {
XMLLinks::size_type link_index = 0;
Fonts::size_type pos = 0;
XMLFont* h;
for (XMLString *tmp = this->yx_strings; tmp; tmp = tmp->yx_next) {
pos = tmp->font_idx;
h = this->fonts->at(pos);
tmp->encode();
if (this->links->in_link(
tmp->x_min, tmp->y_min, tmp->x_max, tmp->y_max, link_index)) {
tmp->link = links->at(link_index);
}
}
this->coalesce();
for (XMLString *tmp = yx_strings; tmp; tmp=tmp->yx_next) {
if (tmp->xml_text && tmp->xml_text->size() > 0) {
(*this->output) << "\t\t\t" << tmp->str() << endl;
if (!(*this->output)) throw ReflowException(strerror(errno));
}
}
}
static const char *strrstr( const char *s, const char *ss )
{
const char *p = strstr( s, ss );
for( const char *pp = p; pp != NULL; pp = strstr( p+1, ss ) ){
p = pp;
}
return p;
}
static void close_tags( string *xml_text, bool &finish_a, bool &finish_italic, bool &finish_bold )
{
const char *last_italic = finish_italic && ( finish_bold || finish_a ) ? strrstr( xml_text->c_str(), "<em>" ) : NULL;
const char *last_bold = finish_bold && ( finish_italic || finish_a ) ? strrstr( xml_text->c_str(), "<strong>" ) : NULL;
const char *last_a = finish_a && ( finish_italic || finish_bold ) ? strrstr( xml_text->c_str(), "<a " ) : NULL;
if( finish_a && ( finish_italic || finish_bold ) && last_a > ( last_italic > last_bold ? last_italic : last_bold ) ) {
xml_text->append("</a>");
finish_a = false;
}
if( finish_italic && finish_bold && last_italic > last_bold ){
xml_text->append("</em>");
finish_italic = false;
}
if( finish_bold )
xml_text->append("</strong>");
if( finish_italic )
xml_text->append("</em>");
if( finish_a )
xml_text->append("</a>");
}
void XMLPage::coalesce() {
XMLString *str1, *str2, *str3;
XMLFont *hfont1, *hfont2;
double space, hor_space, vert_space, vert_overlap, size, x_limit;
bool add_space, found;
int n, i;
double cur_x, cur_y;
str1 = this->yx_strings;
if( !str1 ) return;
//----- discard duplicated text (fake boldface, drop shadows)
while (str1)
{
size = str1->y_max - str1->y_min;
x_limit = str1->x_min + size * 0.2;
found = false;
for (str2 = str1, str3 = str1->yx_next;
str3 && str3->x_min < x_limit;
str2 = str3, str3 = str2->yx_next)
{
if (str3->length() == str1->length() &&
!memcmp(str3->text, str1->text, str1->length() * sizeof(Unicode)) &&
fabs(str3->y_min - str1->y_min) < size * 0.2 &&
fabs(str3->y_max - str1->y_max) < size * 0.2 &&
fabs(str3->x_max - str1->x_max) < size * 0.2)
{
found = true;
//printf("found duplicate!\n");
break;
}
}
if (found)
{
str2->xy_next = str3->xy_next;
str2->yx_next = str3->yx_next;
delete str3;
}
else
{
str1 = str1->yx_next;
}
}
str1 = yx_strings;
hfont1 = this->fonts->at(str1->font_idx);
if( hfont1->is_bold() )
str1->xml_text->insert(0, "<strong>");
if( hfont1->is_italic() )
str1->xml_text->insert(0, "<em>");
if (str1->get_link())
str1->xml_text->insert(0, str1->get_link()->get_link_start());
cur_x = str1->x_min; cur_y = str1->y_min;
while (str1 && (str2 = str1->yx_next)) {
hfont2 = this->fonts->at(str2->font_idx);
space = str1->y_max - str1->y_min;
hor_space = str2->x_min - str1->x_max;
vert_space = str2->y_min - str1->y_max;
vert_overlap = 0;
if (str2->y_min >= str1->y_min && str2->y_min <= str1->y_max)
{
vert_overlap = str1->y_max - str2->y_min;
} else if (str2->y_max >= str1->y_min && str2->y_max <= str1->y_max)
{
vert_overlap = str2->y_max - str1->y_min;
}
if (
(
(
(str2->y_min < str1->y_max)
&&
(hor_space > -0.5 * space && hor_space < space)
)
) &&
(hfont1->eq_upto_inline(*hfont2)) &&
str1->dir == str2->dir // text direction the same
)
{
n = str1->length() + str2->length();
if ((add_space = hor_space > 0.1 * space)) {
++n;
}
str1->text->reserve((n + 15) & ~15);
str1->x_right->reserve((n + 15) & ~15);
if (add_space) {
str1->text->push_back(0x20);
str1->xml_text->push_back(' ');
str1->x_right->push_back(str2->x_min);
}
for (i = 0; i < str2->length(); i++) {
str1->text->push_back(str2->text->at(i));
str1->x_right->push_back(str2->x_right->at(i));
}
/* fix <i>, <b> if str1 and str2 differ and handle switch of links */
XMLLink *hlink1 = str1->get_link();
XMLLink *hlink2 = str2->get_link();
bool switch_links = !hlink1 || !hlink2 || !((*hlink1) == (*hlink2));
bool finish_a = switch_links && hlink1 != NULL;
bool finish_italic = hfont1->is_italic() && ( !hfont2->is_italic() || finish_a );
bool finish_bold = hfont1->is_bold() &&
( !hfont2->is_bold() || finish_a || finish_italic );
close_tags( str1->xml_text, finish_a, finish_italic, finish_bold );
if( switch_links && hlink2 != NULL ) {
string ls = hlink2->get_link_start();
str1->xml_text->append(ls);
}
if( ( !hfont1->is_italic() || finish_italic ) && hfont2->is_italic() )
str1->xml_text->append("<em>");
if( ( !hfont1->is_bold() || finish_bold ) && hfont2->is_bold() )
str1->xml_text->append("<strong>");
str1->xml_text->append(*str2->xml_text);
// str1 now contains href for link of str2 (if it is defined)
str1->link = str2->link;
hfont1 = hfont2;
if (str2->x_max > str1->x_max) {
str1->x_max = str2->x_max;
}
if (str2->y_max > str1->y_max) {
str1->y_max = str2->y_max;
}
str1->yx_next = str2->yx_next;
delete str2;
} else { // keep strings separate
bool finish_a = str1->get_link() != NULL;
bool finish_bold = hfont1->is_bold();
bool finish_italic = hfont1->is_italic();
close_tags( str1->xml_text, finish_a, finish_italic, finish_bold );
str1->x_min = cur_x; str1->y_min = cur_y;
str1 = str2;
cur_x = str1->x_min; cur_y = str1->y_min;
hfont1 = hfont2;
if ( hfont1->is_bold() )
str1->xml_text->insert(0, "<strong>");
if( hfont1->is_italic() )
str1->xml_text->insert(0, "<em>");
if( str1->get_link() != NULL ) {
str1->xml_text->insert(0, str1->get_link()->get_link_start());
}
}
}
str1->x_min = cur_x; str1->y_min = cur_y;
bool finish_bold = hfont1->is_bold();
bool finish_italic = hfont1->is_italic();
bool finish_a = str1->get_link() != NULL;
close_tags( str1->xml_text, finish_a, finish_italic, finish_bold );
}
//------------------------------------------------------------------------
// XMLOutputDev
//------------------------------------------------------------------------
XMLOutputDev::XMLOutputDev(PDFDoc *doc) :
current_page(NULL), output(new ofstream("index.xml", ios::trunc)),
fonts(new Fonts()), catalog(NULL), images(new XMLImages()), doc(doc)
{
if (!(*this->output)) {
throw ReflowException(strerror(errno));
}
(*this->output) << "<pdfreflow>" << endl;
(*this->output) << "\t<pages>" << endl;
if (!(*this->output)) throw ReflowException(strerror(errno));
}
XMLOutputDev::~XMLOutputDev() {
(*this->output) << "\t</pages>" << endl;
if (!(*this->output)) throw ReflowException(strerror(errno));
(*this->output) << "\t<fonts>" << endl;
if (!(*this->output)) throw ReflowException(strerror(errno));
for (Fonts::const_iterator it = this->fonts->begin(); it < this->fonts->end(); it++) {
(*this->output) << "\t\t" << (*it)->str(it - this->fonts->begin()) << endl;
if (!(*this->output)) throw ReflowException(strerror(errno));
}
(*this->output) << "\t</fonts>" << endl;
if (!(*this->output)) throw ReflowException(strerror(errno));
(*this->output) << "</pdfreflow>" << endl;
if (!(*this->output)) throw ReflowException(strerror(errno));
this->output->close();
delete this->output;
delete this->fonts;
delete this->images;
}
static string get_link_dest(LinkAction *link, PDFDoc *doc) {
unsigned int page = 1;
ostringstream oss;
switch(link->getKind())
{
case actionGoTo:
{
LinkGoTo *ha = (LinkGoTo *)link;
LinkDest *dest = NULL;
if (ha->getDest() != NULL)
dest = ha->getDest()->copy();
else if (ha->getNamedDest() != NULL) {
dest = doc->findDest(ha->getNamedDest());
}
if (dest) {
if (dest->isPageRef()) {
Ref pageref = dest->getPageRef();
page = doc->findPage(pageref.num, pageref.gen);
}
else {
page = dest->getPageNum();
}
oss << "#" << page
<< setiosflags(ios::fixed) << setprecision(2)
<< ":l=" << dest->getLeft()
<< "t=" << dest->getTop();
//<< "r=" << dest->getRight()
//<< "b=" << dest->getBottom();
delete dest;
}
break;
}
case actionGoToR:
{
LinkGoToR *ha = (LinkGoToR *) link;
LinkDest *dest = NULL;
bool has_file = false;
if (ha->getFileName()) {
oss << ha->getFileName()->getCString();
has_file = true;
}
if (ha->getDest() != NULL) dest=ha->getDest()->copy();
if (dest && has_file) {
if (!(dest->isPageRef())) page = dest->getPageNum();
delete dest;
oss << '#' << page;
}
break;
}
case actionURI:
{
LinkURI *ha=(LinkURI *) link;
oss << ha->getURI()->getCString();
break;
}
case actionLaunch:
{
LinkLaunch *ha = (LinkLaunch *) link;
oss << ha->getFileName()->getCString();
break;
}
case actionNamed: break;
case actionMovie: break;
case actionRendition: break;
case actionSound: break;
case actionJavaScript: break;
case actionUnknown: break;
default: break;
}
return oss.str();
}
void XMLOutputDev::process_link(AnnotLink* link){
double _x1, _y1, _x2, _y2;
int x1, y1, x2, y2;
link->getRect(&_x1, &_y1, &_x2, &_y2);
cvtUserToDev(_x1, _y1, &x1, &y1);
cvtUserToDev(_x2, _y2, &x2, &y2);
LinkAction *a = link->getAction();
if (!a) return;
string dest = get_link_dest(a, this->doc);
if (dest.length() > 0) {
XMLLink *t = new XMLLink((double)x1, (double)y2, (double)x2, (double)y1,
dest.c_str());
this->current_page->add_link(t);
}
}
void XMLOutputDev::endPage() {
#ifdef POPPLER_PRE_20
Links *slinks = catalog->getPage(current_page->number())->getLinks(catalog);
#else
Links *slinks = catalog->getPage(current_page->number())->getLinks();
#endif
for (int i = 0; i < slinks->getNumLinks(); i++)
{
this->process_link(slinks->getLink(i));
}
delete slinks;
this->current_page->end();
vector<string*> images = this->images->str();
for (vector<string*>::iterator it = images.begin(); it < images.end(); it++) {
(*this->output) << "\t\t\t" << *(*it) << endl;
if (!(*this->output)) throw ReflowException(strerror(errno));
delete *it;
}
this->images->clear();
delete this->current_page;
this->current_page = NULL;
}
void XMLOutputDev::drawImageMask(GfxState *state, Object *ref, Stream *str,
int width, int height, GBool invert,
GBool interpolate, GBool inlineImg) {
OutputDev::drawImageMask(state, ref, str, width, height,
invert, interpolate, inlineImg);
//this->images->add_mask();
cerr << "mask requested" << endl;
}
void XMLOutputDev::drawImage(GfxState *state, Object *ref, Stream *str,
int width, int height, GfxImageColorMap *colorMap,
GBool interpolate, int *maskColors, GBool inlineImg) {
this->images->add(state, ref, str,
static_cast<unsigned int>(width), static_cast<unsigned int>(height),
colorMap, interpolate, maskColors, inlineImg);
}
Reflow::Reflow(char *pdfdata, size_t sz) :
pdfdata(pdfdata), current_font_size(-1), doc(NULL), obj()
{
int err;
this->obj.initNull();
if (globalParams == NULL) {
globalParams = new GlobalParams();
if (!globalParams)
throw ReflowException("Failed to allocate Globalparams");
}
MemStream *str = new MemStream(pdfdata, 0, sz, &this->obj);
this->doc = new PDFDoc(str, NULL, NULL);
if (!this->doc->isOk()) {
err = this->doc->getErrorCode();
ostringstream stm;
if (err == errEncrypted)
stm << "PDF is password protected.";
else {
stm << "Failed to open PDF file";
stm << " with error code: " << err;
}
delete this->doc;
this->doc = NULL;
throw ReflowException(stm.str().c_str());
}
}
int
Reflow::render(int first_page, int last_page) {
if (!this->doc->okToCopy())
cout << "Warning, this document has the copy protection flag set, ignoring." << endl;
globalParams->setTextEncoding(encoding);
int doc_pages = doc->getNumPages();
if (last_page < 1 || last_page > doc_pages) last_page = doc_pages;
if (first_page < 1) first_page = 1;
if (first_page > last_page) first_page = last_page;
XMLOutputDev *xml_out = new XMLOutputDev(this->doc);
doc->displayPages(xml_out, first_page, last_page,
96, //hDPI
96, //vDPI
0, //rotate
true, //UseMediaBox
true, //Crop
false //Printing
);
if (last_page - first_page == doc_pages - 1)
this->dump_outline();
delete xml_out;
return doc_pages;
}
void Reflow::dump_outline() {
Outline *outline = this->doc->getOutline();
if (!outline) return;
GooList *items = outline->getItems();
if ( !items || items->getLength() < 1 )
return;
ostringstream *output = new ostringstream();
(*output) << "<outline>" << endl;
this->outline_level(output, items);
(*output) << "</outline>" << endl;
ofstream of("outline.xml", ios::trunc);
of << output->str();
if (!of) throw ReflowException("Error writing outline file");
of.close();
delete output;
}
static inline void outline_tabs(ostringstream *o, int level) {
for (int i = 0; i < level; i++)
(*o) << "\t";
}
void Reflow::outline_level(ostringstream *oss, GooList *items, int level)
{
int num_of_items = items->getLength();
if (num_of_items > 0) {
outline_tabs(oss, level);
(*oss) << "<links level=\"" << level << "\">" << endl;
for (int i = 0; i < num_of_items; i++) {
OutlineItem* item = (OutlineItem *)items->get(i);
Unicode *u = item->getTitle();
string title = encode_unicode_chars(u, item->getTitleLength());
if (title.size() < 1) continue;
outline_tabs(oss, level+1);
(*oss) << "<link open=\"" << (item->isOpen()?"yes":"no") << "\"";
LinkAction *a = item->getAction();
if (a != NULL)
(*oss) << " dest=\"" << get_link_dest(a, this->doc) << "\"";
(*oss) << ">" << title << "</link>" << endl;
item->open();
GooList *children = item->getKids();
if (children)
outline_level(oss, children, level+1);
}
}
}
Reflow::~Reflow() {
delete this->doc;
}
map<string, string> Reflow::get_info() {
Object info;
map<string, string> ans;
string val;
globalParams->setTextEncoding(encoding);
this->doc->getDocInfo(&info);
if (info.isDict()) {
for(size_t i = 0; i < num_info_keys; i++) {
val = this->decode_info_string(info.getDict(), info_keys[i]);
if (val.size() > 0) {
ans[string(info_keys[i])] = string(val);
}
}
}
return ans;
}
string Reflow::decode_info_string(Dict *info, const char *key) const {
Object obj;
GooString *s1;
bool is_unicode;
Unicode u;
char buf[8];
int i, n;
ostringstream oss;
char *tmp = new char[strlen(key)+1];
strncpy(tmp, key, strlen(key)+1);
UnicodeMap *umap;
if (!(umap = globalParams->getTextEncoding())) {
throw ReflowException("Failed to allocate unicode map.");
}
if (info->lookup(tmp, &obj)->isString()) {
s1 = obj.getString();
if ((s1->getChar(0) & 0xff) == 0xfe &&
(s1->getChar(1) & 0xff) == 0xff) {
is_unicode = true;
i = 2;
} else {
is_unicode = false;
i = 0;
}
while (i < obj.getString()->getLength()) {
if (is_unicode) {
u = ((s1->getChar(i) & 0xff) << 8) |
(s1->getChar(i+1) & 0xff);
i += 2;
} else {
u = pdfDocEncoding[s1->getChar(i) & 0xff];
++i;
}
n = umap->mapUnicode(u, buf, sizeof(buf));
buf[n] = 0;
oss << buf;
}
}
obj.free();
delete[] tmp;
return oss.str();
}
vector<char>* Reflow::render_first_page(bool use_crop_box, double x_res,
double y_res) {
if (this->numpages() < 1) throw ReflowException("Document has no pages.");
globalParams->setTextEncoding(encoding);
globalParams->setEnableFreeType(yes);
globalParams->setAntialias(yes);
globalParams->setVectorAntialias(yes);
SplashColor paper_color;
paper_color[0] = 255;
paper_color[1] = 255;
paper_color[2] = 255;
SplashOutputDev *out = new SplashOutputDev(splashModeRGB8, 4, false, paper_color, true, true);
out->setVectorAntialias(true);
if (!out) {
throw ReflowException("Failed to allocate SplashOutputDev");
}
try {
#ifdef POPPLER_PRE_20
out->startDoc(doc->getXRef());
#else
out->startDoc(doc);
#endif
out->startPage(1, NULL);
double pg_w, pg_h;
int pg = 1;
if (use_crop_box) {
pg_w = this->doc->getPageCropWidth(pg);
pg_h = this->doc->getPageCropHeight(pg);
} else {
pg_w = this->doc->getPageMediaWidth(pg);
pg_h = this->doc->getPageMediaHeight(pg);
}
pg_w *= x_res/72.;
pg_h *= y_res/72.;
int x=0, y=0;
this->doc->displayPageSlice(out, pg, x_res, y_res, 0,
!use_crop_box, false, false, x, y, pg_w, pg_h);
} catch(...) { delete out; throw; }
SplashBitmap *bmp = out->takeBitmap();
out->endPage();
delete out; out = NULL;
PNGMemWriter writer;
vector<char> *buf = new vector<char>();
try {
writer.init(buf, bmp->getWidth(), bmp->getHeight());
writer.write_splash_bitmap(bmp);
writer.close();
} catch(...) { delete buf; delete bmp; throw; }
delete bmp;
return buf;
}
class MemOutStream : public OutStream {
private:
ostringstream out;
public:
MemOutStream() :OutStream() {}
~MemOutStream() {}
void close() {}
int getPos() { return out.tellp(); }
void put(char c) { out.put(c); }
void printf (const char *format, ...) {
vector<char> buf;
size_t written = strlen(format)*5;
va_list ap;
do {
buf.reserve(written + 20);
va_start(ap, format);
written = vsnprintf(&buf[0], buf.capacity(), format, ap);
va_end(ap);
} while (written >= buf.capacity());
out.write(&buf[0], written);
}
};
string Reflow::set_info(map<char *, char *> sinfo) {
XRef *xref = this->doc->getXRef();
if (!xref) throw ReflowException("No XRef table");
Object *trailer_dict = xref->getTrailerDict();
if (!trailer_dict || !trailer_dict->isDict()) throw ReflowException("No trailer dictionary");
Object tmp;
char INFO[5] = "Info";
Object *info = trailer_dict->dictLookup(INFO, &tmp);
if (!info) {
info = new Object();
info->initDict(xref);
}
if (!info->isDict()) throw ReflowException("Invalid info object");
for (map<char *, char *>::iterator it = sinfo.begin(); it != sinfo.end(); it++) {
Object *tmp = new Object();
tmp->initString(new GooString((*it).second));
info->dictSet((*it).first, tmp);
}
trailer_dict->dictSet(INFO, info);
char out[20] = "/t/out.pdf";
this->doc->saveAs(new GooString(out), writeForceRewrite);
string ans;
return ans;
}

View File

@ -1,253 +0,0 @@
/**
* Copyright 2009 Kovid Goyal <kovid@kovidgoyal.net>
* License: GNU GPL v2+
* Based on pdftohtml from the poppler project.
*/
#pragma once
#define UNICODE
#ifdef _WIN32
#include <poppler/Object.h>
#elif defined(_OSX)
#include <poppler/Object.h>
#else
#include <Object.h>
#endif
#include <PDFDoc.h>
#include <GlobalParams.h>
#include <GfxState.h>
#include <GfxFont.h>
#include <OutputDev.h>
#include <Link.h>
#include <UnicodeMap.h>
#include <cmath>
#include <exception>
#include <string>
#include <sstream>
#include <vector>
#include <iostream>
#include <algorithm>
#include <fstream>
#include <iomanip>
#include <map>
#include <errno.h>
#include "fonts.h"
#include "links.h"
#include "images.h"
using namespace std;
namespace calibre_reflow {
#ifdef POPPLER_OLD_LINK_TYPE
#define AnnotLink Link
#endif
enum UnicodeTextDirection {
text_dir_unknown,
text_dir_left_right,
text_dir_right_left,
text_dir_top_bottom
};
class Reflow {
private:
char *pdfdata;
double current_font_size;
PDFDoc *doc;
Object obj;
string decode_info_string(Dict *info, const char *key) const;
void outline_level(ostringstream *oss, GooList *items,
int level=1);
public:
Reflow (char *xpdfdata, size_t sz);
~Reflow();
/* Convert the PDF to XML. All files are output to the current directory */
int render(int first_page, int last_page);
/* Get the PDF Info Dictionary */
map<string, string> get_info();
/* True if the PDF is encrypted */
bool is_locked() const { return !this->doc || this->doc->isEncrypted(); }
/* Return the first page of the PDF, rendered as a PNG image */
vector<char>* render_first_page(bool use_crop_box=true, double x_res=150.0,
double y_res = 150.0);
/* Dump the PDF outline as the file outline.xml in the current directory */
void dump_outline();
/* Set the info dictionary. Currently broken. */
string set_info(map<char *, char *> info);
/* Number of pages in the document */
int numpages() { return this->doc->getNumPages(); }
};
class XMLString {
private:
vector<Unicode> *text; // the text
vector<double> *x_right; // right-hand x coord of each char
XMLString *yx_next; // next string in y-major order
XMLString *xy_next; // next string in x-major order
Fonts *fonts;
Fonts::size_type font_idx;
string *xml_text;
XMLLink *link;
double x_min, x_max; // bounding box x coordinates
double y_min, y_max; // bounding box y coordinates
int col; // starting column
UnicodeTextDirection dir; // direction (left to right/right to left)
friend class XMLPage;
public:
XMLString(GfxState *state, GooString *s, double current_font_size, Fonts *fonts);
~XMLString();
bool character_does_not_belong_to_string(GfxState *state, double x1) {
return this->length() > 0 &&
fabs(x1 - x_right->at(this->length()-1)) > 0.1 * (y_max - y_min);
}
void add_char(GfxState *state, double x, double y,
double dx, double dy, Unicode u);
void end_string();
inline int length() const { return this->text->size(); }
inline double height() const { return y_max - y_min; }
void encode();
XMLLink* get_link() { return this->link; }
string str() const;
};
class XMLPage {
private:
XMLString *current_string;
unsigned int num;
ofstream *output;
double current_font_size;
XMLString *yx_strings; // strings in y-major order
XMLString *xy_strings; // strings in x-major order
XMLString *yx_cur1, *yx_cur2; // cursors for yxStrings list
Fonts *fonts;
XMLLinks *links;
void coalesce();
public:
XMLPage(unsigned int num, GfxState *state, ofstream *output, Fonts* fonts);
~XMLPage();
void update_font(GfxState *state);
void begin_string(GfxState *state, GooString *s) {
this->current_string = new XMLString(state, s,
this->current_font_size, this->fonts);
}
void draw_char(GfxState *state, double x, double y,
double dx, double dy,
double originX, double originY,
CharCode code, int nBytes, Unicode *u, int uLen);
void end_string();
void end();
void add_link(XMLLink *t) { this->links->push_back(t); }
unsigned int number() const { return this->num; }
};
class XMLOutputDev : public OutputDev {
public:
XMLOutputDev(PDFDoc *doc);
virtual ~XMLOutputDev();
//---- get info about output device
// Does this device use upside-down coordinates?
// (Upside-down means (0,0) is the top left corner of the page.)
virtual GBool upsideDown() { return gTrue; }
// Does this device use drawChar() or drawString()?
virtual GBool useDrawChar() { return gTrue; }
// Does this device use beginType3Char/endType3Char? Otherwise,
// text in Type 3 fonts will be drawn with drawChar/drawString.
virtual GBool interpretType3Chars() { return gFalse; }
// Does this device need non-text content?
virtual GBool needNonText() { return gTrue; }
//----- initialization and control
virtual GBool checkPageSlice(Page *page, double hDPI, double vDPI,
int rotate, GBool useMediaBox, GBool crop,
int sliceX, int sliceY, int sliceW, int sliceH,
GBool printing, Catalog * catalogA,
GBool (* abortCheckCbk)(void *data) = NULL,
void * abortCheckCbkData = NULL)
{
this->catalog = catalogA;
return gTrue;
}
// Start a page.
virtual void startPage(int page_num, GfxState *state) {
this->current_page = new XMLPage(page_num, state, this->output, this->fonts);
}
// End a page.
virtual void endPage();
//----- update text state
virtual void updateFont(GfxState *state) {current_page->update_font(state);}
//----- text drawing
virtual void beginString(GfxState *state, GooString *s) {
this->current_page->begin_string(state, s);
}
virtual void endString(GfxState *state) {
this->current_page->end_string();
}
virtual void drawChar(GfxState *state, double x, double y,
double dx, double dy,
double originX, double originY,
CharCode code, int nBytes, Unicode *u, int uLen) {
this->current_page->draw_char(state, x, y, dx, dy, originX,
originY, code, nBytes, u, uLen);
}
virtual void drawImageMask(GfxState *state, Object *ref,
Stream *str,
int width, int height, GBool invert,
GBool interpolate, GBool inlineImg);
virtual void drawImage(GfxState *state, Object *ref, Stream *str,
int width, int height, GfxImageColorMap *colorMap,
GBool interpolate, int *maskColors, GBool inlineImg);
//new feature
virtual int DevType() {return 1234;}
private:
XMLPage *current_page;
ofstream *output; // xml file
Fonts *fonts;
Catalog *catalog;
XMLImages *images;
PDFDoc *doc;
void process_link(AnnotLink* link);
};
}