Sync to trunk.

This commit is contained in:
John Schember 2009-09-22 17:14:13 -04:00
commit a6886b0acd
32 changed files with 2725 additions and 522 deletions

Binary file not shown.

After

Width:  |  Height:  |  Size: 843 B

Binary file not shown.

After

Width:  |  Height:  |  Size: 629 B

View File

@ -6,7 +6,7 @@ __license__ = 'GPL v3'
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
import os, socket, struct
import os, socket, struct, subprocess
from distutils.spawn import find_executable
from PyQt4 import pyqtconfig
@ -42,6 +42,39 @@ elif find_executable('qmake'):
QMAKE = find_executable('qmake')
QMAKE = os.environ.get('QMAKE', QMAKE)
PKGCONFIG = find_executable('pkg-config')
PKGCONFIG = os.environ.get('PKG_CONFIG', PKGCONFIG)
def run_pkgconfig(name, envvar, default, flag, prefix):
ans = []
if envvar:
ans = os.environ.get(envvar, default)
ans = [x.strip() for x in ans.split(os.pathsep)]
ans = [x for x in ans if x and (prefix=='-l' or os.path.exists(x))]
if not ans:
try:
raw = subprocess.Popen([PKGCONFIG, flag, name],
stdout=subprocess.PIPE).stdout.read()
ans = [x.strip() for x in raw.split(prefix)]
ans = [x for x in ans if x and (prefix=='-l' or os.path.exists(x))]
except:
print 'Failed to run pkg-config:', PKGCONFIG, 'for:', name
return ans
def pkgconfig_include_dirs(name, envvar, default):
return run_pkgconfig(name, envvar, default, '--cflags-only-I', '-I')
def pkgconfig_lib_dirs(name, envvar, default):
return run_pkgconfig(name, envvar, default,'--libs-only-L', '-L')
def pkgconfig_libs(name, envvar, default):
return run_pkgconfig(name, envvar, default,'--libs-only-l', '-l')
def consolidate(envvar, default):
val = os.environ.get(envvar, default)
ans = [x.strip() for x in val.split(os.pathsep())]
return [x for x in ans if x and os.path.exists(x)]
pyqt = pyqtconfig.Configuration()
@ -50,28 +83,62 @@ qt_lib = pyqt.qt_lib_dir
fc_inc = '/usr/include/fontconfig'
fc_lib = '/usr/lib'
poppler_inc = '/usr/include/poppler/qt4'
poppler_lib = '/usr/lib'
poppler_libs = []
podofo_inc = '/usr/include/podofo'
podofo_lib = '/usr/lib'
if iswindows:
fc_inc = r'C:\cygwin\home\kovid\fontconfig\include\fontconfig'
fc_lib = r'C:\cygwin\home\kovid\fontconfig\lib'
poppler_inc = r'C:\cygwin\home\kovid\poppler\include\poppler\qt4'
poppler_lib = r'C:\cygwin\home\kovid\poppler\lib'
poppler_libs = ['QtCore4', 'QtGui4']
poppler_inc_dirs = consolidate('POPPLER_INC_DIR',
r'C:\cygwin\home\kovid\poppler\include\poppler')
popplerqt4_inc_dirs = poppler_inc_dirs + [poppler_inc_dirs[0]+r'\qt4']
poppler_lib_dirs = consolidate('POPPLER_LIB_DIR',
r'C:\cygwin\home\kovid\poppler\lib')
popplerqt4_lib_dirs = poppler_lib_dirs
poppler_libs = ['poppler']
popplerqt4_libs = poppler_libs + ['QtCore4', 'QtGui4']
podofo_inc = 'C:\\podofo\\include\\podofo'
podofo_lib = r'C:\podofo'
if isosx:
elif isosx:
fc_inc = '/Users/kovid/fontconfig/include/fontconfig'
fc_lib = '/Users/kovid/fontconfig/lib'
poppler_inc = '/Volumes/sw/build/poppler-0.10.7/qt4/src'
poppler_lib = '/Users/kovid/poppler/lib'
poppler_inc_dirs = consolidate('POPPLER_INC_DIR',
'/Volumes/sw/build/poppler-0.10.7/poppler')
popplerqt4_inc_dirs = poppler_inc_dirs + [poppler_inc_dirs[0]+'/qt4']
poppler_lib_dirs = consolidate('POPPLER_LIB_DIR',
'/Users/kovid/poppler/lib')
popplerqt4_lib_dirs = poppler_lib_dirs
poppler_libs = popplerqt4_libs = ['poppler']
podofo_inc = '/usr/local/include/podofo'
podofo_lib = '/usr/local/lib'
else:
# Include directories
poppler_inc_dirs = pkgconfig_include_dirs('poppler',
'POPPLER_INC_DIR', '/usr/include/poppler')
popplerqt4_inc_dirs = pkgconfig_include_dirs('poppler-qt4', '', '')
if not popplerqt4_inc_dirs:
popplerqt4_inc_dirs = poppler_inc_dirs + [poppler_inc_dirs[0]+'/qt4']
png_inc_dirs = pkgconfig_include_dirs('libpng', 'PNG_INC_DIR',
'/usr/include')
magick_inc_dirs = pkgconfig_include_dirs('MagickWand', 'MAGICK_INC', '/usr/include/ImageMagick')
# Library directories
poppler_lib_dirs = popplerqt4_lib_dirs = pkgconfig_lib_dirs('poppler', 'POPPLER_LIB_DIR',
'/usr/lib')
png_lib_dirs = pkgconfig_lib_dirs('libpng', 'PNG_LIB_DIR', '/usr/lib')
magick_lib_dirs = pkgconfig_lib_dirs('MagickWand', 'MAGICK_LIB', '/usr/lib')
# Libraries
poppler_libs = pkgconfig_libs('poppler', '', '')
if not poppler_libs:
poppler_libs = ['poppler']
popplerqt4_libs = pkgconfig_libs('poppler-qt4', '', '')
if not popplerqt4_libs:
popplerqt4_libs = ['poppler-qt4', 'poppler']
magick_libs = pkgconfig_libs('MagickWand', '', '')
if not magick_libs:
magick_libs = ['MagickWand', 'MagickCore']
png_libs = ['png']
fc_inc = os.environ.get('FC_INC_DIR', fc_inc)
@ -82,14 +149,27 @@ fc_error = None if os.path.exists(os.path.join(fc_inc, 'fontconfig.h')) else \
'variables.')
poppler_inc = os.environ.get('POPPLER_INC_DIR', poppler_inc)
poppler_lib = os.environ.get('POPPLER_LIB_DIR', poppler_lib)
poppler_error = None if os.path.exists(os.path.join(poppler_inc,
'poppler-qt4.h')) else \
poppler_error = None
if not poppler_inc_dirs or not os.path.exists(
os.path.join(poppler_inc_dirs[0], 'OutputDev.h')):
poppler_error = \
('Poppler not found on your system. Various PDF related',
' functionality will not work. Use the POPPLER_INC_DIR and',
' POPPLER_LIB_DIR environment variables.')
popplerqt4_error = None
if not popplerqt4_inc_dirs or not os.path.exists(
os.path.join(popplerqt4_inc_dirs[-1], 'poppler-qt4.h')):
popplerqt4_error = \
('Poppler Qt4 bindings not found on your system.')
magick_error = None
if not magick_inc_dirs or not os.path.exists(os.path.join(magick_inc_dirs[0],
'wand')):
magick_error = ('ImageMagick not found on your system. '
'Try setting the environment variables MAGICK_INC '
'and MAGICK_LIB to help calibre locate the inclue and libbrary '
'files.')
podofo_lib = os.environ.get('PODOFO_LIB_DIR', podofo_lib)
podofo_inc = os.environ.get('PODOFO_INC_DIR', podofo_inc)
@ -116,3 +196,5 @@ except:
HOST='unknown'
PROJECT=os.path.basename(os.path.abspath('.'))

View File

@ -8,7 +8,7 @@ __docformat__ = 'restructuredtext en'
__all__ = [
'pot', 'translations', 'get_translations', 'iso639',
'build',
'build', 'build_pdf2xml',
'gui',
'develop', 'install',
'resources',
@ -30,8 +30,9 @@ translations = Translations()
get_translations = GetTranslations()
iso639 = ISO639()
from setup.extensions import Build
from setup.extensions import Build, BuildPDF2XML
build = Build()
build_pdf2xml = BuildPDF2XML()
from setup.install import Develop, Install, Sdist
develop = Develop()

View File

@ -12,10 +12,12 @@ from distutils import sysconfig
from PyQt4.pyqtconfig import QtGuiModuleMakefile
from setup import Command, islinux, isosx, SRC, iswindows
from setup.build_environment import fc_inc, fc_lib, qt_inc, qt_lib, \
fc_error, poppler_libs, poppler_lib, poppler_inc, podofo_inc, \
from setup.build_environment import fc_inc, fc_lib, \
fc_error, poppler_libs, poppler_lib_dirs, poppler_inc_dirs, podofo_inc, \
podofo_lib, podofo_error, poppler_error, pyqt, OSX_SDK, NMAKE, \
leopard_build, QMAKE, msvc, MT, win_inc, win_lib
leopard_build, QMAKE, msvc, MT, win_inc, win_lib, png_inc_dirs, \
magick_inc_dirs, magick_lib_dirs, png_lib_dirs, png_libs, \
magick_error, magick_libs
MT
isunix = islinux or isosx
@ -43,6 +45,10 @@ class Extension(object):
self.ldflags = kwargs.get('ldflags', [])
self.optional = kwargs.get('optional', False)
reflow_sources = glob.glob(os.path.join(SRC, 'calibre', 'ebooks', 'pdf', '*.cpp'))
reflow_headers = glob.glob(os.path.join(SRC, 'calibre', 'ebooks', 'pdf', '*.h'))
reflow_error = poppler_error if poppler_error else magick_error
extensions = [
Extension('lzx',
['calibre/utils/lzx/lzxmodule.c',
@ -76,15 +82,6 @@ extensions = [
Extension('cPalmdoc',
['calibre/ebooks/compression/palmdoc.c']),
Extension('calibre_poppler',
['calibre/utils/poppler/poppler.cpp'],
libraries=(['poppler', 'poppler-qt4']+poppler_libs),
lib_dirs=[os.environ.get('POPPLER_LIB_DIR',
poppler_lib), qt_lib],
inc_dirs=[poppler_inc, qt_inc],
error=poppler_error,
optional=True),
Extension('podofo',
['calibre/utils/podofo/podofo.cpp'],
libraries=['podofo'],
@ -97,10 +94,20 @@ extensions = [
inc_dirs = ['calibre/gui2/pictureflow'],
headers = ['calibre/gui2/pictureflow/pictureflow.h'],
sip_files = ['calibre/gui2/pictureflow/pictureflow.sip']
)
),
Extension('pdfreflow',
reflow_sources,
headers=reflow_headers,
libraries=poppler_libs+magick_libs+png_libs,
lib_dirs=poppler_lib_dirs+magick_lib_dirs+png_lib_dirs,
inc_dirs=poppler_inc_dirs+magick_inc_dirs+png_inc_dirs,
error=reflow_error,
cflags=['-DPNG_SKIP_SETJMP_CHECK'] if islinux else []
)
]
if iswindows:
extensions.append(Extension('winutil',
['calibre/utils/windows/winutil.c'],
@ -346,10 +353,36 @@ class Build(Command):
class BuildPDF2XML(Command):
description = 'Build command line pdf2xml utility'
def run(self, opts):
dest = os.path.expanduser('~/bin/pdf2xml')
odest = self.j(self.d(self.SRC), 'build', 'objects', 'pdf2xml')
if not os.path.exists(odest):
os.makedirs(odest)
objects = []
for src in reflow_sources:
if src.endswith('python.cpp'):
continue
obj = self.j(odest, self.b(src+'.o'))
if self.newer(obj, [src]+reflow_headers):
cmd = ['g++', '-pthread', '-pedantic', '-g', '-c', '-Wall', '-I/usr/include/poppler',
'-I/usr/include/ImageMagick',
'-DPDF2XML', '-o', obj, src]
self.info(*cmd)
subprocess.check_call(cmd)
objects.append(obj)
if self.newer(dest, objects):
cmd = ['g++', '-g', '-o', dest]+objects+['-lpoppler', '-lMagickWand',
'-lpng', '-lpthread']
self.info(*cmd)
subprocess.check_call(cmd)
self.info('Binary installed as', dest)

View File

@ -192,6 +192,10 @@ class Install(Develop):
x = self.j(dest, x)
if os.path.exists(dest):
shutil.rmtree(x)
for x in os.walk(dest):
for f in x[-1]:
if os.path.splitext(f)[1] in ('.c', '.cpp', '.h'):
os.remove(self.j(x[0], f))
dest = self.root + self.resources
if os.path.exists(dest):
shutil.rmtree(dest)
@ -241,4 +245,3 @@ class Sdist(Command):
os.remove(self.DEST)

View File

@ -38,6 +38,7 @@ class LinuxFreeze(Command):
binary_includes = [
'/usr/bin/pdftohtml',
'/usr/lib/libwmflite-0.2.so.7',
'/usr/lib/liblcms.so.1',
'/tmp/calibre-mount-helper',
'/usr/lib/libunrar.so',
'/usr/lib/libsqlite3.so.0',

View File

@ -55,7 +55,7 @@ if plugins is None:
sys.path.insert(0, plugin_path)
for plugin in ['pictureflow', 'lzx', 'msdes', 'podofo', 'cPalmdoc',
'fontconfig', 'calibre_poppler'] + \
'fontconfig', 'pdfreflow'] + \
(['winutil'] if iswindows else []) + \
(['usbobserver'] if isosx else []):
try:

View File

@ -161,6 +161,7 @@ quick_metadata = QuickMetadata()
def get_file_type_metadata(stream, ftype):
mi = MetaInformation(None, None)
ftype = ftype.lower().strip()
if _metadata_readers.has_key(ftype):
for plugin in _metadata_readers[ftype]:
@ -168,6 +169,8 @@ def get_file_type_metadata(stream, ftype):
with plugin:
try:
plugin.quick = quick_metadata.quick
if hasattr(stream, 'seek'):
stream.seek(0)
mi = plugin.get_metadata(stream, ftype.lower().strip())
break
except:

View File

@ -10,6 +10,7 @@ import sys, os, re, shutil
from calibre.utils.config import OptionParser
from calibre.constants import iswindows, isosx
from calibre.libunzip import update
from calibre import prints
def option_parser():
parser = OptionParser(usage='''\
@ -28,6 +29,8 @@ Run an embedded python interpreter.
help='Debug the specified device driver.')
parser.add_option('-g', '--gui', default=False, action='store_true',
help='Run the GUI',)
parser.add_option('--paths', default=False, action='store_true',
help='Output the paths necessary to setup the calibre environment')
parser.add_option('--migrate', action='store_true', default=False,
help='Migrate old database. Needs two arguments. Path '
'to library1.db and path to new library folder.')
@ -35,6 +38,9 @@ Run an embedded python interpreter.
help='Add a simple plugin (i.e. a plugin that consists of only a '
'.py file), by specifying the path to the py file containing the '
'plugin code.')
parser.add_option('--pdfreflow', default=None,
help='Path to PDF file to try and reflow. Output will be placed in '
'current directory. ')
return parser
@ -203,6 +209,15 @@ def main(args=sys.argv):
migrate(args[1], args[2])
elif opts.add_simple_plugin is not None:
add_simple_plugin(opts.add_simple_plugin)
elif opts.paths:
prints('CALIBRE_RESOURCES_LOCATION='+sys.resources_location)
prints('CALIBRE_EXTENSIONS_LOCATION='+sys.extensions_location)
prints('CALIBRE_PYTHON_PATH='+os.pathsep.join(sys.path))
elif opts.pdfreflow:
from calibre.ebooks.pdf.reflow import option_parser as px, run
from calibre.utils.logging import default_log
opts2, args = px().parse_args(['xxxx', '-vvvv', opts.pdfreflow])
run(opts2, opts.pdfreflow, default_log)
else:
from IPython.Shell import IPShellEmbed
ipshell = IPShellEmbed()

View File

@ -45,7 +45,7 @@ class FB2Input(InputFormatPlugin):
log.debug('Parsing XML...')
parser = etree.XMLParser(recover=True, no_network=True)
doc = etree.parse(stream, parser)
doc = etree.fromstring(stream.read())
self.extract_embedded_content(doc)
log.debug('Converting XML to HTML...')
ss = open(P('templates/fb2.xsl'), 'rb').read()

View File

@ -130,7 +130,7 @@ def metadata_from_filename(name, pat=None):
au = match.group('author')
aus = string_to_authors(au)
mi.authors = aus
except IndexError:
except (IndexError, ValueError):
pass
try:
mi.series = match.group('series')

View File

@ -666,7 +666,7 @@ class OPF(object):
for key in matches[0].attrib:
if key.endswith('file-as'):
matches[0].attrib.pop(key)
matches[0].set('file-as', unicode(val))
matches[0].set('{%s}file-as'%self.NAMESPACES['opf'], unicode(val))
return property(fget=fget, fset=fset)

View File

@ -3,59 +3,55 @@ __license__ = 'GPL v3'
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
'''Read meta information from PDF files'''
import sys, os, cStringIO
from functools import partial
from calibre import prints
from calibre.constants import plugins
from calibre.ebooks.metadata import MetaInformation, string_to_authors, authors_to_string
pdfreflow, pdfreflow_error = plugins['pdfreflow']
def get_metadata(stream, cover=True):
if pdfreflow is None:
raise RuntimeError(pdfreflow_error)
info = pdfreflow.get_metadata(stream.read(), cover)
title = info.get('Title', None)
au = info.get('Author', None)
if au is None:
au = [_('Unknown')]
else:
au = string_to_authors(au)
mi = MetaInformation(title, au)
creator = info.get('Creator', None)
if creator:
mi.book_producer = creator
keywords = info.get('Keywords', None)
mi.tags = []
if keywords:
mi.tags = [x.strip() for x in keywords.split(',')]
subject = info.get('Subject', None)
if subject:
mi.tags.insert(0, subject)
if cover and 'cover' in info:
data = info['cover']
if data is None:
prints(title, 'is an encrypted document, cover extraction not allowed.')
else:
mi.cover_data = ('png', data)
return mi
get_quick_metadata = partial(get_metadata, cover=False)
import cStringIO
from threading import Thread
from calibre import StreamReadWrapper
from calibre.ptempfile import TemporaryDirectory
try:
from calibre.utils.PythonMagickWand import \
NewMagickWand, MagickReadImage, MagickSetImageFormat, \
MagickWriteImage, ImageMagick
_imagemagick_loaded = True
except:
_imagemagick_loaded = False
from calibre.ebooks.metadata import MetaInformation, string_to_authors, authors_to_string
from calibre.utils.pdftk import set_metadata as pdftk_set_metadata
from calibre.utils.podofo import get_metadata as podofo_get_metadata, \
set_metadata as podofo_set_metadata, Unavailable, get_metadata_quick
from calibre.utils.poppler import get_metadata as get_metadata_poppler, NotAvailable
def get_quick_metadata(stream):
try:
return get_metadata_poppler(stream, False)
except NotAvailable:
pass
return get_metadata_pypdf(stream)
raw = stream.read()
mi = get_metadata_quick(raw)
if mi.title == '_':
mi.title = getattr(stream, 'name', _('Unknown'))
mi.title = mi.title.rpartition('.')[0]
return mi
def get_metadata(stream, extract_cover=True):
try:
return get_metadata_poppler(stream, extract_cover)
except NotAvailable:
pass
try:
with TemporaryDirectory('_pdfmeta') as tdir:
cpath = os.path.join(tdir, 'cover.pdf')
if not extract_cover:
cpath = None
mi = podofo_get_metadata(stream, cpath=cpath)
if mi.cover is not None:
cdata = get_cover(mi.cover)
mi.cover = None
if cdata is not None:
mi.cover_data = ('jpg', cdata)
except Unavailable:
mi = get_metadata_pypdf(stream)
return mi
from calibre.utils.podofo import set_metadata as podofo_set_metadata, Unavailable
def set_metadata(stream, mi):
stream.seek(0)
@ -70,25 +66,6 @@ def set_metadata(stream, mi):
set_metadata_pypdf(stream, mi)
def get_metadata_pypdf(stream):
""" Return metadata as a L{MetaInfo} object """
from pyPdf import PdfFileReader
mi = MetaInformation(_('Unknown'), [_('Unknown')])
try:
with StreamReadWrapper(stream) as stream:
info = PdfFileReader(stream).getDocumentInfo()
if info.title:
mi.title = info.title
if info.author:
mi.author = info.author
mi.authors = string_to_authors(info.author)
if info.subject:
mi.category = info.subject
except Exception, err:
msg = u'Couldn\'t read metadata from pdf: %s with error %s'%(mi.title, unicode(err))
print >>sys.stderr, msg.encode('utf8')
return mi
class MetadataWriter(Thread):
def __init__(self, out_pdf, buf):
@ -132,13 +109,4 @@ def set_metadata_pypdf(stream, mi):
stream.write(out_str.read())
stream.seek(0)
def get_cover(cover_path):
with ImageMagick():
wand = NewMagickWand()
MagickReadImage(wand, cover_path)
MagickSetImageFormat(wand, 'JPEG')
MagickWriteImage(wand, '%s.jpg' % cover_path)
return open('%s.jpg' % cover_path, 'rb').read()

View File

@ -0,0 +1,143 @@
/**
* Copyright 2009 Kovid Goyal <kovid@kovidgoyal.net>
* License: GNU GPL v3
*/
#include "fonts.h"
#include "utils.h"
using namespace calibre_reflow;
using namespace std;
XMLColor::XMLColor(GfxRGB rgb) {
this->r = static_cast<int>(rgb.r/65535.0*255.0);
this->g = static_cast<int>(rgb.g/65535.0*255.0);
this->b = static_cast<int>(rgb.b/65535.0*255.0);
if (!(this->ok(this->r) && this->ok(this->b) && this->ok(this->g))) {
this->r = 0; this->g = 0; this->b = 0;
}
}
string XMLColor::str() const {
ostringstream oss;
oss << "rgb(" << this->r << "," << this->g << "," << this->b << ")";
return oss.str();
}
static const char *FONT_MODS[7] = {
"-bolditalic", "-boldoblique", "-bold", "-italic", "-oblique", "-roman",
NULL
};
#define ap_toupper(c) (toupper(((unsigned char)(c))))
static inline
char *strcasestr( char *h, char *n )
{ /* h="haystack", n="needle" */
char *a=h, *e=n;
if( !h || !*h || !n || !*n ) { return 0; }
while( *a && *e ) {
if( ap_toupper(*a)!=ap_toupper(*e) ) {
++h; a=h; e=n;
}
else {
++a; ++e;
}
}
return *e ? 0 : h;
}
static string* family_name(const string *font_name) {
if (!font_name) return NULL;
string *fn = new string(*font_name);
size_t pos;
const char *p;
for (size_t i = 0; FONT_MODS[i] != NULL; i++) {
p = strcasestr(fn->c_str(), FONT_MODS[i]);
if (p != NULL) {
pos = p - fn->c_str();
fn->replace(pos, strlen(FONT_MODS[i]), "");
break;
}
}
return fn;
}
XMLFont::XMLFont(string* font_name, double size, GfxRGB rgb) :
size(size-1), line_size(-1.0), italic(false), bold(false), font_name(font_name),
font_family(NULL), color(rgb) {
if (!this->font_name) this->font_name = new string(DEFAULT_FONT_FAMILY);
this->font_family = family_name(this->font_name);
if (strcasestr(font_name->c_str(), "bold")) this->bold = true;
if (strcasestr(font_name->c_str(),"italic")||
strcasestr(font_name->c_str(),"oblique")) this->italic = true;
}
XMLFont& XMLFont::operator=(const XMLFont& x){
if (this==&x) return *this;
this->size = x.size;
this->line_size = x.line_size;
this->italic = x.italic;
this->bold = x.bold;
this->color = x.color;
if (this->font_name) delete this->font_name;
this->font_name = new string(*x.font_name);
if (this->font_family) delete this->font_family;
this->font_family = new string(*x.font_family);
return *this;
}
bool XMLFont::operator==(const XMLFont &f) const {
return (fabs(this->size - f.size) < 0.1) &&
(fabs(this->line_size - f.line_size) < 0.1) &&
(this->italic == f.italic) &&
(this->bold == f.bold) &&
(this->color == f.color) &&
((*this->font_family) == (*f.font_family));
}
bool XMLFont::eq_upto_inline(const XMLFont &f) const {
return (fabs(this->size - f.size) < 0.1) &&
(fabs(this->line_size - f.line_size) < 0.1) &&
(this->color == f.color) &&
((*this->font_family) == (*f.font_family));
}
string XMLFont::str(Fonts::size_type id) const {
ostringstream oss;
oss << "<font id=\"" << id << "\" ";
oss << "family=\"" << encode_for_xml(*this->font_family) << "\" ";
oss << "color=\"" << this->color.str() << "\" ";
oss << setiosflags(ios::fixed) << setprecision(2)
<< "size=\"" << this->size << "\"";
oss << "/>";
return oss.str();
}
Fonts::size_type Fonts::add_font(XMLFont *f) {
Fonts::iterator it;
size_type i;
for ( i=0, it=this->begin(); it < this->end(); it++, i++ ) {
if (**it == *f) return i;
}
this->push_back(f);
return this->size()-1;
}
Fonts::size_type Fonts::add_font(string* font_name, double size, GfxRGB rgb) {
XMLFont *f = new XMLFont(font_name, size, rgb);
return this->add_font(f);
}
Fonts::~Fonts() {
Fonts::iterator it;
for ( it=this->begin(); it < this->end(); it++ ) delete *it;
this->resize(0);
}

View File

@ -0,0 +1,105 @@
/**
* Copyright 2009 Kovid Goyal <kovid@kovidgoyal.net>
* License: GNU GPL v3
*/
#ifndef CALIBRE_REFLOW_FONTS
#define CALIBRE_REFLOW_FONTS
#include <vector>
#include <sstream>
#include <iomanip>
#include <ctype.h>
#include <math.h>
#include <GfxState.h>
using namespace std;
#define DEFAULT_FONT_FAMILY "Times New Roman"
namespace calibre_reflow {
class XMLColor {
private:
unsigned int r;
unsigned int g;
unsigned int b;
inline bool ok(unsigned int xcol) const {
return ( (xcol <= 255) && (xcol >= 0) );
}
public:
XMLColor():r(0),g(0),b(0){}
XMLColor(GfxRGB rgb);
XMLColor(const XMLColor& x) {
this->r=x.r; this->g=x.g; this->b=x.b;
}
XMLColor& operator=(const XMLColor &x){
this->r=x.r; this->g=x.g; this->b=x.b;
return *this;
}
~XMLColor(){}
string str() const;
bool operator==(const XMLColor &col) const {
return ((r==col.r)&&(g==col.g)&&(b==col.b));
}
};
class XMLFont {
private:
double size;
double line_size;
bool italic;
bool bold;
string *font_name;
string *font_family;
XMLColor color;
public:
XMLFont(const char *font_family=DEFAULT_FONT_FAMILY, double size=12.0) :
size(size), line_size(-1.0), italic(false), bold(false),
font_name(new string(font_family)), font_family(new string(font_family)),
color() {}
XMLFont(string* font_name, double size, GfxRGB rgb);
XMLFont(const XMLFont& other) :
size(other.size), line_size(other.line_size), italic(other.italic),
bold(other.bold), font_name(new string(*other.font_name)),
font_family(other.font_family), color(other.color) {}
XMLColor get_color() { return this->color; }
string* get_font_name() { return this->font_name; }
double get_size() const { return this->size; }
double get_line_size() { return this->line_size; }
void set_line_size(double ls) { this->line_size = ls; }
bool is_italic() const { return this->italic; }
bool is_bold() const { return this->bold; }
~XMLFont() { delete this->font_name; delete this->font_family; }
XMLFont& operator=(const XMLFont& other);
bool operator==(const XMLFont &other) const;
bool eq_upto_inline(const XMLFont &f) const;
string str(vector<XMLFont*>::size_type id) const;
};
class Fonts : public vector<XMLFont*> {
public:
Fonts::size_type add_font(XMLFont *f);
Fonts::size_type add_font(string* font_name, double size, GfxRGB rgb);
~Fonts();
};
}
#endif

View File

@ -0,0 +1,289 @@
#include <stdio.h>
#include <errno.h>
#include <sstream>
#include <algorithm>
#include <iomanip>
#include <math.h>
#include <iostream>
#include <wand/MagickWand.h>
#include "images.h"
#include "utils.h"
#define xoutRound(x) ( static_cast<int>(round(x)) )
using namespace std;
using namespace calibre_reflow;
calibre_reflow::ImageInfo::ImageInfo(GfxState *state) {
// get image position and size
state->transform(0, 0, &xt, &yt);
state->transformDelta(1, 1, &wt, &ht);
if (wt > 0) {
x0 = xoutRound(xt);
w0 = xoutRound(wt);
} else {
x0 = xoutRound(xt + wt);
w0 = xoutRound(-wt);
}
if (ht > 0) {
y0 = xoutRound(yt);
h0 = xoutRound(ht);
} else {
y0 = xoutRound(yt + ht);
h0 = xoutRound(-ht);
}
state->transformDelta(1, 0, &xt, &yt);
rotate = fabs(xt) < fabs(yt);
if (rotate) {
w1 = h0;
h1 = w0;
x_flip = ht < 0;
y_flip = wt > 0;
} else {
w1 = w0;
h1 = h0;
x_flip = wt < 0;
y_flip = ht > 0;
}
//cout << x_flip << "|" << y_flip << endl;
}
void XMLImages::clear() {
vector<XMLImage*>::iterator it;
for (it = this->masks.begin(); it < this->masks.end(); it++)
delete *it;
for (it = this->images.begin(); it < this->images.end(); it++)
delete *it;
this->masks.clear();
this->images.clear();
}
void XMLImages::add_mask(GfxState *state, Object *ref, Stream *str,
unsigned int width, unsigned int height, bool invert,
bool interpolate, bool inline_img) {
}
static void throw_magick_exception(MagickWand *wand) {
ExceptionType severity;
char *description = MagickGetException(wand, &severity);
ostringstream oss;
oss << description << endl;
description=(char *) MagickRelinquishMemory(description);
wand = DestroyMagickWand(wand);
MagickWandTerminus();
throw ReflowException(oss.str().c_str());
}
static void flip_image(string file_name, bool x_flip, bool y_flip) {
MagickWand *magick_wand;
MagickBooleanType status;
MagickWandGenesis();
magick_wand = NewMagickWand();
status = MagickReadImage(magick_wand, file_name.c_str());
if (status == MagickFalse) throw_magick_exception(magick_wand);
if (y_flip) {
status = MagickFlipImage(magick_wand);
if (status == MagickFalse) throw_magick_exception(magick_wand);
}
if (x_flip) {
status = MagickFlopImage(magick_wand);
if (status == MagickFalse) throw_magick_exception(magick_wand);
}
status = MagickWriteImage(magick_wand, NULL);
if (status == MagickFalse) throw_magick_exception(magick_wand);
magick_wand = DestroyMagickWand(magick_wand);
MagickWandTerminus();
}
void XMLImages::add(GfxState *state, Object *ref, Stream *str,
unsigned int width, unsigned int height, GfxImageColorMap *colorMap,
bool interpolate, int *maskColors, bool inline_img) {
XMLImage *img = new XMLImage(state);
this->images.push_back(img);
img->width = width; img->height = height;
img->type = (str->getKind() == strDCT) ? jpeg : png;
string file_name = this->file_name(img);
FILE *of = fopen(file_name.c_str(), "wb");
if (!of) throw ReflowException(strerror(errno));
if (img->type == jpeg) {
int c;
str = ((DCTStream *)str)->getRawStream();
str->reset();
// copy the stream
while ((c = str->getChar()) != EOF) fputc(c, of);
} else { //Render as PNG
Guchar *p;
GfxRGB rgb;
png_byte *row = (png_byte *) malloc(3 * width); // 3 bytes/pixel: RGB
png_bytep *row_pointer= &row;
PNGWriter *writer = new PNGWriter();
writer->init(of, width, height);
// Initialize the image stream
ImageStream *imgStr = new ImageStream(str, width,
colorMap->getNumPixelComps(), colorMap->getBits());
imgStr->reset();
// For each line...
for (unsigned int y = 0; y < height; y++) {
// Convert into a PNG row
p = imgStr->getLine();
for (unsigned int x = 0; x < width; x++) {
colorMap->getRGB(p, &rgb);
// Write the RGB pixels into the row
row[3*x]= colToByte(rgb.r);
row[3*x+1]= colToByte(rgb.g);
row[3*x+2]= colToByte(rgb.b);
p += colorMap->getNumPixelComps();
}
writer->writeRow(row_pointer);
}
writer->close();
delete writer;
free(row);
imgStr->close();
delete imgStr;
}
fclose(of);
img->written = true;
if (img->info.x_flip || img->info.y_flip)
flip_image(file_name, img->info.x_flip, img->info.y_flip);
}
string XMLImages::file_name(const XMLImage *img) const {
vector<XMLImage*>::const_iterator ir, mr;
size_t idx = 0;
bool mask = false;
ir = find( this->images.begin(), this->images.end(), img);
if (ir == this->images.end()) {
mr = find( this->masks.begin(), this->masks.end(), img);
idx = mr - this->masks.begin();
mask = true;
} else idx = ir - this->images.begin();
ostringstream oss;
oss << ((mask) ? "mask" : "image") << "-" << idx+1 << '.';
oss << ((img->type == jpeg) ? "jpg" : "png");
return oss.str();
}
vector<string*> XMLImages::str() const {
vector<string*> ans;
vector <XMLImage*>::const_iterator it;
for (it = this->masks.begin(); it < this->masks.end(); it++) {
if ((*it)->written)
ans.push_back(new string((*it)->str(it - this->masks.begin(), true,
this->file_name(*it))));
}
for (it = this->images.begin(); it < this->images.end(); it++) {
if ((*it)->written)
ans.push_back(new string((*it)->str(it - this->images.begin(), false,
this->file_name(*it))));
}
return ans;
}
string XMLImage::str(size_t num, bool mask, string file_name) const {
ostringstream oss;
oss << "<img type=\"" << ((mask) ? "mask" : "image") << "\" "
<< "src=\"" << file_name << "\" "
<< "iwidth=\"" << this->width << "\" iheight=\"" << this->height << "\" "
<< "rwidth=\"" << this->info.w1 << "\" rheight=\"" << this->info.h1 << "\" "
<< setiosflags(ios::fixed) << setprecision(2)
<< "top=\"" << this->info.y0 << "\" left=\"" << this->info.x0 << "\"/>";
return oss.str();
}
PNGWriter::~PNGWriter()
{
/* cleanup heap allocation */
png_destroy_write_struct(&png_ptr, &info_ptr);
}
void PNGWriter::init(FILE *f, int width, int height)
{
/* initialize stuff */
png_ptr = png_create_write_struct(PNG_LIBPNG_VER_STRING, NULL, NULL, NULL);
if (!png_ptr)
throw ReflowException("png_create_write_struct failed");
info_ptr = png_create_info_struct(png_ptr);
if (!info_ptr)
throw ReflowException("png_create_info_struct failed");
if (setjmp(png_jmpbuf(png_ptr)))
throw ReflowException("png_jmpbuf failed");
/* write header */
png_init_io(png_ptr, f);
if (setjmp(png_jmpbuf(png_ptr)))
throw ReflowException("Error during writing header");
// Set up the type of PNG image and the compression level
png_set_compression_level(png_ptr, Z_BEST_COMPRESSION);
png_byte bit_depth = 8;
png_byte color_type = PNG_COLOR_TYPE_RGB;
png_byte interlace_type = PNG_INTERLACE_NONE;
png_set_IHDR(png_ptr, info_ptr, width, height, bit_depth, color_type, interlace_type, PNG_COMPRESSION_TYPE_DEFAULT, PNG_FILTER_TYPE_DEFAULT);
png_write_info(png_ptr, info_ptr);
if (setjmp(png_jmpbuf(png_ptr)))
throw ReflowException("error during writing png info bytes");
}
void PNGWriter::writePointers(png_bytep *rowPointers)
{
png_write_image(png_ptr, rowPointers);
/* write bytes */
if (setjmp(png_jmpbuf(png_ptr)))
throw ReflowException("Error during writing bytes");
}
void PNGWriter::writeRow(png_bytep *row)
{
// Write the row to the file
png_write_rows(png_ptr, row, 1);
if (setjmp(png_jmpbuf(png_ptr)))
throw ReflowException("error during png row write");
}
void PNGWriter::close()
{
/* end write */
png_write_end(png_ptr, info_ptr);
if (setjmp(png_jmpbuf(png_ptr)))
throw ReflowException("Error during end of write");
}
void PNGWriter::write_splash_bitmap(SplashBitmap *bitmap) {
SplashColorPtr row = bitmap->getDataPtr();
int height = bitmap->getHeight();
int row_size = bitmap->getRowSize();
png_bytep *row_pointers = new png_bytep[height];
for (int y = 0; y < height; ++y) {
row_pointers[y] = row;
row += row_size;
}
this->writePointers(row_pointers);
delete[] row_pointers;
}

View File

@ -0,0 +1,94 @@
#ifndef _CALIBRE_REFLOW_IMAGES
#define _CALIBRE_REFLOW_IMAGES
#include <vector>
#include <GfxState.h>
#include <splash/SplashBitmap.h>
#include <png.h>
using namespace std;
namespace calibre_reflow {
enum ImageType {
jpeg, png
};
class PNGWriter
{
public:
PNGWriter() {}
~PNGWriter();
void init(FILE *f, int width, int height);
void writePointers(png_bytep *rowPointers);
void writeRow(png_bytep *row);
void write_splash_bitmap(SplashBitmap *bitmap);
void close();
private:
png_structp png_ptr;
png_infop info_ptr;
};
class ImageInfo {
public:
ImageInfo(GfxState *state);
private:
int x0, y0; // top left corner of image
int w0, h0, w1, h1; // size of image
double xt, yt, wt, ht;
bool rotate, x_flip, y_flip;
friend class XMLImage;
friend class XMLImages;
};
class XMLImage {
private:
double x, y;
unsigned int width, height;
ImageType type;
bool written;
ImageInfo info;
friend class XMLImages;
public:
XMLImage(GfxState *state) :
x(0.), y(0.), width(0), height(0), type(jpeg), written(false), info(state)
{}
~XMLImage() {}
string str(size_t num, bool mask, string file_name) const;
};
class XMLImages {
private:
vector<XMLImage*> images;
vector<XMLImage*> masks;
public:
~XMLImages() { this->clear(); }
void add_mask(GfxState *state, Object *ref, Stream *str,
unsigned int width, unsigned int height, bool invert,
bool interpolate, bool inline_img);
void add(GfxState *state, Object *ref, Stream *str,
unsigned int width, unsigned int height, GfxImageColorMap *colorMap,
bool interpolate, int *maskColors, bool inline_img);
string file_name(const XMLImage *img) const;
vector<string*> str() const;
void clear();
};
}
#endif

View File

@ -0,0 +1,56 @@
/**
* Copyright 2009 Kovid Goyal <kovid@kovidgoyal.net>
* License: GNU GPL v3
*/
#include "links.h"
#include "utils.h"
using namespace std;
using namespace calibre_reflow;
XMLLink& XMLLink::operator=(const XMLLink &x) {
if (this==&x) return *this;
if (this->dest) {delete this->dest; this->dest=NULL;}
this->x_min = x.x_min;
this->y_min = x.y_min;
this->x_max = x.x_max;
this->y_max = x.y_max;
this->dest = new string(*x.dest);
return *this;
}
bool XMLLink::in_link(double xmin,double ymin,double xmax,double ymax) const {
double y = (ymin + ymax)/2;
if (y > this->y_max) return false;
return (y > this->y_min) && (xmin < this->x_max) && (xmax > this->x_min);
}
string XMLLink::get_link_start() {
ostringstream oss;
oss << "<a href=\"";
if (this->dest) oss << encode_for_xml(*this->dest);
oss << "\">";
return oss.str();
}
XMLLinks::~XMLLinks() {
for(XMLLinks::iterator i = this->begin(); i != this->end(); i++)
delete *i;
this->clear();
}
bool XMLLinks::in_link(double xmin, double ymin, double xmax,
double ymax, XMLLinks::size_type &p) const {
for(XMLLinks::const_iterator i = this->begin(); i != this->end(); i++) {
if ( (*i)->in_link(xmin, ymin, xmax, ymax) ) {
p = (i - this->begin());
return true;
}
}
return false;
}

View File

@ -0,0 +1,69 @@
/**
* Copyright 2009 Kovid Goyal <kovid@kovidgoyal.net>
* License: GNU GPL v3
*/
#ifndef _CALIBRE_XML_LINKS
#define _CALIBRE_XML_LINKS
#include <vector>
#include <sstream>
using namespace std;
namespace calibre_reflow {
class XMLLink {
private:
double x_min;
double y_min;
double x_max;
double y_max;
string* dest;
public:
XMLLink() : dest(NULL) {}
XMLLink(const XMLLink& x) :
x_min(x.x_min), y_min(x.y_min), x_max(x.x_max),
y_max(x.y_max), dest(new string(*x.dest)) {}
XMLLink(double x_min, double y_min, double x_max,
double y_max, const char *dest) :
x_min((x_min < x_max) ? x_min : x_max),
y_min((y_min < y_max) ? y_min : y_max),
x_max((x_max > x_min) ? x_max : x_min),
y_max((y_max > y_min) ? y_max : y_min),
dest(new string(dest)) {}
~XMLLink() { delete this->dest; }
string* get_dest() { return this->dest; }
double get_x1() const {return x_min;}
double get_x2() const {return x_max;}
double get_y1() const {return y_min;}
double get_y2() const {return y_max;}
XMLLink& operator=(const XMLLink &x);
bool operator==(const XMLLink &x) const {
return (this->dest != NULL) && (x.dest != NULL) &&
this->dest->compare(*x.dest) == 0;
}
bool in_link(double xmin, double ymin, double xmax, double ymax) const;
string get_link_start();
};
class XMLLinks : public vector<XMLLink*> {
public:
~XMLLinks();
bool in_link(double xmin, double ymin, double xmax,
double ymax, XMLLinks::size_type &p) const;
};
}
#endif

View File

@ -0,0 +1,198 @@
#ifndef PDF2XML
#define UNICODE
#define PY_SSIZE_T_CLEAN
#include <Python.h>
#endif
#include "reflow.h"
using namespace std;
using namespace calibre_reflow;
#ifndef PDF2XML
extern "C" {
static PyObject *
pdfreflow_reflow(PyObject *self, PyObject *args) {
char *pdfdata;
Py_ssize_t size;
if (!PyArg_ParseTuple(args, "s#", &pdfdata, &size))
return NULL;
try {
Reflow reflow(pdfdata, static_cast<std::ifstream::pos_type>(size));
reflow.render();
} catch (std::exception &e) {
PyErr_SetString(PyExc_RuntimeError, e.what()); return NULL;
} catch (...) {
PyErr_SetString(PyExc_RuntimeError,
"Unknown exception raised while rendering PDF"); return NULL;
}
Py_RETURN_NONE;
}
static PyObject *
pdfreflow_get_metadata(PyObject *self, PyObject *args) {
char *pdfdata;
Py_ssize_t size;
map<string,string> info;
PyObject *cover;
PyObject *ans = PyDict_New();
if (!ans) return PyErr_NoMemory();
if (!PyArg_ParseTuple(args, "s#O", &pdfdata, &size, &cover))
return NULL;
try {
Reflow reflow(pdfdata, static_cast<std::ifstream::pos_type>(size));
info = reflow.get_info();
if (PyObject_IsTrue(cover)) {
if (!reflow.is_locked()) {
size_t size;
char *data = reflow.render_first_page(&size);
PyObject *d = PyString_FromStringAndSize(data, size);
delete[] data;
if (d == NULL) return PyErr_NoMemory();
if (PyDict_SetItemString(ans, "cover", d) == -1) return NULL;
} else {
if (PyDict_SetItemString(ans, "cover", Py_None) == -1) return NULL;
}
}
} catch (std::exception &e) {
PyErr_SetString(PyExc_RuntimeError, e.what()); return NULL;
} catch (...) {
PyErr_SetString(PyExc_RuntimeError,
"Unknown exception raised while getting metadata from PDF"); return NULL;
}
for (map<string,string>::const_iterator it = info.begin() ; it != info.end(); it++ ) {
PyObject *key = PyUnicode_Decode((*it).first.c_str(), (*it).first.size(), "UTF-8", "replace");
if (!key) return NULL;
PyObject *val = PyUnicode_Decode((*it).second.c_str(), (*it).second.size(), "UTF-8", "replace");
if (!val) return NULL;
if (PyDict_SetItem(ans, key, val) == -1) return NULL;
}
return ans;
}
static PyObject *
pdfreflow_set_metadata(PyObject *self, PyObject *args) {
char *pdfdata;
Py_ssize_t size;
PyObject *info;
if (!PyArg_ParseTuple(args, "s#O", &pdfdata, &size, &info))
return NULL;
if (!PyDict_Check(info)) {
PyErr_SetString(PyExc_ValueError, "Info object must be a dictionary.");
return NULL;
}
char Title[10] = "Title", Author[10] = "Author", Keywords[10] = "Keywords";
char *keys[3] = { Title, Author, Keywords };
map<char *, char *> pinfo;
PyObject *val = NULL, *utf8 = NULL;
for (int i = 0; i < 3; i++) {
val = PyDict_GetItemString(info, keys[i]);
if (!val || !PyUnicode_Check(val)) continue;
utf8 = PyUnicode_AsUTF8String(val);
if (!utf8) continue;
pinfo[keys[i]] = PyString_AS_STRING(utf8);
}
PyObject *ans = NULL;
try {
Reflow reflow(pdfdata, static_cast<std::ifstream::pos_type>(size));
if (reflow.is_locked()) {
PyErr_SetString(PyExc_ValueError, "Setting metadata not possible in encrypeted PDFs");
return NULL;
}
string result = reflow.set_info(pinfo);
ans = PyString_FromStringAndSize(result.c_str(), result.size());
} catch (std::exception &e) {
PyErr_SetString(PyExc_RuntimeError, e.what()); return NULL;
} catch (...) {
PyErr_SetString(PyExc_RuntimeError,
"Unknown exception raised while getting metadata from PDF"); return NULL;
}
return ans;
}
static
PyMethodDef pdfreflow_methods[] = {
{"reflow", pdfreflow_reflow, METH_VARARGS,
"reflow(pdf_data)\n\n"
"Reflow the specified PDF."
},
{"get_metadata", pdfreflow_get_metadata, METH_VARARGS,
"get_metadata(pdf_data, cover)\n\n"
"Get metadata and (optionally) cover from the specified PDF."
},
{"set_metadata", pdfreflow_set_metadata, METH_VARARGS,
"get_metadata(info_dict)\n\n"
"Set metadata in the specified PDF. Currently broken."
},
{NULL, NULL, 0, NULL}
};
PyMODINIT_FUNC
initpdfreflow(void)
{
PyObject* m;
m = Py_InitModule3("pdfreflow", pdfreflow_methods,
"Reflow a PDF file");
if (m == NULL) return;
}
}
#else
int main(int argc, char **argv) {
char *memblock;
ifstream::pos_type size;
if (argc != 2) {
cerr << "Usage: " << argv[0] << " file.pdf" << endl;
return 1;
}
ifstream file (argv[1], ios::in|ios::binary|ios::ate);
if (file.is_open()) {
size = file.tellg();
memblock = new char[size];
file.seekg (0, ios::beg);
file.read (memblock, size);
file.close();
} else {
cerr << "Unable to read from: " << argv[1] << endl;
return 1;
}
try {
Reflow reflow(memblock, size);
reflow.render();
size_t sz = 0;
char *data = reflow.render_first_page(&sz);
ofstream file("cover.png", ios::binary);
file.write(data, sz);
file.close();
} catch(exception &e) {
cerr << e.what() << endl;
return 1;
}
return 0;
}
#endif

View File

@ -0,0 +1,974 @@
/**
* Copyright 2009 Kovid Goyal <kovid@kovidgoyal.net>
* License: GNU GPL v3
*/
#include <Object.h>
#include <Outline.h>
#include <PDFDocEncoding.h>
#include <goo/GooList.h>
#include <SplashOutputDev.h>
#include <splash/SplashBitmap.h>
#include <splash/SplashErrorCodes.h>
#include "reflow.h"
#include "utils.h"
using namespace std;
using namespace calibre_reflow;
static const size_t num_info_keys = 8;
static const char* info_keys[num_info_keys] = {
"Title", "Subject", "Keywords", "Author", "Creator", "Producer",
"CreationDate", "ModDate"
};
//------------------------------------------------------------------------
// XMLString
//------------------------------------------------------------------------
XMLString::XMLString(GfxState *state, GooString *s, double current_font_size,
Fonts *fonts) :
text(new vector<Unicode>(0)), x_right(new vector<double>(0)),
yx_next(NULL), xy_next(NULL), fonts(fonts), font_idx(0), xml_text(NULL),
link(NULL), x_min(0), x_max(0), y_min(0), y_max(0), col(0), dir(text_dir_unknown)
{
double x = 0, y = 0;
GfxFont *font;
state->transform(state->getCurX(), state->getCurY(), &x, &y);
if ((font = state->getFont())) {
double ascent = font->getAscent();
double descent = font->getDescent();
if( ascent > 1.05 ){
//printf( "ascent=%.15g is too high, descent=%.15g\n", ascent, descent );
ascent = 1.05;
}
if( descent < -0.4 ){
//printf( "descent %.15g is too low, ascent=%.15g\n", descent, ascent );
descent = -0.4;
}
this->y_min = y - ascent * current_font_size;
this->y_max = y - descent * current_font_size;
GfxRGB rgb;
state->getFillRGB(&rgb);
GooString *name = state->getFont()->getName();
if (!name)
this->font_idx = this->fonts->add_font(NULL, current_font_size-1, rgb);
else
this->font_idx = this->fonts->add_font(
new string(name->getCString()), current_font_size-1, rgb);
} else {
// this means that the PDF file draws text without a current font,
// which should never happen
this->y_min = y - 0.95 * current_font_size;
this->y_max = y + 0.35 * current_font_size;
}
if (this->y_min == this->y_max) {
// this is a sanity check for a case that shouldn't happen -- but
// if it does happen, we want to avoid dividing by zero later
this->y_min = y;
this->y_max = y + 1;
}
}
void XMLString::add_char(GfxState *state, double x, double y,
double dx, double dy, Unicode u) {
if (dir == text_dir_unknown) {
//dir = UnicodeMap::getDirection(u);
dir = text_dir_left_right;
}
if (this->text->capacity() == this->text->size()) {
this->text->reserve(text->size()+16);
this->x_right->reserve(x_right->size()+16);
}
this->text->push_back(u);
if (this->length() == 1) {
this->x_min = x;
}
this->x_max = x + dx;
this->x_right->push_back(x_max);
//printf("added char: %f %f xright = %f\n", x, dx, x+dx);
}
void XMLString::end_string()
{
if( this->dir == text_dir_right_left && this->length() > 1 )
{
//printf("will reverse!\n");
reverse(this->text->begin(), this->text->end());
}
}
static string encode_unicode_chars(const Unicode *u, size_t num) {
ostringstream oss;
UnicodeMap *uMap;
char buf[10];
int n;
if (!(uMap = globalParams->getTextEncoding())) {
throw ReflowException("Failed to allocate unicode map.");
}
for (size_t i = 0; i < num; i++) {
switch (u[i]) {
case '&': oss << "&amp;"; break;
case '<': oss << "&lt;"; break;
case '>': oss << "&gt;"; break;
default:
{
// convert unicode to string
if ((n = uMap->mapUnicode(u[i], buf, sizeof(buf))) > 0) {
buf[n] = 0;
oss << buf;
}
}
}
}
uMap->decRefCnt();
return oss.str();
}
void XMLString::encode() {
delete this->xml_text;
this->xml_text = new string(encode_unicode_chars(&((*this->text)[0]), this->text->size()));
}
string XMLString::str() const {
ostringstream oss;
oss << "<text font=\"" << this->font_idx << "\" ";
oss << setiosflags(ios::fixed) << setprecision(2)
<< "top=\"" << this->y_min << "\" left=\"" << this->x_min
<< "\" width=\"" << this->x_max - this->x_min << "\" "
<< "height=\"" << this->y_max - this->y_min << "\">";
oss << *this->xml_text << "</text>";
return oss.str();
}
XMLString::~XMLString() {
delete this->text; delete this->x_right;
}
//------------------------------------------------------------------------
// XMLPage
//------------------------------------------------------------------------
XMLPage::XMLPage(unsigned int num, GfxState *state, ofstream *output, Fonts* fonts) :
current_string(NULL), num(num), output(output), current_font_size(0.0),
yx_strings(NULL), xy_strings(NULL), yx_cur1(NULL), yx_cur2(NULL),
fonts(fonts), links(new XMLLinks())
{
(*this->output) << setiosflags(ios::fixed) << setprecision(2) <<
"\t\t<page number=\"" << this->num << "\" width=\"" <<
state->getPageWidth() << "\" height=\"" << state->getPageHeight() <<
"\">" << endl;
if (!(*this->output)) throw ReflowException(strerror(errno));
}
XMLPage::~XMLPage() {
(*this->output) << "\t\t</page>" << endl;
if (!(*this->output)) throw ReflowException(strerror(errno));
for (XMLString *tmp = this->yx_strings; tmp; tmp = tmp->yx_next)
delete tmp;
delete this->links;
}
void XMLPage::update_font(GfxState *state) {
GfxFont *font;
double *fm;
char *name;
int code;
double w;
current_font_size = state->getTransformedFontSize();
if ((font = state->getFont()) && font->getType() == fontType3) {
// This is a hack which makes it possible to deal with some Type 3
// fonts. The problem is that it's impossible to know what the
// base coordinate system used in the font is without actually
// rendering the font. This code tries to guess by looking at the
// width of the character 'm' (which breaks if the font is a
// subset that doesn't contain 'm').
for (code = 0; code < 256; ++code) {
if ((name = ((Gfx8BitFont *)font)->getCharName(code)) &&
name[0] == 'm' && name[1] == '\0') break;
}
if (code < 256) {
w = ((Gfx8BitFont *)font)->getWidth(code);
if (w != 0) {
// 600 is a generic average 'm' width -- yes, this is a hack
current_font_size *= w / 0.6;
}
}
fm = font->getFontMatrix();
if (fm[0] != 0) {
current_font_size *= fabs(fm[3] / fm[0]);
}
}
}
void XMLPage::draw_char(GfxState *state, double x, double y,
double dx, double dy,
double originX, double originY,
CharCode code, int nBytes, Unicode *u, int uLen) {
if ( (state->getRender() & 3) == 3) return; //Hidden text
double x1, y1, w1, h1, dx2, dy2;
int i;
state->transform(x, y, &x1, &y1);
// check that new character is in the same direction as current string
// and is not too far away from it before adding
if (this->current_string->character_does_not_belong_to_string(state, x1)) {
this->end_string();
this->begin_string(state, NULL);
}
state->textTransformDelta(state->getCharSpace() * state->getHorizScaling(),
0, &dx2, &dy2);
dx -= dx2;
dy -= dy2;
state->transformDelta(dx, dy, &w1, &h1);
if (uLen != 0) {
w1 /= uLen;
h1 /= uLen;
}
for (i = 0; i < uLen; ++i) {
this->current_string->add_char(state, x1 + i*w1, y1 + i*h1, w1, h1, u[i]);
}
}
void XMLPage::end_string() {
XMLString *p1 = NULL, *p2 = NULL;
double h, y1, y2;
// throw away zero-length strings -- they don't have valid xMin/xMax
// values, and they're useless anyway
if (this->current_string->length() == 0) {
delete this->current_string;
this->current_string = NULL;
return;
}
this->current_string->end_string();
// insert string in y-major list
h = this->current_string->height();
y1 = this->current_string->y_min + 0.5 * h;
y2 = this->current_string->y_min + 0.8 * h;
if (gFalse) { //rawOrder
p1 = this->yx_cur1;
p2 = NULL;
} else if (
(!this->yx_cur1 ||
(y1 >= this->yx_cur1->y_min &&
(y2 >= this->yx_cur1->y_max ||
this->current_string->x_max >= this->yx_cur1->x_min))) &&
(!this->yx_cur2 ||
(y1 < this->yx_cur2->y_min ||
(y2 < this->yx_cur2->y_max &&
this->current_string->x_max < this->yx_cur2->x_min)))
) {
p1 = this->yx_cur1;
p2 = this->yx_cur2;
} else {
for (p1 = NULL, p2 = this->yx_strings; p2; p1 = p2, p2 = p2->yx_next) {
if (y1 < p2->y_min || (y2 < p2->y_max && this->current_string->x_max < p2->x_min))
break;
}
this->yx_cur2 = p2;
}
this->yx_cur1 = this->current_string;
if (p1)
p1->yx_next = this->current_string;
else
this->yx_strings = this->current_string;
this->current_string->yx_next = p2;
this->current_string = NULL;
}
void XMLPage::end() {
XMLLinks::size_type link_index = 0;
Fonts::size_type pos = 0;
XMLFont* h;
for (XMLString *tmp = this->yx_strings; tmp; tmp = tmp->yx_next) {
pos = tmp->font_idx;
h = this->fonts->at(pos);
tmp->encode();
if (this->links->in_link(
tmp->x_min, tmp->y_min, tmp->x_max, tmp->y_max, link_index)) {
tmp->link = links->at(link_index);
}
}
this->coalesce();
for (XMLString *tmp = yx_strings; tmp; tmp=tmp->yx_next) {
if (tmp->xml_text && tmp->xml_text->size() > 0) {
(*this->output) << "\t\t\t" << tmp->str() << endl;
if (!(*this->output)) throw ReflowException(strerror(errno));
}
}
}
static const char *strrstr( const char *s, const char *ss )
{
const char *p = strstr( s, ss );
for( const char *pp = p; pp != NULL; pp = strstr( p+1, ss ) ){
p = pp;
}
return p;
}
static void close_tags( string *xml_text, bool &finish_a, bool &finish_italic, bool &finish_bold )
{
const char *last_italic = finish_italic && ( finish_bold || finish_a ) ? strrstr( xml_text->c_str(), "<em>" ) : NULL;
const char *last_bold = finish_bold && ( finish_italic || finish_a ) ? strrstr( xml_text->c_str(), "<strong>" ) : NULL;
const char *last_a = finish_a && ( finish_italic || finish_bold ) ? strrstr( xml_text->c_str(), "<a " ) : NULL;
if( finish_a && ( finish_italic || finish_bold ) && last_a > ( last_italic > last_bold ? last_italic : last_bold ) ) {
xml_text->append("</a>");
finish_a = false;
}
if( finish_italic && finish_bold && last_italic > last_bold ){
xml_text->append("</em>");
finish_italic = false;
}
if( finish_bold )
xml_text->append("</strong>");
if( finish_italic )
xml_text->append("</em>");
if( finish_a )
xml_text->append("</a>");
}
void XMLPage::coalesce() {
XMLString *str1, *str2, *str3;
XMLFont *hfont1, *hfont2;
double space, hor_space, vert_space, vert_overlap, size, x_limit;
bool add_space, found;
int n, i;
double cur_x, cur_y;
str1 = this->yx_strings;
if( !str1 ) return;
//----- discard duplicated text (fake boldface, drop shadows)
while (str1)
{
size = str1->y_max - str1->y_min;
x_limit = str1->x_min + size * 0.2;
found = false;
for (str2 = str1, str3 = str1->yx_next;
str3 && str3->x_min < x_limit;
str2 = str3, str3 = str2->yx_next)
{
if (str3->length() == str1->length() &&
!memcmp(str3->text, str1->text, str1->length() * sizeof(Unicode)) &&
fabs(str3->y_min - str1->y_min) < size * 0.2 &&
fabs(str3->y_max - str1->y_max) < size * 0.2 &&
fabs(str3->x_max - str1->x_max) < size * 0.2)
{
found = true;
//printf("found duplicate!\n");
break;
}
}
if (found)
{
str2->xy_next = str3->xy_next;
str2->yx_next = str3->yx_next;
delete str3;
}
else
{
str1 = str1->yx_next;
}
}
str1 = yx_strings;
hfont1 = this->fonts->at(str1->font_idx);
if( hfont1->is_bold() )
str1->xml_text->insert(0, "<strong>");
if( hfont1->is_italic() )
str1->xml_text->insert(0, "<em>");
if (str1->get_link())
str1->xml_text->insert(0, str1->get_link()->get_link_start());
cur_x = str1->x_min; cur_y = str1->y_min;
while (str1 && (str2 = str1->yx_next)) {
hfont2 = this->fonts->at(str2->font_idx);
space = str1->y_max - str1->y_min;
hor_space = str2->x_min - str1->x_max;
vert_space = str2->y_min - str1->y_max;
vert_overlap = 0;
if (str2->y_min >= str1->y_min && str2->y_min <= str1->y_max)
{
vert_overlap = str1->y_max - str2->y_min;
} else if (str2->y_max >= str1->y_min && str2->y_max <= str1->y_max)
{
vert_overlap = str2->y_max - str1->y_min;
}
if (
(
(
(str2->y_min < str1->y_max)
&&
(hor_space > -0.5 * space && hor_space < space)
)
) &&
(hfont1->eq_upto_inline(*hfont2)) &&
str1->dir == str2->dir // text direction the same
)
{
n = str1->length() + str2->length();
if ((add_space = hor_space > 0.1 * space)) {
++n;
}
str1->text->reserve((n + 15) & ~15);
str1->x_right->reserve((n + 15) & ~15);
if (add_space) {
str1->text->push_back(0x20);
str1->xml_text->push_back(' ');
str1->x_right->push_back(str2->x_min);
}
for (i = 0; i < str2->length(); i++) {
str1->text->push_back(str2->text->at(i));
str1->x_right->push_back(str2->x_right->at(i));
}
/* fix <i>, <b> if str1 and str2 differ and handle switch of links */
XMLLink *hlink1 = str1->get_link();
XMLLink *hlink2 = str2->get_link();
bool switch_links = !hlink1 || !hlink2 || !((*hlink1) == (*hlink2));
bool finish_a = switch_links && hlink1 != NULL;
bool finish_italic = hfont1->is_italic() && ( !hfont2->is_italic() || finish_a );
bool finish_bold = hfont1->is_bold() &&
( !hfont2->is_bold() || finish_a || finish_italic );
close_tags( str1->xml_text, finish_a, finish_italic, finish_bold );
if( switch_links && hlink2 != NULL ) {
string ls = hlink2->get_link_start();
str1->xml_text->append(ls);
}
if( ( !hfont1->is_italic() || finish_italic ) && hfont2->is_italic() )
str1->xml_text->append("<em>");
if( ( !hfont1->is_bold() || finish_bold ) && hfont2->is_bold() )
str1->xml_text->append("<strong>");
str1->xml_text->append(*str2->xml_text);
// str1 now contains href for link of str2 (if it is defined)
str1->link = str2->link;
hfont1 = hfont2;
if (str2->x_max > str1->x_max) {
str1->x_max = str2->x_max;
}
if (str2->y_max > str1->y_max) {
str1->y_max = str2->y_max;
}
str1->yx_next = str2->yx_next;
delete str2;
} else { // keep strings separate
bool finish_a = str1->get_link() != NULL;
bool finish_bold = hfont1->is_bold();
bool finish_italic = hfont1->is_italic();
close_tags( str1->xml_text, finish_a, finish_italic, finish_bold );
str1->x_min = cur_x; str1->y_min = cur_y;
str1 = str2;
cur_x = str1->x_min; cur_y = str1->y_min;
hfont1 = hfont2;
if ( hfont1->is_bold() )
str1->xml_text->insert(0, "<strong>");
if( hfont1->is_italic() )
str1->xml_text->insert(0, "<em>");
if( str1->get_link() != NULL ) {
str1->xml_text->insert(0, str1->get_link()->get_link_start());
}
}
}
str1->x_min = cur_x; str1->y_min = cur_y;
bool finish_bold = hfont1->is_bold();
bool finish_italic = hfont1->is_italic();
bool finish_a = str1->get_link() != NULL;
close_tags( str1->xml_text, finish_a, finish_italic, finish_bold );
}
//------------------------------------------------------------------------
// XMLOutputDev
//------------------------------------------------------------------------
XMLOutputDev::XMLOutputDev(PDFDoc *doc) :
current_page(NULL), output(new ofstream("index.xml", ios::trunc)),
fonts(new Fonts()), catalog(NULL), images(new XMLImages()), doc(doc)
{
if (!(*this->output)) {
throw ReflowException(strerror(errno));
}
(*this->output) << "<pdfreflow>" << endl;
(*this->output) << "\t<pages>" << endl;
if (!(*this->output)) throw ReflowException(strerror(errno));
}
XMLOutputDev::~XMLOutputDev() {
(*this->output) << "\t</pages>" << endl;
if (!(*this->output)) throw ReflowException(strerror(errno));
(*this->output) << "\t<fonts>" << endl;
if (!(*this->output)) throw ReflowException(strerror(errno));
for (Fonts::const_iterator it = this->fonts->begin(); it < this->fonts->end(); it++) {
(*this->output) << "\t\t" << (*it)->str(it - this->fonts->begin()) << endl;
if (!(*this->output)) throw ReflowException(strerror(errno));
}
(*this->output) << "\t</fonts>" << endl;
if (!(*this->output)) throw ReflowException(strerror(errno));
(*this->output) << "</pdfreflow>" << endl;
if (!(*this->output)) throw ReflowException(strerror(errno));
this->output->close();
delete this->output;
delete this->fonts;
delete this->images;
}
static string get_link_dest(LinkAction *link, PDFDoc *doc) {
unsigned int page = 1;
ostringstream oss;
switch(link->getKind())
{
case actionGoTo:
{
LinkGoTo *ha = (LinkGoTo *)link;
LinkDest *dest = NULL;
if (ha->getDest() != NULL)
dest = ha->getDest()->copy();
else if (ha->getNamedDest() != NULL) {
dest = doc->findDest(ha->getNamedDest());
}
if (dest) {
if (dest->isPageRef()) {
Ref pageref = dest->getPageRef();
page = doc->findPage(pageref.num, pageref.gen);
}
else {
page = dest->getPageNum();
}
oss << "#" << page
<< setiosflags(ios::fixed) << setprecision(2)
<< ":l=" << dest->getLeft()
<< "t=" << dest->getTop();
//<< "r=" << dest->getRight()
//<< "b=" << dest->getBottom();
delete dest;
}
break;
}
case actionGoToR:
{
LinkGoToR *ha = (LinkGoToR *) link;
LinkDest *dest = NULL;
bool has_file = false;
if (ha->getFileName()) {
oss << ha->getFileName()->getCString();
has_file = true;
}
if (ha->getDest() != NULL) dest=ha->getDest()->copy();
if (dest && has_file) {
if (!(dest->isPageRef())) page = dest->getPageNum();
delete dest;
oss << '#' << page;
}
break;
}
case actionURI:
{
LinkURI *ha=(LinkURI *) link;
oss << ha->getURI()->getCString();
break;
}
case actionLaunch:
{
LinkLaunch *ha = (LinkLaunch *) link;
oss << ha->getFileName()->getCString();
break;
}
case actionNamed: break;
case actionMovie: break;
case actionRendition: break;
case actionSound: break;
case actionJavaScript: break;
case actionUnknown: break;
}
return oss.str();
}
void XMLOutputDev::process_link(Link* link){
double _x1, _y1, _x2, _y2;
int x1, y1, x2, y2;
link->getRect(&_x1, &_y1, &_x2, &_y2);
cvtUserToDev(_x1, _y1, &x1, &y1);
cvtUserToDev(_x2, _y2, &x2, &y2);
LinkAction *a = link->getAction();
if (!a) return;
string dest = get_link_dest(a, this->doc);
if (dest.length() > 0) {
XMLLink *t = new XMLLink((double)x1, (double)y2, (double)x2, (double)y1,
dest.c_str());
this->current_page->add_link(t);
}
}
void XMLOutputDev::endPage() {
Links *slinks = catalog->getPage(current_page->number())->getLinks(catalog);
for (int i = 0; i < slinks->getNumLinks(); i++)
{
this->process_link(slinks->getLink(i));
}
delete slinks;
this->current_page->end();
vector<string*> images = this->images->str();
for (vector<string*>::iterator it = images.begin(); it < images.end(); it++) {
(*this->output) << "\t\t\t" << *(*it) << endl;
if (!(*this->output)) throw ReflowException(strerror(errno));
delete *it;
}
this->images->clear();
delete this->current_page;
this->current_page = NULL;
}
void XMLOutputDev::drawImageMask(GfxState *state, Object *ref, Stream *str,
int width, int height, GBool invert,
GBool interpolate, GBool inlineImg) {
OutputDev::drawImageMask(state, ref, str, width, height,
invert, interpolate, inlineImg);
//this->images->add_mask();
cerr << "mask requested" << endl;
}
void XMLOutputDev::drawImage(GfxState *state, Object *ref, Stream *str,
int width, int height, GfxImageColorMap *colorMap,
GBool interpolate, int *maskColors, GBool inlineImg) {
this->images->add(state, ref, str,
static_cast<unsigned int>(width), static_cast<unsigned int>(height),
colorMap, interpolate, maskColors, inlineImg);
}
static char stream_pdf[15] = "stream.pdf";
class MemInStream : public MemStream {
public:
MemInStream(char *buf, size_t st, size_t sz, Object *obj) :
MemStream(buf, st, sz, obj) {}
~MemInStream() {}
GooString *getFileName() { return new GooString(stream_pdf); }
};
Reflow::Reflow(char *pdfdata, size_t sz) :
pdfdata(pdfdata), current_font_size(-1), doc(NULL)
{
Object obj;
obj.initNull();
if (globalParams == NULL) {
globalParams = new GlobalParams();
if (!globalParams)
throw ReflowException("Failed to allocate Globalparams");
}
MemInStream *str = new MemInStream(pdfdata, 0, sz, &obj);
this->doc = new PDFDoc(str, NULL, NULL);
if (!this->doc->isOk()) {
ostringstream stm;
stm << "Failed to open PDF file";
stm << " with error code: " << doc->getErrorCode();
delete this->doc;
this->doc = NULL;
throw ReflowException(stm.str().c_str());
}
}
void
Reflow::render() {
if (this->doc->isEncrypted()) {
throw ReflowException("Document is encrypted.");
}
if (!this->doc->okToCopy())
cout << "Warning, this document has the copy protection flag set, ignoring." << endl;
char encoding[10] = "UTF-8";
globalParams->setTextEncoding(encoding);
int first_page = 1;
int last_page = doc->getNumPages();
XMLOutputDev *xml_out = new XMLOutputDev(this->doc);
doc->displayPages(xml_out, first_page, last_page,
96, //hDPI
96, //vDPI
0, //rotate
true, //UseMediaBox
true, //Crop
false //Printing
);
this->dump_outline();
delete xml_out;
}
void Reflow::dump_outline() {
Outline *outline = this->doc->getOutline();
if (!outline) return;
GooList *items = outline->getItems();
if ( !items || items->getLength() < 1 )
return;
ostringstream *output = new ostringstream();
(*output) << "<outline>" << endl;
this->outline_level(output, items);
(*output) << "</outline>" << endl;
ofstream of("outline.xml", ios::trunc);
of << output->str();
if (!of) throw ReflowException("Error writing outline file");
of.close();
delete output;
}
static inline void outline_tabs(ostringstream *o, int level) {
for (int i = 0; i < level; i++)
(*o) << "\t";
}
void Reflow::outline_level(ostringstream *oss, GooList *items, int level)
{
int num_of_items = items->getLength();
if (num_of_items > 0) {
outline_tabs(oss, level);
(*oss) << "<links level=\"" << level << "\">" << endl;
for (int i = 0; i < num_of_items; i++) {
OutlineItem* item = (OutlineItem *)items->get(i);
Unicode *u = item->getTitle();
string title = encode_unicode_chars(u, item->getTitleLength());
if (title.size() < 1) continue;
outline_tabs(oss, level+1);
(*oss) << "<link open=\"" << (item->isOpen()?"yes":"no") << "\"";
LinkAction *a = item->getAction();
if (a != NULL)
(*oss) << " dest=\"" << get_link_dest(a, this->doc) << "\"";
(*oss) << ">" << title << "</link>" << endl;
item->open();
GooList *children = item->getKids();
if (children)
outline_level(oss, children, level+1);
}
}
}
Reflow::~Reflow() {
delete this->doc;
}
map<string, string> Reflow::get_info() {
Object info;
map<string, string> ans;
string val;
char encoding[10] = "UTF-8";
globalParams->setTextEncoding(encoding);
this->doc->getDocInfo(&info);
if (info.isDict()) {
for(size_t i = 0; i < num_info_keys; i++) {
val = this->decode_info_string(info.getDict(), info_keys[i]);
if (val.size() > 0) {
ans[string(info_keys[i])] = string(val);
}
}
}
return ans;
}
string Reflow::decode_info_string(Dict *info, const char *key) const {
Object obj;
GooString *s1;
bool is_unicode;
Unicode u;
char buf[8];
int i, n;
ostringstream oss;
char *tmp = new char[strlen(key)+1];
strcpy(tmp, key);
UnicodeMap *umap;
if (!(umap = globalParams->getTextEncoding())) {
throw ReflowException("Failed to allocate unicode map.");
}
if (info->lookup(tmp, &obj)->isString()) {
s1 = obj.getString();
if ((s1->getChar(0) & 0xff) == 0xfe &&
(s1->getChar(1) & 0xff) == 0xff) {
is_unicode = true;
i = 2;
} else {
is_unicode = false;
i = 0;
}
while (i < obj.getString()->getLength()) {
if (is_unicode) {
u = ((s1->getChar(i) & 0xff) << 8) |
(s1->getChar(i+1) & 0xff);
i += 2;
} else {
u = pdfDocEncoding[s1->getChar(i) & 0xff];
++i;
}
n = umap->mapUnicode(u, buf, sizeof(buf));
buf[n] = 0;
oss << buf;
}
}
obj.free();
delete[] tmp;
return oss.str();
}
char* Reflow::render_first_page(size_t *data_size,
bool use_crop_box, double x_res,
double y_res) {
if (this->is_locked()) throw ReflowException("Document is locked.");
char encoding[10] = "UTF-8";
char yes[10] = "yes";
globalParams->setTextEncoding(encoding);
globalParams->setEnableFreeType(yes);
globalParams->setAntialias(yes);
globalParams->setVectorAntialias(yes);
SplashColor paper_color;
paper_color[0] = 255;
paper_color[1] = 255;
paper_color[2] = 255;
SplashOutputDev *out = new SplashOutputDev(splashModeRGB8, 4, false, paper_color);
if (!out) {
throw ReflowException("Failed to allocate SplashOutputDev");
}
out->startDoc(doc->getXRef());
double pg_w, pg_h;
int pg = 1;
if (use_crop_box) {
pg_w = this->doc->getPageCropWidth(pg);
pg_h = this->doc->getPageCropHeight(pg);
} else {
pg_w = this->doc->getPageMediaWidth(pg);
pg_h = this->doc->getPageMediaHeight(pg);
}
pg_w *= x_res/72.;
pg_h *= x_res/72.;
int x=0, y=0;
this->doc->displayPageSlice(out, pg, x_res, y_res, 0,
!use_crop_box, false, false, x, y, pg_w, pg_h);
FILE * f = tmpfile();
if (!f) throw ReflowException(strerror(errno));
SplashBitmap *bmp = out->getBitmap();
PNGWriter *writer = new PNGWriter();
writer->init(f, bmp->getWidth(), bmp->getHeight());
writer->write_splash_bitmap(bmp);
writer->close();
delete writer;
long size = ftell(f);
rewind(f);
char *buffer = new char[size];
*data_size = fread(buffer, 1, size, f);
if (*data_size != (size_t)size) {
throw ReflowException("I/O error reading from tmpfile");
}
return buffer;
}
class MemOutStream : public OutStream {
private:
ostringstream out;
public:
MemOutStream() :OutStream() {}
~MemOutStream() {}
void close() {}
int getPos() { return out.tellp(); }
void put(char c) { out.put(c); }
void printf (const char *format, ...) {
vector<char> buf;
size_t written = strlen(format)*5;
va_list ap;
do {
buf.reserve(written + 20);
va_start(ap, format);
written = vsnprintf(&buf[0], buf.capacity(), format, ap);
va_end(ap);
} while (written >= buf.capacity());
out.write(&buf[0], written);
}
};
string Reflow::set_info(map<char *, char *> sinfo) {
XRef *xref = this->doc->getXRef();
if (!xref) throw ReflowException("No XRef table");
Object *trailer_dict = xref->getTrailerDict();
if (!trailer_dict || !trailer_dict->isDict()) throw ReflowException("No trailer dictionary");
Object tmp;
char INFO[5] = "Info";
Object *info = trailer_dict->dictLookup(INFO, &tmp);
if (!info) {
info = new Object();
info->initDict(xref);
}
if (!info->isDict()) throw ReflowException("Invalid info object");
for (map<char *, char *>::iterator it = sinfo.begin(); it != sinfo.end(); it++) {
Object *tmp = new Object();
tmp->initString(new GooString((*it).second));
info->dictSet((*it).first, tmp);
}
trailer_dict->dictSet(INFO, info);
char out[20] = "/t/out.pdf";
this->doc->saveAs(new GooString(out), writeForceRewrite);
string ans;
return ans;
}

View File

@ -0,0 +1,241 @@
/**
* Copyright 2009 Kovid Goyal <kovid@kovidgoyal.net>
* License: GNU GPL v3
* Based on pdftohtml from the poppler project.
*/
#ifndef CALIBRE_REFLOW
#define CALIBRE_REFLOW
#define UNICODE
#include <PDFDoc.h>
#include <GlobalParams.h>
#include <GfxState.h>
#include <GfxFont.h>
#include <OutputDev.h>
#include <Link.h>
#include <UnicodeMap.h>
#include <cmath>
#include <exception>
#include <string>
#include <sstream>
#include <vector>
#include <iostream>
#include <algorithm>
#include <fstream>
#include <iostream>
#include <iomanip>
#include <map>
#include <errno.h>
#include "fonts.h"
#include "links.h"
#include "images.h"
using namespace std;
namespace calibre_reflow {
enum UnicodeTextDirection {
text_dir_unknown,
text_dir_left_right,
text_dir_right_left,
text_dir_top_bottom
};
class Reflow {
private:
char *pdfdata;
double current_font_size;
PDFDoc *doc;
string decode_info_string(Dict *info, const char *key) const;
void outline_level(ostringstream *oss, GooList *items,
int level=1);
public:
Reflow (char *xpdfdata, size_t sz);
~Reflow();
/* Convert the PDF to XML. All files are output to the current directory */
void render();
/* Get the PDF Info Dictionary */
map<string, string> get_info();
/* True if the PDF is encrypted */
bool is_locked() const { return !this->doc || this->doc->isEncrypted(); }
/* Return the first page of the PDF, rendered as a PNG image */
char* render_first_page(size_t *data_size,
bool use_crop_box=true, double x_res=150.0,
double y_res = 150.0);
/* Dump the PDF outline as the file outline.xml in the current directory */
void dump_outline();
/* Set the info dictionary. Currently broken. */
string set_info(map<char *, char *> info);
};
class XMLString {
private:
vector<Unicode> *text; // the text
vector<double> *x_right; // right-hand x coord of each char
XMLString *yx_next; // next string in y-major order
XMLString *xy_next; // next string in x-major order
Fonts *fonts;
Fonts::size_type font_idx;
string *xml_text;
XMLLink *link;
double x_min, x_max; // bounding box x coordinates
double y_min, y_max; // bounding box y coordinates
int col; // starting column
UnicodeTextDirection dir; // direction (left to right/right to left)
friend class XMLPage;
public:
XMLString(GfxState *state, GooString *s, double current_font_size, Fonts *fonts);
~XMLString();
bool character_does_not_belong_to_string(GfxState *state, double x1) {
return this->length() > 0 &&
fabs(x1 - x_right->at(this->length()-1)) > 0.1 * (y_max - y_min);
}
void add_char(GfxState *state, double x, double y,
double dx, double dy, Unicode u);
void end_string();
inline int length() const { return this->text->size(); }
inline double height() const { return y_max - y_min; }
void encode();
XMLLink* get_link() { return this->link; }
string str() const;
};
class XMLPage {
private:
XMLString *current_string;
unsigned int num;
ofstream *output;
double current_font_size;
XMLString *yx_strings; // strings in y-major order
XMLString *xy_strings; // strings in x-major order
XMLString *yx_cur1, *yx_cur2; // cursors for yxStrings list
Fonts *fonts;
XMLLinks *links;
void coalesce();
public:
XMLPage(unsigned int num, GfxState *state, ofstream *output, Fonts* fonts);
~XMLPage();
void update_font(GfxState *state);
void begin_string(GfxState *state, GooString *s) {
this->current_string = new XMLString(state, s,
this->current_font_size, this->fonts);
}
void draw_char(GfxState *state, double x, double y,
double dx, double dy,
double originX, double originY,
CharCode code, int nBytes, Unicode *u, int uLen);
void end_string();
void end();
void add_link(XMLLink *t) { this->links->push_back(t); }
unsigned int number() const { return this->num; }
};
class XMLOutputDev : public OutputDev {
public:
XMLOutputDev(PDFDoc *doc);
virtual ~XMLOutputDev();
//---- get info about output device
// Does this device use upside-down coordinates?
// (Upside-down means (0,0) is the top left corner of the page.)
virtual GBool upsideDown() { return gTrue; }
// Does this device use drawChar() or drawString()?
virtual GBool useDrawChar() { return gTrue; }
// Does this device use beginType3Char/endType3Char? Otherwise,
// text in Type 3 fonts will be drawn with drawChar/drawString.
virtual GBool interpretType3Chars() { return gFalse; }
// Does this device need non-text content?
virtual GBool needNonText() { return gTrue; }
//----- initialization and control
virtual GBool checkPageSlice(Page *page, double hDPI, double vDPI,
int rotate, GBool useMediaBox, GBool crop,
int sliceX, int sliceY, int sliceW, int sliceH,
GBool printing, Catalog * catalogA,
GBool (* abortCheckCbk)(void *data) = NULL,
void * abortCheckCbkData = NULL)
{
this->catalog = catalogA;
return gTrue;
}
// Start a page.
virtual void startPage(int page_num, GfxState *state) {
this->current_page = new XMLPage(page_num, state, this->output, this->fonts);
}
// End a page.
virtual void endPage();
//----- update text state
virtual void updateFont(GfxState *state) {current_page->update_font(state);}
//----- text drawing
virtual void beginString(GfxState *state, GooString *s) {
this->current_page->begin_string(state, s);
}
virtual void endString(GfxState *state) {
this->current_page->end_string();
}
virtual void drawChar(GfxState *state, double x, double y,
double dx, double dy,
double originX, double originY,
CharCode code, int nBytes, Unicode *u, int uLen) {
this->current_page->draw_char(state, x, y, dx, dy, originX,
originY, code, nBytes, u, uLen);
}
virtual void drawImageMask(GfxState *state, Object *ref,
Stream *str,
int width, int height, GBool invert,
GBool interpolate, GBool inlineImg);
virtual void drawImage(GfxState *state, Object *ref, Stream *str,
int width, int height, GfxImageColorMap *colorMap,
GBool interpolate, int *maskColors, GBool inlineImg);
//new feature
virtual int DevType() {return 1234;}
private:
XMLPage *current_page;
ofstream *output; // xml file
Fonts *fonts;
Catalog *catalog;
XMLImages *images;
PDFDoc *doc;
void process_link(Link* link);
};
}
#endif

View File

@ -0,0 +1,127 @@
#!/usr/bin/env python
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from __future__ import with_statement
__license__ = 'GPL v3'
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
import sys, os
from copy import deepcopy
from lxml import etree
class Font(object):
def __init__(self, spec):
self.id = spec.get('id')
self.size = float(spec.get('size'))
self.color = spec.get('color')
self.family = spec.get('family')
class Text(object):
A = etree.XPath('descendant::a[@href]')
def __init__(self, text, font_map, classes, opts, log):
self.opts, self.log = opts, log
self.font_map = font_map
self.top, self.left, self.width, self.height = map(float, map(text.get,
('top', 'left', 'width', 'height')))
self.font = self.font_map[text.get('font')]
self.font_size = self.font.size
self.color = self.font.color
self.font_family = self.font.family
for a in self.A(text):
href = a.get('href')
if href.startswith('index.'):
href = href.split('#')[-1]
a.set('href', '#page'+href)
self.text = etree.Element('span')
css = {'font_size':'%.1fpt'%self.font_size, 'color': self.color}
if css not in classes:
classes.append(css)
idx = classes.index(css)
self.text.set('class', 't%d'%idx)
if text.text:
self.text.text = text.text
for x in text:
self.text.append(deepcopy(x))
#print etree.tostring(self.text, encoding='utf-8', with_tail=False)
class Page(object):
def __init__(self, page, font_map, classes, opts, log):
self.opts, self.log = opts, log
self.font_map = font_map
self.number = int(page.get('number'))
self.top, self.left, self.width, self.height = map(float, map(page.get,
('top', 'left', 'width', 'height')))
self.id = 'page%d'%self.number
self.texts = []
for text in page.xpath('descendant::text'):
self.texts.append(Text(text, self.font_map, classes, self.opts, self.log))
class PDFDocument(object):
def __init__(self, xml, opts, log):
self.opts, self.log = opts, log
parser = etree.XMLParser(recover=True)
self.root = etree.fromstring(xml, parser=parser)
self.fonts = []
self.font_map = {}
for spec in self.root.xpath('//fontspec'):
self.fonts.append(Font(spec))
self.font_map[self.fonts[-1].id] = self.fonts[-1]
self.pages = []
self.page_map = {}
self.classes = []
for page in self.root.xpath('//page'):
page = Page(page, self.font_map, self.classes, opts, log)
self.page_map[page.id] = page
self.pages.append(page)
def run(opts, pathtopdf, log):
from calibre.constants import plugins
pdfreflow, err = plugins['pdfreflow']
if pdfreflow is None:
raise RuntimeError('Failed to load PDF Reflow plugin: '+err)
data = open(pathtopdf, 'rb').read()
pdfreflow.reflow(data)
index = os.path.join(os.getcwdu(), 'index.xml')
xml = open(index, 'rb').read()
#pdfdoc = PDFDocument(xml, opts, log)
def option_parser():
from optparse import OptionParser
p = OptionParser()
p.add_option('-v', '--verbose', action='count', default=0)
return p
def main(args=sys.argv):
p = option_parser()
opts, args = p.parse_args(args)
from calibre.utils.logging import default_log
if len(args) < 2:
p.print_help()
default_log('No input PDF file specified', file=sys.stderr)
return 1
run(opts, args[1], default_log)
return 0

View File

@ -0,0 +1,48 @@
/**
* Copyright 2009 Kovid Goyal <kovid@kovidgoyal.net>
* License: GNU GPL v3
*/
#ifndef _CALIBRE_REFLOW_UTILS
#define _CALIBRE_REFLOW_UTILS
#include <string>
#include <sstream>
using namespace std;
namespace calibre_reflow {
class ReflowException : public exception {
const char *msg;
public:
ReflowException(const char *m) : msg(m) {}
virtual const char* what() const throw() { return msg; }
};
inline string encode_for_xml(const string &sSrc )
{
ostringstream sRet;
for( string::const_iterator iter = sSrc.begin(); iter!=sSrc.end(); iter++ )
{
unsigned char c = (unsigned char)*iter;
switch( c )
{
case '&': sRet << "&amp;"; break;
case '<': sRet << "&lt;"; break;
case '>': sRet << "&gt;"; break;
case '"': sRet << "&quot;"; break;
default: sRet << c;
}
}
return sRet.str();
}
}
#endif

View File

@ -40,12 +40,12 @@
<string>...</string>
</property>
<property name="icon">
<iconset resource="../../../../resources/images.qrc">
<iconset resource="../../../work/calibre/resources/images.qrc">
<normaloff>:/images/document_open.svg</normaloff>:/images/document_open.svg</iconset>
</property>
</widget>
</item>
<item row="2" column="0">
<item row="3" column="0">
<spacer name="verticalSpacer">
<property name="orientation">
<enum>Qt::Vertical</enum>
@ -64,15 +64,25 @@
<string>...</string>
</property>
<property name="icon">
<iconset resource="../../../../resources/images.qrc">
<iconset resource="../../../work/calibre/resources/images.qrc">
<normaloff>:/images/clear_left.svg</normaloff>:/images/clear_left.svg</iconset>
</property>
</widget>
</item>
<item row="2" column="0">
<widget class="QLabel" name="label_2">
<property name="text">
<string>The debug process outputs the intermediate HTML generated at various stages of the conversion process. This HTML can sometimes serve as a good starting point for hand editing a conversion.</string>
</property>
<property name="wordWrap">
<bool>true</bool>
</property>
</widget>
</item>
</layout>
</widget>
<resources>
<include location="../../../../resources/images.qrc"/>
<include location="../../../work/calibre/resources/images.qrc"/>
</resources>
<connections/>
</ui>

View File

@ -1,63 +0,0 @@
#!/usr/bin/env python
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from __future__ import with_statement
__license__ = 'GPL v3'
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
import os
from calibre.constants import plugins
from calibre.ebooks.metadata import MetaInformation, string_to_authors
poppler, poppler_err = plugins['calibre_poppler']
class NotAvailable(Exception):
pass
def get_metadata(stream, cover=True):
if not poppler:
raise NotAvailable('Failed to load poppler with error: '+poppler_err)
raw = stream.read()
doc = poppler.PDFDoc()
doc.load(raw)
del raw
title = doc.title
if not title or not title.strip():
title = _('Unknown')
if hasattr(stream, 'name'):
title = os.path.splitext(os.path.basename(stream.name))[0]
author = doc.author
authors = string_to_authors(author) if author else [_('Unknown')]
creator = doc.creator
mi = MetaInformation(title, authors)
if creator:
mi.book_producer = creator
if doc.subject:
mi.category = doc.subject
if doc.keywords:
mi.tags = [x.strip() for x in doc.keywords.split(',')]
if cover:
from calibre.gui2 import is_ok_to_use_qt
cdata = None
if is_ok_to_use_qt():
try:
cdata = doc.render_page(0)
except:
import traceback
traceback.print_exc()
if cdata is not None:
mi.cover_data = ('jpg', cdata)
del doc
return mi

View File

@ -1,329 +0,0 @@
#define UNICODE
#define PY_SSIZE_T_CLEAN
#include <Python.h>
#include <poppler-qt4.h>
#include <QtCore/QBuffer>
#include <QtGui/QImage>
typedef struct {
PyObject_HEAD
/* Type-specific fields go here. */
Poppler::Document *doc;
} poppler_PDFDoc;
extern "C" {
static void
poppler_PDFDoc_dealloc(poppler_PDFDoc* self)
{
if (self->doc != NULL) delete self->doc;
self->ob_type->tp_free((PyObject*)self);
}
static PyObject *
poppler_PDFDoc_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
{
poppler_PDFDoc *self;
self = (poppler_PDFDoc *)type->tp_alloc(type, 0);
if (self != NULL) {
self->doc = NULL;
}
return (PyObject *)self;
}
static PyObject *
poppler_PDFDoc_load(poppler_PDFDoc *self, PyObject *args, PyObject *kwargs) {
char *buffer; Py_ssize_t size; QByteArray data;
if (!PyArg_ParseTuple(args, "s#", &buffer, &size)) return NULL;
data = QByteArray::fromRawData(buffer, size);
self->doc = Poppler::Document::loadFromData(data);
if (self->doc == NULL) {PyErr_SetString(PyExc_ValueError, "Could not load PDF file from data."); return NULL;}
Py_RETURN_NONE;
}
}
static QString
poppler_convert_pystring(PyObject *py) {
QString ans;
Py_UNICODE* u = PyUnicode_AS_UNICODE(py);
PyObject *u8 = PyUnicode_EncodeUTF8(u, PyUnicode_GET_SIZE(py), "replace");
if (u8 == NULL) { PyErr_NoMemory(); return NULL; }
ans = QString::fromUtf8(PyString_AS_STRING(u8));
Py_DECREF(u8);
return ans;
}
extern "C" {
static PyObject *
poppler_convert_qstring(const QString &src) {
QByteArray data = src.toUtf8();
const char *cdata = data.constData();
int sz = data.size();
return PyUnicode_Decode(cdata, sz, "utf-8", "error");
}
static PyObject *
poppler_PDFDoc_open(poppler_PDFDoc *self, PyObject *args, PyObject *kwargs) {
PyObject *fname; QString _fname;
if (!PyArg_ParseTuple(args, "O", &fname)) return NULL;
_fname = poppler_convert_pystring(fname);
self->doc = Poppler::Document::load(_fname);
Py_RETURN_NONE;
}
static PyObject *
poppler_PDFDoc_getter(poppler_PDFDoc *self, int field)
{
PyObject *ans;
const char *s;
switch (field) {
case 0:
s = "Title"; break;
case 1:
s = "Author"; break;
case 2:
s = "Subject"; break;
case 3:
s = "Keywords"; break;
case 4:
s = "Creator"; break;
case 5:
s = "Producer"; break;
default:
PyErr_SetString(PyExc_Exception, "Bad field");
return NULL;
}
ans = poppler_convert_qstring(self->doc->info(QString(s)));
if (ans != NULL) Py_INCREF(ans);
return ans;
}
static int
poppler_PDFDoc_setter(poppler_PDFDoc *self, PyObject *val, int field) {
return -1;
}
static PyObject *
poppler_PDFDoc_title_getter(poppler_PDFDoc *self, void *closure) {
return poppler_PDFDoc_getter(self, 0);
}
static PyObject *
poppler_PDFDoc_author_getter(poppler_PDFDoc *self, void *closure) {
return poppler_PDFDoc_getter(self, 1);
}
static PyObject *
poppler_PDFDoc_subject_getter(poppler_PDFDoc *self, void *closure) {
return poppler_PDFDoc_getter(self, 2);
}
static PyObject *
poppler_PDFDoc_keywords_getter(poppler_PDFDoc *self, void *closure) {
return poppler_PDFDoc_getter(self, 3);
}
static PyObject *
poppler_PDFDoc_creator_getter(poppler_PDFDoc *self, void *closure) {
return poppler_PDFDoc_getter(self, 4);
}
static PyObject *
poppler_PDFDoc_producer_getter(poppler_PDFDoc *self, void *closure) {
return poppler_PDFDoc_getter(self, 5);
}
static PyObject *
poppler_PDFDoc_version_getter(poppler_PDFDoc *self, void *closure) {
PyObject *ans = PyFloat_FromDouble(self->doc->pdfVersion());
if (ans != NULL) Py_INCREF(ans);
return ans;
}
static int
poppler_PDFDoc_title_setter(poppler_PDFDoc *self, PyObject *val, void *closure) {
return poppler_PDFDoc_setter(self, val, 0);
}
static int
poppler_PDFDoc_author_setter(poppler_PDFDoc *self, PyObject *val, void *closure) {
return poppler_PDFDoc_setter(self, val, 1);
}
static int
poppler_PDFDoc_subject_setter(poppler_PDFDoc *self, PyObject *val, void *closure) {
return poppler_PDFDoc_setter(self, val, 2);
}
static int
poppler_PDFDoc_keywords_setter(poppler_PDFDoc *self, PyObject *val, void *closure) {
return poppler_PDFDoc_setter(self, val, 3);
}
static int
poppler_PDFDoc_creator_setter(poppler_PDFDoc *self, PyObject *val, void *closure) {
return poppler_PDFDoc_setter(self, val, 4);
}
static int
poppler_PDFDoc_producer_setter(poppler_PDFDoc *self, PyObject *val, void *closure) {
return poppler_PDFDoc_setter(self, val, 5);
}
}
static PyObject *
poppler_PDFDoc_render_page(poppler_PDFDoc *self, PyObject *args, PyObject *kwargs) {
QImage img;
float xdpi = 166.0, ydpi = 166.0;
Poppler::Page *page;
QByteArray ba;
PyObject *ans = NULL;
QBuffer buffer(&ba);
int num;
if (!PyArg_ParseTuple(args, "i|ff", &num, &xdpi, &ydpi)) return ans;
if ( self->doc->isLocked()) {
PyErr_SetString(PyExc_ValueError, "This document is copyrighted.");
return ans;
}
if ( num < 0 || num >= self->doc->numPages()) {
PyErr_SetString(PyExc_ValueError, "Invalid page number");
return ans;
}
page = self->doc->page(num);
img = page->renderToImage(xdpi, ydpi);
if (img.isNull()) {
PyErr_SetString(PyExc_Exception, "Failed to render first page of PDF");
return ans;
}
buffer.open(QIODevice::WriteOnly);
if (!img.save(&buffer, "JPEG")) {
PyErr_SetString(PyExc_Exception, "Failed to save rendered page");
return ans;
}
ans = PyString_FromStringAndSize(ba.data(), ba.size());
if (ans != NULL) { Py_INCREF(ans); }
return ans;
}
static PyMethodDef poppler_PDFDoc_methods[] = {
{"load", (PyCFunction)poppler_PDFDoc_load, METH_VARARGS,
"Load a PDF document from a byte buffer (string)"
},
{"open", (PyCFunction)poppler_PDFDoc_open, METH_VARARGS,
"Load a PDF document from a file path (string)"
},
{"render_page", (PyCFunction)poppler_PDFDoc_render_page, METH_VARARGS,
"render_page(page_num, xdpi=166, ydpi=166) -> Render a page to a JPEG image. Page numbers start from zero."
},
{NULL} /* Sentinel */
};
static PyObject *
poppler_PDFDoc_pages_getter(poppler_PDFDoc *self, void *closure) {
int pages = self->doc->numPages();
PyObject *ans = PyInt_FromLong(static_cast<long>(pages));
if (ans != NULL) Py_INCREF(ans);
return ans;
}
static PyGetSetDef poppler_PDFDoc_getsetters[] = {
{(char *)"title",
(getter)poppler_PDFDoc_title_getter, (setter)poppler_PDFDoc_title_setter,
(char *)"Document title",
NULL},
{(char *)"author",
(getter)poppler_PDFDoc_author_getter, (setter)poppler_PDFDoc_author_setter,
(char *)"Document author",
NULL},
{(char *)"subject",
(getter)poppler_PDFDoc_subject_getter, (setter)poppler_PDFDoc_subject_setter,
(char *)"Document subject",
NULL},
{(char *)"keywords",
(getter)poppler_PDFDoc_keywords_getter, (setter)poppler_PDFDoc_keywords_setter,
(char *)"Document keywords",
NULL},
{(char *)"creator",
(getter)poppler_PDFDoc_creator_getter, (setter)poppler_PDFDoc_creator_setter,
(char *)"Document creator",
NULL},
{(char *)"producer",
(getter)poppler_PDFDoc_producer_getter, (setter)poppler_PDFDoc_producer_setter,
(char *)"Document producer",
NULL},
{(char *)"pages",
(getter)poppler_PDFDoc_pages_getter, NULL,
(char *)"Number of pages in document (read only)",
NULL},
{(char *)"version",
(getter)poppler_PDFDoc_version_getter, NULL,
(char *)"The PDF version (read only)",
NULL},
{NULL} /* Sentinel */
};
static PyTypeObject poppler_PDFDocType = {
PyObject_HEAD_INIT(NULL)
0, /*ob_size*/
"calibre_poppler.PDFDoc", /*tp_name*/
sizeof(poppler_PDFDoc), /*tp_basicsize*/
0, /*tp_itemsize*/
(destructor)poppler_PDFDoc_dealloc, /*tp_dealloc*/
0, /*tp_print*/
0, /*tp_getattr*/
0, /*tp_setattr*/
0, /*tp_compare*/
0, /*tp_repr*/
0, /*tp_as_number*/
0, /*tp_as_sequence*/
0, /*tp_as_mapping*/
0, /*tp_hash */
0, /*tp_call*/
0, /*tp_str*/
0, /*tp_getattro*/
0, /*tp_setattro*/
0, /*tp_as_buffer*/
Py_TPFLAGS_DEFAULT, /*tp_flags*/
"PDF Documents", /* tp_doc */
0, /* tp_traverse */
0, /* tp_clear */
0, /* tp_richcompare */
0, /* tp_weaklistoffset */
0, /* tp_iter */
0, /* tp_iternext */
poppler_PDFDoc_methods, /* tp_methods */
0, /* tp_members */
poppler_PDFDoc_getsetters, /* tp_getset */
0, /* tp_base */
0, /* tp_dict */
0, /* tp_descr_get */
0, /* tp_descr_set */
0, /* tp_dictoffset */
0, /* tp_init */
0, /* tp_alloc */
poppler_PDFDoc_new, /* tp_new */
};
static PyMethodDef poppler_methods[] = {
{NULL} /* Sentinel */
};
extern "C" {
PyMODINIT_FUNC
initcalibre_poppler(void)
{
PyObject* m;
if (PyType_Ready(&poppler_PDFDocType) < 0)
return;
m = Py_InitModule3("calibre_poppler", poppler_methods,
"Wrapper for the Poppler PDF library");
Py_INCREF(&poppler_PDFDocType);
PyModule_AddObject(m, "PDFDoc", (PyObject *)&poppler_PDFDocType);
}
}

View File

@ -57,7 +57,8 @@ recipe_modules = ['recipe_' + r for r in (
'monitor', 'republika', 'beta', 'beta_en', 'glasjavnosti',
'esquire', 'livemint', 'thedgesingapore', 'darknet', 'rga',
'intelligencer', 'theoldfoodie', 'hln_be', 'honvedelem',
'the_new_republic', 'philly', 'salon', 'tweakers',
'the_new_republic', 'philly', 'salon', 'tweakers', 'smashing',
'thestar',
)]

View File

@ -0,0 +1,51 @@
#!/usr/bin/env python
__license__ = 'GPL v3'
__copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>'
'''
www.smashingmagazine.com
'''
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import Tag
class SmashingMagazine(BasicNewsRecipe):
title = 'Smashing Magazine'
__author__ = 'Darko Miletic'
description = 'We smash you with the information that will make your life easier, really'
oldest_article = 20
language = 'en'
max_articles_per_feed = 100
no_stylesheets = True
use_embedded_content = False
publisher = 'Smashing Magazine'
category = 'news, web, IT, css, javascript, html'
encoding = 'utf-8'
conversion_options = {
'comments' : description
,'tags' : category
,'publisher' : publisher
}
keep_only_tags = [dict(name='div', attrs={'id':'leftcolumn'})]
remove_tags_after = dict(name='ul',attrs={'class':'social'})
remove_tags = [
dict(name=['link','object'])
,dict(name='h1',attrs={'class':'logo'})
,dict(name='div',attrs={'id':'booklogosec'})
,dict(attrs={'src':'http://media2.smashingmagazine.com/wp-content/uploads/images/the-smashing-book/smbook6.gif'})
]
feeds = [(u'Articles', u'http://rss1.smashingmagazine.com/feed/')]
def preprocess_html(self, soup):
for iter in soup.findAll('div',attrs={'class':'leftframe'}):
it = iter.find('h1')
if it == None:
iter.extract()
for item in soup.findAll('img'):
oldParent = item.parent
if oldParent.name == 'a':
oldParent.name = 'div'
return soup

View File

@ -0,0 +1,47 @@
#!/usr/bin/env python
__license__ = 'GPL v3'
__copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>'
'''
www.thestar.com
'''
from calibre.web.feeds.news import BasicNewsRecipe
class TheTorontoStar(BasicNewsRecipe):
title = 'The Toronto Star'
__author__ = 'Darko Miletic'
description = "Canada's largest daily newspaper"
oldest_article = 2
language = 'en_CA'
max_articles_per_feed = 100
no_stylesheets = True
use_embedded_content = False
publisher = 'The Toronto Star'
category = "Toronto Star,Canada's largest daily newspaper,breaking news,classifieds,careers,GTA,Toronto Maple Leafs,sports,Toronto,news,editorial,The Star,Ontario,information,columnists,business,entertainment,births,deaths,automotive,rentals,weather,archives,Torstar,technology,Joseph Atkinson"
encoding = 'utf-8'
extra_css = ' .headlineArticle{font-size: x-large; font-weight: bold} .navbar{text-align:center} '
conversion_options = {
'comments' : description
,'tags' : category
,'publisher' : publisher
}
keep_only_tags = [dict(name='div', attrs={'id':'AssetWebPart1'})]
remove_attributes= ['style']
feeds = [
(u'News' , u'http://www.thestar.com/rss/0?searchMode=Query&categories=296' )
,(u'Opinions' , u'http://www.thestar.com/rss/0?searchMode=Query&categories=311' )
,(u'Business' , u'http://www.thestar.com/rss/0?searchMode=Query&categories=294' )
,(u'Sports' , u'http://www.thestar.com/rss/0?searchMode=Query&categories=295' )
,(u'Entertainment', u'http://www.thestar.com/rss/0?searchMode=Query&categories=296' )
,(u'Living' , u'http://www.thestar.com/rss/0?searchMode=Query&categories=296' )
,(u'Travel' , u'http://www.thestar.com/rss/82858?searchMode=Lineup' )
,(u'Science' , u'http://www.thestar.com/rss/82848?searchMode=Query&categories=300')
]
def print_version(self, url):
return url.replace('/article/','/printArticle/')

View File

@ -13,18 +13,17 @@ class ZeitDe(BasicNewsRecipe):
title = 'Die Zeit Nachrichten'
description = 'Die Zeit - Online Nachrichten'
language = 'de'
lang = 'de_DE'
__author__ = 'Kovid Goyal and Martin Pitt'
__author__ = 'Martin Pitt and Suajta Raman'
use_embedded_content = False
timefmt = ' [%d %b %Y]'
max_articles_per_feed = 40
remove_empty_feeds = True
no_stylesheets = True
encoding = 'utf8'
encoding = 'utf-8'
remove_tags = [{'class': 'adwrap'}]
keep_only_tags = [{'name': 'div', 'class': 'content'}]
feeds = [ ('Kurznachrichten', 'http://newsfeed.zeit.de/index'),
feeds = [
('Politik', 'http://newsfeed.zeit.de/politik/index'),
('Wirtschaft', 'http://newsfeed.zeit.de/wirtschaft/index'),
('Meinung', 'http://newsfeed.zeit.de/meinung/index'),
@ -33,6 +32,43 @@ class ZeitDe(BasicNewsRecipe):
('Wissen', 'http://newsfeed.zeit.de/wissen/index'),
]
def print_version(self,url):
return url.replace('http://www.zeit.de/', 'http://mobil.zeit.de/')
extra_css = '''
.supertitle{color:#990000; font-family:Arial,Helvetica,sans-serif;font-size:xx-small;}
.excerpt{font-family:Georgia,Palatino,Palatino Linotype,FreeSerif,serif;font-size:large;}
.title{font-family:Arial,Helvetica,sans-serif;font-size:large}
.caption{color:#666666; font-family:Arial,Helvetica,sans-serif;font-size:xx-small;}
.copyright{color:#666666; font-family:Arial,Helvetica,sans-serif;font-size:xx-small;}
.article{font-family:Georgia,Palatino,Palatino Linotype,FreeSerif,serif;font-size:x-small}
.headline iconportrait_inline{font-family:Arial,Helvetica,sans-serif;font-size:x-small}
'''
filter_regexps = [r'ad.de.doubleclick.net/']
keep_only_tags = [
dict(name='div', attrs={'class':["article"]}) ,
]
remove_tags = [
dict(name='link'), dict(name='iframe'),dict(name='style'),
dict(name='div', attrs={'class':["pagination block","pagenav","inline link"] }),
dict(name='div', attrs={'id':["place_5","place_4"]})
]
def get_article_url(self, article):
url = article.get('guid', None)
if 'video' in url or 'quiz' in url :
url = None
return url
def preprocess_html(self, soup):
soup.html['xml:lang'] = self.lang
soup.html['lang'] = self.lang
mtag = '<meta http-equiv="Content-Type" content="text/html; charset=' + self.encoding + '">'
soup.head.insert(0,mtag)
return soup
#def print_version(self,url):
# return url.replace('http://www.zeit.de/', 'http://images.zeit.de/text/').replace('?from=rss', '')