Sync to trunk.

This commit is contained in:
John Schember 2009-09-22 17:14:13 -04:00
commit a6886b0acd
32 changed files with 2725 additions and 522 deletions

Binary file not shown.

After

Width:  |  Height:  |  Size: 843 B

Binary file not shown.

After

Width:  |  Height:  |  Size: 629 B

View File

@ -6,7 +6,7 @@ __license__ = 'GPL v3'
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>' __copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en' __docformat__ = 'restructuredtext en'
import os, socket, struct import os, socket, struct, subprocess
from distutils.spawn import find_executable from distutils.spawn import find_executable
from PyQt4 import pyqtconfig from PyQt4 import pyqtconfig
@ -42,6 +42,39 @@ elif find_executable('qmake'):
QMAKE = find_executable('qmake') QMAKE = find_executable('qmake')
QMAKE = os.environ.get('QMAKE', QMAKE) QMAKE = os.environ.get('QMAKE', QMAKE)
PKGCONFIG = find_executable('pkg-config')
PKGCONFIG = os.environ.get('PKG_CONFIG', PKGCONFIG)
def run_pkgconfig(name, envvar, default, flag, prefix):
ans = []
if envvar:
ans = os.environ.get(envvar, default)
ans = [x.strip() for x in ans.split(os.pathsep)]
ans = [x for x in ans if x and (prefix=='-l' or os.path.exists(x))]
if not ans:
try:
raw = subprocess.Popen([PKGCONFIG, flag, name],
stdout=subprocess.PIPE).stdout.read()
ans = [x.strip() for x in raw.split(prefix)]
ans = [x for x in ans if x and (prefix=='-l' or os.path.exists(x))]
except:
print 'Failed to run pkg-config:', PKGCONFIG, 'for:', name
return ans
def pkgconfig_include_dirs(name, envvar, default):
return run_pkgconfig(name, envvar, default, '--cflags-only-I', '-I')
def pkgconfig_lib_dirs(name, envvar, default):
return run_pkgconfig(name, envvar, default,'--libs-only-L', '-L')
def pkgconfig_libs(name, envvar, default):
return run_pkgconfig(name, envvar, default,'--libs-only-l', '-l')
def consolidate(envvar, default):
val = os.environ.get(envvar, default)
ans = [x.strip() for x in val.split(os.pathsep())]
return [x for x in ans if x and os.path.exists(x)]
pyqt = pyqtconfig.Configuration() pyqt = pyqtconfig.Configuration()
@ -50,28 +83,62 @@ qt_lib = pyqt.qt_lib_dir
fc_inc = '/usr/include/fontconfig' fc_inc = '/usr/include/fontconfig'
fc_lib = '/usr/lib' fc_lib = '/usr/lib'
poppler_inc = '/usr/include/poppler/qt4'
poppler_lib = '/usr/lib'
poppler_libs = []
podofo_inc = '/usr/include/podofo' podofo_inc = '/usr/include/podofo'
podofo_lib = '/usr/lib' podofo_lib = '/usr/lib'
if iswindows: if iswindows:
fc_inc = r'C:\cygwin\home\kovid\fontconfig\include\fontconfig' fc_inc = r'C:\cygwin\home\kovid\fontconfig\include\fontconfig'
fc_lib = r'C:\cygwin\home\kovid\fontconfig\lib' fc_lib = r'C:\cygwin\home\kovid\fontconfig\lib'
poppler_inc = r'C:\cygwin\home\kovid\poppler\include\poppler\qt4' poppler_inc_dirs = consolidate('POPPLER_INC_DIR',
poppler_lib = r'C:\cygwin\home\kovid\poppler\lib' r'C:\cygwin\home\kovid\poppler\include\poppler')
poppler_libs = ['QtCore4', 'QtGui4'] popplerqt4_inc_dirs = poppler_inc_dirs + [poppler_inc_dirs[0]+r'\qt4']
poppler_lib_dirs = consolidate('POPPLER_LIB_DIR',
r'C:\cygwin\home\kovid\poppler\lib')
popplerqt4_lib_dirs = poppler_lib_dirs
poppler_libs = ['poppler']
popplerqt4_libs = poppler_libs + ['QtCore4', 'QtGui4']
podofo_inc = 'C:\\podofo\\include\\podofo' podofo_inc = 'C:\\podofo\\include\\podofo'
podofo_lib = r'C:\podofo' podofo_lib = r'C:\podofo'
elif isosx:
if isosx:
fc_inc = '/Users/kovid/fontconfig/include/fontconfig' fc_inc = '/Users/kovid/fontconfig/include/fontconfig'
fc_lib = '/Users/kovid/fontconfig/lib' fc_lib = '/Users/kovid/fontconfig/lib'
poppler_inc = '/Volumes/sw/build/poppler-0.10.7/qt4/src' poppler_inc_dirs = consolidate('POPPLER_INC_DIR',
poppler_lib = '/Users/kovid/poppler/lib' '/Volumes/sw/build/poppler-0.10.7/poppler')
popplerqt4_inc_dirs = poppler_inc_dirs + [poppler_inc_dirs[0]+'/qt4']
poppler_lib_dirs = consolidate('POPPLER_LIB_DIR',
'/Users/kovid/poppler/lib')
popplerqt4_lib_dirs = poppler_lib_dirs
poppler_libs = popplerqt4_libs = ['poppler']
podofo_inc = '/usr/local/include/podofo' podofo_inc = '/usr/local/include/podofo'
podofo_lib = '/usr/local/lib' podofo_lib = '/usr/local/lib'
else:
# Include directories
poppler_inc_dirs = pkgconfig_include_dirs('poppler',
'POPPLER_INC_DIR', '/usr/include/poppler')
popplerqt4_inc_dirs = pkgconfig_include_dirs('poppler-qt4', '', '')
if not popplerqt4_inc_dirs:
popplerqt4_inc_dirs = poppler_inc_dirs + [poppler_inc_dirs[0]+'/qt4']
png_inc_dirs = pkgconfig_include_dirs('libpng', 'PNG_INC_DIR',
'/usr/include')
magick_inc_dirs = pkgconfig_include_dirs('MagickWand', 'MAGICK_INC', '/usr/include/ImageMagick')
# Library directories
poppler_lib_dirs = popplerqt4_lib_dirs = pkgconfig_lib_dirs('poppler', 'POPPLER_LIB_DIR',
'/usr/lib')
png_lib_dirs = pkgconfig_lib_dirs('libpng', 'PNG_LIB_DIR', '/usr/lib')
magick_lib_dirs = pkgconfig_lib_dirs('MagickWand', 'MAGICK_LIB', '/usr/lib')
# Libraries
poppler_libs = pkgconfig_libs('poppler', '', '')
if not poppler_libs:
poppler_libs = ['poppler']
popplerqt4_libs = pkgconfig_libs('poppler-qt4', '', '')
if not popplerqt4_libs:
popplerqt4_libs = ['poppler-qt4', 'poppler']
magick_libs = pkgconfig_libs('MagickWand', '', '')
if not magick_libs:
magick_libs = ['MagickWand', 'MagickCore']
png_libs = ['png']
fc_inc = os.environ.get('FC_INC_DIR', fc_inc) fc_inc = os.environ.get('FC_INC_DIR', fc_inc)
@ -82,14 +149,27 @@ fc_error = None if os.path.exists(os.path.join(fc_inc, 'fontconfig.h')) else \
'variables.') 'variables.')
poppler_inc = os.environ.get('POPPLER_INC_DIR', poppler_inc) poppler_error = None
poppler_lib = os.environ.get('POPPLER_LIB_DIR', poppler_lib) if not poppler_inc_dirs or not os.path.exists(
poppler_error = None if os.path.exists(os.path.join(poppler_inc, os.path.join(poppler_inc_dirs[0], 'OutputDev.h')):
'poppler-qt4.h')) else \ poppler_error = \
('Poppler not found on your system. Various PDF related', ('Poppler not found on your system. Various PDF related',
' functionality will not work. Use the POPPLER_INC_DIR and', ' functionality will not work. Use the POPPLER_INC_DIR and',
' POPPLER_LIB_DIR environment variables.') ' POPPLER_LIB_DIR environment variables.')
popplerqt4_error = None
if not popplerqt4_inc_dirs or not os.path.exists(
os.path.join(popplerqt4_inc_dirs[-1], 'poppler-qt4.h')):
popplerqt4_error = \
('Poppler Qt4 bindings not found on your system.')
magick_error = None
if not magick_inc_dirs or not os.path.exists(os.path.join(magick_inc_dirs[0],
'wand')):
magick_error = ('ImageMagick not found on your system. '
'Try setting the environment variables MAGICK_INC '
'and MAGICK_LIB to help calibre locate the inclue and libbrary '
'files.')
podofo_lib = os.environ.get('PODOFO_LIB_DIR', podofo_lib) podofo_lib = os.environ.get('PODOFO_LIB_DIR', podofo_lib)
podofo_inc = os.environ.get('PODOFO_INC_DIR', podofo_inc) podofo_inc = os.environ.get('PODOFO_INC_DIR', podofo_inc)
@ -116,3 +196,5 @@ except:
HOST='unknown' HOST='unknown'
PROJECT=os.path.basename(os.path.abspath('.')) PROJECT=os.path.basename(os.path.abspath('.'))

View File

@ -8,7 +8,7 @@ __docformat__ = 'restructuredtext en'
__all__ = [ __all__ = [
'pot', 'translations', 'get_translations', 'iso639', 'pot', 'translations', 'get_translations', 'iso639',
'build', 'build', 'build_pdf2xml',
'gui', 'gui',
'develop', 'install', 'develop', 'install',
'resources', 'resources',
@ -30,8 +30,9 @@ translations = Translations()
get_translations = GetTranslations() get_translations = GetTranslations()
iso639 = ISO639() iso639 = ISO639()
from setup.extensions import Build from setup.extensions import Build, BuildPDF2XML
build = Build() build = Build()
build_pdf2xml = BuildPDF2XML()
from setup.install import Develop, Install, Sdist from setup.install import Develop, Install, Sdist
develop = Develop() develop = Develop()

View File

@ -12,10 +12,12 @@ from distutils import sysconfig
from PyQt4.pyqtconfig import QtGuiModuleMakefile from PyQt4.pyqtconfig import QtGuiModuleMakefile
from setup import Command, islinux, isosx, SRC, iswindows from setup import Command, islinux, isosx, SRC, iswindows
from setup.build_environment import fc_inc, fc_lib, qt_inc, qt_lib, \ from setup.build_environment import fc_inc, fc_lib, \
fc_error, poppler_libs, poppler_lib, poppler_inc, podofo_inc, \ fc_error, poppler_libs, poppler_lib_dirs, poppler_inc_dirs, podofo_inc, \
podofo_lib, podofo_error, poppler_error, pyqt, OSX_SDK, NMAKE, \ podofo_lib, podofo_error, poppler_error, pyqt, OSX_SDK, NMAKE, \
leopard_build, QMAKE, msvc, MT, win_inc, win_lib leopard_build, QMAKE, msvc, MT, win_inc, win_lib, png_inc_dirs, \
magick_inc_dirs, magick_lib_dirs, png_lib_dirs, png_libs, \
magick_error, magick_libs
MT MT
isunix = islinux or isosx isunix = islinux or isosx
@ -43,6 +45,10 @@ class Extension(object):
self.ldflags = kwargs.get('ldflags', []) self.ldflags = kwargs.get('ldflags', [])
self.optional = kwargs.get('optional', False) self.optional = kwargs.get('optional', False)
reflow_sources = glob.glob(os.path.join(SRC, 'calibre', 'ebooks', 'pdf', '*.cpp'))
reflow_headers = glob.glob(os.path.join(SRC, 'calibre', 'ebooks', 'pdf', '*.h'))
reflow_error = poppler_error if poppler_error else magick_error
extensions = [ extensions = [
Extension('lzx', Extension('lzx',
['calibre/utils/lzx/lzxmodule.c', ['calibre/utils/lzx/lzxmodule.c',
@ -76,15 +82,6 @@ extensions = [
Extension('cPalmdoc', Extension('cPalmdoc',
['calibre/ebooks/compression/palmdoc.c']), ['calibre/ebooks/compression/palmdoc.c']),
Extension('calibre_poppler',
['calibre/utils/poppler/poppler.cpp'],
libraries=(['poppler', 'poppler-qt4']+poppler_libs),
lib_dirs=[os.environ.get('POPPLER_LIB_DIR',
poppler_lib), qt_lib],
inc_dirs=[poppler_inc, qt_inc],
error=poppler_error,
optional=True),
Extension('podofo', Extension('podofo',
['calibre/utils/podofo/podofo.cpp'], ['calibre/utils/podofo/podofo.cpp'],
libraries=['podofo'], libraries=['podofo'],
@ -97,10 +94,20 @@ extensions = [
inc_dirs = ['calibre/gui2/pictureflow'], inc_dirs = ['calibre/gui2/pictureflow'],
headers = ['calibre/gui2/pictureflow/pictureflow.h'], headers = ['calibre/gui2/pictureflow/pictureflow.h'],
sip_files = ['calibre/gui2/pictureflow/pictureflow.sip'] sip_files = ['calibre/gui2/pictureflow/pictureflow.sip']
) ),
Extension('pdfreflow',
reflow_sources,
headers=reflow_headers,
libraries=poppler_libs+magick_libs+png_libs,
lib_dirs=poppler_lib_dirs+magick_lib_dirs+png_lib_dirs,
inc_dirs=poppler_inc_dirs+magick_inc_dirs+png_inc_dirs,
error=reflow_error,
cflags=['-DPNG_SKIP_SETJMP_CHECK'] if islinux else []
)
] ]
if iswindows: if iswindows:
extensions.append(Extension('winutil', extensions.append(Extension('winutil',
['calibre/utils/windows/winutil.c'], ['calibre/utils/windows/winutil.c'],
@ -346,10 +353,36 @@ class Build(Command):
class BuildPDF2XML(Command):
description = 'Build command line pdf2xml utility'
def run(self, opts):
dest = os.path.expanduser('~/bin/pdf2xml')
odest = self.j(self.d(self.SRC), 'build', 'objects', 'pdf2xml')
if not os.path.exists(odest):
os.makedirs(odest)
objects = []
for src in reflow_sources:
if src.endswith('python.cpp'):
continue
obj = self.j(odest, self.b(src+'.o'))
if self.newer(obj, [src]+reflow_headers):
cmd = ['g++', '-pthread', '-pedantic', '-g', '-c', '-Wall', '-I/usr/include/poppler',
'-I/usr/include/ImageMagick',
'-DPDF2XML', '-o', obj, src]
self.info(*cmd)
subprocess.check_call(cmd)
objects.append(obj)
if self.newer(dest, objects):
cmd = ['g++', '-g', '-o', dest]+objects+['-lpoppler', '-lMagickWand',
'-lpng', '-lpthread']
self.info(*cmd)
subprocess.check_call(cmd)
self.info('Binary installed as', dest)

View File

@ -192,6 +192,10 @@ class Install(Develop):
x = self.j(dest, x) x = self.j(dest, x)
if os.path.exists(dest): if os.path.exists(dest):
shutil.rmtree(x) shutil.rmtree(x)
for x in os.walk(dest):
for f in x[-1]:
if os.path.splitext(f)[1] in ('.c', '.cpp', '.h'):
os.remove(self.j(x[0], f))
dest = self.root + self.resources dest = self.root + self.resources
if os.path.exists(dest): if os.path.exists(dest):
shutil.rmtree(dest) shutil.rmtree(dest)
@ -241,4 +245,3 @@ class Sdist(Command):
os.remove(self.DEST) os.remove(self.DEST)

View File

@ -38,6 +38,7 @@ class LinuxFreeze(Command):
binary_includes = [ binary_includes = [
'/usr/bin/pdftohtml', '/usr/bin/pdftohtml',
'/usr/lib/libwmflite-0.2.so.7', '/usr/lib/libwmflite-0.2.so.7',
'/usr/lib/liblcms.so.1',
'/tmp/calibre-mount-helper', '/tmp/calibre-mount-helper',
'/usr/lib/libunrar.so', '/usr/lib/libunrar.so',
'/usr/lib/libsqlite3.so.0', '/usr/lib/libsqlite3.so.0',

View File

@ -55,7 +55,7 @@ if plugins is None:
sys.path.insert(0, plugin_path) sys.path.insert(0, plugin_path)
for plugin in ['pictureflow', 'lzx', 'msdes', 'podofo', 'cPalmdoc', for plugin in ['pictureflow', 'lzx', 'msdes', 'podofo', 'cPalmdoc',
'fontconfig', 'calibre_poppler'] + \ 'fontconfig', 'pdfreflow'] + \
(['winutil'] if iswindows else []) + \ (['winutil'] if iswindows else []) + \
(['usbobserver'] if isosx else []): (['usbobserver'] if isosx else []):
try: try:

View File

@ -161,6 +161,7 @@ quick_metadata = QuickMetadata()
def get_file_type_metadata(stream, ftype): def get_file_type_metadata(stream, ftype):
mi = MetaInformation(None, None) mi = MetaInformation(None, None)
ftype = ftype.lower().strip() ftype = ftype.lower().strip()
if _metadata_readers.has_key(ftype): if _metadata_readers.has_key(ftype):
for plugin in _metadata_readers[ftype]: for plugin in _metadata_readers[ftype]:
@ -168,6 +169,8 @@ def get_file_type_metadata(stream, ftype):
with plugin: with plugin:
try: try:
plugin.quick = quick_metadata.quick plugin.quick = quick_metadata.quick
if hasattr(stream, 'seek'):
stream.seek(0)
mi = plugin.get_metadata(stream, ftype.lower().strip()) mi = plugin.get_metadata(stream, ftype.lower().strip())
break break
except: except:

View File

@ -10,6 +10,7 @@ import sys, os, re, shutil
from calibre.utils.config import OptionParser from calibre.utils.config import OptionParser
from calibre.constants import iswindows, isosx from calibre.constants import iswindows, isosx
from calibre.libunzip import update from calibre.libunzip import update
from calibre import prints
def option_parser(): def option_parser():
parser = OptionParser(usage='''\ parser = OptionParser(usage='''\
@ -28,6 +29,8 @@ Run an embedded python interpreter.
help='Debug the specified device driver.') help='Debug the specified device driver.')
parser.add_option('-g', '--gui', default=False, action='store_true', parser.add_option('-g', '--gui', default=False, action='store_true',
help='Run the GUI',) help='Run the GUI',)
parser.add_option('--paths', default=False, action='store_true',
help='Output the paths necessary to setup the calibre environment')
parser.add_option('--migrate', action='store_true', default=False, parser.add_option('--migrate', action='store_true', default=False,
help='Migrate old database. Needs two arguments. Path ' help='Migrate old database. Needs two arguments. Path '
'to library1.db and path to new library folder.') 'to library1.db and path to new library folder.')
@ -35,6 +38,9 @@ Run an embedded python interpreter.
help='Add a simple plugin (i.e. a plugin that consists of only a ' help='Add a simple plugin (i.e. a plugin that consists of only a '
'.py file), by specifying the path to the py file containing the ' '.py file), by specifying the path to the py file containing the '
'plugin code.') 'plugin code.')
parser.add_option('--pdfreflow', default=None,
help='Path to PDF file to try and reflow. Output will be placed in '
'current directory. ')
return parser return parser
@ -203,6 +209,15 @@ def main(args=sys.argv):
migrate(args[1], args[2]) migrate(args[1], args[2])
elif opts.add_simple_plugin is not None: elif opts.add_simple_plugin is not None:
add_simple_plugin(opts.add_simple_plugin) add_simple_plugin(opts.add_simple_plugin)
elif opts.paths:
prints('CALIBRE_RESOURCES_LOCATION='+sys.resources_location)
prints('CALIBRE_EXTENSIONS_LOCATION='+sys.extensions_location)
prints('CALIBRE_PYTHON_PATH='+os.pathsep.join(sys.path))
elif opts.pdfreflow:
from calibre.ebooks.pdf.reflow import option_parser as px, run
from calibre.utils.logging import default_log
opts2, args = px().parse_args(['xxxx', '-vvvv', opts.pdfreflow])
run(opts2, opts.pdfreflow, default_log)
else: else:
from IPython.Shell import IPShellEmbed from IPython.Shell import IPShellEmbed
ipshell = IPShellEmbed() ipshell = IPShellEmbed()

View File

@ -45,7 +45,7 @@ class FB2Input(InputFormatPlugin):
log.debug('Parsing XML...') log.debug('Parsing XML...')
parser = etree.XMLParser(recover=True, no_network=True) parser = etree.XMLParser(recover=True, no_network=True)
doc = etree.parse(stream, parser) doc = etree.fromstring(stream.read())
self.extract_embedded_content(doc) self.extract_embedded_content(doc)
log.debug('Converting XML to HTML...') log.debug('Converting XML to HTML...')
ss = open(P('templates/fb2.xsl'), 'rb').read() ss = open(P('templates/fb2.xsl'), 'rb').read()

View File

@ -130,7 +130,7 @@ def metadata_from_filename(name, pat=None):
au = match.group('author') au = match.group('author')
aus = string_to_authors(au) aus = string_to_authors(au)
mi.authors = aus mi.authors = aus
except IndexError: except (IndexError, ValueError):
pass pass
try: try:
mi.series = match.group('series') mi.series = match.group('series')

View File

@ -666,7 +666,7 @@ class OPF(object):
for key in matches[0].attrib: for key in matches[0].attrib:
if key.endswith('file-as'): if key.endswith('file-as'):
matches[0].attrib.pop(key) matches[0].attrib.pop(key)
matches[0].set('file-as', unicode(val)) matches[0].set('{%s}file-as'%self.NAMESPACES['opf'], unicode(val))
return property(fget=fget, fset=fset) return property(fget=fget, fset=fset)

View File

@ -3,59 +3,55 @@ __license__ = 'GPL v3'
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>' __copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
'''Read meta information from PDF files''' '''Read meta information from PDF files'''
import sys, os, cStringIO from functools import partial
from calibre import prints
from calibre.constants import plugins
from calibre.ebooks.metadata import MetaInformation, string_to_authors, authors_to_string
pdfreflow, pdfreflow_error = plugins['pdfreflow']
def get_metadata(stream, cover=True):
if pdfreflow is None:
raise RuntimeError(pdfreflow_error)
info = pdfreflow.get_metadata(stream.read(), cover)
title = info.get('Title', None)
au = info.get('Author', None)
if au is None:
au = [_('Unknown')]
else:
au = string_to_authors(au)
mi = MetaInformation(title, au)
creator = info.get('Creator', None)
if creator:
mi.book_producer = creator
keywords = info.get('Keywords', None)
mi.tags = []
if keywords:
mi.tags = [x.strip() for x in keywords.split(',')]
subject = info.get('Subject', None)
if subject:
mi.tags.insert(0, subject)
if cover and 'cover' in info:
data = info['cover']
if data is None:
prints(title, 'is an encrypted document, cover extraction not allowed.')
else:
mi.cover_data = ('png', data)
return mi
get_quick_metadata = partial(get_metadata, cover=False)
import cStringIO
from threading import Thread from threading import Thread
from calibre import StreamReadWrapper
from calibre.ptempfile import TemporaryDirectory
try:
from calibre.utils.PythonMagickWand import \
NewMagickWand, MagickReadImage, MagickSetImageFormat, \
MagickWriteImage, ImageMagick
_imagemagick_loaded = True
except:
_imagemagick_loaded = False
from calibre.ebooks.metadata import MetaInformation, string_to_authors, authors_to_string
from calibre.utils.pdftk import set_metadata as pdftk_set_metadata from calibre.utils.pdftk import set_metadata as pdftk_set_metadata
from calibre.utils.podofo import get_metadata as podofo_get_metadata, \ from calibre.utils.podofo import set_metadata as podofo_set_metadata, Unavailable
set_metadata as podofo_set_metadata, Unavailable, get_metadata_quick
from calibre.utils.poppler import get_metadata as get_metadata_poppler, NotAvailable
def get_quick_metadata(stream):
try:
return get_metadata_poppler(stream, False)
except NotAvailable:
pass
return get_metadata_pypdf(stream)
raw = stream.read()
mi = get_metadata_quick(raw)
if mi.title == '_':
mi.title = getattr(stream, 'name', _('Unknown'))
mi.title = mi.title.rpartition('.')[0]
return mi
def get_metadata(stream, extract_cover=True):
try:
return get_metadata_poppler(stream, extract_cover)
except NotAvailable:
pass
try:
with TemporaryDirectory('_pdfmeta') as tdir:
cpath = os.path.join(tdir, 'cover.pdf')
if not extract_cover:
cpath = None
mi = podofo_get_metadata(stream, cpath=cpath)
if mi.cover is not None:
cdata = get_cover(mi.cover)
mi.cover = None
if cdata is not None:
mi.cover_data = ('jpg', cdata)
except Unavailable:
mi = get_metadata_pypdf(stream)
return mi
def set_metadata(stream, mi): def set_metadata(stream, mi):
stream.seek(0) stream.seek(0)
@ -70,25 +66,6 @@ def set_metadata(stream, mi):
set_metadata_pypdf(stream, mi) set_metadata_pypdf(stream, mi)
def get_metadata_pypdf(stream):
""" Return metadata as a L{MetaInfo} object """
from pyPdf import PdfFileReader
mi = MetaInformation(_('Unknown'), [_('Unknown')])
try:
with StreamReadWrapper(stream) as stream:
info = PdfFileReader(stream).getDocumentInfo()
if info.title:
mi.title = info.title
if info.author:
mi.author = info.author
mi.authors = string_to_authors(info.author)
if info.subject:
mi.category = info.subject
except Exception, err:
msg = u'Couldn\'t read metadata from pdf: %s with error %s'%(mi.title, unicode(err))
print >>sys.stderr, msg.encode('utf8')
return mi
class MetadataWriter(Thread): class MetadataWriter(Thread):
def __init__(self, out_pdf, buf): def __init__(self, out_pdf, buf):
@ -132,13 +109,4 @@ def set_metadata_pypdf(stream, mi):
stream.write(out_str.read()) stream.write(out_str.read())
stream.seek(0) stream.seek(0)
def get_cover(cover_path):
with ImageMagick():
wand = NewMagickWand()
MagickReadImage(wand, cover_path)
MagickSetImageFormat(wand, 'JPEG')
MagickWriteImage(wand, '%s.jpg' % cover_path)
return open('%s.jpg' % cover_path, 'rb').read()

View File

@ -0,0 +1,143 @@
/**
* Copyright 2009 Kovid Goyal <kovid@kovidgoyal.net>
* License: GNU GPL v3
*/
#include "fonts.h"
#include "utils.h"
using namespace calibre_reflow;
using namespace std;
XMLColor::XMLColor(GfxRGB rgb) {
this->r = static_cast<int>(rgb.r/65535.0*255.0);
this->g = static_cast<int>(rgb.g/65535.0*255.0);
this->b = static_cast<int>(rgb.b/65535.0*255.0);
if (!(this->ok(this->r) && this->ok(this->b) && this->ok(this->g))) {
this->r = 0; this->g = 0; this->b = 0;
}
}
string XMLColor::str() const {
ostringstream oss;
oss << "rgb(" << this->r << "," << this->g << "," << this->b << ")";
return oss.str();
}
static const char *FONT_MODS[7] = {
"-bolditalic", "-boldoblique", "-bold", "-italic", "-oblique", "-roman",
NULL
};
#define ap_toupper(c) (toupper(((unsigned char)(c))))
static inline
char *strcasestr( char *h, char *n )
{ /* h="haystack", n="needle" */
char *a=h, *e=n;
if( !h || !*h || !n || !*n ) { return 0; }
while( *a && *e ) {
if( ap_toupper(*a)!=ap_toupper(*e) ) {
++h; a=h; e=n;
}
else {
++a; ++e;
}
}
return *e ? 0 : h;
}
static string* family_name(const string *font_name) {
if (!font_name) return NULL;
string *fn = new string(*font_name);
size_t pos;
const char *p;
for (size_t i = 0; FONT_MODS[i] != NULL; i++) {
p = strcasestr(fn->c_str(), FONT_MODS[i]);
if (p != NULL) {
pos = p - fn->c_str();
fn->replace(pos, strlen(FONT_MODS[i]), "");
break;
}
}
return fn;
}
XMLFont::XMLFont(string* font_name, double size, GfxRGB rgb) :
size(size-1), line_size(-1.0), italic(false), bold(false), font_name(font_name),
font_family(NULL), color(rgb) {
if (!this->font_name) this->font_name = new string(DEFAULT_FONT_FAMILY);
this->font_family = family_name(this->font_name);
if (strcasestr(font_name->c_str(), "bold")) this->bold = true;
if (strcasestr(font_name->c_str(),"italic")||
strcasestr(font_name->c_str(),"oblique")) this->italic = true;
}
XMLFont& XMLFont::operator=(const XMLFont& x){
if (this==&x) return *this;
this->size = x.size;
this->line_size = x.line_size;
this->italic = x.italic;
this->bold = x.bold;
this->color = x.color;
if (this->font_name) delete this->font_name;
this->font_name = new string(*x.font_name);
if (this->font_family) delete this->font_family;
this->font_family = new string(*x.font_family);
return *this;
}
bool XMLFont::operator==(const XMLFont &f) const {
return (fabs(this->size - f.size) < 0.1) &&
(fabs(this->line_size - f.line_size) < 0.1) &&
(this->italic == f.italic) &&
(this->bold == f.bold) &&
(this->color == f.color) &&
((*this->font_family) == (*f.font_family));
}
bool XMLFont::eq_upto_inline(const XMLFont &f) const {
return (fabs(this->size - f.size) < 0.1) &&
(fabs(this->line_size - f.line_size) < 0.1) &&
(this->color == f.color) &&
((*this->font_family) == (*f.font_family));
}
string XMLFont::str(Fonts::size_type id) const {
ostringstream oss;
oss << "<font id=\"" << id << "\" ";
oss << "family=\"" << encode_for_xml(*this->font_family) << "\" ";
oss << "color=\"" << this->color.str() << "\" ";
oss << setiosflags(ios::fixed) << setprecision(2)
<< "size=\"" << this->size << "\"";
oss << "/>";
return oss.str();
}
Fonts::size_type Fonts::add_font(XMLFont *f) {
Fonts::iterator it;
size_type i;
for ( i=0, it=this->begin(); it < this->end(); it++, i++ ) {
if (**it == *f) return i;
}
this->push_back(f);
return this->size()-1;
}
Fonts::size_type Fonts::add_font(string* font_name, double size, GfxRGB rgb) {
XMLFont *f = new XMLFont(font_name, size, rgb);
return this->add_font(f);
}
Fonts::~Fonts() {
Fonts::iterator it;
for ( it=this->begin(); it < this->end(); it++ ) delete *it;
this->resize(0);
}

View File

@ -0,0 +1,105 @@
/**
* Copyright 2009 Kovid Goyal <kovid@kovidgoyal.net>
* License: GNU GPL v3
*/
#ifndef CALIBRE_REFLOW_FONTS
#define CALIBRE_REFLOW_FONTS
#include <vector>
#include <sstream>
#include <iomanip>
#include <ctype.h>
#include <math.h>
#include <GfxState.h>
using namespace std;
#define DEFAULT_FONT_FAMILY "Times New Roman"
namespace calibre_reflow {
class XMLColor {
private:
unsigned int r;
unsigned int g;
unsigned int b;
inline bool ok(unsigned int xcol) const {
return ( (xcol <= 255) && (xcol >= 0) );
}
public:
XMLColor():r(0),g(0),b(0){}
XMLColor(GfxRGB rgb);
XMLColor(const XMLColor& x) {
this->r=x.r; this->g=x.g; this->b=x.b;
}
XMLColor& operator=(const XMLColor &x){
this->r=x.r; this->g=x.g; this->b=x.b;
return *this;
}
~XMLColor(){}
string str() const;
bool operator==(const XMLColor &col) const {
return ((r==col.r)&&(g==col.g)&&(b==col.b));
}
};
class XMLFont {
private:
double size;
double line_size;
bool italic;
bool bold;
string *font_name;
string *font_family;
XMLColor color;
public:
XMLFont(const char *font_family=DEFAULT_FONT_FAMILY, double size=12.0) :
size(size), line_size(-1.0), italic(false), bold(false),
font_name(new string(font_family)), font_family(new string(font_family)),
color() {}
XMLFont(string* font_name, double size, GfxRGB rgb);
XMLFont(const XMLFont& other) :
size(other.size), line_size(other.line_size), italic(other.italic),
bold(other.bold), font_name(new string(*other.font_name)),
font_family(other.font_family), color(other.color) {}
XMLColor get_color() { return this->color; }
string* get_font_name() { return this->font_name; }
double get_size() const { return this->size; }
double get_line_size() { return this->line_size; }
void set_line_size(double ls) { this->line_size = ls; }
bool is_italic() const { return this->italic; }
bool is_bold() const { return this->bold; }
~XMLFont() { delete this->font_name; delete this->font_family; }
XMLFont& operator=(const XMLFont& other);
bool operator==(const XMLFont &other) const;
bool eq_upto_inline(const XMLFont &f) const;
string str(vector<XMLFont*>::size_type id) const;
};
class Fonts : public vector<XMLFont*> {
public:
Fonts::size_type add_font(XMLFont *f);
Fonts::size_type add_font(string* font_name, double size, GfxRGB rgb);
~Fonts();
};
}
#endif

View File

@ -0,0 +1,289 @@
#include <stdio.h>
#include <errno.h>
#include <sstream>
#include <algorithm>
#include <iomanip>
#include <math.h>
#include <iostream>
#include <wand/MagickWand.h>
#include "images.h"
#include "utils.h"
#define xoutRound(x) ( static_cast<int>(round(x)) )
using namespace std;
using namespace calibre_reflow;
calibre_reflow::ImageInfo::ImageInfo(GfxState *state) {
// get image position and size
state->transform(0, 0, &xt, &yt);
state->transformDelta(1, 1, &wt, &ht);
if (wt > 0) {
x0 = xoutRound(xt);
w0 = xoutRound(wt);
} else {
x0 = xoutRound(xt + wt);
w0 = xoutRound(-wt);
}
if (ht > 0) {
y0 = xoutRound(yt);
h0 = xoutRound(ht);
} else {
y0 = xoutRound(yt + ht);
h0 = xoutRound(-ht);
}
state->transformDelta(1, 0, &xt, &yt);
rotate = fabs(xt) < fabs(yt);
if (rotate) {
w1 = h0;
h1 = w0;
x_flip = ht < 0;
y_flip = wt > 0;
} else {
w1 = w0;
h1 = h0;
x_flip = wt < 0;
y_flip = ht > 0;
}
//cout << x_flip << "|" << y_flip << endl;
}
void XMLImages::clear() {
vector<XMLImage*>::iterator it;
for (it = this->masks.begin(); it < this->masks.end(); it++)
delete *it;
for (it = this->images.begin(); it < this->images.end(); it++)
delete *it;
this->masks.clear();
this->images.clear();
}
void XMLImages::add_mask(GfxState *state, Object *ref, Stream *str,
unsigned int width, unsigned int height, bool invert,
bool interpolate, bool inline_img) {
}
static void throw_magick_exception(MagickWand *wand) {
ExceptionType severity;
char *description = MagickGetException(wand, &severity);
ostringstream oss;
oss << description << endl;
description=(char *) MagickRelinquishMemory(description);
wand = DestroyMagickWand(wand);
MagickWandTerminus();
throw ReflowException(oss.str().c_str());
}
static void flip_image(string file_name, bool x_flip, bool y_flip) {
MagickWand *magick_wand;
MagickBooleanType status;
MagickWandGenesis();
magick_wand = NewMagickWand();
status = MagickReadImage(magick_wand, file_name.c_str());
if (status == MagickFalse) throw_magick_exception(magick_wand);
if (y_flip) {
status = MagickFlipImage(magick_wand);
if (status == MagickFalse) throw_magick_exception(magick_wand);
}
if (x_flip) {
status = MagickFlopImage(magick_wand);
if (status == MagickFalse) throw_magick_exception(magick_wand);
}
status = MagickWriteImage(magick_wand, NULL);
if (status == MagickFalse) throw_magick_exception(magick_wand);
magick_wand = DestroyMagickWand(magick_wand);
MagickWandTerminus();
}
void XMLImages::add(GfxState *state, Object *ref, Stream *str,
unsigned int width, unsigned int height, GfxImageColorMap *colorMap,
bool interpolate, int *maskColors, bool inline_img) {
XMLImage *img = new XMLImage(state);
this->images.push_back(img);
img->width = width; img->height = height;
img->type = (str->getKind() == strDCT) ? jpeg : png;
string file_name = this->file_name(img);
FILE *of = fopen(file_name.c_str(), "wb");
if (!of) throw ReflowException(strerror(errno));
if (img->type == jpeg) {
int c;
str = ((DCTStream *)str)->getRawStream();
str->reset();
// copy the stream
while ((c = str->getChar()) != EOF) fputc(c, of);
} else { //Render as PNG
Guchar *p;
GfxRGB rgb;
png_byte *row = (png_byte *) malloc(3 * width); // 3 bytes/pixel: RGB
png_bytep *row_pointer= &row;
PNGWriter *writer = new PNGWriter();
writer->init(of, width, height);
// Initialize the image stream
ImageStream *imgStr = new ImageStream(str, width,
colorMap->getNumPixelComps(), colorMap->getBits());
imgStr->reset();
// For each line...
for (unsigned int y = 0; y < height; y++) {
// Convert into a PNG row
p = imgStr->getLine();
for (unsigned int x = 0; x < width; x++) {
colorMap->getRGB(p, &rgb);
// Write the RGB pixels into the row
row[3*x]= colToByte(rgb.r);
row[3*x+1]= colToByte(rgb.g);
row[3*x+2]= colToByte(rgb.b);
p += colorMap->getNumPixelComps();
}
writer->writeRow(row_pointer);
}
writer->close();
delete writer;
free(row);
imgStr->close();
delete imgStr;
}
fclose(of);
img->written = true;
if (img->info.x_flip || img->info.y_flip)
flip_image(file_name, img->info.x_flip, img->info.y_flip);
}
string XMLImages::file_name(const XMLImage *img) const {
vector<XMLImage*>::const_iterator ir, mr;
size_t idx = 0;
bool mask = false;
ir = find( this->images.begin(), this->images.end(), img);
if (ir == this->images.end()) {
mr = find( this->masks.begin(), this->masks.end(), img);
idx = mr - this->masks.begin();
mask = true;
} else idx = ir - this->images.begin();
ostringstream oss;
oss << ((mask) ? "mask" : "image") << "-" << idx+1 << '.';
oss << ((img->type == jpeg) ? "jpg" : "png");
return oss.str();
}
vector<string*> XMLImages::str() const {
vector<string*> ans;
vector <XMLImage*>::const_iterator it;
for (it = this->masks.begin(); it < this->masks.end(); it++) {
if ((*it)->written)
ans.push_back(new string((*it)->str(it - this->masks.begin(), true,
this->file_name(*it))));
}
for (it = this->images.begin(); it < this->images.end(); it++) {
if ((*it)->written)
ans.push_back(new string((*it)->str(it - this->images.begin(), false,
this->file_name(*it))));
}
return ans;
}
string XMLImage::str(size_t num, bool mask, string file_name) const {
ostringstream oss;
oss << "<img type=\"" << ((mask) ? "mask" : "image") << "\" "
<< "src=\"" << file_name << "\" "
<< "iwidth=\"" << this->width << "\" iheight=\"" << this->height << "\" "
<< "rwidth=\"" << this->info.w1 << "\" rheight=\"" << this->info.h1 << "\" "
<< setiosflags(ios::fixed) << setprecision(2)
<< "top=\"" << this->info.y0 << "\" left=\"" << this->info.x0 << "\"/>";
return oss.str();
}
PNGWriter::~PNGWriter()
{
/* cleanup heap allocation */
png_destroy_write_struct(&png_ptr, &info_ptr);
}
void PNGWriter::init(FILE *f, int width, int height)
{
/* initialize stuff */
png_ptr = png_create_write_struct(PNG_LIBPNG_VER_STRING, NULL, NULL, NULL);
if (!png_ptr)
throw ReflowException("png_create_write_struct failed");
info_ptr = png_create_info_struct(png_ptr);
if (!info_ptr)
throw ReflowException("png_create_info_struct failed");
if (setjmp(png_jmpbuf(png_ptr)))
throw ReflowException("png_jmpbuf failed");
/* write header */
png_init_io(png_ptr, f);
if (setjmp(png_jmpbuf(png_ptr)))
throw ReflowException("Error during writing header");
// Set up the type of PNG image and the compression level
png_set_compression_level(png_ptr, Z_BEST_COMPRESSION);
png_byte bit_depth = 8;
png_byte color_type = PNG_COLOR_TYPE_RGB;
png_byte interlace_type = PNG_INTERLACE_NONE;
png_set_IHDR(png_ptr, info_ptr, width, height, bit_depth, color_type, interlace_type, PNG_COMPRESSION_TYPE_DEFAULT, PNG_FILTER_TYPE_DEFAULT);
png_write_info(png_ptr, info_ptr);
if (setjmp(png_jmpbuf(png_ptr)))
throw ReflowException("error during writing png info bytes");
}
void PNGWriter::writePointers(png_bytep *rowPointers)
{
png_write_image(png_ptr, rowPointers);
/* write bytes */
if (setjmp(png_jmpbuf(png_ptr)))
throw ReflowException("Error during writing bytes");
}
void PNGWriter::writeRow(png_bytep *row)
{
// Write the row to the file
png_write_rows(png_ptr, row, 1);
if (setjmp(png_jmpbuf(png_ptr)))
throw ReflowException("error during png row write");
}
void PNGWriter::close()
{
/* end write */
png_write_end(png_ptr, info_ptr);
if (setjmp(png_jmpbuf(png_ptr)))
throw ReflowException("Error during end of write");
}
void PNGWriter::write_splash_bitmap(SplashBitmap *bitmap) {
SplashColorPtr row = bitmap->getDataPtr();
int height = bitmap->getHeight();
int row_size = bitmap->getRowSize();
png_bytep *row_pointers = new png_bytep[height];
for (int y = 0; y < height; ++y) {
row_pointers[y] = row;
row += row_size;
}
this->writePointers(row_pointers);
delete[] row_pointers;
}

View File

@ -0,0 +1,94 @@
#ifndef _CALIBRE_REFLOW_IMAGES
#define _CALIBRE_REFLOW_IMAGES
#include <vector>
#include <GfxState.h>
#include <splash/SplashBitmap.h>
#include <png.h>
using namespace std;
namespace calibre_reflow {
enum ImageType {
jpeg, png
};
class PNGWriter
{
public:
PNGWriter() {}
~PNGWriter();
void init(FILE *f, int width, int height);
void writePointers(png_bytep *rowPointers);
void writeRow(png_bytep *row);
void write_splash_bitmap(SplashBitmap *bitmap);
void close();
private:
png_structp png_ptr;
png_infop info_ptr;
};
class ImageInfo {
public:
ImageInfo(GfxState *state);
private:
int x0, y0; // top left corner of image
int w0, h0, w1, h1; // size of image
double xt, yt, wt, ht;
bool rotate, x_flip, y_flip;
friend class XMLImage;
friend class XMLImages;
};
class XMLImage {
private:
double x, y;
unsigned int width, height;
ImageType type;
bool written;
ImageInfo info;
friend class XMLImages;
public:
XMLImage(GfxState *state) :
x(0.), y(0.), width(0), height(0), type(jpeg), written(false), info(state)
{}
~XMLImage() {}
string str(size_t num, bool mask, string file_name) const;
};
class XMLImages {
private:
vector<XMLImage*> images;
vector<XMLImage*> masks;
public:
~XMLImages() { this->clear(); }
void add_mask(GfxState *state, Object *ref, Stream *str,
unsigned int width, unsigned int height, bool invert,
bool interpolate, bool inline_img);
void add(GfxState *state, Object *ref, Stream *str,
unsigned int width, unsigned int height, GfxImageColorMap *colorMap,
bool interpolate, int *maskColors, bool inline_img);
string file_name(const XMLImage *img) const;
vector<string*> str() const;
void clear();
};
}
#endif

View File

@ -0,0 +1,56 @@
/**
* Copyright 2009 Kovid Goyal <kovid@kovidgoyal.net>
* License: GNU GPL v3
*/
#include "links.h"
#include "utils.h"
using namespace std;
using namespace calibre_reflow;
XMLLink& XMLLink::operator=(const XMLLink &x) {
if (this==&x) return *this;
if (this->dest) {delete this->dest; this->dest=NULL;}
this->x_min = x.x_min;
this->y_min = x.y_min;
this->x_max = x.x_max;
this->y_max = x.y_max;
this->dest = new string(*x.dest);
return *this;
}
bool XMLLink::in_link(double xmin,double ymin,double xmax,double ymax) const {
double y = (ymin + ymax)/2;
if (y > this->y_max) return false;
return (y > this->y_min) && (xmin < this->x_max) && (xmax > this->x_min);
}
string XMLLink::get_link_start() {
ostringstream oss;
oss << "<a href=\"";
if (this->dest) oss << encode_for_xml(*this->dest);
oss << "\">";
return oss.str();
}
XMLLinks::~XMLLinks() {
for(XMLLinks::iterator i = this->begin(); i != this->end(); i++)
delete *i;
this->clear();
}
bool XMLLinks::in_link(double xmin, double ymin, double xmax,
double ymax, XMLLinks::size_type &p) const {
for(XMLLinks::const_iterator i = this->begin(); i != this->end(); i++) {
if ( (*i)->in_link(xmin, ymin, xmax, ymax) ) {
p = (i - this->begin());
return true;
}
}
return false;
}

View File

@ -0,0 +1,69 @@
/**
* Copyright 2009 Kovid Goyal <kovid@kovidgoyal.net>
* License: GNU GPL v3
*/
#ifndef _CALIBRE_XML_LINKS
#define _CALIBRE_XML_LINKS
#include <vector>
#include <sstream>
using namespace std;
namespace calibre_reflow {
class XMLLink {
private:
double x_min;
double y_min;
double x_max;
double y_max;
string* dest;
public:
XMLLink() : dest(NULL) {}
XMLLink(const XMLLink& x) :
x_min(x.x_min), y_min(x.y_min), x_max(x.x_max),
y_max(x.y_max), dest(new string(*x.dest)) {}
XMLLink(double x_min, double y_min, double x_max,
double y_max, const char *dest) :
x_min((x_min < x_max) ? x_min : x_max),
y_min((y_min < y_max) ? y_min : y_max),
x_max((x_max > x_min) ? x_max : x_min),
y_max((y_max > y_min) ? y_max : y_min),
dest(new string(dest)) {}
~XMLLink() { delete this->dest; }
string* get_dest() { return this->dest; }
double get_x1() const {return x_min;}
double get_x2() const {return x_max;}
double get_y1() const {return y_min;}
double get_y2() const {return y_max;}
XMLLink& operator=(const XMLLink &x);
bool operator==(const XMLLink &x) const {
return (this->dest != NULL) && (x.dest != NULL) &&
this->dest->compare(*x.dest) == 0;
}
bool in_link(double xmin, double ymin, double xmax, double ymax) const;
string get_link_start();
};
class XMLLinks : public vector<XMLLink*> {
public:
~XMLLinks();
bool in_link(double xmin, double ymin, double xmax,
double ymax, XMLLinks::size_type &p) const;
};
}
#endif

View File

@ -0,0 +1,198 @@
#ifndef PDF2XML
#define UNICODE
#define PY_SSIZE_T_CLEAN
#include <Python.h>
#endif
#include "reflow.h"
using namespace std;
using namespace calibre_reflow;
#ifndef PDF2XML
extern "C" {
static PyObject *
pdfreflow_reflow(PyObject *self, PyObject *args) {
char *pdfdata;
Py_ssize_t size;
if (!PyArg_ParseTuple(args, "s#", &pdfdata, &size))
return NULL;
try {
Reflow reflow(pdfdata, static_cast<std::ifstream::pos_type>(size));
reflow.render();
} catch (std::exception &e) {
PyErr_SetString(PyExc_RuntimeError, e.what()); return NULL;
} catch (...) {
PyErr_SetString(PyExc_RuntimeError,
"Unknown exception raised while rendering PDF"); return NULL;
}
Py_RETURN_NONE;
}
static PyObject *
pdfreflow_get_metadata(PyObject *self, PyObject *args) {
char *pdfdata;
Py_ssize_t size;
map<string,string> info;
PyObject *cover;
PyObject *ans = PyDict_New();
if (!ans) return PyErr_NoMemory();
if (!PyArg_ParseTuple(args, "s#O", &pdfdata, &size, &cover))
return NULL;
try {
Reflow reflow(pdfdata, static_cast<std::ifstream::pos_type>(size));
info = reflow.get_info();
if (PyObject_IsTrue(cover)) {
if (!reflow.is_locked()) {
size_t size;
char *data = reflow.render_first_page(&size);
PyObject *d = PyString_FromStringAndSize(data, size);
delete[] data;
if (d == NULL) return PyErr_NoMemory();
if (PyDict_SetItemString(ans, "cover", d) == -1) return NULL;
} else {
if (PyDict_SetItemString(ans, "cover", Py_None) == -1) return NULL;
}
}
} catch (std::exception &e) {
PyErr_SetString(PyExc_RuntimeError, e.what()); return NULL;
} catch (...) {
PyErr_SetString(PyExc_RuntimeError,
"Unknown exception raised while getting metadata from PDF"); return NULL;
}
for (map<string,string>::const_iterator it = info.begin() ; it != info.end(); it++ ) {
PyObject *key = PyUnicode_Decode((*it).first.c_str(), (*it).first.size(), "UTF-8", "replace");
if (!key) return NULL;
PyObject *val = PyUnicode_Decode((*it).second.c_str(), (*it).second.size(), "UTF-8", "replace");
if (!val) return NULL;
if (PyDict_SetItem(ans, key, val) == -1) return NULL;
}
return ans;
}
static PyObject *
pdfreflow_set_metadata(PyObject *self, PyObject *args) {
char *pdfdata;
Py_ssize_t size;
PyObject *info;
if (!PyArg_ParseTuple(args, "s#O", &pdfdata, &size, &info))
return NULL;
if (!PyDict_Check(info)) {
PyErr_SetString(PyExc_ValueError, "Info object must be a dictionary.");
return NULL;
}
char Title[10] = "Title", Author[10] = "Author", Keywords[10] = "Keywords";
char *keys[3] = { Title, Author, Keywords };
map<char *, char *> pinfo;
PyObject *val = NULL, *utf8 = NULL;
for (int i = 0; i < 3; i++) {
val = PyDict_GetItemString(info, keys[i]);
if (!val || !PyUnicode_Check(val)) continue;
utf8 = PyUnicode_AsUTF8String(val);
if (!utf8) continue;
pinfo[keys[i]] = PyString_AS_STRING(utf8);
}
PyObject *ans = NULL;
try {
Reflow reflow(pdfdata, static_cast<std::ifstream::pos_type>(size));
if (reflow.is_locked()) {
PyErr_SetString(PyExc_ValueError, "Setting metadata not possible in encrypeted PDFs");
return NULL;
}
string result = reflow.set_info(pinfo);
ans = PyString_FromStringAndSize(result.c_str(), result.size());
} catch (std::exception &e) {
PyErr_SetString(PyExc_RuntimeError, e.what()); return NULL;
} catch (...) {
PyErr_SetString(PyExc_RuntimeError,
"Unknown exception raised while getting metadata from PDF"); return NULL;
}
return ans;
}
static
PyMethodDef pdfreflow_methods[] = {
{"reflow", pdfreflow_reflow, METH_VARARGS,
"reflow(pdf_data)\n\n"
"Reflow the specified PDF."
},
{"get_metadata", pdfreflow_get_metadata, METH_VARARGS,
"get_metadata(pdf_data, cover)\n\n"
"Get metadata and (optionally) cover from the specified PDF."
},
{"set_metadata", pdfreflow_set_metadata, METH_VARARGS,
"get_metadata(info_dict)\n\n"
"Set metadata in the specified PDF. Currently broken."
},
{NULL, NULL, 0, NULL}
};
PyMODINIT_FUNC
initpdfreflow(void)
{
PyObject* m;
m = Py_InitModule3("pdfreflow", pdfreflow_methods,
"Reflow a PDF file");
if (m == NULL) return;
}
}
#else
int main(int argc, char **argv) {
char *memblock;
ifstream::pos_type size;
if (argc != 2) {
cerr << "Usage: " << argv[0] << " file.pdf" << endl;
return 1;
}
ifstream file (argv[1], ios::in|ios::binary|ios::ate);
if (file.is_open()) {
size = file.tellg();
memblock = new char[size];
file.seekg (0, ios::beg);
file.read (memblock, size);
file.close();
} else {
cerr << "Unable to read from: " << argv[1] << endl;
return 1;
}
try {
Reflow reflow(memblock, size);
reflow.render();
size_t sz = 0;
char *data = reflow.render_first_page(&sz);
ofstream file("cover.png", ios::binary);
file.write(data, sz);
file.close();
} catch(exception &e) {
cerr << e.what() << endl;
return 1;
}
return 0;
}
#endif

View File

@ -0,0 +1,974 @@
/**
* Copyright 2009 Kovid Goyal <kovid@kovidgoyal.net>
* License: GNU GPL v3
*/
#include <Object.h>
#include <Outline.h>
#include <PDFDocEncoding.h>
#include <goo/GooList.h>
#include <SplashOutputDev.h>
#include <splash/SplashBitmap.h>
#include <splash/SplashErrorCodes.h>
#include "reflow.h"
#include "utils.h"
using namespace std;
using namespace calibre_reflow;
static const size_t num_info_keys = 8;
static const char* info_keys[num_info_keys] = {
"Title", "Subject", "Keywords", "Author", "Creator", "Producer",
"CreationDate", "ModDate"
};
//------------------------------------------------------------------------
// XMLString
//------------------------------------------------------------------------
XMLString::XMLString(GfxState *state, GooString *s, double current_font_size,
Fonts *fonts) :
text(new vector<Unicode>(0)), x_right(new vector<double>(0)),
yx_next(NULL), xy_next(NULL), fonts(fonts), font_idx(0), xml_text(NULL),
link(NULL), x_min(0), x_max(0), y_min(0), y_max(0), col(0), dir(text_dir_unknown)
{
double x = 0, y = 0;
GfxFont *font;
state->transform(state->getCurX(), state->getCurY(), &x, &y);
if ((font = state->getFont())) {
double ascent = font->getAscent();
double descent = font->getDescent();
if( ascent > 1.05 ){
//printf( "ascent=%.15g is too high, descent=%.15g\n", ascent, descent );
ascent = 1.05;
}
if( descent < -0.4 ){
//printf( "descent %.15g is too low, ascent=%.15g\n", descent, ascent );
descent = -0.4;
}
this->y_min = y - ascent * current_font_size;
this->y_max = y - descent * current_font_size;
GfxRGB rgb;
state->getFillRGB(&rgb);
GooString *name = state->getFont()->getName();
if (!name)
this->font_idx = this->fonts->add_font(NULL, current_font_size-1, rgb);
else
this->font_idx = this->fonts->add_font(
new string(name->getCString()), current_font_size-1, rgb);
} else {
// this means that the PDF file draws text without a current font,
// which should never happen
this->y_min = y - 0.95 * current_font_size;
this->y_max = y + 0.35 * current_font_size;
}
if (this->y_min == this->y_max) {
// this is a sanity check for a case that shouldn't happen -- but
// if it does happen, we want to avoid dividing by zero later
this->y_min = y;
this->y_max = y + 1;
}
}
void XMLString::add_char(GfxState *state, double x, double y,
double dx, double dy, Unicode u) {
if (dir == text_dir_unknown) {
//dir = UnicodeMap::getDirection(u);
dir = text_dir_left_right;
}
if (this->text->capacity() == this->text->size()) {
this->text->reserve(text->size()+16);
this->x_right->reserve(x_right->size()+16);
}
this->text->push_back(u);
if (this->length() == 1) {
this->x_min = x;
}
this->x_max = x + dx;
this->x_right->push_back(x_max);
//printf("added char: %f %f xright = %f\n", x, dx, x+dx);
}
void XMLString::end_string()
{
if( this->dir == text_dir_right_left && this->length() > 1 )
{
//printf("will reverse!\n");
reverse(this->text->begin(), this->text->end());
}
}
static string encode_unicode_chars(const Unicode *u, size_t num) {
ostringstream oss;
UnicodeMap *uMap;
char buf[10];
int n;
if (!(uMap = globalParams->getTextEncoding())) {
throw ReflowException("Failed to allocate unicode map.");
}
for (size_t i = 0; i < num; i++) {
switch (u[i]) {
case '&': oss << "&amp;"; break;
case '<': oss << "&lt;"; break;
case '>': oss << "&gt;"; break;
default:
{
// convert unicode to string
if ((n = uMap->mapUnicode(u[i], buf, sizeof(buf))) > 0) {
buf[n] = 0;
oss << buf;
}
}
}
}
uMap->decRefCnt();
return oss.str();
}
void XMLString::encode() {
delete this->xml_text;
this->xml_text = new string(encode_unicode_chars(&((*this->text)[0]), this->text->size()));
}
string XMLString::str() const {
ostringstream oss;
oss << "<text font=\"" << this->font_idx << "\" ";
oss << setiosflags(ios::fixed) << setprecision(2)
<< "top=\"" << this->y_min << "\" left=\"" << this->x_min
<< "\" width=\"" << this->x_max - this->x_min << "\" "
<< "height=\"" << this->y_max - this->y_min << "\">";
oss << *this->xml_text << "</text>";
return oss.str();
}
XMLString::~XMLString() {
delete this->text; delete this->x_right;
}
//------------------------------------------------------------------------
// XMLPage
//------------------------------------------------------------------------
XMLPage::XMLPage(unsigned int num, GfxState *state, ofstream *output, Fonts* fonts) :
current_string(NULL), num(num), output(output), current_font_size(0.0),
yx_strings(NULL), xy_strings(NULL), yx_cur1(NULL), yx_cur2(NULL),
fonts(fonts), links(new XMLLinks())
{
(*this->output) << setiosflags(ios::fixed) << setprecision(2) <<
"\t\t<page number=\"" << this->num << "\" width=\"" <<
state->getPageWidth() << "\" height=\"" << state->getPageHeight() <<
"\">" << endl;
if (!(*this->output)) throw ReflowException(strerror(errno));
}
XMLPage::~XMLPage() {
(*this->output) << "\t\t</page>" << endl;
if (!(*this->output)) throw ReflowException(strerror(errno));
for (XMLString *tmp = this->yx_strings; tmp; tmp = tmp->yx_next)
delete tmp;
delete this->links;
}
void XMLPage::update_font(GfxState *state) {
GfxFont *font;
double *fm;
char *name;
int code;
double w;
current_font_size = state->getTransformedFontSize();
if ((font = state->getFont()) && font->getType() == fontType3) {
// This is a hack which makes it possible to deal with some Type 3
// fonts. The problem is that it's impossible to know what the
// base coordinate system used in the font is without actually
// rendering the font. This code tries to guess by looking at the
// width of the character 'm' (which breaks if the font is a
// subset that doesn't contain 'm').
for (code = 0; code < 256; ++code) {
if ((name = ((Gfx8BitFont *)font)->getCharName(code)) &&
name[0] == 'm' && name[1] == '\0') break;
}
if (code < 256) {
w = ((Gfx8BitFont *)font)->getWidth(code);
if (w != 0) {
// 600 is a generic average 'm' width -- yes, this is a hack
current_font_size *= w / 0.6;
}
}
fm = font->getFontMatrix();
if (fm[0] != 0) {
current_font_size *= fabs(fm[3] / fm[0]);
}
}
}
void XMLPage::draw_char(GfxState *state, double x, double y,
double dx, double dy,
double originX, double originY,
CharCode code, int nBytes, Unicode *u, int uLen) {
if ( (state->getRender() & 3) == 3) return; //Hidden text
double x1, y1, w1, h1, dx2, dy2;
int i;
state->transform(x, y, &x1, &y1);
// check that new character is in the same direction as current string
// and is not too far away from it before adding
if (this->current_string->character_does_not_belong_to_string(state, x1)) {
this->end_string();
this->begin_string(state, NULL);
}
state->textTransformDelta(state->getCharSpace() * state->getHorizScaling(),
0, &dx2, &dy2);
dx -= dx2;
dy -= dy2;
state->transformDelta(dx, dy, &w1, &h1);
if (uLen != 0) {
w1 /= uLen;
h1 /= uLen;
}
for (i = 0; i < uLen; ++i) {
this->current_string->add_char(state, x1 + i*w1, y1 + i*h1, w1, h1, u[i]);
}
}
void XMLPage::end_string() {
XMLString *p1 = NULL, *p2 = NULL;
double h, y1, y2;
// throw away zero-length strings -- they don't have valid xMin/xMax
// values, and they're useless anyway
if (this->current_string->length() == 0) {
delete this->current_string;
this->current_string = NULL;
return;
}
this->current_string->end_string();
// insert string in y-major list
h = this->current_string->height();
y1 = this->current_string->y_min + 0.5 * h;
y2 = this->current_string->y_min + 0.8 * h;
if (gFalse) { //rawOrder
p1 = this->yx_cur1;
p2 = NULL;
} else if (
(!this->yx_cur1 ||
(y1 >= this->yx_cur1->y_min &&
(y2 >= this->yx_cur1->y_max ||
this->current_string->x_max >= this->yx_cur1->x_min))) &&
(!this->yx_cur2 ||
(y1 < this->yx_cur2->y_min ||
(y2 < this->yx_cur2->y_max &&
this->current_string->x_max < this->yx_cur2->x_min)))
) {
p1 = this->yx_cur1;
p2 = this->yx_cur2;
} else {
for (p1 = NULL, p2 = this->yx_strings; p2; p1 = p2, p2 = p2->yx_next) {
if (y1 < p2->y_min || (y2 < p2->y_max && this->current_string->x_max < p2->x_min))
break;
}
this->yx_cur2 = p2;
}
this->yx_cur1 = this->current_string;
if (p1)
p1->yx_next = this->current_string;
else
this->yx_strings = this->current_string;
this->current_string->yx_next = p2;
this->current_string = NULL;
}
void XMLPage::end() {
XMLLinks::size_type link_index = 0;
Fonts::size_type pos = 0;
XMLFont* h;
for (XMLString *tmp = this->yx_strings; tmp; tmp = tmp->yx_next) {
pos = tmp->font_idx;
h = this->fonts->at(pos);
tmp->encode();
if (this->links->in_link(
tmp->x_min, tmp->y_min, tmp->x_max, tmp->y_max, link_index)) {
tmp->link = links->at(link_index);
}
}
this->coalesce();
for (XMLString *tmp = yx_strings; tmp; tmp=tmp->yx_next) {
if (tmp->xml_text && tmp->xml_text->size() > 0) {
(*this->output) << "\t\t\t" << tmp->str() << endl;
if (!(*this->output)) throw ReflowException(strerror(errno));
}
}
}
static const char *strrstr( const char *s, const char *ss )
{
const char *p = strstr( s, ss );
for( const char *pp = p; pp != NULL; pp = strstr( p+1, ss ) ){
p = pp;
}
return p;
}
static void close_tags( string *xml_text, bool &finish_a, bool &finish_italic, bool &finish_bold )
{
const char *last_italic = finish_italic && ( finish_bold || finish_a ) ? strrstr( xml_text->c_str(), "<em>" ) : NULL;
const char *last_bold = finish_bold && ( finish_italic || finish_a ) ? strrstr( xml_text->c_str(), "<strong>" ) : NULL;
const char *last_a = finish_a && ( finish_italic || finish_bold ) ? strrstr( xml_text->c_str(), "<a " ) : NULL;
if( finish_a && ( finish_italic || finish_bold ) && last_a > ( last_italic > last_bold ? last_italic : last_bold ) ) {
xml_text->append("</a>");
finish_a = false;
}
if( finish_italic && finish_bold && last_italic > last_bold ){
xml_text->append("</em>");
finish_italic = false;
}
if( finish_bold )
xml_text->append("</strong>");
if( finish_italic )
xml_text->append("</em>");
if( finish_a )
xml_text->append("</a>");
}
void XMLPage::coalesce() {
XMLString *str1, *str2, *str3;
XMLFont *hfont1, *hfont2;
double space, hor_space, vert_space, vert_overlap, size, x_limit;
bool add_space, found;
int n, i;
double cur_x, cur_y;
str1 = this->yx_strings;
if( !str1 ) return;
//----- discard duplicated text (fake boldface, drop shadows)
while (str1)
{
size = str1->y_max - str1->y_min;
x_limit = str1->x_min + size * 0.2;
found = false;
for (str2 = str1, str3 = str1->yx_next;
str3 && str3->x_min < x_limit;
str2 = str3, str3 = str2->yx_next)
{
if (str3->length() == str1->length() &&
!memcmp(str3->text, str1->text, str1->length() * sizeof(Unicode)) &&
fabs(str3->y_min - str1->y_min) < size * 0.2 &&
fabs(str3->y_max - str1->y_max) < size * 0.2 &&
fabs(str3->x_max - str1->x_max) < size * 0.2)
{
found = true;
//printf("found duplicate!\n");
break;
}
}
if (found)
{
str2->xy_next = str3->xy_next;
str2->yx_next = str3->yx_next;
delete str3;
}
else
{
str1 = str1->yx_next;
}
}
str1 = yx_strings;
hfont1 = this->fonts->at(str1->font_idx);
if( hfont1->is_bold() )
str1->xml_text->insert(0, "<strong>");
if( hfont1->is_italic() )
str1->xml_text->insert(0, "<em>");
if (str1->get_link())
str1->xml_text->insert(0, str1->get_link()->get_link_start());
cur_x = str1->x_min; cur_y = str1->y_min;
while (str1 && (str2 = str1->yx_next)) {
hfont2 = this->fonts->at(str2->font_idx);
space = str1->y_max - str1->y_min;
hor_space = str2->x_min - str1->x_max;
vert_space = str2->y_min - str1->y_max;
vert_overlap = 0;
if (str2->y_min >= str1->y_min && str2->y_min <= str1->y_max)
{
vert_overlap = str1->y_max - str2->y_min;
} else if (str2->y_max >= str1->y_min && str2->y_max <= str1->y_max)
{
vert_overlap = str2->y_max - str1->y_min;
}
if (
(
(
(str2->y_min < str1->y_max)
&&
(hor_space > -0.5 * space && hor_space < space)
)
) &&
(hfont1->eq_upto_inline(*hfont2)) &&
str1->dir == str2->dir // text direction the same
)
{
n = str1->length() + str2->length();
if ((add_space = hor_space > 0.1 * space)) {
++n;
}
str1->text->reserve((n + 15) & ~15);
str1->x_right->reserve((n + 15) & ~15);
if (add_space) {
str1->text->push_back(0x20);
str1->xml_text->push_back(' ');
str1->x_right->push_back(str2->x_min);
}
for (i = 0; i < str2->length(); i++) {
str1->text->push_back(str2->text->at(i));
str1->x_right->push_back(str2->x_right->at(i));
}
/* fix <i>, <b> if str1 and str2 differ and handle switch of links */
XMLLink *hlink1 = str1->get_link();
XMLLink *hlink2 = str2->get_link();
bool switch_links = !hlink1 || !hlink2 || !((*hlink1) == (*hlink2));
bool finish_a = switch_links && hlink1 != NULL;
bool finish_italic = hfont1->is_italic() && ( !hfont2->is_italic() || finish_a );
bool finish_bold = hfont1->is_bold() &&
( !hfont2->is_bold() || finish_a || finish_italic );
close_tags( str1->xml_text, finish_a, finish_italic, finish_bold );
if( switch_links && hlink2 != NULL ) {
string ls = hlink2->get_link_start();
str1->xml_text->append(ls);
}
if( ( !hfont1->is_italic() || finish_italic ) && hfont2->is_italic() )
str1->xml_text->append("<em>");
if( ( !hfont1->is_bold() || finish_bold ) && hfont2->is_bold() )
str1->xml_text->append("<strong>");
str1->xml_text->append(*str2->xml_text);
// str1 now contains href for link of str2 (if it is defined)
str1->link = str2->link;
hfont1 = hfont2;
if (str2->x_max > str1->x_max) {
str1->x_max = str2->x_max;
}
if (str2->y_max > str1->y_max) {
str1->y_max = str2->y_max;
}
str1->yx_next = str2->yx_next;
delete str2;
} else { // keep strings separate
bool finish_a = str1->get_link() != NULL;
bool finish_bold = hfont1->is_bold();
bool finish_italic = hfont1->is_italic();
close_tags( str1->xml_text, finish_a, finish_italic, finish_bold );
str1->x_min = cur_x; str1->y_min = cur_y;
str1 = str2;
cur_x = str1->x_min; cur_y = str1->y_min;
hfont1 = hfont2;
if ( hfont1->is_bold() )
str1->xml_text->insert(0, "<strong>");
if( hfont1->is_italic() )
str1->xml_text->insert(0, "<em>");
if( str1->get_link() != NULL ) {
str1->xml_text->insert(0, str1->get_link()->get_link_start());
}
}
}
str1->x_min = cur_x; str1->y_min = cur_y;
bool finish_bold = hfont1->is_bold();
bool finish_italic = hfont1->is_italic();
bool finish_a = str1->get_link() != NULL;
close_tags( str1->xml_text, finish_a, finish_italic, finish_bold );
}
//------------------------------------------------------------------------
// XMLOutputDev
//------------------------------------------------------------------------
XMLOutputDev::XMLOutputDev(PDFDoc *doc) :
current_page(NULL), output(new ofstream("index.xml", ios::trunc)),
fonts(new Fonts()), catalog(NULL), images(new XMLImages()), doc(doc)
{
if (!(*this->output)) {
throw ReflowException(strerror(errno));
}
(*this->output) << "<pdfreflow>" << endl;
(*this->output) << "\t<pages>" << endl;
if (!(*this->output)) throw ReflowException(strerror(errno));
}
XMLOutputDev::~XMLOutputDev() {
(*this->output) << "\t</pages>" << endl;
if (!(*this->output)) throw ReflowException(strerror(errno));
(*this->output) << "\t<fonts>" << endl;
if (!(*this->output)) throw ReflowException(strerror(errno));
for (Fonts::const_iterator it = this->fonts->begin(); it < this->fonts->end(); it++) {
(*this->output) << "\t\t" << (*it)->str(it - this->fonts->begin()) << endl;
if (!(*this->output)) throw ReflowException(strerror(errno));
}
(*this->output) << "\t</fonts>" << endl;
if (!(*this->output)) throw ReflowException(strerror(errno));
(*this->output) << "</pdfreflow>" << endl;
if (!(*this->output)) throw ReflowException(strerror(errno));
this->output->close();
delete this->output;
delete this->fonts;
delete this->images;
}
static string get_link_dest(LinkAction *link, PDFDoc *doc) {
unsigned int page = 1;
ostringstream oss;
switch(link->getKind())
{
case actionGoTo:
{
LinkGoTo *ha = (LinkGoTo *)link;
LinkDest *dest = NULL;
if (ha->getDest() != NULL)
dest = ha->getDest()->copy();
else if (ha->getNamedDest() != NULL) {
dest = doc->findDest(ha->getNamedDest());
}
if (dest) {
if (dest->isPageRef()) {
Ref pageref = dest->getPageRef();
page = doc->findPage(pageref.num, pageref.gen);
}
else {
page = dest->getPageNum();
}
oss << "#" << page
<< setiosflags(ios::fixed) << setprecision(2)
<< ":l=" << dest->getLeft()
<< "t=" << dest->getTop();
//<< "r=" << dest->getRight()
//<< "b=" << dest->getBottom();
delete dest;
}
break;
}
case actionGoToR:
{
LinkGoToR *ha = (LinkGoToR *) link;
LinkDest *dest = NULL;
bool has_file = false;
if (ha->getFileName()) {
oss << ha->getFileName()->getCString();
has_file = true;
}
if (ha->getDest() != NULL) dest=ha->getDest()->copy();
if (dest && has_file) {
if (!(dest->isPageRef())) page = dest->getPageNum();
delete dest;
oss << '#' << page;
}
break;
}
case actionURI:
{
LinkURI *ha=(LinkURI *) link;
oss << ha->getURI()->getCString();
break;
}
case actionLaunch:
{
LinkLaunch *ha = (LinkLaunch *) link;
oss << ha->getFileName()->getCString();
break;
}
case actionNamed: break;
case actionMovie: break;
case actionRendition: break;
case actionSound: break;
case actionJavaScript: break;
case actionUnknown: break;
}
return oss.str();
}
void XMLOutputDev::process_link(Link* link){
double _x1, _y1, _x2, _y2;
int x1, y1, x2, y2;
link->getRect(&_x1, &_y1, &_x2, &_y2);
cvtUserToDev(_x1, _y1, &x1, &y1);
cvtUserToDev(_x2, _y2, &x2, &y2);
LinkAction *a = link->getAction();
if (!a) return;
string dest = get_link_dest(a, this->doc);
if (dest.length() > 0) {
XMLLink *t = new XMLLink((double)x1, (double)y2, (double)x2, (double)y1,
dest.c_str());
this->current_page->add_link(t);
}
}
void XMLOutputDev::endPage() {
Links *slinks = catalog->getPage(current_page->number())->getLinks(catalog);
for (int i = 0; i < slinks->getNumLinks(); i++)
{
this->process_link(slinks->getLink(i));
}
delete slinks;
this->current_page->end();
vector<string*> images = this->images->str();
for (vector<string*>::iterator it = images.begin(); it < images.end(); it++) {
(*this->output) << "\t\t\t" << *(*it) << endl;
if (!(*this->output)) throw ReflowException(strerror(errno));
delete *it;
}
this->images->clear();
delete this->current_page;
this->current_page = NULL;
}
void XMLOutputDev::drawImageMask(GfxState *state, Object *ref, Stream *str,
int width, int height, GBool invert,
GBool interpolate, GBool inlineImg) {
OutputDev::drawImageMask(state, ref, str, width, height,
invert, interpolate, inlineImg);
//this->images->add_mask();
cerr << "mask requested" << endl;
}
void XMLOutputDev::drawImage(GfxState *state, Object *ref, Stream *str,
int width, int height, GfxImageColorMap *colorMap,
GBool interpolate, int *maskColors, GBool inlineImg) {
this->images->add(state, ref, str,
static_cast<unsigned int>(width), static_cast<unsigned int>(height),
colorMap, interpolate, maskColors, inlineImg);
}
static char stream_pdf[15] = "stream.pdf";
class MemInStream : public MemStream {
public:
MemInStream(char *buf, size_t st, size_t sz, Object *obj) :
MemStream(buf, st, sz, obj) {}
~MemInStream() {}
GooString *getFileName() { return new GooString(stream_pdf); }
};
Reflow::Reflow(char *pdfdata, size_t sz) :
pdfdata(pdfdata), current_font_size(-1), doc(NULL)
{
Object obj;
obj.initNull();
if (globalParams == NULL) {
globalParams = new GlobalParams();
if (!globalParams)
throw ReflowException("Failed to allocate Globalparams");
}
MemInStream *str = new MemInStream(pdfdata, 0, sz, &obj);
this->doc = new PDFDoc(str, NULL, NULL);
if (!this->doc->isOk()) {
ostringstream stm;
stm << "Failed to open PDF file";
stm << " with error code: " << doc->getErrorCode();
delete this->doc;
this->doc = NULL;
throw ReflowException(stm.str().c_str());
}
}
void
Reflow::render() {
if (this->doc->isEncrypted()) {
throw ReflowException("Document is encrypted.");
}
if (!this->doc->okToCopy())
cout << "Warning, this document has the copy protection flag set, ignoring." << endl;
char encoding[10] = "UTF-8";
globalParams->setTextEncoding(encoding);
int first_page = 1;
int last_page = doc->getNumPages();
XMLOutputDev *xml_out = new XMLOutputDev(this->doc);
doc->displayPages(xml_out, first_page, last_page,
96, //hDPI
96, //vDPI
0, //rotate
true, //UseMediaBox
true, //Crop
false //Printing
);
this->dump_outline();
delete xml_out;
}
void Reflow::dump_outline() {
Outline *outline = this->doc->getOutline();
if (!outline) return;
GooList *items = outline->getItems();
if ( !items || items->getLength() < 1 )
return;
ostringstream *output = new ostringstream();
(*output) << "<outline>" << endl;
this->outline_level(output, items);
(*output) << "</outline>" << endl;
ofstream of("outline.xml", ios::trunc);
of << output->str();
if (!of) throw ReflowException("Error writing outline file");
of.close();
delete output;
}
static inline void outline_tabs(ostringstream *o, int level) {
for (int i = 0; i < level; i++)
(*o) << "\t";
}
void Reflow::outline_level(ostringstream *oss, GooList *items, int level)
{
int num_of_items = items->getLength();
if (num_of_items > 0) {
outline_tabs(oss, level);
(*oss) << "<links level=\"" << level << "\">" << endl;
for (int i = 0; i < num_of_items; i++) {
OutlineItem* item = (OutlineItem *)items->get(i);
Unicode *u = item->getTitle();
string title = encode_unicode_chars(u, item->getTitleLength());
if (title.size() < 1) continue;
outline_tabs(oss, level+1);
(*oss) << "<link open=\"" << (item->isOpen()?"yes":"no") << "\"";
LinkAction *a = item->getAction();
if (a != NULL)
(*oss) << " dest=\"" << get_link_dest(a, this->doc) << "\"";
(*oss) << ">" << title << "</link>" << endl;
item->open();
GooList *children = item->getKids();
if (children)
outline_level(oss, children, level+1);
}
}
}
Reflow::~Reflow() {
delete this->doc;
}
map<string, string> Reflow::get_info() {
Object info;
map<string, string> ans;
string val;
char encoding[10] = "UTF-8";
globalParams->setTextEncoding(encoding);
this->doc->getDocInfo(&info);
if (info.isDict()) {
for(size_t i = 0; i < num_info_keys; i++) {
val = this->decode_info_string(info.getDict(), info_keys[i]);
if (val.size() > 0) {
ans[string(info_keys[i])] = string(val);
}
}
}
return ans;
}
string Reflow::decode_info_string(Dict *info, const char *key) const {
Object obj;
GooString *s1;
bool is_unicode;
Unicode u;
char buf[8];
int i, n;
ostringstream oss;
char *tmp = new char[strlen(key)+1];
strcpy(tmp, key);
UnicodeMap *umap;
if (!(umap = globalParams->getTextEncoding())) {
throw ReflowException("Failed to allocate unicode map.");
}
if (info->lookup(tmp, &obj)->isString()) {
s1 = obj.getString();
if ((s1->getChar(0) & 0xff) == 0xfe &&
(s1->getChar(1) & 0xff) == 0xff) {
is_unicode = true;
i = 2;
} else {
is_unicode = false;
i = 0;
}
while (i < obj.getString()->getLength()) {
if (is_unicode) {
u = ((s1->getChar(i) & 0xff) << 8) |
(s1->getChar(i+1) & 0xff);
i += 2;
} else {
u = pdfDocEncoding[s1->getChar(i) & 0xff];
++i;
}
n = umap->mapUnicode(u, buf, sizeof(buf));
buf[n] = 0;
oss << buf;
}
}
obj.free();
delete[] tmp;
return oss.str();
}
char* Reflow::render_first_page(size_t *data_size,
bool use_crop_box, double x_res,
double y_res) {
if (this->is_locked()) throw ReflowException("Document is locked.");
char encoding[10] = "UTF-8";
char yes[10] = "yes";
globalParams->setTextEncoding(encoding);
globalParams->setEnableFreeType(yes);
globalParams->setAntialias(yes);
globalParams->setVectorAntialias(yes);
SplashColor paper_color;
paper_color[0] = 255;
paper_color[1] = 255;
paper_color[2] = 255;
SplashOutputDev *out = new SplashOutputDev(splashModeRGB8, 4, false, paper_color);
if (!out) {
throw ReflowException("Failed to allocate SplashOutputDev");
}
out->startDoc(doc->getXRef());
double pg_w, pg_h;
int pg = 1;
if (use_crop_box) {
pg_w = this->doc->getPageCropWidth(pg);
pg_h = this->doc->getPageCropHeight(pg);
} else {
pg_w = this->doc->getPageMediaWidth(pg);
pg_h = this->doc->getPageMediaHeight(pg);
}
pg_w *= x_res/72.;
pg_h *= x_res/72.;
int x=0, y=0;
this->doc->displayPageSlice(out, pg, x_res, y_res, 0,
!use_crop_box, false, false, x, y, pg_w, pg_h);
FILE * f = tmpfile();
if (!f) throw ReflowException(strerror(errno));
SplashBitmap *bmp = out->getBitmap();
PNGWriter *writer = new PNGWriter();
writer->init(f, bmp->getWidth(), bmp->getHeight());
writer->write_splash_bitmap(bmp);
writer->close();
delete writer;
long size = ftell(f);
rewind(f);
char *buffer = new char[size];
*data_size = fread(buffer, 1, size, f);
if (*data_size != (size_t)size) {
throw ReflowException("I/O error reading from tmpfile");
}
return buffer;
}
class MemOutStream : public OutStream {
private:
ostringstream out;
public:
MemOutStream() :OutStream() {}
~MemOutStream() {}
void close() {}
int getPos() { return out.tellp(); }
void put(char c) { out.put(c); }
void printf (const char *format, ...) {
vector<char> buf;
size_t written = strlen(format)*5;
va_list ap;
do {
buf.reserve(written + 20);
va_start(ap, format);
written = vsnprintf(&buf[0], buf.capacity(), format, ap);
va_end(ap);
} while (written >= buf.capacity());
out.write(&buf[0], written);
}
};
string Reflow::set_info(map<char *, char *> sinfo) {
XRef *xref = this->doc->getXRef();
if (!xref) throw ReflowException("No XRef table");
Object *trailer_dict = xref->getTrailerDict();
if (!trailer_dict || !trailer_dict->isDict()) throw ReflowException("No trailer dictionary");
Object tmp;
char INFO[5] = "Info";
Object *info = trailer_dict->dictLookup(INFO, &tmp);
if (!info) {
info = new Object();
info->initDict(xref);
}
if (!info->isDict()) throw ReflowException("Invalid info object");
for (map<char *, char *>::iterator it = sinfo.begin(); it != sinfo.end(); it++) {
Object *tmp = new Object();
tmp->initString(new GooString((*it).second));
info->dictSet((*it).first, tmp);
}
trailer_dict->dictSet(INFO, info);
char out[20] = "/t/out.pdf";
this->doc->saveAs(new GooString(out), writeForceRewrite);
string ans;
return ans;
}

View File

@ -0,0 +1,241 @@
/**
* Copyright 2009 Kovid Goyal <kovid@kovidgoyal.net>
* License: GNU GPL v3
* Based on pdftohtml from the poppler project.
*/
#ifndef CALIBRE_REFLOW
#define CALIBRE_REFLOW
#define UNICODE
#include <PDFDoc.h>
#include <GlobalParams.h>
#include <GfxState.h>
#include <GfxFont.h>
#include <OutputDev.h>
#include <Link.h>
#include <UnicodeMap.h>
#include <cmath>
#include <exception>
#include <string>
#include <sstream>
#include <vector>
#include <iostream>
#include <algorithm>
#include <fstream>
#include <iostream>
#include <iomanip>
#include <map>
#include <errno.h>
#include "fonts.h"
#include "links.h"
#include "images.h"
using namespace std;
namespace calibre_reflow {
enum UnicodeTextDirection {
text_dir_unknown,
text_dir_left_right,
text_dir_right_left,
text_dir_top_bottom
};
class Reflow {
private:
char *pdfdata;
double current_font_size;
PDFDoc *doc;
string decode_info_string(Dict *info, const char *key) const;
void outline_level(ostringstream *oss, GooList *items,
int level=1);
public:
Reflow (char *xpdfdata, size_t sz);
~Reflow();
/* Convert the PDF to XML. All files are output to the current directory */
void render();
/* Get the PDF Info Dictionary */
map<string, string> get_info();
/* True if the PDF is encrypted */
bool is_locked() const { return !this->doc || this->doc->isEncrypted(); }
/* Return the first page of the PDF, rendered as a PNG image */
char* render_first_page(size_t *data_size,
bool use_crop_box=true, double x_res=150.0,
double y_res = 150.0);
/* Dump the PDF outline as the file outline.xml in the current directory */
void dump_outline();
/* Set the info dictionary. Currently broken. */
string set_info(map<char *, char *> info);
};
class XMLString {
private:
vector<Unicode> *text; // the text
vector<double> *x_right; // right-hand x coord of each char
XMLString *yx_next; // next string in y-major order
XMLString *xy_next; // next string in x-major order
Fonts *fonts;
Fonts::size_type font_idx;
string *xml_text;
XMLLink *link;
double x_min, x_max; // bounding box x coordinates
double y_min, y_max; // bounding box y coordinates
int col; // starting column
UnicodeTextDirection dir; // direction (left to right/right to left)
friend class XMLPage;
public:
XMLString(GfxState *state, GooString *s, double current_font_size, Fonts *fonts);
~XMLString();
bool character_does_not_belong_to_string(GfxState *state, double x1) {
return this->length() > 0 &&
fabs(x1 - x_right->at(this->length()-1)) > 0.1 * (y_max - y_min);
}
void add_char(GfxState *state, double x, double y,
double dx, double dy, Unicode u);
void end_string();
inline int length() const { return this->text->size(); }
inline double height() const { return y_max - y_min; }
void encode();
XMLLink* get_link() { return this->link; }
string str() const;
};
class XMLPage {
private:
XMLString *current_string;
unsigned int num;
ofstream *output;
double current_font_size;
XMLString *yx_strings; // strings in y-major order
XMLString *xy_strings; // strings in x-major order
XMLString *yx_cur1, *yx_cur2; // cursors for yxStrings list
Fonts *fonts;
XMLLinks *links;
void coalesce();
public:
XMLPage(unsigned int num, GfxState *state, ofstream *output, Fonts* fonts);
~XMLPage();
void update_font(GfxState *state);
void begin_string(GfxState *state, GooString *s) {
this->current_string = new XMLString(state, s,
this->current_font_size, this->fonts);
}
void draw_char(GfxState *state, double x, double y,
double dx, double dy,
double originX, double originY,
CharCode code, int nBytes, Unicode *u, int uLen);
void end_string();
void end();
void add_link(XMLLink *t) { this->links->push_back(t); }
unsigned int number() const { return this->num; }
};
class XMLOutputDev : public OutputDev {
public:
XMLOutputDev(PDFDoc *doc);
virtual ~XMLOutputDev();
//---- get info about output device
// Does this device use upside-down coordinates?
// (Upside-down means (0,0) is the top left corner of the page.)
virtual GBool upsideDown() { return gTrue; }
// Does this device use drawChar() or drawString()?
virtual GBool useDrawChar() { return gTrue; }
// Does this device use beginType3Char/endType3Char? Otherwise,
// text in Type 3 fonts will be drawn with drawChar/drawString.
virtual GBool interpretType3Chars() { return gFalse; }
// Does this device need non-text content?
virtual GBool needNonText() { return gTrue; }
//----- initialization and control
virtual GBool checkPageSlice(Page *page, double hDPI, double vDPI,
int rotate, GBool useMediaBox, GBool crop,
int sliceX, int sliceY, int sliceW, int sliceH,
GBool printing, Catalog * catalogA,
GBool (* abortCheckCbk)(void *data) = NULL,
void * abortCheckCbkData = NULL)
{
this->catalog = catalogA;
return gTrue;
}
// Start a page.
virtual void startPage(int page_num, GfxState *state) {
this->current_page = new XMLPage(page_num, state, this->output, this->fonts);
}
// End a page.
virtual void endPage();
//----- update text state
virtual void updateFont(GfxState *state) {current_page->update_font(state);}
//----- text drawing
virtual void beginString(GfxState *state, GooString *s) {
this->current_page->begin_string(state, s);
}
virtual void endString(GfxState *state) {
this->current_page->end_string();
}
virtual void drawChar(GfxState *state, double x, double y,
double dx, double dy,
double originX, double originY,
CharCode code, int nBytes, Unicode *u, int uLen) {
this->current_page->draw_char(state, x, y, dx, dy, originX,
originY, code, nBytes, u, uLen);
}
virtual void drawImageMask(GfxState *state, Object *ref,
Stream *str,
int width, int height, GBool invert,
GBool interpolate, GBool inlineImg);
virtual void drawImage(GfxState *state, Object *ref, Stream *str,
int width, int height, GfxImageColorMap *colorMap,
GBool interpolate, int *maskColors, GBool inlineImg);
//new feature
virtual int DevType() {return 1234;}
private:
XMLPage *current_page;
ofstream *output; // xml file
Fonts *fonts;
Catalog *catalog;
XMLImages *images;
PDFDoc *doc;
void process_link(Link* link);
};
}
#endif

View File

@ -0,0 +1,127 @@
#!/usr/bin/env python
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from __future__ import with_statement
__license__ = 'GPL v3'
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
import sys, os
from copy import deepcopy
from lxml import etree
class Font(object):
def __init__(self, spec):
self.id = spec.get('id')
self.size = float(spec.get('size'))
self.color = spec.get('color')
self.family = spec.get('family')
class Text(object):
A = etree.XPath('descendant::a[@href]')
def __init__(self, text, font_map, classes, opts, log):
self.opts, self.log = opts, log
self.font_map = font_map
self.top, self.left, self.width, self.height = map(float, map(text.get,
('top', 'left', 'width', 'height')))
self.font = self.font_map[text.get('font')]
self.font_size = self.font.size
self.color = self.font.color
self.font_family = self.font.family
for a in self.A(text):
href = a.get('href')
if href.startswith('index.'):
href = href.split('#')[-1]
a.set('href', '#page'+href)
self.text = etree.Element('span')
css = {'font_size':'%.1fpt'%self.font_size, 'color': self.color}
if css not in classes:
classes.append(css)
idx = classes.index(css)
self.text.set('class', 't%d'%idx)
if text.text:
self.text.text = text.text
for x in text:
self.text.append(deepcopy(x))
#print etree.tostring(self.text, encoding='utf-8', with_tail=False)
class Page(object):
def __init__(self, page, font_map, classes, opts, log):
self.opts, self.log = opts, log
self.font_map = font_map
self.number = int(page.get('number'))
self.top, self.left, self.width, self.height = map(float, map(page.get,
('top', 'left', 'width', 'height')))
self.id = 'page%d'%self.number
self.texts = []
for text in page.xpath('descendant::text'):
self.texts.append(Text(text, self.font_map, classes, self.opts, self.log))
class PDFDocument(object):
def __init__(self, xml, opts, log):
self.opts, self.log = opts, log
parser = etree.XMLParser(recover=True)
self.root = etree.fromstring(xml, parser=parser)
self.fonts = []
self.font_map = {}
for spec in self.root.xpath('//fontspec'):
self.fonts.append(Font(spec))
self.font_map[self.fonts[-1].id] = self.fonts[-1]
self.pages = []
self.page_map = {}
self.classes = []
for page in self.root.xpath('//page'):
page = Page(page, self.font_map, self.classes, opts, log)
self.page_map[page.id] = page
self.pages.append(page)
def run(opts, pathtopdf, log):
from calibre.constants import plugins
pdfreflow, err = plugins['pdfreflow']
if pdfreflow is None:
raise RuntimeError('Failed to load PDF Reflow plugin: '+err)
data = open(pathtopdf, 'rb').read()
pdfreflow.reflow(data)
index = os.path.join(os.getcwdu(), 'index.xml')
xml = open(index, 'rb').read()
#pdfdoc = PDFDocument(xml, opts, log)
def option_parser():
from optparse import OptionParser
p = OptionParser()
p.add_option('-v', '--verbose', action='count', default=0)
return p
def main(args=sys.argv):
p = option_parser()
opts, args = p.parse_args(args)
from calibre.utils.logging import default_log
if len(args) < 2:
p.print_help()
default_log('No input PDF file specified', file=sys.stderr)
return 1
run(opts, args[1], default_log)
return 0

View File

@ -0,0 +1,48 @@
/**
* Copyright 2009 Kovid Goyal <kovid@kovidgoyal.net>
* License: GNU GPL v3
*/
#ifndef _CALIBRE_REFLOW_UTILS
#define _CALIBRE_REFLOW_UTILS
#include <string>
#include <sstream>
using namespace std;
namespace calibre_reflow {
class ReflowException : public exception {
const char *msg;
public:
ReflowException(const char *m) : msg(m) {}
virtual const char* what() const throw() { return msg; }
};
inline string encode_for_xml(const string &sSrc )
{
ostringstream sRet;
for( string::const_iterator iter = sSrc.begin(); iter!=sSrc.end(); iter++ )
{
unsigned char c = (unsigned char)*iter;
switch( c )
{
case '&': sRet << "&amp;"; break;
case '<': sRet << "&lt;"; break;
case '>': sRet << "&gt;"; break;
case '"': sRet << "&quot;"; break;
default: sRet << c;
}
}
return sRet.str();
}
}
#endif

View File

@ -40,12 +40,12 @@
<string>...</string> <string>...</string>
</property> </property>
<property name="icon"> <property name="icon">
<iconset resource="../../../../resources/images.qrc"> <iconset resource="../../../work/calibre/resources/images.qrc">
<normaloff>:/images/document_open.svg</normaloff>:/images/document_open.svg</iconset> <normaloff>:/images/document_open.svg</normaloff>:/images/document_open.svg</iconset>
</property> </property>
</widget> </widget>
</item> </item>
<item row="2" column="0"> <item row="3" column="0">
<spacer name="verticalSpacer"> <spacer name="verticalSpacer">
<property name="orientation"> <property name="orientation">
<enum>Qt::Vertical</enum> <enum>Qt::Vertical</enum>
@ -64,15 +64,25 @@
<string>...</string> <string>...</string>
</property> </property>
<property name="icon"> <property name="icon">
<iconset resource="../../../../resources/images.qrc"> <iconset resource="../../../work/calibre/resources/images.qrc">
<normaloff>:/images/clear_left.svg</normaloff>:/images/clear_left.svg</iconset> <normaloff>:/images/clear_left.svg</normaloff>:/images/clear_left.svg</iconset>
</property> </property>
</widget> </widget>
</item> </item>
<item row="2" column="0">
<widget class="QLabel" name="label_2">
<property name="text">
<string>The debug process outputs the intermediate HTML generated at various stages of the conversion process. This HTML can sometimes serve as a good starting point for hand editing a conversion.</string>
</property>
<property name="wordWrap">
<bool>true</bool>
</property>
</widget>
</item>
</layout> </layout>
</widget> </widget>
<resources> <resources>
<include location="../../../../resources/images.qrc"/> <include location="../../../work/calibre/resources/images.qrc"/>
</resources> </resources>
<connections/> <connections/>
</ui> </ui>

View File

@ -1,63 +0,0 @@
#!/usr/bin/env python
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from __future__ import with_statement
__license__ = 'GPL v3'
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
import os
from calibre.constants import plugins
from calibre.ebooks.metadata import MetaInformation, string_to_authors
poppler, poppler_err = plugins['calibre_poppler']
class NotAvailable(Exception):
pass
def get_metadata(stream, cover=True):
if not poppler:
raise NotAvailable('Failed to load poppler with error: '+poppler_err)
raw = stream.read()
doc = poppler.PDFDoc()
doc.load(raw)
del raw
title = doc.title
if not title or not title.strip():
title = _('Unknown')
if hasattr(stream, 'name'):
title = os.path.splitext(os.path.basename(stream.name))[0]
author = doc.author
authors = string_to_authors(author) if author else [_('Unknown')]
creator = doc.creator
mi = MetaInformation(title, authors)
if creator:
mi.book_producer = creator
if doc.subject:
mi.category = doc.subject
if doc.keywords:
mi.tags = [x.strip() for x in doc.keywords.split(',')]
if cover:
from calibre.gui2 import is_ok_to_use_qt
cdata = None
if is_ok_to_use_qt():
try:
cdata = doc.render_page(0)
except:
import traceback
traceback.print_exc()
if cdata is not None:
mi.cover_data = ('jpg', cdata)
del doc
return mi

View File

@ -1,329 +0,0 @@
#define UNICODE
#define PY_SSIZE_T_CLEAN
#include <Python.h>
#include <poppler-qt4.h>
#include <QtCore/QBuffer>
#include <QtGui/QImage>
typedef struct {
PyObject_HEAD
/* Type-specific fields go here. */
Poppler::Document *doc;
} poppler_PDFDoc;
extern "C" {
static void
poppler_PDFDoc_dealloc(poppler_PDFDoc* self)
{
if (self->doc != NULL) delete self->doc;
self->ob_type->tp_free((PyObject*)self);
}
static PyObject *
poppler_PDFDoc_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
{
poppler_PDFDoc *self;
self = (poppler_PDFDoc *)type->tp_alloc(type, 0);
if (self != NULL) {
self->doc = NULL;
}
return (PyObject *)self;
}
static PyObject *
poppler_PDFDoc_load(poppler_PDFDoc *self, PyObject *args, PyObject *kwargs) {
char *buffer; Py_ssize_t size; QByteArray data;
if (!PyArg_ParseTuple(args, "s#", &buffer, &size)) return NULL;
data = QByteArray::fromRawData(buffer, size);
self->doc = Poppler::Document::loadFromData(data);
if (self->doc == NULL) {PyErr_SetString(PyExc_ValueError, "Could not load PDF file from data."); return NULL;}
Py_RETURN_NONE;
}
}
static QString
poppler_convert_pystring(PyObject *py) {
QString ans;
Py_UNICODE* u = PyUnicode_AS_UNICODE(py);
PyObject *u8 = PyUnicode_EncodeUTF8(u, PyUnicode_GET_SIZE(py), "replace");
if (u8 == NULL) { PyErr_NoMemory(); return NULL; }
ans = QString::fromUtf8(PyString_AS_STRING(u8));
Py_DECREF(u8);
return ans;
}
extern "C" {
static PyObject *
poppler_convert_qstring(const QString &src) {
QByteArray data = src.toUtf8();
const char *cdata = data.constData();
int sz = data.size();
return PyUnicode_Decode(cdata, sz, "utf-8", "error");
}
static PyObject *
poppler_PDFDoc_open(poppler_PDFDoc *self, PyObject *args, PyObject *kwargs) {
PyObject *fname; QString _fname;
if (!PyArg_ParseTuple(args, "O", &fname)) return NULL;
_fname = poppler_convert_pystring(fname);
self->doc = Poppler::Document::load(_fname);
Py_RETURN_NONE;
}
static PyObject *
poppler_PDFDoc_getter(poppler_PDFDoc *self, int field)
{
PyObject *ans;
const char *s;
switch (field) {
case 0:
s = "Title"; break;
case 1:
s = "Author"; break;
case 2:
s = "Subject"; break;
case 3:
s = "Keywords"; break;
case 4:
s = "Creator"; break;
case 5:
s = "Producer"; break;
default:
PyErr_SetString(PyExc_Exception, "Bad field");
return NULL;
}
ans = poppler_convert_qstring(self->doc->info(QString(s)));
if (ans != NULL) Py_INCREF(ans);
return ans;
}
static int
poppler_PDFDoc_setter(poppler_PDFDoc *self, PyObject *val, int field) {
return -1;
}
static PyObject *
poppler_PDFDoc_title_getter(poppler_PDFDoc *self, void *closure) {
return poppler_PDFDoc_getter(self, 0);
}
static PyObject *
poppler_PDFDoc_author_getter(poppler_PDFDoc *self, void *closure) {
return poppler_PDFDoc_getter(self, 1);
}
static PyObject *
poppler_PDFDoc_subject_getter(poppler_PDFDoc *self, void *closure) {
return poppler_PDFDoc_getter(self, 2);
}
static PyObject *
poppler_PDFDoc_keywords_getter(poppler_PDFDoc *self, void *closure) {
return poppler_PDFDoc_getter(self, 3);
}
static PyObject *
poppler_PDFDoc_creator_getter(poppler_PDFDoc *self, void *closure) {
return poppler_PDFDoc_getter(self, 4);
}
static PyObject *
poppler_PDFDoc_producer_getter(poppler_PDFDoc *self, void *closure) {
return poppler_PDFDoc_getter(self, 5);
}
static PyObject *
poppler_PDFDoc_version_getter(poppler_PDFDoc *self, void *closure) {
PyObject *ans = PyFloat_FromDouble(self->doc->pdfVersion());
if (ans != NULL) Py_INCREF(ans);
return ans;
}
static int
poppler_PDFDoc_title_setter(poppler_PDFDoc *self, PyObject *val, void *closure) {
return poppler_PDFDoc_setter(self, val, 0);
}
static int
poppler_PDFDoc_author_setter(poppler_PDFDoc *self, PyObject *val, void *closure) {
return poppler_PDFDoc_setter(self, val, 1);
}
static int
poppler_PDFDoc_subject_setter(poppler_PDFDoc *self, PyObject *val, void *closure) {
return poppler_PDFDoc_setter(self, val, 2);
}
static int
poppler_PDFDoc_keywords_setter(poppler_PDFDoc *self, PyObject *val, void *closure) {
return poppler_PDFDoc_setter(self, val, 3);
}
static int
poppler_PDFDoc_creator_setter(poppler_PDFDoc *self, PyObject *val, void *closure) {
return poppler_PDFDoc_setter(self, val, 4);
}
static int
poppler_PDFDoc_producer_setter(poppler_PDFDoc *self, PyObject *val, void *closure) {
return poppler_PDFDoc_setter(self, val, 5);
}
}
static PyObject *
poppler_PDFDoc_render_page(poppler_PDFDoc *self, PyObject *args, PyObject *kwargs) {
QImage img;
float xdpi = 166.0, ydpi = 166.0;
Poppler::Page *page;
QByteArray ba;
PyObject *ans = NULL;
QBuffer buffer(&ba);
int num;
if (!PyArg_ParseTuple(args, "i|ff", &num, &xdpi, &ydpi)) return ans;
if ( self->doc->isLocked()) {
PyErr_SetString(PyExc_ValueError, "This document is copyrighted.");
return ans;
}
if ( num < 0 || num >= self->doc->numPages()) {
PyErr_SetString(PyExc_ValueError, "Invalid page number");
return ans;
}
page = self->doc->page(num);
img = page->renderToImage(xdpi, ydpi);
if (img.isNull()) {
PyErr_SetString(PyExc_Exception, "Failed to render first page of PDF");
return ans;
}
buffer.open(QIODevice::WriteOnly);
if (!img.save(&buffer, "JPEG")) {
PyErr_SetString(PyExc_Exception, "Failed to save rendered page");
return ans;
}
ans = PyString_FromStringAndSize(ba.data(), ba.size());
if (ans != NULL) { Py_INCREF(ans); }
return ans;
}
static PyMethodDef poppler_PDFDoc_methods[] = {
{"load", (PyCFunction)poppler_PDFDoc_load, METH_VARARGS,
"Load a PDF document from a byte buffer (string)"
},
{"open", (PyCFunction)poppler_PDFDoc_open, METH_VARARGS,
"Load a PDF document from a file path (string)"
},
{"render_page", (PyCFunction)poppler_PDFDoc_render_page, METH_VARARGS,
"render_page(page_num, xdpi=166, ydpi=166) -> Render a page to a JPEG image. Page numbers start from zero."
},
{NULL} /* Sentinel */
};
static PyObject *
poppler_PDFDoc_pages_getter(poppler_PDFDoc *self, void *closure) {
int pages = self->doc->numPages();
PyObject *ans = PyInt_FromLong(static_cast<long>(pages));
if (ans != NULL) Py_INCREF(ans);
return ans;
}
static PyGetSetDef poppler_PDFDoc_getsetters[] = {
{(char *)"title",
(getter)poppler_PDFDoc_title_getter, (setter)poppler_PDFDoc_title_setter,
(char *)"Document title",
NULL},
{(char *)"author",
(getter)poppler_PDFDoc_author_getter, (setter)poppler_PDFDoc_author_setter,
(char *)"Document author",
NULL},
{(char *)"subject",
(getter)poppler_PDFDoc_subject_getter, (setter)poppler_PDFDoc_subject_setter,
(char *)"Document subject",
NULL},
{(char *)"keywords",
(getter)poppler_PDFDoc_keywords_getter, (setter)poppler_PDFDoc_keywords_setter,
(char *)"Document keywords",
NULL},
{(char *)"creator",
(getter)poppler_PDFDoc_creator_getter, (setter)poppler_PDFDoc_creator_setter,
(char *)"Document creator",
NULL},
{(char *)"producer",
(getter)poppler_PDFDoc_producer_getter, (setter)poppler_PDFDoc_producer_setter,
(char *)"Document producer",
NULL},
{(char *)"pages",
(getter)poppler_PDFDoc_pages_getter, NULL,
(char *)"Number of pages in document (read only)",
NULL},
{(char *)"version",
(getter)poppler_PDFDoc_version_getter, NULL,
(char *)"The PDF version (read only)",
NULL},
{NULL} /* Sentinel */
};
static PyTypeObject poppler_PDFDocType = {
PyObject_HEAD_INIT(NULL)
0, /*ob_size*/
"calibre_poppler.PDFDoc", /*tp_name*/
sizeof(poppler_PDFDoc), /*tp_basicsize*/
0, /*tp_itemsize*/
(destructor)poppler_PDFDoc_dealloc, /*tp_dealloc*/
0, /*tp_print*/
0, /*tp_getattr*/
0, /*tp_setattr*/
0, /*tp_compare*/
0, /*tp_repr*/
0, /*tp_as_number*/
0, /*tp_as_sequence*/
0, /*tp_as_mapping*/
0, /*tp_hash */
0, /*tp_call*/
0, /*tp_str*/
0, /*tp_getattro*/
0, /*tp_setattro*/
0, /*tp_as_buffer*/
Py_TPFLAGS_DEFAULT, /*tp_flags*/
"PDF Documents", /* tp_doc */
0, /* tp_traverse */
0, /* tp_clear */
0, /* tp_richcompare */
0, /* tp_weaklistoffset */
0, /* tp_iter */
0, /* tp_iternext */
poppler_PDFDoc_methods, /* tp_methods */
0, /* tp_members */
poppler_PDFDoc_getsetters, /* tp_getset */
0, /* tp_base */
0, /* tp_dict */
0, /* tp_descr_get */
0, /* tp_descr_set */
0, /* tp_dictoffset */
0, /* tp_init */
0, /* tp_alloc */
poppler_PDFDoc_new, /* tp_new */
};
static PyMethodDef poppler_methods[] = {
{NULL} /* Sentinel */
};
extern "C" {
PyMODINIT_FUNC
initcalibre_poppler(void)
{
PyObject* m;
if (PyType_Ready(&poppler_PDFDocType) < 0)
return;
m = Py_InitModule3("calibre_poppler", poppler_methods,
"Wrapper for the Poppler PDF library");
Py_INCREF(&poppler_PDFDocType);
PyModule_AddObject(m, "PDFDoc", (PyObject *)&poppler_PDFDocType);
}
}

View File

@ -57,7 +57,8 @@ recipe_modules = ['recipe_' + r for r in (
'monitor', 'republika', 'beta', 'beta_en', 'glasjavnosti', 'monitor', 'republika', 'beta', 'beta_en', 'glasjavnosti',
'esquire', 'livemint', 'thedgesingapore', 'darknet', 'rga', 'esquire', 'livemint', 'thedgesingapore', 'darknet', 'rga',
'intelligencer', 'theoldfoodie', 'hln_be', 'honvedelem', 'intelligencer', 'theoldfoodie', 'hln_be', 'honvedelem',
'the_new_republic', 'philly', 'salon', 'tweakers', 'the_new_republic', 'philly', 'salon', 'tweakers', 'smashing',
'thestar',
)] )]

View File

@ -0,0 +1,51 @@
#!/usr/bin/env python
__license__ = 'GPL v3'
__copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>'
'''
www.smashingmagazine.com
'''
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import Tag
class SmashingMagazine(BasicNewsRecipe):
title = 'Smashing Magazine'
__author__ = 'Darko Miletic'
description = 'We smash you with the information that will make your life easier, really'
oldest_article = 20
language = 'en'
max_articles_per_feed = 100
no_stylesheets = True
use_embedded_content = False
publisher = 'Smashing Magazine'
category = 'news, web, IT, css, javascript, html'
encoding = 'utf-8'
conversion_options = {
'comments' : description
,'tags' : category
,'publisher' : publisher
}
keep_only_tags = [dict(name='div', attrs={'id':'leftcolumn'})]
remove_tags_after = dict(name='ul',attrs={'class':'social'})
remove_tags = [
dict(name=['link','object'])
,dict(name='h1',attrs={'class':'logo'})
,dict(name='div',attrs={'id':'booklogosec'})
,dict(attrs={'src':'http://media2.smashingmagazine.com/wp-content/uploads/images/the-smashing-book/smbook6.gif'})
]
feeds = [(u'Articles', u'http://rss1.smashingmagazine.com/feed/')]
def preprocess_html(self, soup):
for iter in soup.findAll('div',attrs={'class':'leftframe'}):
it = iter.find('h1')
if it == None:
iter.extract()
for item in soup.findAll('img'):
oldParent = item.parent
if oldParent.name == 'a':
oldParent.name = 'div'
return soup

View File

@ -0,0 +1,47 @@
#!/usr/bin/env python
__license__ = 'GPL v3'
__copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>'
'''
www.thestar.com
'''
from calibre.web.feeds.news import BasicNewsRecipe
class TheTorontoStar(BasicNewsRecipe):
title = 'The Toronto Star'
__author__ = 'Darko Miletic'
description = "Canada's largest daily newspaper"
oldest_article = 2
language = 'en_CA'
max_articles_per_feed = 100
no_stylesheets = True
use_embedded_content = False
publisher = 'The Toronto Star'
category = "Toronto Star,Canada's largest daily newspaper,breaking news,classifieds,careers,GTA,Toronto Maple Leafs,sports,Toronto,news,editorial,The Star,Ontario,information,columnists,business,entertainment,births,deaths,automotive,rentals,weather,archives,Torstar,technology,Joseph Atkinson"
encoding = 'utf-8'
extra_css = ' .headlineArticle{font-size: x-large; font-weight: bold} .navbar{text-align:center} '
conversion_options = {
'comments' : description
,'tags' : category
,'publisher' : publisher
}
keep_only_tags = [dict(name='div', attrs={'id':'AssetWebPart1'})]
remove_attributes= ['style']
feeds = [
(u'News' , u'http://www.thestar.com/rss/0?searchMode=Query&categories=296' )
,(u'Opinions' , u'http://www.thestar.com/rss/0?searchMode=Query&categories=311' )
,(u'Business' , u'http://www.thestar.com/rss/0?searchMode=Query&categories=294' )
,(u'Sports' , u'http://www.thestar.com/rss/0?searchMode=Query&categories=295' )
,(u'Entertainment', u'http://www.thestar.com/rss/0?searchMode=Query&categories=296' )
,(u'Living' , u'http://www.thestar.com/rss/0?searchMode=Query&categories=296' )
,(u'Travel' , u'http://www.thestar.com/rss/82858?searchMode=Lineup' )
,(u'Science' , u'http://www.thestar.com/rss/82848?searchMode=Query&categories=300')
]
def print_version(self, url):
return url.replace('/article/','/printArticle/')

View File

@ -13,18 +13,17 @@ class ZeitDe(BasicNewsRecipe):
title = 'Die Zeit Nachrichten' title = 'Die Zeit Nachrichten'
description = 'Die Zeit - Online Nachrichten' description = 'Die Zeit - Online Nachrichten'
language = 'de' language = 'de'
lang = 'de_DE'
__author__ = 'Kovid Goyal and Martin Pitt' __author__ = 'Martin Pitt and Suajta Raman'
use_embedded_content = False use_embedded_content = False
timefmt = ' [%d %b %Y]'
max_articles_per_feed = 40 max_articles_per_feed = 40
remove_empty_feeds = True
no_stylesheets = True no_stylesheets = True
encoding = 'utf8' encoding = 'utf-8'
remove_tags = [{'class': 'adwrap'}]
keep_only_tags = [{'name': 'div', 'class': 'content'}]
feeds = [ ('Kurznachrichten', 'http://newsfeed.zeit.de/index'), feeds = [
('Politik', 'http://newsfeed.zeit.de/politik/index'), ('Politik', 'http://newsfeed.zeit.de/politik/index'),
('Wirtschaft', 'http://newsfeed.zeit.de/wirtschaft/index'), ('Wirtschaft', 'http://newsfeed.zeit.de/wirtschaft/index'),
('Meinung', 'http://newsfeed.zeit.de/meinung/index'), ('Meinung', 'http://newsfeed.zeit.de/meinung/index'),
@ -33,6 +32,43 @@ class ZeitDe(BasicNewsRecipe):
('Wissen', 'http://newsfeed.zeit.de/wissen/index'), ('Wissen', 'http://newsfeed.zeit.de/wissen/index'),
] ]
def print_version(self,url): extra_css = '''
return url.replace('http://www.zeit.de/', 'http://mobil.zeit.de/') .supertitle{color:#990000; font-family:Arial,Helvetica,sans-serif;font-size:xx-small;}
.excerpt{font-family:Georgia,Palatino,Palatino Linotype,FreeSerif,serif;font-size:large;}
.title{font-family:Arial,Helvetica,sans-serif;font-size:large}
.caption{color:#666666; font-family:Arial,Helvetica,sans-serif;font-size:xx-small;}
.copyright{color:#666666; font-family:Arial,Helvetica,sans-serif;font-size:xx-small;}
.article{font-family:Georgia,Palatino,Palatino Linotype,FreeSerif,serif;font-size:x-small}
.headline iconportrait_inline{font-family:Arial,Helvetica,sans-serif;font-size:x-small}
'''
filter_regexps = [r'ad.de.doubleclick.net/']
keep_only_tags = [
dict(name='div', attrs={'class':["article"]}) ,
]
remove_tags = [
dict(name='link'), dict(name='iframe'),dict(name='style'),
dict(name='div', attrs={'class':["pagination block","pagenav","inline link"] }),
dict(name='div', attrs={'id':["place_5","place_4"]})
]
def get_article_url(self, article):
url = article.get('guid', None)
if 'video' in url or 'quiz' in url :
url = None
return url
def preprocess_html(self, soup):
soup.html['xml:lang'] = self.lang
soup.html['lang'] = self.lang
mtag = '<meta http-equiv="Content-Type" content="text/html; charset=' + self.encoding + '">'
soup.head.insert(0,mtag)
return soup
#def print_version(self,url):
# return url.replace('http://www.zeit.de/', 'http://images.zeit.de/text/').replace('?from=rss', '')