mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-11-26 16:25:02 -05:00
248 lines
8.1 KiB
Python
248 lines
8.1 KiB
Python
from __future__ import with_statement
|
|
__license__ = 'GPL v3'
|
|
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
|
|
|
|
import os, glob, re, textwrap
|
|
|
|
from lxml import etree
|
|
|
|
from calibre.customize.conversion import InputFormatPlugin
|
|
from calibre.ebooks.conversion.utils import PreProcessor
|
|
|
|
class InlineClass(etree.XSLTExtension):
|
|
|
|
FMTS = ('italics', 'bold', 'underlined', 'strike-through', 'small-caps')
|
|
|
|
def __init__(self, log):
|
|
etree.XSLTExtension.__init__(self)
|
|
self.log = log
|
|
self.font_sizes = []
|
|
self.colors = []
|
|
|
|
def execute(self, context, self_node, input_node, output_parent):
|
|
classes = ['none']
|
|
for x in self.FMTS:
|
|
if input_node.get(x, None) == 'true':
|
|
classes.append(x)
|
|
fs = input_node.get('font-size', False)
|
|
if fs:
|
|
if fs not in self.font_sizes:
|
|
self.font_sizes.append(fs)
|
|
classes.append('fs%d'%self.font_sizes.index(fs))
|
|
fc = input_node.get('font-color', False)
|
|
if fc:
|
|
if fc not in self.colors:
|
|
self.colors.append(fc)
|
|
classes.append('col%d'%self.colors.index(fc))
|
|
|
|
output_parent.text = ' '.join(classes)
|
|
|
|
|
|
class RTFInput(InputFormatPlugin):
|
|
|
|
name = 'RTF Input'
|
|
author = 'Kovid Goyal'
|
|
description = 'Convert RTF files to HTML'
|
|
file_types = set(['rtf'])
|
|
|
|
def generate_xml(self, stream):
|
|
from calibre.ebooks.rtf2xml.ParseRtf import ParseRtf
|
|
ofile = 'out.xml'
|
|
parser = ParseRtf(
|
|
in_file = stream,
|
|
out_file = ofile,
|
|
# Convert symbol fonts to unicode equivalents. Default
|
|
# is 1
|
|
convert_symbol = 1,
|
|
|
|
# Convert Zapf fonts to unicode equivalents. Default
|
|
# is 1.
|
|
convert_zapf = 1,
|
|
|
|
# Convert Wingding fonts to unicode equivalents.
|
|
# Default is 1.
|
|
convert_wingdings = 1,
|
|
|
|
# Convert RTF caps to real caps.
|
|
# Default is 1.
|
|
convert_caps = 1,
|
|
|
|
# Indent resulting XML.
|
|
# Default is 0 (no indent).
|
|
indent = 1,
|
|
|
|
# Form lists from RTF. Default is 1.
|
|
form_lists = 1,
|
|
|
|
# Convert headings to sections. Default is 0.
|
|
headings_to_sections = 1,
|
|
|
|
# Group paragraphs with the same style name. Default is 1.
|
|
group_styles = 1,
|
|
|
|
# Group borders. Default is 1.
|
|
group_borders = 1,
|
|
|
|
# Write or do not write paragraphs. Default is 0.
|
|
empty_paragraphs = 0,
|
|
)
|
|
parser.parse_rtf()
|
|
ans = open('out.xml').read()
|
|
os.remove('out.xml')
|
|
return ans
|
|
|
|
def extract_images(self, picts):
|
|
self.log('Extracting images...')
|
|
|
|
count = 0
|
|
raw = open(picts, 'rb').read()
|
|
starts = []
|
|
for match in re.finditer(r'\{\\pict([^}]+)\}', raw):
|
|
starts.append(match.start(1))
|
|
|
|
imap = {}
|
|
|
|
for start in starts:
|
|
pos, bc = start, 1
|
|
while bc > 0:
|
|
if raw[pos] == '}': bc -= 1
|
|
elif raw[pos] == '{': bc += 1
|
|
pos += 1
|
|
pict = raw[start:pos+1]
|
|
enc = re.sub(r'[^a-zA-Z0-9]', '', pict)
|
|
if len(enc) % 2 == 1:
|
|
enc = enc[:-1]
|
|
data = enc.decode('hex')
|
|
count += 1
|
|
name = (('%4d'%count).replace(' ', '0'))+'.wmf'
|
|
open(name, 'wb').write(data)
|
|
imap[count] = name
|
|
#open(name+'.hex', 'wb').write(enc)
|
|
return self.convert_images(imap)
|
|
|
|
def convert_images(self, imap):
|
|
for count, val in imap.items():
|
|
try:
|
|
imap[count] = self.convert_image(val)
|
|
except:
|
|
self.log.exception('Failed to convert', val)
|
|
return imap
|
|
|
|
def convert_image(self, name):
|
|
from calibre.utils.magick import Image
|
|
img = Image()
|
|
img.open(name)
|
|
name = name.replace('.wmf', '.jpg')
|
|
img.save(name)
|
|
return name
|
|
|
|
|
|
def write_inline_css(self, ic):
|
|
font_size_classes = ['span.fs%d { font-size: %spt }'%(i, x) for i, x in
|
|
enumerate(ic.font_sizes)]
|
|
color_classes = ['span.col%d { color: %s }'%(i, x) for i, x in
|
|
enumerate(ic.colors)]
|
|
css = textwrap.dedent('''
|
|
span.none {
|
|
text-decoration: none; font-weight: normal;
|
|
font-style: normal; font-variant: normal
|
|
}
|
|
|
|
span.italics { font-style: italic }
|
|
|
|
span.bold { font-weight: bold }
|
|
|
|
span.small-caps { font-variant: small-caps }
|
|
|
|
span.underlined { text-decoration: underline }
|
|
|
|
span.strike-through { text-decoration: line-through }
|
|
|
|
''')
|
|
css += '\n'+'\n'.join(font_size_classes)
|
|
css += '\n' +'\n'.join(color_classes)
|
|
with open('styles.css', 'ab') as f:
|
|
f.write(css)
|
|
|
|
def preprocess(self, fname):
|
|
self.log('\tPreprocessing to convert unicode characters')
|
|
try:
|
|
data = open(fname, 'rb').read()
|
|
from calibre.ebooks.rtf.preprocess import RtfTokenizer, RtfTokenParser
|
|
tokenizer = RtfTokenizer(data)
|
|
tokens = RtfTokenParser(tokenizer.tokens)
|
|
data = tokens.toRTF()
|
|
fname = 'preprocessed.rtf'
|
|
with open(fname, 'wb') as f:
|
|
f.write(data)
|
|
except:
|
|
self.log.exception(
|
|
'Failed to preprocess RTF to convert unicode sequences, ignoring...')
|
|
return fname
|
|
|
|
def convert(self, stream, options, file_ext, log,
|
|
accelerators):
|
|
from calibre.ebooks.metadata.meta import get_metadata
|
|
from calibre.ebooks.metadata.opf2 import OPFCreator
|
|
from calibre.ebooks.rtf2xml.ParseRtf import RtfInvalidCodeException
|
|
self.options = options
|
|
self.log = log
|
|
self.log('Converting RTF to XML...')
|
|
#Name of the preprocesssed RTF file
|
|
fname = self.preprocess(stream.name)
|
|
try:
|
|
xml = self.generate_xml(fname)
|
|
except RtfInvalidCodeException, e:
|
|
raise ValueError(_('This RTF file has a feature calibre does not '
|
|
'support. Convert it to HTML first and then try it.\n%s')%e)
|
|
|
|
'''dataxml = open('dataxml.xml', 'w')
|
|
dataxml.write(xml)
|
|
dataxml.close'''
|
|
|
|
d = glob.glob(os.path.join('*_rtf_pict_dir', 'picts.rtf'))
|
|
if d:
|
|
imap = {}
|
|
try:
|
|
imap = self.extract_images(d[0])
|
|
except:
|
|
self.log.exception('Failed to extract images...')
|
|
|
|
self.log('Parsing XML...')
|
|
parser = etree.XMLParser(recover=True, no_network=True)
|
|
doc = etree.fromstring(xml, parser=parser)
|
|
for pict in doc.xpath('//rtf:pict[@num]',
|
|
namespaces={'rtf':'http://rtf2xml.sourceforge.net/'}):
|
|
num = int(pict.get('num'))
|
|
name = imap.get(num, None)
|
|
if name is not None:
|
|
pict.set('num', name)
|
|
|
|
self.log('Converting XML to HTML...')
|
|
inline_class = InlineClass(self.log)
|
|
styledoc = etree.fromstring(P('templates/rtf.xsl', data=True))
|
|
extensions = { ('calibre', 'inline-class') : inline_class }
|
|
transform = etree.XSLT(styledoc, extensions=extensions)
|
|
result = transform(doc)
|
|
html = 'index.xhtml'
|
|
with open(html, 'wb') as f:
|
|
res = transform.tostring(result)
|
|
res = res[:100].replace('xmlns:html', 'xmlns') + res[100:]
|
|
if self.options.preprocess_html:
|
|
preprocessor = PreProcessor(self.options, log=getattr(self, 'log', None))
|
|
res = preprocessor(res)
|
|
f.write(res)
|
|
self.write_inline_css(inline_class)
|
|
stream.seek(0)
|
|
mi = get_metadata(stream, 'rtf')
|
|
if not mi.title:
|
|
mi.title = _('Unknown')
|
|
if not mi.authors:
|
|
mi.authors = [_('Unknown')]
|
|
opf = OPFCreator(os.getcwd(), mi)
|
|
opf.create_manifest([('index.xhtml', None)])
|
|
opf.create_spine(['index.xhtml'])
|
|
opf.render(open('metadata.opf', 'wb'))
|
|
return os.path.abspath('metadata.opf')
|
|
|