From b92c2dc002927626a01cfd53066e3e8b4dd469be Mon Sep 17 00:00:00 2001 From: John Schember Date: Mon, 25 May 2009 21:31:51 -0400 Subject: [PATCH] Baisc RTF output. --- src/calibre/customize/builtins.py | 2 + src/calibre/ebooks/pml/output.py | 3 +- src/calibre/ebooks/rtf/output.py | 36 +++++++ src/calibre/ebooks/rtf/rtfml.py | 171 ++++++++++++++++++++++++++++++ src/calibre/ebooks/txt/output.py | 1 - 5 files changed, 211 insertions(+), 2 deletions(-) create mode 100644 src/calibre/ebooks/rtf/output.py create mode 100644 src/calibre/ebooks/rtf/rtfml.py diff --git a/src/calibre/customize/builtins.py b/src/calibre/customize/builtins.py index ab9460d3be..d107413e38 100644 --- a/src/calibre/customize/builtins.py +++ b/src/calibre/customize/builtins.py @@ -337,6 +337,7 @@ from calibre.ebooks.pdb.output import PDBOutput from calibre.ebooks.pdf.output import PDFOutput from calibre.ebooks.pml.output import PMLOutput from calibre.ebooks.rb.output import RBOutput +from calibre.ebooks.rtf.output import RTFOutput from calibre.ebooks.txt.output import TXTOutput from calibre.customize.profiles import input_profiles, output_profiles @@ -382,6 +383,7 @@ plugins += [ PDFOutput, PMLOutput, RBOutput, + RTFOutput, TXTOutput, ] plugins += [ diff --git a/src/calibre/ebooks/pml/output.py b/src/calibre/ebooks/pml/output.py index 9d07718654..8be8cc18ee 100644 --- a/src/calibre/ebooks/pml/output.py +++ b/src/calibre/ebooks/pml/output.py @@ -6,7 +6,8 @@ __docformat__ = 'restructuredtext en' import os -import Image, cStringIO +import Image +import cStringIO from calibre.customize.conversion import OutputFormatPlugin from calibre.ptempfile import TemporaryDirectory diff --git a/src/calibre/ebooks/rtf/output.py b/src/calibre/ebooks/rtf/output.py new file mode 100644 index 0000000000..fab7ecad5d --- /dev/null +++ b/src/calibre/ebooks/rtf/output.py @@ -0,0 +1,36 @@ +# -*- coding: utf-8 -*- + +__license__ = 'GPL 3' +__copyright__ = '2009, John Schember ' +__docformat__ = 'restructuredtext en' + +import os + +from calibre.ebooks.rtf.rtfml import RTFMLizer +from calibre.customize.conversion import OutputFormatPlugin + +class RTFOutput(OutputFormatPlugin): + + name = 'RTF Output' + author = 'John Schember' + file_type = 'rtf' + + def convert(self, oeb_book, output_path, input_plugin, opts, log): + rtfmlitzer = RTFMLizer(ignore_tables=opts.linearize_tables) + content = rtfmlitzer.extract_content(oeb_book, opts) + + close = False + if not hasattr(output_path, 'write'): + close = True + if not os.path.exists(os.path.dirname(output_path)) and os.path.dirname(output_path) != '': + os.makedirs(os.path.dirname(output_path)) + out_stream = open(output_path, 'wb') + else: + out_stream = output_path + + out_stream.seek(0) + out_stream.truncate() + out_stream.write(content.encode('cp1252', 'replace')) + + if close: + out_stream.close() diff --git a/src/calibre/ebooks/rtf/rtfml.py b/src/calibre/ebooks/rtf/rtfml.py new file mode 100644 index 0000000000..ade9291558 --- /dev/null +++ b/src/calibre/ebooks/rtf/rtfml.py @@ -0,0 +1,171 @@ +# -*- coding: utf-8 -*- + +__license__ = 'GPL 3' +__copyright__ = '2009, John Schember ' +__docformat__ = 'restructuredtext en' + +''' +Transform OEB content into RTF markup +''' + +import os +import re + +from calibre.ebooks.oeb.base import XHTML, XHTML_NS, barename, namespace +from calibre.ebooks.oeb.stylizer import Stylizer + +TAGS = { + 'b': '\\b', + 'del': '\\deleted', + 'h1': '\\b \\par \\pard \\hyphpar \\keep', + 'h2': '\\b \\par \\pard \\hyphpar \\keep', + 'h3': '\\b \\par \\pard \\hyphpar \\keep', + 'h4': '\\b \\par \\pard \\hyphpar \\keep', + 'h5': '\\b \\par \\pard \\hyphpar \\keep', + 'h6': '\\b \\par \\pard \\hyphpar \\keep', + 'li': '\\par \\pard \\hyphpar \\keep \t', + 'p': '\\par \\pard \\hyphpar \\keep \t', + #'ol': '\\pn \\pnrestart \\pnlvlblt', + 'sub': '\\sub', + 'sup': '\\super', + 'u': '\\ul', + #'ul': '\\pn \\pnrestart \\pndec', +} + +SINGLE_TAGS = { + 'br': '{\\line }', + 'div': '{\\line }', +} + +STYLES = [ + ('display', {'block': '\\par \\pard \\hyphpar \\keep'}), + ('font-weight', {'bold': '\\b', 'bolder': '\\b'}), + ('font-style', {'italic': '\\i'}), +# ('page-break-before', {'always': '\\pagebb '}), + ('text-align', {'center': '\\qc', 'left': '\\ql', 'right': '\\qr', 'justify': '\\qj'}), + ('text-decoration', {'line-through': '\\strike', 'underline': '\\ul'}), +] + +BLOCK_TAGS = [ + 'p', + 'h1', + 'h2', + 'h3', + 'h4', + 'h5', + 'h6', + 'li', +] + +BLOCK_STYLES = [ + 'block' +] + +''' +TODO: + * Tables + * Images + * Fonts +''' +class RTFMLizer(object): + + def __init__(self, ignore_tables=False): + self.ignore_tables = ignore_tables + + def extract_content(self, oeb_book, opts): + oeb_book.logger.info('Converting XHTML to RTF markup...') + self.oeb_book = oeb_book + self.opts = opts + return self.mlize_spine() + + def mlize_spine(self): + output = self.header() + for item in self.oeb_book.spine: + stylizer = Stylizer(item.data, item.href, self.oeb_book, self.opts.output_profile) + output += self.dump_text(item.data.find(XHTML('body')), stylizer) + output += self.footer() + output = self.clean_text(output) + + return output + + def header(self): + return u'{\\rtf1\\ansi\\ansicpg1252\\deff0\\deflang1033' + + def footer(self): + return ' }' + + def clean_text(self, text): + # Remove excess spaces at beginning and end of lines + text = re.sub('(?m)^[ ]+', '', text) + text = re.sub('(?m)[ ]+$', '', text) + + # Remove excessive newlines + #text = re.sub('%s{1,1}' % os.linesep, '%s%s' % (os.linesep, os.linesep), text) + text = re.sub('%s{3,}' % os.linesep, '%s%s' % (os.linesep, os.linesep), text) + + # Remove excessive spaces + text = re.sub('[ ]{2,}', ' ', text) + + text = re.sub(r'(\{\\line \}){3,}', r'{\\line }{\\line }', text) + text = re.sub(r'(\{\\line \})+\{\\par', r'{\\par', text) + + return text + + def dump_text(self, elem, stylizer, tag_stack=[]): + if not isinstance(elem.tag, basestring) \ + or namespace(elem.tag) != XHTML_NS: + return u'' + + text = u'' + style = stylizer.style(elem) + + if style['display'] in ('none', 'oeb-page-head', 'oeb-page-foot') \ + or style['visibility'] == 'hidden': + return u'' + + tag = barename(elem.tag) + tag_count = 0 + + # Are we in a paragraph block? + if tag in BLOCK_TAGS or style['display'] in BLOCK_STYLES: + if 'block' not in tag_stack: + tag_count += 1 + tag_stack.append('block') + + single_tag = SINGLE_TAGS.get(tag, None) + if single_tag: + text += single_tag + + rtf_tag = TAGS.get(tag, None) + if rtf_tag and rtf_tag not in tag_stack: + tag_count += 1 + text += '{%s\n' % rtf_tag + tag_stack.append(rtf_tag) + + # Processes style information + for s in STYLES: + style_tag = s[1].get(style[s[0]], None) + if style_tag and style_tag not in tag_stack: + tag_count += 1 + text += '{%s\n' % style_tag + tag_stack.append(style_tag) + + # Proccess tags that contain text. + if hasattr(elem, 'text') and elem.text != None and elem.text.strip() != '': + text += '%s' % elem.text + + for item in elem: + text += self.dump_text(item, stylizer, tag_stack) + + for i in range(0, tag_count): + end_tag = tag_stack.pop() + if end_tag != 'block': + text += u'}' + + if hasattr(elem, 'tail') and elem.tail != None and elem.tail.strip() != '': + if 'block' in tag_stack: + text += '%s ' % elem.tail + else: + text += '{\\par \\pard \\hyphpar \\keep %s}' % elem.tail + + return text diff --git a/src/calibre/ebooks/txt/output.py b/src/calibre/ebooks/txt/output.py index adf357181c..6afc5452b2 100644 --- a/src/calibre/ebooks/txt/output.py +++ b/src/calibre/ebooks/txt/output.py @@ -9,7 +9,6 @@ import os from calibre.customize.conversion import OutputFormatPlugin, \ OptionRecommendation from calibre.ebooks.txt.writer import TxtWriter, TxtNewlines -from calibre.ebooks.metadata import authors_to_string class TXTOutput(OutputFormatPlugin):