From f526555572a2cca0bfc08f498664a546a6f4c4a4 Mon Sep 17 00:00:00 2001 From: John Schember Date: Mon, 25 May 2009 10:43:29 -0400 Subject: [PATCH 01/10] Tweak line length factor for pdf line wrapping. --- src/calibre/ebooks/conversion/preprocess.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py index 76fc36708e..2dc404e586 100644 --- a/src/calibre/ebooks/conversion/preprocess.py +++ b/src/calibre/ebooks/conversion/preprocess.py @@ -176,7 +176,7 @@ class HTMLPreProcessor(object): elif self.is_pdftohtml(html): line_length_rules = [ # Un wrap using punctuation - (re.compile(r'(?<=.{%i}[a-z,;:-IA])\s*(?P)?\s*()\s*(?=(<(i|b|u)>)?[\w\d])' % line_length(html, .4), re.UNICODE), wrap_lines), + (re.compile(r'(?<=.{%i}[a-z,;:-IA])\s*(?P)?\s*()\s*(?=(<(i|b|u)>)?[\w\d])' % line_length(html, .3), re.UNICODE), wrap_lines), ] rules = self.PDFTOHTML + line_length_rules From b92c2dc002927626a01cfd53066e3e8b4dd469be Mon Sep 17 00:00:00 2001 From: John Schember Date: Mon, 25 May 2009 21:31:51 -0400 Subject: [PATCH 02/10] Baisc RTF output. --- src/calibre/customize/builtins.py | 2 + src/calibre/ebooks/pml/output.py | 3 +- src/calibre/ebooks/rtf/output.py | 36 +++++++ src/calibre/ebooks/rtf/rtfml.py | 171 ++++++++++++++++++++++++++++++ src/calibre/ebooks/txt/output.py | 1 - 5 files changed, 211 insertions(+), 2 deletions(-) create mode 100644 src/calibre/ebooks/rtf/output.py create mode 100644 src/calibre/ebooks/rtf/rtfml.py diff --git a/src/calibre/customize/builtins.py b/src/calibre/customize/builtins.py index ab9460d3be..d107413e38 100644 --- a/src/calibre/customize/builtins.py +++ b/src/calibre/customize/builtins.py @@ -337,6 +337,7 @@ from calibre.ebooks.pdb.output import PDBOutput from calibre.ebooks.pdf.output import PDFOutput from calibre.ebooks.pml.output import PMLOutput from calibre.ebooks.rb.output import RBOutput +from calibre.ebooks.rtf.output import RTFOutput from calibre.ebooks.txt.output import TXTOutput from calibre.customize.profiles import input_profiles, output_profiles @@ -382,6 +383,7 @@ plugins += [ PDFOutput, PMLOutput, RBOutput, + RTFOutput, TXTOutput, ] plugins += [ diff --git a/src/calibre/ebooks/pml/output.py b/src/calibre/ebooks/pml/output.py index 9d07718654..8be8cc18ee 100644 --- a/src/calibre/ebooks/pml/output.py +++ b/src/calibre/ebooks/pml/output.py @@ -6,7 +6,8 @@ __docformat__ = 'restructuredtext en' import os -import Image, cStringIO +import Image +import cStringIO from calibre.customize.conversion import OutputFormatPlugin from calibre.ptempfile import TemporaryDirectory diff --git a/src/calibre/ebooks/rtf/output.py b/src/calibre/ebooks/rtf/output.py new file mode 100644 index 0000000000..fab7ecad5d --- /dev/null +++ b/src/calibre/ebooks/rtf/output.py @@ -0,0 +1,36 @@ +# -*- coding: utf-8 -*- + +__license__ = 'GPL 3' +__copyright__ = '2009, John Schember ' +__docformat__ = 'restructuredtext en' + +import os + +from calibre.ebooks.rtf.rtfml import RTFMLizer +from calibre.customize.conversion import OutputFormatPlugin + +class RTFOutput(OutputFormatPlugin): + + name = 'RTF Output' + author = 'John Schember' + file_type = 'rtf' + + def convert(self, oeb_book, output_path, input_plugin, opts, log): + rtfmlitzer = RTFMLizer(ignore_tables=opts.linearize_tables) + content = rtfmlitzer.extract_content(oeb_book, opts) + + close = False + if not hasattr(output_path, 'write'): + close = True + if not os.path.exists(os.path.dirname(output_path)) and os.path.dirname(output_path) != '': + os.makedirs(os.path.dirname(output_path)) + out_stream = open(output_path, 'wb') + else: + out_stream = output_path + + out_stream.seek(0) + out_stream.truncate() + out_stream.write(content.encode('cp1252', 'replace')) + + if close: + out_stream.close() diff --git a/src/calibre/ebooks/rtf/rtfml.py b/src/calibre/ebooks/rtf/rtfml.py new file mode 100644 index 0000000000..ade9291558 --- /dev/null +++ b/src/calibre/ebooks/rtf/rtfml.py @@ -0,0 +1,171 @@ +# -*- coding: utf-8 -*- + +__license__ = 'GPL 3' +__copyright__ = '2009, John Schember ' +__docformat__ = 'restructuredtext en' + +''' +Transform OEB content into RTF markup +''' + +import os +import re + +from calibre.ebooks.oeb.base import XHTML, XHTML_NS, barename, namespace +from calibre.ebooks.oeb.stylizer import Stylizer + +TAGS = { + 'b': '\\b', + 'del': '\\deleted', + 'h1': '\\b \\par \\pard \\hyphpar \\keep', + 'h2': '\\b \\par \\pard \\hyphpar \\keep', + 'h3': '\\b \\par \\pard \\hyphpar \\keep', + 'h4': '\\b \\par \\pard \\hyphpar \\keep', + 'h5': '\\b \\par \\pard \\hyphpar \\keep', + 'h6': '\\b \\par \\pard \\hyphpar \\keep', + 'li': '\\par \\pard \\hyphpar \\keep \t', + 'p': '\\par \\pard \\hyphpar \\keep \t', + #'ol': '\\pn \\pnrestart \\pnlvlblt', + 'sub': '\\sub', + 'sup': '\\super', + 'u': '\\ul', + #'ul': '\\pn \\pnrestart \\pndec', +} + +SINGLE_TAGS = { + 'br': '{\\line }', + 'div': '{\\line }', +} + +STYLES = [ + ('display', {'block': '\\par \\pard \\hyphpar \\keep'}), + ('font-weight', {'bold': '\\b', 'bolder': '\\b'}), + ('font-style', {'italic': '\\i'}), +# ('page-break-before', {'always': '\\pagebb '}), + ('text-align', {'center': '\\qc', 'left': '\\ql', 'right': '\\qr', 'justify': '\\qj'}), + ('text-decoration', {'line-through': '\\strike', 'underline': '\\ul'}), +] + +BLOCK_TAGS = [ + 'p', + 'h1', + 'h2', + 'h3', + 'h4', + 'h5', + 'h6', + 'li', +] + +BLOCK_STYLES = [ + 'block' +] + +''' +TODO: + * Tables + * Images + * Fonts +''' +class RTFMLizer(object): + + def __init__(self, ignore_tables=False): + self.ignore_tables = ignore_tables + + def extract_content(self, oeb_book, opts): + oeb_book.logger.info('Converting XHTML to RTF markup...') + self.oeb_book = oeb_book + self.opts = opts + return self.mlize_spine() + + def mlize_spine(self): + output = self.header() + for item in self.oeb_book.spine: + stylizer = Stylizer(item.data, item.href, self.oeb_book, self.opts.output_profile) + output += self.dump_text(item.data.find(XHTML('body')), stylizer) + output += self.footer() + output = self.clean_text(output) + + return output + + def header(self): + return u'{\\rtf1\\ansi\\ansicpg1252\\deff0\\deflang1033' + + def footer(self): + return ' }' + + def clean_text(self, text): + # Remove excess spaces at beginning and end of lines + text = re.sub('(?m)^[ ]+', '', text) + text = re.sub('(?m)[ ]+$', '', text) + + # Remove excessive newlines + #text = re.sub('%s{1,1}' % os.linesep, '%s%s' % (os.linesep, os.linesep), text) + text = re.sub('%s{3,}' % os.linesep, '%s%s' % (os.linesep, os.linesep), text) + + # Remove excessive spaces + text = re.sub('[ ]{2,}', ' ', text) + + text = re.sub(r'(\{\\line \}){3,}', r'{\\line }{\\line }', text) + text = re.sub(r'(\{\\line \})+\{\\par', r'{\\par', text) + + return text + + def dump_text(self, elem, stylizer, tag_stack=[]): + if not isinstance(elem.tag, basestring) \ + or namespace(elem.tag) != XHTML_NS: + return u'' + + text = u'' + style = stylizer.style(elem) + + if style['display'] in ('none', 'oeb-page-head', 'oeb-page-foot') \ + or style['visibility'] == 'hidden': + return u'' + + tag = barename(elem.tag) + tag_count = 0 + + # Are we in a paragraph block? + if tag in BLOCK_TAGS or style['display'] in BLOCK_STYLES: + if 'block' not in tag_stack: + tag_count += 1 + tag_stack.append('block') + + single_tag = SINGLE_TAGS.get(tag, None) + if single_tag: + text += single_tag + + rtf_tag = TAGS.get(tag, None) + if rtf_tag and rtf_tag not in tag_stack: + tag_count += 1 + text += '{%s\n' % rtf_tag + tag_stack.append(rtf_tag) + + # Processes style information + for s in STYLES: + style_tag = s[1].get(style[s[0]], None) + if style_tag and style_tag not in tag_stack: + tag_count += 1 + text += '{%s\n' % style_tag + tag_stack.append(style_tag) + + # Proccess tags that contain text. + if hasattr(elem, 'text') and elem.text != None and elem.text.strip() != '': + text += '%s' % elem.text + + for item in elem: + text += self.dump_text(item, stylizer, tag_stack) + + for i in range(0, tag_count): + end_tag = tag_stack.pop() + if end_tag != 'block': + text += u'}' + + if hasattr(elem, 'tail') and elem.tail != None and elem.tail.strip() != '': + if 'block' in tag_stack: + text += '%s ' % elem.tail + else: + text += '{\\par \\pard \\hyphpar \\keep %s}' % elem.tail + + return text diff --git a/src/calibre/ebooks/txt/output.py b/src/calibre/ebooks/txt/output.py index adf357181c..6afc5452b2 100644 --- a/src/calibre/ebooks/txt/output.py +++ b/src/calibre/ebooks/txt/output.py @@ -9,7 +9,6 @@ import os from calibre.customize.conversion import OutputFormatPlugin, \ OptionRecommendation from calibre.ebooks.txt.writer import TxtWriter, TxtNewlines -from calibre.ebooks.metadata import authors_to_string class TXTOutput(OutputFormatPlugin): From 1fd7c704d22efe67d50b5288131af74666df638b Mon Sep 17 00:00:00 2001 From: John Schember Date: Tue, 26 May 2009 17:41:46 -0400 Subject: [PATCH 03/10] Fix bug in ml classes where some tags were missed. --- src/calibre/ebooks/fb2/fb2ml.py | 30 ++++++++-------- src/calibre/ebooks/pml/pmlml.py | 62 ++++++++++++++++----------------- 2 files changed, 47 insertions(+), 45 deletions(-) diff --git a/src/calibre/ebooks/fb2/fb2ml.py b/src/calibre/ebooks/fb2/fb2ml.py index f10cf95e87..81600b9624 100644 --- a/src/calibre/ebooks/fb2/fb2ml.py +++ b/src/calibre/ebooks/fb2/fb2ml.py @@ -98,25 +98,27 @@ class FB2MLizer(object): return u'' tag = barename(elem.tag) + tag_count = 0 + if tag == 'img': fb2_text += '' % os.path.basename(elem.attrib['src']) - tag_count = 0 - if hasattr(elem, 'text') and elem.text != None and elem.text.strip() != '': - fb2_tag = TAG_MAP.get(tag, 'p') - if fb2_tag and fb2_tag not in tag_stack: + + fb2_tag = TAG_MAP.get(tag, 'p') + if fb2_tag and fb2_tag not in tag_stack: + tag_count += 1 + fb2_text += '<%s>' % fb2_tag + tag_stack.append(fb2_tag) + + # Processes style information + for s in STYLES: + style_tag = s[1].get(style[s[0]], None) + if style_tag: tag_count += 1 - fb2_text += '<%s>' % fb2_tag - tag_stack.append(fb2_tag) - - # Processes style information - for s in STYLES: - style_tag = s[1].get(style[s[0]], None) - if style_tag: - tag_count += 1 - fb2_text += '<%s>' % style_tag - tag_stack.append(style_tag) + fb2_text += '<%s>' % style_tag + tag_stack.append(style_tag) + if hasattr(elem, 'text') and elem.text != None and elem.text.strip() != '': fb2_text += elem.text for item in elem: diff --git a/src/calibre/ebooks/pml/pmlml.py b/src/calibre/ebooks/pml/pmlml.py index 01f777caae..2f2feeb981 100644 --- a/src/calibre/ebooks/pml/pmlml.py +++ b/src/calibre/ebooks/pml/pmlml.py @@ -153,39 +153,39 @@ class PMLMLizer(object): #if style['page-break-before'] == 'always': # text += '\\p' + pml_tag = TAG_MAP.get(tag, None) + if pml_tag and pml_tag not in tag_stack: + tag_count += 1 + text += '\\%s' % pml_tag + tag_stack.append(pml_tag) + + # Special processing of tags that require an argument. + # Anchors links + if tag in LINK_TAGS and 'q' not in tag_stack: + href = elem.get('href') + if href and '://' not in href: + if '#' in href: + href = href.partition('#')[2] + href = os.path.splitext(os.path.basename(href))[0] + tag_count += 1 + text += '\\q="#%s"' % href + tag_stack.append('q') + # Anchor ids + id_name = elem.get('id') + if id_name: + text += '\\Q="%s"' % os.path.splitext(id_name)[0] + + # Processes style information + for s in STYLES: + style_tag = s[1].get(style[s[0]], None) + if style_tag and style_tag not in tag_stack: + tag_count += 1 + text += '\\%s' % style_tag + tag_stack.append(style_tag) + # margin + # Proccess tags that contain text. if hasattr(elem, 'text') and elem.text != None and elem.text.strip() != '': - pml_tag = TAG_MAP.get(tag, None) - if pml_tag and pml_tag not in tag_stack: - tag_count += 1 - text += '\\%s' % pml_tag - tag_stack.append(pml_tag) - - # Special processing of tags that require an argument. - # Anchors links - if tag in LINK_TAGS and 'q' not in tag_stack: - href = elem.get('href') - if href and '://' not in href: - if '#' in href: - href = href.partition('#')[2] - href = os.path.splitext(os.path.basename(href))[0] - tag_count += 1 - text += '\\q="#%s"' % href - tag_stack.append('q') - # Anchor ids - id_name = elem.get('id') - if id_name: - text += '\\Q="%s"' % os.path.splitext(id_name)[0] - - # Processes style information - for s in STYLES: - style_tag = s[1].get(style[s[0]], None) - if style_tag and style_tag not in tag_stack: - tag_count += 1 - text += '\\%s' % style_tag - tag_stack.append(style_tag) - # margin - text += self.elem_text(elem, tag_stack) for item in elem: From 1a42ba554d1d515deaa6931fd12ce8271c53d6e7 Mon Sep 17 00:00:00 2001 From: John Schember Date: Tue, 26 May 2009 20:15:20 -0400 Subject: [PATCH 04/10] RTF Output: Image support. --- src/calibre/ebooks/rtf/rtfml.py | 52 ++++++++++++++++++++++++++++++--- 1 file changed, 48 insertions(+), 4 deletions(-) diff --git a/src/calibre/ebooks/rtf/rtfml.py b/src/calibre/ebooks/rtf/rtfml.py index ade9291558..3a14c44f8d 100644 --- a/src/calibre/ebooks/rtf/rtfml.py +++ b/src/calibre/ebooks/rtf/rtfml.py @@ -11,7 +11,11 @@ Transform OEB content into RTF markup import os import re -from calibre.ebooks.oeb.base import XHTML, XHTML_NS, barename, namespace +import Image +import cStringIO + +from calibre.ebooks.oeb.base import XHTML, XHTML_NS, barename, namespace, \ + OEB_IMAGES from calibre.ebooks.oeb.stylizer import Stylizer TAGS = { @@ -64,7 +68,6 @@ BLOCK_STYLES = [ ''' TODO: * Tables - * Images * Fonts ''' class RTFMLizer(object): @@ -84,6 +87,7 @@ class RTFMLizer(object): stylizer = Stylizer(item.data, item.href, self.oeb_book, self.opts.output_profile) output += self.dump_text(item.data.find(XHTML('body')), stylizer) output += self.footer() + output = self.insert_images(output) output = self.clean_text(output) return output @@ -93,7 +97,36 @@ class RTFMLizer(object): def footer(self): return ' }' - + + def insert_images(self, text): + for item in self.oeb_book.manifest: + if item.media_type in OEB_IMAGES: + src = os.path.basename(item.href) + data, width, height = self.image_to_hexstring(item.data) + text = text.replace('SPECIAL_IMAGE-%s-REPLACE_ME' % src, '\n\n{\\*\\shppict{\\pict\\picw%i\\pich%i\\jpegblip \n%s}}\n\n' % (width, height, data)) + return text + + def image_to_hexstring(self, data): + im = Image.open(cStringIO.StringIO(data)) + data = cStringIO.StringIO() + im.save(data, 'JPEG') + data = data.getvalue() + + raw_hex = '' + for char in data: + raw_hex += hex(ord(char)).replace('0x', '').rjust(2, '0') + + hex_string = '' + col = 1 + for char in raw_hex: + if col == 129: + hex_string += '\n' + col = 1 + col += 1 + hex_string += char + + return (hex_string, im.size[0], im.size[1]) + def clean_text(self, text): # Remove excess spaces at beginning and end of lines text = re.sub('(?m)^[ ]+', '', text) @@ -125,13 +158,24 @@ class RTFMLizer(object): tag = barename(elem.tag) tag_count = 0 - + # Are we in a paragraph block? if tag in BLOCK_TAGS or style['display'] in BLOCK_STYLES: if 'block' not in tag_stack: tag_count += 1 tag_stack.append('block') + # Process tags that need special processing and that do not have inner + # text. Usually these require an argument + if tag == 'img': + src = os.path.basename(elem.get('src')) + block_start = '' + block_end = '' + if 'block' not in tag_stack: + block_start = '{\\par \\pard \\hyphpar \\keep ' + block_end = '}' + text += '%s SPECIAL_IMAGE-%s-REPLACE_ME %s' % (block_start, src, block_end) + single_tag = SINGLE_TAGS.get(tag, None) if single_tag: text += single_tag From 458f02e6fd2392e8cbb7303e0f7c628b46d770b6 Mon Sep 17 00:00:00 2001 From: John Schember Date: Wed, 27 May 2009 18:17:09 -0400 Subject: [PATCH 05/10] Comments. --- src/calibre/ebooks/rtf/rtfml.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/calibre/ebooks/rtf/rtfml.py b/src/calibre/ebooks/rtf/rtfml.py index 3a14c44f8d..be05938f82 100644 --- a/src/calibre/ebooks/rtf/rtfml.py +++ b/src/calibre/ebooks/rtf/rtfml.py @@ -116,6 +116,8 @@ class RTFMLizer(object): for char in data: raw_hex += hex(ord(char)).replace('0x', '').rjust(2, '0') + # Images must be broken up so that they are no longer than 129 chars + # per line hex_string = '' col = 1 for char in raw_hex: From 9f5e16cc128b53fc682d1d8e688412b19dccd94b Mon Sep 17 00:00:00 2001 From: John Schember Date: Wed, 27 May 2009 21:24:45 -0400 Subject: [PATCH 06/10] RTF Output: Rendering tweaks. --- src/calibre/ebooks/rtf/rtfml.py | 22 ++++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-) diff --git a/src/calibre/ebooks/rtf/rtfml.py b/src/calibre/ebooks/rtf/rtfml.py index be05938f82..89ec4ea980 100644 --- a/src/calibre/ebooks/rtf/rtfml.py +++ b/src/calibre/ebooks/rtf/rtfml.py @@ -37,16 +37,19 @@ TAGS = { } SINGLE_TAGS = { - 'br': '{\\line }', - 'div': '{\\line }', + 'br': '\n{\\line }\n', + 'div': '\n{\\line }\n', +} + +SINGLE_TAGS_END = { + 'div': '\n{\\line }\n', } STYLES = [ ('display', {'block': '\\par \\pard \\hyphpar \\keep'}), ('font-weight', {'bold': '\\b', 'bolder': '\\b'}), ('font-style', {'italic': '\\i'}), -# ('page-break-before', {'always': '\\pagebb '}), - ('text-align', {'center': '\\qc', 'left': '\\ql', 'right': '\\qr', 'justify': '\\qj'}), + ('text-align', {'center': '\\qc', 'left': '\\ql', 'right': '\\qr'}), ('text-decoration', {'line-through': '\\strike', 'underline': '\\ul'}), ] @@ -141,8 +144,11 @@ class RTFMLizer(object): # Remove excessive spaces text = re.sub('[ ]{2,}', ' ', text) - text = re.sub(r'(\{\\line \}){3,}', r'{\\line }{\\line }', text) - text = re.sub(r'(\{\\line \})+\{\\par', r'{\\par', text) + text = re.sub(r'(\{\\line \}\s*){3,}', r'{\\line }{\\line }', text) + text = re.sub(r'(\{\\line \}\s*)+\{\\par', r'{\\par', text) + + # Remove non-breaking spaces + text = text.replace(u'\xa0', ' ') return text @@ -208,6 +214,10 @@ class RTFMLizer(object): if end_tag != 'block': text += u'}' + single_tag_end = SINGLE_TAGS_END.get(tag, None) + if single_tag_end: + text += single_tag_end + if hasattr(elem, 'tail') and elem.tail != None and elem.tail.strip() != '': if 'block' in tag_stack: text += '%s ' % elem.tail From 3d4ae1920acb29a2cf4768182c9eebd56dd4b76e Mon Sep 17 00:00:00 2001 From: John Schember Date: Thu, 28 May 2009 07:12:45 -0400 Subject: [PATCH 07/10] RTF Output: Metadata and more render tweaks. --- src/calibre/ebooks/rtf/rtfml.py | 29 ++++++++++++++--------------- 1 file changed, 14 insertions(+), 15 deletions(-) diff --git a/src/calibre/ebooks/rtf/rtfml.py b/src/calibre/ebooks/rtf/rtfml.py index 89ec4ea980..78bd96d2d9 100644 --- a/src/calibre/ebooks/rtf/rtfml.py +++ b/src/calibre/ebooks/rtf/rtfml.py @@ -17,23 +17,22 @@ import cStringIO from calibre.ebooks.oeb.base import XHTML, XHTML_NS, barename, namespace, \ OEB_IMAGES from calibre.ebooks.oeb.stylizer import Stylizer +from calibre.ebooks.metadata import authors_to_string TAGS = { 'b': '\\b', 'del': '\\deleted', - 'h1': '\\b \\par \\pard \\hyphpar \\keep', - 'h2': '\\b \\par \\pard \\hyphpar \\keep', - 'h3': '\\b \\par \\pard \\hyphpar \\keep', - 'h4': '\\b \\par \\pard \\hyphpar \\keep', - 'h5': '\\b \\par \\pard \\hyphpar \\keep', - 'h6': '\\b \\par \\pard \\hyphpar \\keep', - 'li': '\\par \\pard \\hyphpar \\keep \t', - 'p': '\\par \\pard \\hyphpar \\keep \t', - #'ol': '\\pn \\pnrestart \\pnlvlblt', + 'h1': '\\b \\par \\pard \\hyphpar', + 'h2': '\\b \\par \\pard \\hyphpar', + 'h3': '\\b \\par \\pard \\hyphpar', + 'h4': '\\b \\par \\pard \\hyphpar', + 'h5': '\\b \\par \\pard \\hyphpar', + 'h6': '\\b \\par \\pard \\hyphpar', + 'li': '\\par \\pard \\hyphpar \t', + 'p': '\\par \\pard \\hyphpar \t', 'sub': '\\sub', 'sup': '\\super', 'u': '\\ul', - #'ul': '\\pn \\pnrestart \\pndec', } SINGLE_TAGS = { @@ -46,7 +45,7 @@ SINGLE_TAGS_END = { } STYLES = [ - ('display', {'block': '\\par \\pard \\hyphpar \\keep'}), + ('display', {'block': '\\par \\pard \\hyphpar'}), ('font-weight', {'bold': '\\b', 'bolder': '\\b'}), ('font-style', {'italic': '\\i'}), ('text-align', {'center': '\\qc', 'left': '\\ql', 'right': '\\qr'}), @@ -96,7 +95,7 @@ class RTFMLizer(object): return output def header(self): - return u'{\\rtf1\\ansi\\ansicpg1252\\deff0\\deflang1033' + return u'{\\rtf1{\\info{\\title %s}{\\author %s}}\\ansi\\ansicpg1252\\deff0\\deflang1033' % (self.oeb_book.metadata.title[0].value, authors_to_string([x.value for x in self.oeb_book.metadata.creator])) def footer(self): return ' }' @@ -145,7 +144,7 @@ class RTFMLizer(object): text = re.sub('[ ]{2,}', ' ', text) text = re.sub(r'(\{\\line \}\s*){3,}', r'{\\line }{\\line }', text) - text = re.sub(r'(\{\\line \}\s*)+\{\\par', r'{\\par', text) + #text = re.compile(r'(\{\\line \}\s*)+(?P}*)\s*\{\\par').sub(lambda mo: r'%s{\\par' % mo.group('brackets'), text) # Remove non-breaking spaces text = text.replace(u'\xa0', ' ') @@ -180,7 +179,7 @@ class RTFMLizer(object): block_start = '' block_end = '' if 'block' not in tag_stack: - block_start = '{\\par \\pard \\hyphpar \\keep ' + block_start = '{\\par \\pard \\hyphpar ' block_end = '}' text += '%s SPECIAL_IMAGE-%s-REPLACE_ME %s' % (block_start, src, block_end) @@ -222,6 +221,6 @@ class RTFMLizer(object): if 'block' in tag_stack: text += '%s ' % elem.tail else: - text += '{\\par \\pard \\hyphpar \\keep %s}' % elem.tail + text += '{\\par \\pard \\hyphpar %s}' % elem.tail return text From 1c2f1b0f006a14695a3cf277f609a2f44c7c1a67 Mon Sep 17 00:00:00 2001 From: John Schember Date: Thu, 28 May 2009 08:12:50 -0400 Subject: [PATCH 08/10] RTF Input: Fix bug preventing it from running. RTF Output: produce files that can be read by RTF input. --- src/calibre/ebooks/rtf/input.py | 2 +- src/calibre/ebooks/rtf/output.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/calibre/ebooks/rtf/input.py b/src/calibre/ebooks/rtf/input.py index fce73668a2..22bb5263d5 100644 --- a/src/calibre/ebooks/rtf/input.py +++ b/src/calibre/ebooks/rtf/input.py @@ -70,7 +70,7 @@ class RTFInput(InputFormatPlugin): self.log = log self.log('Converting RTF to XML...') try: - xml = self.generate_xml(stream) + xml = self.generate_xml(stream.name) except RtfInvalidCodeException: raise ValueError(_('This RTF file has a feature calibre does not ' 'support. Convert it to HTML first and then try it.')) diff --git a/src/calibre/ebooks/rtf/output.py b/src/calibre/ebooks/rtf/output.py index fab7ecad5d..4ef7706762 100644 --- a/src/calibre/ebooks/rtf/output.py +++ b/src/calibre/ebooks/rtf/output.py @@ -30,7 +30,7 @@ class RTFOutput(OutputFormatPlugin): out_stream.seek(0) out_stream.truncate() - out_stream.write(content.encode('cp1252', 'replace')) + out_stream.write(content.encode('ascii', 'replace')) if close: out_stream.close() From da140445a0a8f61e752065897b8e18a37a52db30 Mon Sep 17 00:00:00 2001 From: John Schember Date: Thu, 28 May 2009 08:52:53 -0400 Subject: [PATCH 09/10] ml's include cover page if present. --- src/calibre/ebooks/fb2/fb2ml.py | 6 ++++++ src/calibre/ebooks/pml/pmlml.py | 6 ++++++ src/calibre/ebooks/rb/rbml.py | 6 ++++++ src/calibre/ebooks/rtf/rtfml.py | 7 +++++++ 4 files changed, 25 insertions(+) diff --git a/src/calibre/ebooks/fb2/fb2ml.py b/src/calibre/ebooks/fb2/fb2ml.py index 81600b9624..3a5806b143 100644 --- a/src/calibre/ebooks/fb2/fb2ml.py +++ b/src/calibre/ebooks/fb2/fb2ml.py @@ -44,6 +44,12 @@ class FB2MLizer(object): def fb2mlize_spine(self): output = self.fb2_header() + if 'titlepage' in self.oeb_book.guide: + href = self.oeb_book.guide['titlepage'].href + item = self.oeb_book.manifest.hrefs[href] + if item.spine_position is None: + stylizer = Stylizer(item.data, item.href, self.oeb_book, self.opts.output_profile) + output += self.dump_text(item.data.find(XHTML('body')), stylizer) for item in self.oeb_book.spine: stylizer = Stylizer(item.data, item.href, self.oeb_book, self.opts.output_profile) output += self.dump_text(item.data.find(XHTML('body')), stylizer) diff --git a/src/calibre/ebooks/pml/pmlml.py b/src/calibre/ebooks/pml/pmlml.py index 2f2feeb981..ef735a56b1 100644 --- a/src/calibre/ebooks/pml/pmlml.py +++ b/src/calibre/ebooks/pml/pmlml.py @@ -78,6 +78,12 @@ class PMLMLizer(object): def pmlmlize_spine(self): output = u'' + if 'titlepage' in self.oeb_book.guide: + href = self.oeb_book.guide['titlepage'].href + item = self.oeb_book.manifest.hrefs[href] + if item.spine_position is None: + stylizer = Stylizer(item.data, item.href, self.oeb_book, self.opts.output_profile) + output += self.dump_text(item.data.find(XHTML('body')), stylizer) for item in self.oeb_book.spine: stylizer = Stylizer(item.data, item.href, self.oeb_book, self.opts.output_profile) output += self.add_page_anchor(item.href) diff --git a/src/calibre/ebooks/rb/rbml.py b/src/calibre/ebooks/rb/rbml.py index f18803e8d0..3563ba2538 100644 --- a/src/calibre/ebooks/rb/rbml.py +++ b/src/calibre/ebooks/rb/rbml.py @@ -65,6 +65,12 @@ class RBMLizer(object): def mlize_spine(self): output = u'' + if 'titlepage' in self.oeb_book.guide: + href = self.oeb_book.guide['titlepage'].href + item = self.oeb_book.manifest.hrefs[href] + if item.spine_position is None: + stylizer = Stylizer(item.data, item.href, self.oeb_book, self.opts.output_profile) + output += self.dump_text(item.data.find(XHTML('body')), stylizer) for item in self.oeb_book.spine: stylizer = Stylizer(item.data, item.href, self.oeb_book, self.opts.output_profile) output += self.add_page_anchor(item.href) diff --git a/src/calibre/ebooks/rtf/rtfml.py b/src/calibre/ebooks/rtf/rtfml.py index 78bd96d2d9..3ed855adb8 100644 --- a/src/calibre/ebooks/rtf/rtfml.py +++ b/src/calibre/ebooks/rtf/rtfml.py @@ -85,6 +85,13 @@ class RTFMLizer(object): def mlize_spine(self): output = self.header() + if 'titlepage' in self.oeb_book.guide: + href = self.oeb_book.guide['titlepage'].href + item = self.oeb_book.manifest.hrefs[href] + if item.spine_position is None: + stylizer = Stylizer(item.data, item.href, self.oeb_book, self.opts.output_profile) + output += self.dump_text(item.data.find(XHTML('body')), stylizer) + output += '{\\page } ' for item in self.oeb_book.spine: stylizer = Stylizer(item.data, item.href, self.oeb_book, self.opts.output_profile) output += self.dump_text(item.data.find(XHTML('body')), stylizer) From 08af3996bf071701c96ae6427b14b2d7b6ca33b6 Mon Sep 17 00:00:00 2001 From: John Schember Date: Thu, 28 May 2009 08:57:29 -0400 Subject: [PATCH 10/10] RTF Output: ensure proper line breaks. --- src/calibre/ebooks/rtf/rtfml.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/calibre/ebooks/rtf/rtfml.py b/src/calibre/ebooks/rtf/rtfml.py index 3ed855adb8..cb8e9af883 100644 --- a/src/calibre/ebooks/rtf/rtfml.py +++ b/src/calibre/ebooks/rtf/rtfml.py @@ -112,7 +112,7 @@ class RTFMLizer(object): if item.media_type in OEB_IMAGES: src = os.path.basename(item.href) data, width, height = self.image_to_hexstring(item.data) - text = text.replace('SPECIAL_IMAGE-%s-REPLACE_ME' % src, '\n\n{\\*\\shppict{\\pict\\picw%i\\pich%i\\jpegblip \n%s}}\n\n' % (width, height, data)) + text = text.replace('SPECIAL_IMAGE-%s-REPLACE_ME' % src, '\n\n{\\*\\shppict{\\pict\\picw%i\\pich%i\\jpegblip \n%s\n}}\n\n' % (width, height, data)) return text def image_to_hexstring(self, data): @@ -155,6 +155,7 @@ class RTFMLizer(object): # Remove non-breaking spaces text = text.replace(u'\xa0', ' ') + text = text.replace('\n\r', '\n') return text