From 03f70c156c7d557d61db1e348f11bb2a997d90e1 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Mon, 10 Jan 2011 10:44:09 -0700 Subject: [PATCH] RTF Input: Fix regression that broke the Preprocess HTML option --- src/calibre/ebooks/conversion/utils.py | 6 +++--- src/calibre/ebooks/rtf/input.py | 2 +- src/calibre/ebooks/txt/input.py | 6 +++--- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py index 52d1bcc619..dac93fa2e2 100644 --- a/src/calibre/ebooks/conversion/utils.py +++ b/src/calibre/ebooks/conversion/utils.py @@ -191,15 +191,15 @@ class PreProcessor(object): blanklines = "\s*(?P<(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*\s*)\s*){0,3}\s*" line_opening = "<(span|div|p)[^>]*>\s*(<(span|div|p)[^>]*>)?\s*" txt_line_wrap = u"((\u0020|\u0009)*\n){1,4}" - + unwrap_regex = lookahead+line_ending+blanklines+line_opening if format == 'txt': unwrap_regex = lookahead+txt_line_wrap - + unwrap = re.compile(u"%s" % unwrap_regex, re.UNICODE) content = unwrap.sub(' ', content) return content - + def __call__(self, html): self.log("********* Preprocessing HTML *********") diff --git a/src/calibre/ebooks/rtf/input.py b/src/calibre/ebooks/rtf/input.py index 8c7561f68c..5154373eda 100644 --- a/src/calibre/ebooks/rtf/input.py +++ b/src/calibre/ebooks/rtf/input.py @@ -296,7 +296,7 @@ class RTFInput(InputFormatPlugin): u'

\u00a0

\n'.encode('utf-8'), res) if self.opts.preprocess_html: preprocessor = PreProcessor(self.opts, log=getattr(self, 'log', None)) - res = preprocessor(res) + res = preprocessor(res.decode('utf-8')).encode('utf-8') f.write(res) self.write_inline_css(inline_class, border_styles) stream.seek(0) diff --git a/src/calibre/ebooks/txt/input.py b/src/calibre/ebooks/txt/input.py index 3957391494..aaff8b55c0 100644 --- a/src/calibre/ebooks/txt/input.py +++ b/src/calibre/ebooks/txt/input.py @@ -53,7 +53,7 @@ class TXTInput(InputFormatPlugin): def convert(self, stream, options, file_ext, log, accelerators): log.debug('Reading text from file...') - + txt = stream.read() # Get the encoding of the document. if options.input_encoding: @@ -80,7 +80,7 @@ class TXTInput(InputFormatPlugin): # Get length for hyphen removal and punctuation unwrap docanalysis = DocAnalysis('txt', txt) length = docanalysis.line_length(.5) - + if options.formatting_type == 'auto': options.formatting_type = detect_formatting_type(txt) @@ -122,7 +122,7 @@ class TXTInput(InputFormatPlugin): txt = preprocessor.punctuation_unwrap(length, txt, 'txt') flow_size = getattr(options, 'flow_size', 0) - + if options.formatting_type == 'heuristic': html = convert_heuristic(txt, epub_split_size_kb=flow_size) else: