RTF Input: Fix regression that broke the Preprocess HTML option

2025-07-09 03:04:10 -04:00 · 2011-01-10 10:44:09 -07:00 · 2011-01-10 10:44:09 -07:00 · 03f70c156c
commit 03f70c156c
parent ef15ee03a3
3 changed files with 7 additions and 7 deletions
--- a/src/calibre/ebooks/conversion/utils.py
+++ b/src/calibre/ebooks/conversion/utils.py
@ -191,15 +191,15 @@ class PreProcessor(object):
        blanklines = "\s*(?P<up2threeblanks><(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*</(span|p|div)>\s*)</(span|p|div)>\s*){0,3}\s*"
        line_opening = "<(span|div|p)[^>]*>\s*(<(span|div|p)[^>]*>)?\s*"
        txt_line_wrap = u"((\u0020|\u0009)*\n){1,4}"
-        
+
        unwrap_regex = lookahead+line_ending+blanklines+line_opening
        if format == 'txt':
            unwrap_regex = lookahead+txt_line_wrap
-        
+
        unwrap = re.compile(u"%s" % unwrap_regex, re.UNICODE)
        content = unwrap.sub(' ', content)
        return content
-       
+
    def __call__(self, html):
        self.log("*********  Preprocessing HTML  *********")
--- a/src/calibre/ebooks/rtf/input.py
+++ b/src/calibre/ebooks/rtf/input.py
@ -296,7 +296,7 @@ class RTFInput(InputFormatPlugin):
                        u'<p>\u00a0</p>\n'.encode('utf-8'), res)
            if self.opts.preprocess_html:
                preprocessor = PreProcessor(self.opts, log=getattr(self, 'log', None))
-                res = preprocessor(res)
+                res = preprocessor(res.decode('utf-8')).encode('utf-8')
            f.write(res)
        self.write_inline_css(inline_class, border_styles)
        stream.seek(0)
--- a/src/calibre/ebooks/txt/input.py
+++ b/src/calibre/ebooks/txt/input.py
@ -53,7 +53,7 @@ class TXTInput(InputFormatPlugin):
    def convert(self, stream, options, file_ext, log,
                accelerators):
        log.debug('Reading text from file...')
-        
+
        txt = stream.read()
        # Get the encoding of the document.
        if options.input_encoding:
@ -80,7 +80,7 @@ class TXTInput(InputFormatPlugin):
        # Get length for hyphen removal and punctuation unwrap
        docanalysis = DocAnalysis('txt', txt)
        length = docanalysis.line_length(.5)
-            
+
        if options.formatting_type == 'auto':
            options.formatting_type = detect_formatting_type(txt)
@ -122,7 +122,7 @@ class TXTInput(InputFormatPlugin):
                txt = preprocessor.punctuation_unwrap(length, txt, 'txt')
            flow_size = getattr(options, 'flow_size', 0)
-            
+
            if options.formatting_type == 'heuristic':
                html = convert_heuristic(txt, epub_split_size_kb=flow_size)
            else: