RTF Input: Fix regression that broke the Preprocess HTML option

This commit is contained in:
Kovid Goyal 2011-01-10 10:44:09 -07:00
parent ef15ee03a3
commit 03f70c156c
3 changed files with 7 additions and 7 deletions

View File

@ -191,15 +191,15 @@ class PreProcessor(object):
blanklines = "\s*(?P<up2threeblanks><(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*</(span|p|div)>\s*)</(span|p|div)>\s*){0,3}\s*" blanklines = "\s*(?P<up2threeblanks><(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*</(span|p|div)>\s*)</(span|p|div)>\s*){0,3}\s*"
line_opening = "<(span|div|p)[^>]*>\s*(<(span|div|p)[^>]*>)?\s*" line_opening = "<(span|div|p)[^>]*>\s*(<(span|div|p)[^>]*>)?\s*"
txt_line_wrap = u"((\u0020|\u0009)*\n){1,4}" txt_line_wrap = u"((\u0020|\u0009)*\n){1,4}"
unwrap_regex = lookahead+line_ending+blanklines+line_opening unwrap_regex = lookahead+line_ending+blanklines+line_opening
if format == 'txt': if format == 'txt':
unwrap_regex = lookahead+txt_line_wrap unwrap_regex = lookahead+txt_line_wrap
unwrap = re.compile(u"%s" % unwrap_regex, re.UNICODE) unwrap = re.compile(u"%s" % unwrap_regex, re.UNICODE)
content = unwrap.sub(' ', content) content = unwrap.sub(' ', content)
return content return content
def __call__(self, html): def __call__(self, html):
self.log("********* Preprocessing HTML *********") self.log("********* Preprocessing HTML *********")

View File

@ -296,7 +296,7 @@ class RTFInput(InputFormatPlugin):
u'<p>\u00a0</p>\n'.encode('utf-8'), res) u'<p>\u00a0</p>\n'.encode('utf-8'), res)
if self.opts.preprocess_html: if self.opts.preprocess_html:
preprocessor = PreProcessor(self.opts, log=getattr(self, 'log', None)) preprocessor = PreProcessor(self.opts, log=getattr(self, 'log', None))
res = preprocessor(res) res = preprocessor(res.decode('utf-8')).encode('utf-8')
f.write(res) f.write(res)
self.write_inline_css(inline_class, border_styles) self.write_inline_css(inline_class, border_styles)
stream.seek(0) stream.seek(0)

View File

@ -53,7 +53,7 @@ class TXTInput(InputFormatPlugin):
def convert(self, stream, options, file_ext, log, def convert(self, stream, options, file_ext, log,
accelerators): accelerators):
log.debug('Reading text from file...') log.debug('Reading text from file...')
txt = stream.read() txt = stream.read()
# Get the encoding of the document. # Get the encoding of the document.
if options.input_encoding: if options.input_encoding:
@ -80,7 +80,7 @@ class TXTInput(InputFormatPlugin):
# Get length for hyphen removal and punctuation unwrap # Get length for hyphen removal and punctuation unwrap
docanalysis = DocAnalysis('txt', txt) docanalysis = DocAnalysis('txt', txt)
length = docanalysis.line_length(.5) length = docanalysis.line_length(.5)
if options.formatting_type == 'auto': if options.formatting_type == 'auto':
options.formatting_type = detect_formatting_type(txt) options.formatting_type = detect_formatting_type(txt)
@ -122,7 +122,7 @@ class TXTInput(InputFormatPlugin):
txt = preprocessor.punctuation_unwrap(length, txt, 'txt') txt = preprocessor.punctuation_unwrap(length, txt, 'txt')
flow_size = getattr(options, 'flow_size', 0) flow_size = getattr(options, 'flow_size', 0)
if options.formatting_type == 'heuristic': if options.formatting_type == 'heuristic':
html = convert_heuristic(txt, epub_split_size_kb=flow_size) html = convert_heuristic(txt, epub_split_size_kb=flow_size)
else: else: