mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
RTF Input: Fix regression that broke the Preprocess HTML option
This commit is contained in:
parent
ef15ee03a3
commit
03f70c156c
@ -191,15 +191,15 @@ class PreProcessor(object):
|
|||||||
blanklines = "\s*(?P<up2threeblanks><(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*</(span|p|div)>\s*)</(span|p|div)>\s*){0,3}\s*"
|
blanklines = "\s*(?P<up2threeblanks><(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*</(span|p|div)>\s*)</(span|p|div)>\s*){0,3}\s*"
|
||||||
line_opening = "<(span|div|p)[^>]*>\s*(<(span|div|p)[^>]*>)?\s*"
|
line_opening = "<(span|div|p)[^>]*>\s*(<(span|div|p)[^>]*>)?\s*"
|
||||||
txt_line_wrap = u"((\u0020|\u0009)*\n){1,4}"
|
txt_line_wrap = u"((\u0020|\u0009)*\n){1,4}"
|
||||||
|
|
||||||
unwrap_regex = lookahead+line_ending+blanklines+line_opening
|
unwrap_regex = lookahead+line_ending+blanklines+line_opening
|
||||||
if format == 'txt':
|
if format == 'txt':
|
||||||
unwrap_regex = lookahead+txt_line_wrap
|
unwrap_regex = lookahead+txt_line_wrap
|
||||||
|
|
||||||
unwrap = re.compile(u"%s" % unwrap_regex, re.UNICODE)
|
unwrap = re.compile(u"%s" % unwrap_regex, re.UNICODE)
|
||||||
content = unwrap.sub(' ', content)
|
content = unwrap.sub(' ', content)
|
||||||
return content
|
return content
|
||||||
|
|
||||||
|
|
||||||
def __call__(self, html):
|
def __call__(self, html):
|
||||||
self.log("********* Preprocessing HTML *********")
|
self.log("********* Preprocessing HTML *********")
|
||||||
|
@ -296,7 +296,7 @@ class RTFInput(InputFormatPlugin):
|
|||||||
u'<p>\u00a0</p>\n'.encode('utf-8'), res)
|
u'<p>\u00a0</p>\n'.encode('utf-8'), res)
|
||||||
if self.opts.preprocess_html:
|
if self.opts.preprocess_html:
|
||||||
preprocessor = PreProcessor(self.opts, log=getattr(self, 'log', None))
|
preprocessor = PreProcessor(self.opts, log=getattr(self, 'log', None))
|
||||||
res = preprocessor(res)
|
res = preprocessor(res.decode('utf-8')).encode('utf-8')
|
||||||
f.write(res)
|
f.write(res)
|
||||||
self.write_inline_css(inline_class, border_styles)
|
self.write_inline_css(inline_class, border_styles)
|
||||||
stream.seek(0)
|
stream.seek(0)
|
||||||
|
@ -53,7 +53,7 @@ class TXTInput(InputFormatPlugin):
|
|||||||
def convert(self, stream, options, file_ext, log,
|
def convert(self, stream, options, file_ext, log,
|
||||||
accelerators):
|
accelerators):
|
||||||
log.debug('Reading text from file...')
|
log.debug('Reading text from file...')
|
||||||
|
|
||||||
txt = stream.read()
|
txt = stream.read()
|
||||||
# Get the encoding of the document.
|
# Get the encoding of the document.
|
||||||
if options.input_encoding:
|
if options.input_encoding:
|
||||||
@ -80,7 +80,7 @@ class TXTInput(InputFormatPlugin):
|
|||||||
# Get length for hyphen removal and punctuation unwrap
|
# Get length for hyphen removal and punctuation unwrap
|
||||||
docanalysis = DocAnalysis('txt', txt)
|
docanalysis = DocAnalysis('txt', txt)
|
||||||
length = docanalysis.line_length(.5)
|
length = docanalysis.line_length(.5)
|
||||||
|
|
||||||
if options.formatting_type == 'auto':
|
if options.formatting_type == 'auto':
|
||||||
options.formatting_type = detect_formatting_type(txt)
|
options.formatting_type = detect_formatting_type(txt)
|
||||||
|
|
||||||
@ -122,7 +122,7 @@ class TXTInput(InputFormatPlugin):
|
|||||||
txt = preprocessor.punctuation_unwrap(length, txt, 'txt')
|
txt = preprocessor.punctuation_unwrap(length, txt, 'txt')
|
||||||
|
|
||||||
flow_size = getattr(options, 'flow_size', 0)
|
flow_size = getattr(options, 'flow_size', 0)
|
||||||
|
|
||||||
if options.formatting_type == 'heuristic':
|
if options.formatting_type == 'heuristic':
|
||||||
html = convert_heuristic(txt, epub_split_size_kb=flow_size)
|
html = convert_heuristic(txt, epub_split_size_kb=flow_size)
|
||||||
else:
|
else:
|
||||||
|
Loading…
x
Reference in New Issue
Block a user