diff --git a/src/calibre/ebooks/txt/input.py b/src/calibre/ebooks/txt/input.py index dc624519bb..e240205222 100644 --- a/src/calibre/ebooks/txt/input.py +++ b/src/calibre/ebooks/txt/input.py @@ -12,7 +12,7 @@ from calibre.ebooks.chardet import detect from calibre.ebooks.txt.processor import convert_basic, convert_markdown, \ separate_paragraphs_single_line, separate_paragraphs_print_formatted, \ preserve_spaces, detect_paragraph_type, detect_formatting_type, \ - normalize_line_endings, convert_textile, remove_indents + normalize_line_endings, convert_textile, remove_indents, block_to_single_line from calibre import _ent_pat, xml_entity_to_unicode class TXTInput(InputFormatPlugin): @@ -99,14 +99,6 @@ class TXTInput(InputFormatPlugin): setattr(options, 'enable_heuristics', True) setattr(options, 'unwrap_lines', False) - if options.txt_in_remove_indents: - txt = remove_indents(txt) - - # Preserve spaces will replace multiple spaces to a space - # followed by the entity. - if options.preserve_spaces: - txt = preserve_spaces(txt) - # Reformat paragraphs to block formatting based on the detected type. # We don't check for block because the processor assumes block. # single and print at transformed to block for processing. @@ -114,6 +106,7 @@ class TXTInput(InputFormatPlugin): txt = separate_paragraphs_single_line(txt) elif options.paragraph_type == 'print': txt = separate_paragraphs_print_formatted(txt) + txt = block_to_single_line(txt) elif options.paragraph_type == 'unformatted': from calibre.ebooks.conversion.utils import HeuristicProcessor # unwrap lines based on punctuation @@ -122,6 +115,8 @@ class TXTInput(InputFormatPlugin): preprocessor = HeuristicProcessor(options, log=getattr(self, 'log', None)) txt = preprocessor.punctuation_unwrap(length, txt, 'txt') txt = separate_paragraphs_single_line(txt) + else: + txt = block_to_single_line(txt) if getattr(options, 'enable_heuristics', False) and getattr(options, 'dehyphenate', False): docanalysis = DocAnalysis('txt', txt) @@ -130,6 +125,15 @@ class TXTInput(InputFormatPlugin): dehyphenator = Dehyphenator(options.verbose, log=self.log) txt = dehyphenator(txt,'txt', length) + # User requested transformation on the text. + if options.txt_in_remove_indents: + txt = remove_indents(txt) + + # Preserve spaces will replace multiple spaces to a space + # followed by the entity. + if options.preserve_spaces: + txt = preserve_spaces(txt) + # Process the text using the appropriate text processor. html = '' if options.formatting_type == 'markdown': diff --git a/src/calibre/ebooks/txt/processor.py b/src/calibre/ebooks/txt/processor.py index 987d7cdc73..f7b6cce234 100644 --- a/src/calibre/ebooks/txt/processor.py +++ b/src/calibre/ebooks/txt/processor.py @@ -18,6 +18,10 @@ from calibre.utils.cleantext import clean_ascii_chars HTML_TEMPLATE = u'
tags. It condense and retains blank lines when necessary.
+
+ Requires paragraphs to be in single line format.
+ '''
txt = clean_txt(txt)
txt = split_txt(txt, epub_split_size_kb)
@@ -99,15 +118,25 @@ def separate_paragraphs_single_line(txt):
return txt
def separate_paragraphs_print_formatted(txt):
- txt = re.sub(u'(?miu)^(\t+|[ ]{2,})(?=.)', '\n\t', txt)
+ txt = re.sub(u'(?miu)^(?P