From 30212404de06023636ae0594a055302c6da2553c Mon Sep 17 00:00:00 2001 From: ldolse Date: Sun, 6 Feb 2011 14:15:39 +0800 Subject: [PATCH 1/3] fixed handling of 'unformatted' text input --- src/calibre/ebooks/txt/input.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/src/calibre/ebooks/txt/input.py b/src/calibre/ebooks/txt/input.py index 75bafc7cef..674277fc41 100644 --- a/src/calibre/ebooks/txt/input.py +++ b/src/calibre/ebooks/txt/input.py @@ -58,6 +58,7 @@ class TXTInput(InputFormatPlugin): accelerators): self.log = log log.debug('Reading text from file...') + length = 0 txt = stream.read() @@ -109,11 +110,12 @@ class TXTInput(InputFormatPlugin): # Reformat paragraphs to block formatting based on the detected type. # We don't check for block because the processor assumes block. # single and print at transformed to block for processing. - if options.paragraph_type == 'single' or options.paragraph_type == 'unformatted': + if options.paragraph_type == 'single': txt = separate_paragraphs_single_line(txt) elif options.paragraph_type == 'print': txt = separate_paragraphs_print_formatted(txt) elif options.paragraph_type == 'unformatted': + print "unwrapping lines using heuristics" from calibre.ebooks.conversion.utils import HeuristicProcessor # unwrap lines based on punctuation docanalysis = DocAnalysis('txt', txt) @@ -123,7 +125,8 @@ class TXTInput(InputFormatPlugin): if getattr(options, 'enable_heuristics', False) and getattr(options, 'dehyphenate', False): docanalysis = DocAnalysis('txt', txt) - length = docanalysis.line_length(.5) + if not length: + length = docanalysis.line_length(.5) dehyphenator = Dehyphenator(options.verbose, log=self.log) txt = dehyphenator(txt,'txt', length) From 9088903f4c4b535372b0c4f4cfd80c2170be292d Mon Sep 17 00:00:00 2001 From: ldolse Date: Sun, 6 Feb 2011 19:28:15 +0800 Subject: [PATCH 2/3] ... --- src/calibre/ebooks/txt/input.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/calibre/ebooks/txt/input.py b/src/calibre/ebooks/txt/input.py index 674277fc41..ae5a216435 100644 --- a/src/calibre/ebooks/txt/input.py +++ b/src/calibre/ebooks/txt/input.py @@ -115,7 +115,6 @@ class TXTInput(InputFormatPlugin): elif options.paragraph_type == 'print': txt = separate_paragraphs_print_formatted(txt) elif options.paragraph_type == 'unformatted': - print "unwrapping lines using heuristics" from calibre.ebooks.conversion.utils import HeuristicProcessor # unwrap lines based on punctuation docanalysis = DocAnalysis('txt', txt) From 9b1ae4ba9790bfc4cc02c111ee3e83042dd79522 Mon Sep 17 00:00:00 2001 From: ldolse Date: Sun, 6 Feb 2011 19:55:35 +0800 Subject: [PATCH 3/3] ... --- src/calibre/ebooks/txt/input.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/calibre/ebooks/txt/input.py b/src/calibre/ebooks/txt/input.py index ae5a216435..dc624519bb 100644 --- a/src/calibre/ebooks/txt/input.py +++ b/src/calibre/ebooks/txt/input.py @@ -121,6 +121,7 @@ class TXTInput(InputFormatPlugin): length = docanalysis.line_length(.5) preprocessor = HeuristicProcessor(options, log=getattr(self, 'log', None)) txt = preprocessor.punctuation_unwrap(length, txt, 'txt') + txt = separate_paragraphs_single_line(txt) if getattr(options, 'enable_heuristics', False) and getattr(options, 'dehyphenate', False): docanalysis = DocAnalysis('txt', txt)