TXT Input: convet_basic changed at some point to require single line paragraphs. Add function to turn block formatted paragraphs to single so they are processed correctly.

This commit is contained in:
John Schember 2011-02-06 08:45:50 -05:00
parent 92ee46cdb9
commit 2d4fc57ddc
2 changed files with 9 additions and 2 deletions

View File

@ -12,7 +12,7 @@ from calibre.ebooks.chardet import detect
from calibre.ebooks.txt.processor import convert_basic, convert_markdown, \
separate_paragraphs_single_line, separate_paragraphs_print_formatted, \
preserve_spaces, detect_paragraph_type, detect_formatting_type, \
normalize_line_endings, convert_textile, remove_indents
normalize_line_endings, convert_textile, remove_indents, block_to_single_line
from calibre import _ent_pat, xml_entity_to_unicode
class TXTInput(InputFormatPlugin):
@ -106,6 +106,7 @@ class TXTInput(InputFormatPlugin):
txt = separate_paragraphs_single_line(txt)
elif options.paragraph_type == 'print':
txt = separate_paragraphs_print_formatted(txt)
txt = block_to_single_line(txt)
elif options.paragraph_type == 'unformatted':
from calibre.ebooks.conversion.utils import HeuristicProcessor
# unwrap lines based on punctuation
@ -114,6 +115,8 @@ class TXTInput(InputFormatPlugin):
preprocessor = HeuristicProcessor(options, log=getattr(self, 'log', None))
txt = preprocessor.punctuation_unwrap(length, txt, 'txt')
txt = separate_paragraphs_single_line(txt)
else:
txt = block_to_single_line(txt)
if getattr(options, 'enable_heuristics', False) and getattr(options, 'dehyphenate', False):
docanalysis = DocAnalysis('txt', txt)

View File

@ -99,7 +99,11 @@ def separate_paragraphs_single_line(txt):
return txt
def separate_paragraphs_print_formatted(txt):
txt = re.sub(u'(?miu)^(?P<indent>\t+|[ ]{2,})(?=.)', lambda mo: '%s\n\t' % mo.group('indent'), txt)
txt = re.sub(u'(?miu)^(?P<indent>\t+|[ ]{2,})(?=.)', lambda mo: '\n%s' % mo.group('indent'), txt)
return txt
def block_to_single_line(txt):
txt = re.sub(r'(?<=.)\n(?=.)', ' ', txt)
return txt
def preserve_spaces(txt):