TXT Input: Heuristic processing enables smarten punctuation.

This commit is contained in:
Kovid Goyal 2011-02-07 16:19:07 -07:00
commit db6c36df7e
2 changed files with 14 additions and 1 deletions

View File

@ -14,7 +14,8 @@ from calibre.ebooks.chardet import detect
from calibre.ebooks.txt.processor import convert_basic, convert_markdown, \ from calibre.ebooks.txt.processor import convert_basic, convert_markdown, \
separate_paragraphs_single_line, separate_paragraphs_print_formatted, \ separate_paragraphs_single_line, separate_paragraphs_print_formatted, \
preserve_spaces, detect_paragraph_type, detect_formatting_type, \ preserve_spaces, detect_paragraph_type, detect_formatting_type, \
normalize_line_endings, convert_textile, remove_indents, block_to_single_line normalize_line_endings, convert_textile, remove_indents, block_to_single_line, \
separate_hard_scene_breaks
from calibre.ptempfile import TemporaryDirectory from calibre.ptempfile import TemporaryDirectory
from calibre.utils.zipfile import ZipFile from calibre.utils.zipfile import ZipFile
@ -114,6 +115,7 @@ class TXTInput(InputFormatPlugin):
if options.formatting_type == 'heuristic': if options.formatting_type == 'heuristic':
setattr(options, 'enable_heuristics', True) setattr(options, 'enable_heuristics', True)
setattr(options, 'unwrap_lines', False) setattr(options, 'unwrap_lines', False)
setattr(options, 'smarten_punctuation', True)
# Reformat paragraphs to block formatting based on the detected type. # Reformat paragraphs to block formatting based on the detected type.
# We don't check for block because the processor assumes block. # We don't check for block because the processor assumes block.
@ -121,6 +123,7 @@ class TXTInput(InputFormatPlugin):
if options.paragraph_type == 'single': if options.paragraph_type == 'single':
txt = separate_paragraphs_single_line(txt) txt = separate_paragraphs_single_line(txt)
elif options.paragraph_type == 'print': elif options.paragraph_type == 'print':
txt = separate_hard_scene_breaks(txt)
txt = separate_paragraphs_print_formatted(txt) txt = separate_paragraphs_print_formatted(txt)
txt = block_to_single_line(txt) txt = block_to_single_line(txt)
elif options.paragraph_type == 'unformatted': elif options.paragraph_type == 'unformatted':
@ -132,6 +135,7 @@ class TXTInput(InputFormatPlugin):
txt = preprocessor.punctuation_unwrap(length, txt, 'txt') txt = preprocessor.punctuation_unwrap(length, txt, 'txt')
txt = separate_paragraphs_single_line(txt) txt = separate_paragraphs_single_line(txt)
else: else:
txt = separate_hard_scene_breaks(txt)
txt = block_to_single_line(txt) txt = block_to_single_line(txt)
if getattr(options, 'enable_heuristics', False) and getattr(options, 'dehyphenate', False): if getattr(options, 'enable_heuristics', False) and getattr(options, 'dehyphenate', False):

View File

@ -120,6 +120,15 @@ def separate_paragraphs_print_formatted(txt):
txt = re.sub(u'(?miu)^(?P<indent>\t+|[ ]{2,})(?=.)', lambda mo: '\n%s' % mo.group('indent'), txt) txt = re.sub(u'(?miu)^(?P<indent>\t+|[ ]{2,})(?=.)', lambda mo: '\n%s' % mo.group('indent'), txt)
return txt return txt
def separate_hard_scene_breaks(txt):
def sep_break(line):
if len(line.strip()) > 0:
return '\n%s\n' % line
else:
return line
txt = re.sub(u'(?miu)^[ \t-=~\/]+$', lambda mo: sep_break(mo.group()), txt)
return txt
def block_to_single_line(txt): def block_to_single_line(txt):
txt = re.sub(r'(?<=.)\n(?=.)', ' ', txt) txt = re.sub(r'(?<=.)\n(?=.)', ' ', txt)
return txt return txt