mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
TXT Input: Heuristic processing enables smarten punctuation.
This commit is contained in:
commit
db6c36df7e
@ -14,7 +14,8 @@ from calibre.ebooks.chardet import detect
|
|||||||
from calibre.ebooks.txt.processor import convert_basic, convert_markdown, \
|
from calibre.ebooks.txt.processor import convert_basic, convert_markdown, \
|
||||||
separate_paragraphs_single_line, separate_paragraphs_print_formatted, \
|
separate_paragraphs_single_line, separate_paragraphs_print_formatted, \
|
||||||
preserve_spaces, detect_paragraph_type, detect_formatting_type, \
|
preserve_spaces, detect_paragraph_type, detect_formatting_type, \
|
||||||
normalize_line_endings, convert_textile, remove_indents, block_to_single_line
|
normalize_line_endings, convert_textile, remove_indents, block_to_single_line, \
|
||||||
|
separate_hard_scene_breaks
|
||||||
from calibre.ptempfile import TemporaryDirectory
|
from calibre.ptempfile import TemporaryDirectory
|
||||||
from calibre.utils.zipfile import ZipFile
|
from calibre.utils.zipfile import ZipFile
|
||||||
|
|
||||||
@ -114,6 +115,7 @@ class TXTInput(InputFormatPlugin):
|
|||||||
if options.formatting_type == 'heuristic':
|
if options.formatting_type == 'heuristic':
|
||||||
setattr(options, 'enable_heuristics', True)
|
setattr(options, 'enable_heuristics', True)
|
||||||
setattr(options, 'unwrap_lines', False)
|
setattr(options, 'unwrap_lines', False)
|
||||||
|
setattr(options, 'smarten_punctuation', True)
|
||||||
|
|
||||||
# Reformat paragraphs to block formatting based on the detected type.
|
# Reformat paragraphs to block formatting based on the detected type.
|
||||||
# We don't check for block because the processor assumes block.
|
# We don't check for block because the processor assumes block.
|
||||||
@ -121,6 +123,7 @@ class TXTInput(InputFormatPlugin):
|
|||||||
if options.paragraph_type == 'single':
|
if options.paragraph_type == 'single':
|
||||||
txt = separate_paragraphs_single_line(txt)
|
txt = separate_paragraphs_single_line(txt)
|
||||||
elif options.paragraph_type == 'print':
|
elif options.paragraph_type == 'print':
|
||||||
|
txt = separate_hard_scene_breaks(txt)
|
||||||
txt = separate_paragraphs_print_formatted(txt)
|
txt = separate_paragraphs_print_formatted(txt)
|
||||||
txt = block_to_single_line(txt)
|
txt = block_to_single_line(txt)
|
||||||
elif options.paragraph_type == 'unformatted':
|
elif options.paragraph_type == 'unformatted':
|
||||||
@ -132,6 +135,7 @@ class TXTInput(InputFormatPlugin):
|
|||||||
txt = preprocessor.punctuation_unwrap(length, txt, 'txt')
|
txt = preprocessor.punctuation_unwrap(length, txt, 'txt')
|
||||||
txt = separate_paragraphs_single_line(txt)
|
txt = separate_paragraphs_single_line(txt)
|
||||||
else:
|
else:
|
||||||
|
txt = separate_hard_scene_breaks(txt)
|
||||||
txt = block_to_single_line(txt)
|
txt = block_to_single_line(txt)
|
||||||
|
|
||||||
if getattr(options, 'enable_heuristics', False) and getattr(options, 'dehyphenate', False):
|
if getattr(options, 'enable_heuristics', False) and getattr(options, 'dehyphenate', False):
|
||||||
|
@ -120,6 +120,15 @@ def separate_paragraphs_print_formatted(txt):
|
|||||||
txt = re.sub(u'(?miu)^(?P<indent>\t+|[ ]{2,})(?=.)', lambda mo: '\n%s' % mo.group('indent'), txt)
|
txt = re.sub(u'(?miu)^(?P<indent>\t+|[ ]{2,})(?=.)', lambda mo: '\n%s' % mo.group('indent'), txt)
|
||||||
return txt
|
return txt
|
||||||
|
|
||||||
|
def separate_hard_scene_breaks(txt):
|
||||||
|
def sep_break(line):
|
||||||
|
if len(line.strip()) > 0:
|
||||||
|
return '\n%s\n' % line
|
||||||
|
else:
|
||||||
|
return line
|
||||||
|
txt = re.sub(u'(?miu)^[ \t-=~\/]+$', lambda mo: sep_break(mo.group()), txt)
|
||||||
|
return txt
|
||||||
|
|
||||||
def block_to_single_line(txt):
|
def block_to_single_line(txt):
|
||||||
txt = re.sub(r'(?<=.)\n(?=.)', ' ', txt)
|
txt = re.sub(r'(?<=.)\n(?=.)', ' ', txt)
|
||||||
return txt
|
return txt
|
||||||
|
Loading…
x
Reference in New Issue
Block a user