mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
TXT Input: Restructure to run dehyphenator when auto and heuristic formatting options are used. This causes textile and markdown to be dehyphenated.
This commit is contained in:
parent
37dd8f6f3a
commit
9cdad92468
@ -77,20 +77,6 @@ class TXTInput(InputFormatPlugin):
|
||||
# Normalize line endings
|
||||
txt = normalize_line_endings(txt)
|
||||
|
||||
# Detect formatting
|
||||
if options.formatting_type == 'auto':
|
||||
options.formatting_type = detect_formatting_type(txt)
|
||||
log.debug('Auto detected formatting as %s' % options.formatting_type)
|
||||
|
||||
if options.formatting_type == 'heuristic':
|
||||
setattr(options, 'enable_heuristics', True)
|
||||
setattr(options, 'markup_chapter_headings', True)
|
||||
setattr(options, 'italicize_common_cases', True)
|
||||
setattr(options, 'fix_indents', True)
|
||||
setattr(options, 'delete_blank_paragraphs', True)
|
||||
setattr(options, 'format_scene_breaks', True)
|
||||
setattr(options, 'dehyphenate', True)
|
||||
|
||||
# Determine the paragraph type of the document.
|
||||
if options.paragraph_type == 'auto':
|
||||
options.paragraph_type = detect_paragraph_type(txt)
|
||||
@ -100,15 +86,26 @@ class TXTInput(InputFormatPlugin):
|
||||
else:
|
||||
log.debug('Auto detected paragraph type as %s' % options.paragraph_type)
|
||||
|
||||
dehyphenate = False
|
||||
if options.formatting_type in ('auto', 'heuristic'):
|
||||
# Set this here because we want it to run over all
|
||||
# formatting types if auto is used.
|
||||
dehyphenate = True
|
||||
|
||||
# Detect formatting
|
||||
if options.formatting_type == 'auto':
|
||||
options.formatting_type = detect_formatting_type(txt)
|
||||
log.debug('Auto detected formatting as %s' % options.formatting_type)
|
||||
|
||||
if options.formatting_type == 'heuristic':
|
||||
setattr(options, 'enable_heuristics', True)
|
||||
setattr(options, 'unwrap_lines', False)
|
||||
|
||||
# Preserve spaces will replace multiple spaces to a space
|
||||
# followed by the entity.
|
||||
if options.preserve_spaces:
|
||||
txt = preserve_spaces(txt)
|
||||
|
||||
# Get length for hyphen removal and punctuation unwrap
|
||||
docanalysis = DocAnalysis('txt', txt)
|
||||
length = docanalysis.line_length(.5)
|
||||
|
||||
# Reformat paragraphs to block formatting based on the detected type.
|
||||
# We don't check for block because the processor assumes block.
|
||||
# single and print at transformed to block for processing.
|
||||
@ -119,9 +116,17 @@ class TXTInput(InputFormatPlugin):
|
||||
elif options.paragraph_type == 'unformatted':
|
||||
from calibre.ebooks.conversion.utils import HeuristicProcessor
|
||||
# unwrap lines based on punctuation
|
||||
docanalysis = DocAnalysis('txt', txt)
|
||||
length = docanalysis.line_length(.5)
|
||||
preprocessor = HeuristicProcessor(options, log=getattr(self, 'log', None))
|
||||
txt = preprocessor.punctuation_unwrap(length, txt, 'txt')
|
||||
|
||||
if dehyphenate:
|
||||
docanalysis = DocAnalysis('txt', txt)
|
||||
length = docanalysis.line_length(.5)
|
||||
dehyphenator = Dehyphenator(options.verbose, log=self.log)
|
||||
txt = dehyphenator(txt,'txt', length)
|
||||
|
||||
# Process the text using the appropriate text processor.
|
||||
html = ''
|
||||
if options.formatting_type == 'markdown':
|
||||
@ -134,14 +139,8 @@ class TXTInput(InputFormatPlugin):
|
||||
elif options.formatting_type == 'textile':
|
||||
log.debug('Running text through textile conversion...')
|
||||
html = convert_textile(txt)
|
||||
|
||||
else:
|
||||
log.debug('Running text through basic conversion...')
|
||||
if options.formatting_type == 'heuristic':
|
||||
# Dehyphenate
|
||||
dehyphenator = Dehyphenator(options.verbose, log=self.log)
|
||||
txt = dehyphenator(txt,'txt', length)
|
||||
|
||||
flow_size = getattr(options, 'flow_size', 0)
|
||||
html = convert_basic(txt, epub_split_size_kb=flow_size)
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user