mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-08 02:34:06 -04:00
Sync ldolse preprocessing changes.
This commit is contained in:
parent
1990797e45
commit
ed3b2866cf
@ -57,6 +57,7 @@ class TXTInput(InputFormatPlugin):
|
|||||||
def convert(self, stream, options, file_ext, log,
|
def convert(self, stream, options, file_ext, log,
|
||||||
accelerators):
|
accelerators):
|
||||||
self.log = log
|
self.log = log
|
||||||
|
length = None
|
||||||
log.debug('Reading text from file...')
|
log.debug('Reading text from file...')
|
||||||
|
|
||||||
txt = stream.read()
|
txt = stream.read()
|
||||||
@ -109,7 +110,7 @@ class TXTInput(InputFormatPlugin):
|
|||||||
# Reformat paragraphs to block formatting based on the detected type.
|
# Reformat paragraphs to block formatting based on the detected type.
|
||||||
# We don't check for block because the processor assumes block.
|
# We don't check for block because the processor assumes block.
|
||||||
# single and print at transformed to block for processing.
|
# single and print at transformed to block for processing.
|
||||||
if options.paragraph_type == 'single' or options.paragraph_type == 'unformatted':
|
if options.paragraph_type == 'single':
|
||||||
txt = separate_paragraphs_single_line(txt)
|
txt = separate_paragraphs_single_line(txt)
|
||||||
elif options.paragraph_type == 'print':
|
elif options.paragraph_type == 'print':
|
||||||
txt = separate_paragraphs_print_formatted(txt)
|
txt = separate_paragraphs_print_formatted(txt)
|
||||||
@ -120,9 +121,11 @@ class TXTInput(InputFormatPlugin):
|
|||||||
length = docanalysis.line_length(.5)
|
length = docanalysis.line_length(.5)
|
||||||
preprocessor = HeuristicProcessor(options, log=getattr(self, 'log', None))
|
preprocessor = HeuristicProcessor(options, log=getattr(self, 'log', None))
|
||||||
txt = preprocessor.punctuation_unwrap(length, txt, 'txt')
|
txt = preprocessor.punctuation_unwrap(length, txt, 'txt')
|
||||||
|
txt = separate_paragraphs_single_line(txt)
|
||||||
|
|
||||||
if getattr(options, 'enable_heuristics', False) and getattr(options, 'dehyphenate', False):
|
if getattr(options, 'enable_heuristics', False) and getattr(options, 'dehyphenate', False):
|
||||||
docanalysis = DocAnalysis('txt', txt)
|
docanalysis = DocAnalysis('txt', txt)
|
||||||
|
if not length:
|
||||||
length = docanalysis.line_length(.5)
|
length = docanalysis.line_length(.5)
|
||||||
dehyphenator = Dehyphenator(options.verbose, log=self.log)
|
dehyphenator = Dehyphenator(options.verbose, log=self.log)
|
||||||
txt = dehyphenator(txt,'txt', length)
|
txt = dehyphenator(txt,'txt', length)
|
||||||
|
Loading…
x
Reference in New Issue
Block a user