Sync ldolse preprocessing changes.

This commit is contained in:
John Schember 2011-02-06 08:35:07 -05:00
parent 1990797e45
commit ed3b2866cf

View File

@ -57,6 +57,7 @@ class TXTInput(InputFormatPlugin):
def convert(self, stream, options, file_ext, log, def convert(self, stream, options, file_ext, log,
accelerators): accelerators):
self.log = log self.log = log
length = None
log.debug('Reading text from file...') log.debug('Reading text from file...')
txt = stream.read() txt = stream.read()
@ -109,7 +110,7 @@ class TXTInput(InputFormatPlugin):
# Reformat paragraphs to block formatting based on the detected type. # Reformat paragraphs to block formatting based on the detected type.
# We don't check for block because the processor assumes block. # We don't check for block because the processor assumes block.
# single and print at transformed to block for processing. # single and print at transformed to block for processing.
if options.paragraph_type == 'single' or options.paragraph_type == 'unformatted': if options.paragraph_type == 'single':
txt = separate_paragraphs_single_line(txt) txt = separate_paragraphs_single_line(txt)
elif options.paragraph_type == 'print': elif options.paragraph_type == 'print':
txt = separate_paragraphs_print_formatted(txt) txt = separate_paragraphs_print_formatted(txt)
@ -120,10 +121,12 @@ class TXTInput(InputFormatPlugin):
length = docanalysis.line_length(.5) length = docanalysis.line_length(.5)
preprocessor = HeuristicProcessor(options, log=getattr(self, 'log', None)) preprocessor = HeuristicProcessor(options, log=getattr(self, 'log', None))
txt = preprocessor.punctuation_unwrap(length, txt, 'txt') txt = preprocessor.punctuation_unwrap(length, txt, 'txt')
txt = separate_paragraphs_single_line(txt)
if getattr(options, 'enable_heuristics', False) and getattr(options, 'dehyphenate', False): if getattr(options, 'enable_heuristics', False) and getattr(options, 'dehyphenate', False):
docanalysis = DocAnalysis('txt', txt) docanalysis = DocAnalysis('txt', txt)
length = docanalysis.line_length(.5) if not length:
length = docanalysis.line_length(.5)
dehyphenator = Dehyphenator(options.verbose, log=self.log) dehyphenator = Dehyphenator(options.verbose, log=self.log)
txt = dehyphenator(txt,'txt', length) txt = dehyphenator(txt,'txt', length)