diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py index f728bec52b..5fceeb7aed 100644 --- a/src/calibre/ebooks/conversion/preprocess.py +++ b/src/calibre/ebooks/conversion/preprocess.py @@ -201,7 +201,7 @@ class Dehyphenator(object): lookupword = self.removesuffixes.sub('', dehyphenated) else: lookupword = dehyphenated - if len(firsthalf) > 3 and self.prefixes.match(firsthalf) is None: + if len(firsthalf) > 4 and self.prefixes.match(firsthalf) is None: lookupword = self.removeprefix.sub('', lookupword) if self.verbose > 2: self.log("lookup word is: "+str(lookupword)+", orig is: " + str(hyphenated)) diff --git a/src/calibre/ebooks/txt/input.py b/src/calibre/ebooks/txt/input.py index 5b99b19e74..6ec1edb65c 100644 --- a/src/calibre/ebooks/txt/input.py +++ b/src/calibre/ebooks/txt/input.py @@ -71,21 +71,41 @@ class TXTInput(InputFormatPlugin): txt = txt.decode(ienc, 'replace') txt = _ent_pat.sub(xml_entity_to_unicode, txt) + + # Normalize line endings + txt = normalize_line_endings(txt) + + if options.formatting_type == 'auto': + options.formatting_type = detect_formatting_type(txt) + + if options.formatting_type == 'heuristic': + setattr(options, 'enable_heuristics', True) + setattr(options, 'markup_chapter_headings', True) + setattr(options, 'italicize_common_cases', True) + setattr(options, 'fix_indents', True) + setattr(options, 'preserve_spaces', True) + setattr(options, 'delete_blank_paragraphs', True) + setattr(options, 'format_scene_breaks', True) + setattr(options, 'dehyphenate', True) + + # Determine the paragraph type of the document. + if options.paragraph_type == 'auto': + options.paragraph_type = detect_paragraph_type(txt) + if options.paragraph_type == 'unknown': + log.debug('Could not reliably determine paragraph type using block') + options.paragraph_type = 'block' + else: + log.debug('Auto detected paragraph type as %s' % options.paragraph_type) + # Preserve spaces will replace multiple spaces to a space # followed by the entity. if options.preserve_spaces: txt = preserve_spaces(txt) - # Normalize line endings - txt = normalize_line_endings(txt) - # Get length for hyphen removal and punctuation unwrap docanalysis = DocAnalysis('txt', txt) length = docanalysis.line_length(.5) - if options.formatting_type == 'auto': - options.formatting_type = detect_formatting_type(txt) - if options.formatting_type == 'markdown': log.debug('Running text though markdown conversion...') try: @@ -96,16 +116,8 @@ class TXTInput(InputFormatPlugin): elif options.formatting_type == 'textile': log.debug('Running text though textile conversion...') html = convert_textile(txt) - else: - # Determine the paragraph type of the document. - if options.paragraph_type == 'auto': - options.paragraph_type = detect_paragraph_type(txt) - if options.paragraph_type == 'unknown': - log.debug('Could not reliably determine paragraph type using block') - options.paragraph_type = 'block' - else: - log.debug('Auto detected paragraph type as %s' % options.paragraph_type) + else: # Dehyphenate dehyphenator = Dehyphenator(options.verbose, log=self.log) txt = dehyphenator(txt,'txt', length) @@ -129,15 +141,6 @@ class TXTInput(InputFormatPlugin): flow_size = getattr(options, 'flow_size', 0) html = convert_basic(txt, epub_split_size_kb=flow_size) - if options.formatting_type == 'heuristic': - setattr(options, 'enable_heuristics', True) - setattr(options, 'markup_chapter_headings', True) - setattr(options, 'italicize_common_cases', True) - setattr(options, 'fix_indents', True) - setattr(options, 'delete_blank_paragraphs', True) - setattr(options, 'format_scene_breaks', True) - setattr(options, 'dehyphenate', True) - from calibre.customize.ui import plugin_for_input_format html_input = plugin_for_input_format('html') for opt in html_input.options: diff --git a/src/calibre/manual/conversion.rst b/src/calibre/manual/conversion.rst index de27a5f5bb..6ec986f26a 100644 --- a/src/calibre/manual/conversion.rst +++ b/src/calibre/manual/conversion.rst @@ -260,11 +260,11 @@ The Output profile also controls the screen size. This will cause, for example, Heuristic Processing --------------------- -Heuristic Processing provides a variety of functions which can be used that try to detect and correct +Heuristic Processing provides a variety of functions which can be used to try and detect and correct common problems in poorly formatted input documents. Use these functions if your input document suffers -from bad formatting. Because these functions rely on common patterns, be aware that in some cases an +from poor formatting. Because these functions rely on common patterns, be aware that in some cases an option may lead to worse results, so use with care. As an example, several of these options will -remove all non-breaking-space entities. +remove all non-breaking-space entities, or may include false positive matches relating to the function. :guilabel:`Enable heuristic processing` This option activates |app|'s Heuristic Processing stage of the conversion pipeline. @@ -283,7 +283,7 @@ remove all non-breaking-space entities. correction, then this value should be reduced to somewhere between 0.1 and 0.2. :guilabel:`Detect and markup unformatted chapter headings and sub headings` - If your document does not have Chapter Markers and titles formatted differently from the rest of the text, + If your document does not have chapter headings and titles formatted differently from the rest of the text, |app| can use this option to attempt detection them and surround them with heading tags.