diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py index 24528d1fb8..6dc3973213 100644 --- a/src/calibre/ebooks/conversion/utils.py +++ b/src/calibre/ebooks/conversion/utils.py @@ -316,9 +316,17 @@ class HeuristicProcessor(object): ''' Unwraps lines based on line length and punctuation supports a range of html markup and text files + + the lookahead regex below is meant look for any non-full stop characters - punctuation + characters which can be used as a full stop should *not* be added below - e.g. ?!“”. etc + the reason for this is to prevent false positive wrapping. False positives are more + difficult to detect than false negatives during a manual review of the doc + + This function intentionally leaves hyphenated content alone as that is handled by the + dehyphenate routine in a future step ''' - # define the pieces of the regex + # define the pieces of the regex lookahead = "(?<=.{"+str(length)+u"}([a-zäëïöüàèìòùáćéíĺóŕńśúýâêîôûçąężıãõñæøþðßěľščťžňďřů,:)\IA\u00DF]|(?