expanded comments

2025-08-30 23:00:21 -04:00 · 2012-04-21 00:52:13 +08:00 · 2012-04-21 00:52:13 +08:00 · 740c812de2
commit 740c812de2
parent 9217e6bed3
1 changed files with 9 additions and 1 deletions
--- a/src/calibre/ebooks/conversion/utils.py
+++ b/src/calibre/ebooks/conversion/utils.py
@ -316,9 +316,17 @@ class HeuristicProcessor(object):
        '''
        Unwraps lines based on line length and punctuation
        supports a range of html markup and text files
+        
+        the lookahead regex below is meant look for any non-full stop characters - punctuation
+        characters which can be used as a full stop should *not* be added below - e.g. ?!“”. etc
+        the reason for this is to prevent false positive wrapping.  False positives are more
+        difficult to detect than false negatives during a manual review of the doc
+        
+        This function intentionally leaves hyphenated content alone as that is handled by the 
+        dehyphenate routine in a future step
        '''
-        # define the pieces of the regex

+        # define the pieces of the regex
        lookahead = "(?<=.{"+str(length)+u"}([a-zäëïöüàèìòùáćéíĺóŕńśúýâêîôûçąężıãõñæøþðßěľščťžňďřů,:)\IA\u00DF]|(?<!\&\w{4});))" # (?<!\&\w{4});) is a semicolon not part of an entity
        em_en_lookahead = "(?<=.{"+str(length)+u"}[\u2013\u2014])"
        soft_hyphen = u"\xad"