preserve style changes across line breaks in heuristics, fixes bug 1066507

2025-07-09 03:04:10 -04:00 · 2012-12-08 21:26:14 +08:00 · 2012-12-08 21:26:14 +08:00 · 67deb3adcd
commit 67deb3adcd
parent 92afbd1ab5
1 changed files with 24 additions and 9 deletions
--- a/src/calibre/ebooks/conversion/utils.py
+++ b/src/calibre/ebooks/conversion/utils.py
@ -335,14 +335,26 @@ class HeuristicProcessor(object):
        This function intentionally leaves hyphenated content alone as that is handled by the
        dehyphenate routine in a separate step
        '''
+        def style_unwrap(match):
+            style_close = match.group('style_close')
+            style_open = match.group('style_open')
+            if style_open and style_close:
+                return style_close+' '+style_open
+            elif style_open and not style_close:
+                return ' '+style_open
+            elif not style_open and style_close:
+                return style_close+' '
+            else:
+                return ' '
+

        # define the pieces of the regex
        lookahead = "(?<=.{"+str(length)+u"}([a-zäëïöüàèìòùáćéíĺóŕńśúýâêîôûçąężıãõñæøþðßěľščťžňďřů,:)\IA\u00DF]|(?<!\&\w{4});))" # (?<!\&\w{4});) is a semicolon not part of an entity
        em_en_lookahead = "(?<=.{"+str(length)+u"}[\u2013\u2014])"
        soft_hyphen = u"\xad"
-        line_ending = "\s*</(span|[iubp]|div)>\s*(</(span|[iubp]|div)>)?"
+        line_ending = "\s*(?P<style_close></(span|[iub])>)?\s*(</(p|div)>)?"
        blanklines = "\s*(?P<up2threeblanks><(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*</(span|p|div)>\s*)</(span|p|div)>\s*){0,3}\s*"
-        line_opening = "<(span|[iubp]|div)[^>]*>\s*(<(span|[iubp]|div)[^>]*>)?\s*"
+        line_opening = "<(p|div)[^>]*>\s*(?P<style_open><(span|[iub])[^>]*>)?\s*"
        txt_line_wrap = u"((\u0020|\u0009)*\n){1,4}"

        unwrap_regex = lookahead+line_ending+blanklines+line_opening
@ -353,14 +365,17 @@ class HeuristicProcessor(object):
            unwrap_regex = lookahead+txt_line_wrap
            em_en_unwrap_regex = em_en_lookahead+txt_line_wrap
            shy_unwrap_regex = soft_hyphen+txt_line_wrap
+            content = unwrap.sub(' ', content)
+            content = em_en_unwrap.sub('', content)
+            content = shy_unwrap.sub('', content)
+        else:
+            unwrap = re.compile(u"%s" % unwrap_regex, re.UNICODE)
+            em_en_unwrap = re.compile(u"%s" % em_en_unwrap_regex, re.UNICODE)
+            shy_unwrap = re.compile(u"%s" % shy_unwrap_regex, re.UNICODE)
+            content = unwrap.sub(style_unwrap, content)
+            content = em_en_unwrap.sub(style_unwrap, content)
+            content = shy_unwrap.sub(style_unwrap, content)

-        unwrap = re.compile(u"%s" % unwrap_regex, re.UNICODE)
-        em_en_unwrap = re.compile(u"%s" % em_en_unwrap_regex, re.UNICODE)
-        shy_unwrap = re.compile(u"%s" % shy_unwrap_regex, re.UNICODE)
-
-        content = unwrap.sub(' ', content)
-        content = em_en_unwrap.sub('', content)
-        content = shy_unwrap.sub('', content)
        return content

    def txt_process(self, match):