diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py index ddeba79365..85659ff413 100644 --- a/src/calibre/ebooks/conversion/utils.py +++ b/src/calibre/ebooks/conversion/utils.py @@ -335,14 +335,26 @@ class HeuristicProcessor(object): This function intentionally leaves hyphenated content alone as that is handled by the dehyphenate routine in a separate step ''' + def style_unwrap(match): + style_close = match.group('style_close') + style_open = match.group('style_open') + if style_open and style_close: + return style_close+' '+style_open + elif style_open and not style_close: + return ' '+style_open + elif not style_open and style_close: + return style_close+' ' + else: + return ' ' + # define the pieces of the regex lookahead = "(?<=.{"+str(length)+u"}([a-zäëïöüàèìòùáćéíĺóŕńśúýâêîôûçąężıãõñæøþðßěľščťžňďřů,:)\IA\u00DF]|(?\s*()?" + line_ending = "\s*(?P)?\s*()?" blanklines = "\s*(?P<(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*\s*)\s*){0,3}\s*" - line_opening = "<(span|[iubp]|div)[^>]*>\s*(<(span|[iubp]|div)[^>]*>)?\s*" + line_opening = "<(p|div)[^>]*>\s*(?P<(span|[iub])[^>]*>)?\s*" txt_line_wrap = u"((\u0020|\u0009)*\n){1,4}" unwrap_regex = lookahead+line_ending+blanklines+line_opening @@ -353,14 +365,17 @@ class HeuristicProcessor(object): unwrap_regex = lookahead+txt_line_wrap em_en_unwrap_regex = em_en_lookahead+txt_line_wrap shy_unwrap_regex = soft_hyphen+txt_line_wrap + content = unwrap.sub(' ', content) + content = em_en_unwrap.sub('', content) + content = shy_unwrap.sub('', content) + else: + unwrap = re.compile(u"%s" % unwrap_regex, re.UNICODE) + em_en_unwrap = re.compile(u"%s" % em_en_unwrap_regex, re.UNICODE) + shy_unwrap = re.compile(u"%s" % shy_unwrap_regex, re.UNICODE) + content = unwrap.sub(style_unwrap, content) + content = em_en_unwrap.sub(style_unwrap, content) + content = shy_unwrap.sub(style_unwrap, content) - unwrap = re.compile(u"%s" % unwrap_regex, re.UNICODE) - em_en_unwrap = re.compile(u"%s" % em_en_unwrap_regex, re.UNICODE) - shy_unwrap = re.compile(u"%s" % shy_unwrap_regex, re.UNICODE) - - content = unwrap.sub(' ', content) - content = em_en_unwrap.sub('', content) - content = shy_unwrap.sub('', content) return content def txt_process(self, match):