diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py index 397146b415..f541701480 100644 --- a/src/calibre/ebooks/conversion/utils.py +++ b/src/calibre/ebooks/conversion/utils.py @@ -28,8 +28,8 @@ class HeuristicProcessor(object): self.linereg = re.compile('(?<=
)', re.IGNORECASE|re.DOTALL)
self.blankreg = re.compile(r'\s*(?P ]*>)\s*(?P ]*>)\s*(?P ]*>\s* ]*>\s* ]*>\s*
]*>\s*
)', re.IGNORECASE) @@ -379,6 +379,8 @@ class HeuristicProcessor(object): html = re.sub('(?i)?st1:\w+>', '', html) # Re-open self closing paragraph tags html = re.sub('/]*/>', '
', html) + # delete surrounding divs from empty paragraphs + html = re.sub('
]*>\s*
\s*', html) # Get rid of empty span, bold, font, em, & italics tags html = re.sub(r"\s*]*>\s*(]*>\s*){0,2}\s*\s*", " ", html) html = re.sub(r"\s*<(font|[ibu]|em|strong)[^>]*>\s*(<(font|[ibu]|em|strong)[^>]*>\s*(font|[ibu]|em|strong)>\s*){0,2}\s*(font|[ibu]|em|strong)>", " ", html) @@ -637,6 +639,7 @@ class HeuristicProcessor(object): blanks_count = len(self.any_multi_blank.findall(html)) if blanks_count >= 1: html = self.merge_blanks(html, blanks_count) + self.dump(html, 'before_after_merge_blanks') scene_break_regex = self.line_open+'(?![\w\'\"])(?P