diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py index edd4d54cba..63eca10714 100644 --- a/src/calibre/ebooks/conversion/utils.py +++ b/src/calibre/ebooks/conversion/utils.py @@ -28,8 +28,8 @@ class HeuristicProcessor(object): self.linereg = re.compile('(?<=)', re.IGNORECASE|re.DOTALL) self.blankreg = re.compile(r'\s*(?P]*>)\s*(?P

)', re.IGNORECASE) self.anyblank = re.compile(r'\s*(?P]*>)\s*(?P

)', re.IGNORECASE) - self.multi_blank = re.compile(r'(\s*]*>\s*

){2,}(?!\s*]*>\s*

){2,}', re.IGNORECASE) + self.multi_blank = re.compile(r'(\s*]*>\s*

(\s*]*>\s*\s*)*){2,}(?!\s*]*>\s*

(\s*]*>\s*\s*)*){2,}', re.IGNORECASE) self.line_open = "<(?Pp|div)[^>]*>\s*(<(?Pfont|span|[ibu])[^>]*>)?\s*(<(?Pfont|span|[ibu])[^>]*>)?\s*(<(?Pfont|span|[ibu])[^>]*>)?\s*" self.line_close = "()?\s*()?\s*()?\s*" self.single_blank = re.compile(r'(\s*]*>\s*

)', re.IGNORECASE) @@ -384,6 +384,8 @@ class HeuristicProcessor(object): html = re.sub(r"\s*<(font|[ibu]|em|strong)[^>]*>\s*(<(font|[ibu]|em|strong)[^>]*>\s*\s*){0,2}\s*", " ", html) html = re.sub(r"\s*]*>\s*(]>\s*){0,2}\s*\s*", " ", html) html = re.sub(r"\s*<(font|[ibu]|em|strong)[^>]*>\s*(<(font|[ibu]|em|strong)[^>]*>\s*\s*){0,2}\s*", " ", html) + # delete surrounding divs from empty paragraphs + html = re.sub(']*>\s*]*>\s*

\s*', '

', html) # Empty heading tags html = re.sub(r'(?i)\s*', '', html) self.deleted_nbsps = True @@ -561,7 +563,6 @@ class HeuristicProcessor(object): # Determine whether the document uses interleaved blank lines self.blanks_between_paragraphs = self.analyze_blanks(html) - #self.dump(html, 'before_chapter_markup') # detect chapters/sections to match xpath or splitting logic if getattr(self.extra_opts, 'markup_chapter_headings', False):