From ade57f14472e674a4aefd3a4fb831fb98b8db456 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Thu, 6 Jul 2017 20:40:04 +0530 Subject: [PATCH] Conversion: Fix heuristics processing incorrectly removing some
tags. Fixes #1205637 [break line/italic - wrong conversion](https://bugs.launchpad.net/calibre/+bug/1205637) --- src/calibre/ebooks/conversion/utils.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py index 7449a74ba9..35afb492b6 100644 --- a/src/calibre/ebooks/conversion/utils.py +++ b/src/calibre/ebooks/conversion/utils.py @@ -436,12 +436,12 @@ class HeuristicProcessor(object): # Re-open self closing paragraph tags html = re.sub('/]*/>', '

', html) # Get rid of empty span, bold, font, em, & italics tags - html = re.sub(r"\s*]*>\s*(]*>\s*){0,2}\s*\s*", " ", html) - html = re.sub( - r"\s*<(font|[ibu]|em|strong)[^>]*>\s*(<(font|[ibu]|em|strong)[^>]*>\s*\s*){0,2}\s*", " ", html) - html = re.sub(r"\s*]*>\s*(]>\s*){0,2}\s*\s*", " ", html) - html = re.sub( - r"\s*<(font|[ibu]|em|strong)[^>]*>\s*(<(font|[ibu]|em|strong)[^>]*>\s*\s*){0,2}\s*", " ", html) + fmt_tags = 'font|[ibu]|em|strong' + open_fmt_pat, close_fmt_pat = r'<(?:{})(?:\s[^>]*)?>'.format(fmt_tags), ''.format(fmt_tags) + for i in range(2): + html = re.sub(r"\s*]*>\s*(]*>\s*){0,2}\s*\s*", " ", html) + html = re.sub( + r"\s*{open}\s*({open}\s*{close}\s*){{0,2}}\s*{close}".format(open=open_fmt_pat, close=close_fmt_pat) , " ", html) # delete surrounding divs from empty paragraphs html = re.sub(']*>\s*]*>\s*

\s*', '

', html) # Empty heading tags