Conversion: Fix heuristics processing incorrectly removing some <br> tags. Fixes #1205637 [break line/italic - wrong conversion](https://bugs.launchpad.net/calibre/+bug/1205637)

This commit is contained in:
Kovid Goyal 2017-07-06 20:40:04 +05:30
parent 6ae397126b
commit ade57f1447
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C

View File

@ -436,12 +436,12 @@ class HeuristicProcessor(object):
# Re-open self closing paragraph tags
html = re.sub('<p[^>/]*/>', '<p> </p>', html)
# Get rid of empty span, bold, font, em, & italics tags
html = re.sub(r"\s*<span[^>]*>\s*(<span[^>]*>\s*</span>){0,2}\s*</span>\s*", " ", html)
html = re.sub(
r"\s*<(font|[ibu]|em|strong)[^>]*>\s*(<(font|[ibu]|em|strong)[^>]*>\s*</(font|[ibu]|em|strong)>\s*){0,2}\s*</(font|[ibu]|em|strong)>", " ", html)
html = re.sub(r"\s*<span[^>]*>\s*(<span[^>]>\s*</span>){0,2}\s*</span>\s*", " ", html)
html = re.sub(
r"\s*<(font|[ibu]|em|strong)[^>]*>\s*(<(font|[ibu]|em|strong)[^>]*>\s*</(font|[ibu]|em|strong)>\s*){0,2}\s*</(font|[ibu]|em|strong)>", " ", html)
fmt_tags = 'font|[ibu]|em|strong'
open_fmt_pat, close_fmt_pat = r'<(?:{})(?:\s[^>]*)?>'.format(fmt_tags), '</(?:{})>'.format(fmt_tags)
for i in range(2):
html = re.sub(r"\s*<span[^>]*>\s*(<span[^>]*>\s*</span>){0,2}\s*</span>\s*", " ", html)
html = re.sub(
r"\s*{open}\s*({open}\s*{close}\s*){{0,2}}\s*{close}".format(open=open_fmt_pat, close=close_fmt_pat) , " ", html)
# delete surrounding divs from empty paragraphs
html = re.sub('<div[^>]*>\s*<p[^>]*>\s*</p>\s*</div>', '<p> </p>', html)
# Empty heading tags