diff --git a/src/libprs500/ebooks/lrf/html/convert_from.py b/src/libprs500/ebooks/lrf/html/convert_from.py index ac7e640462..f610123b16 100644 --- a/src/libprs500/ebooks/lrf/html/convert_from.py +++ b/src/libprs500/ebooks/lrf/html/convert_from.py @@ -79,6 +79,11 @@ def strip_style_comments(match): src = src[:lindex] + src[rindex+2:] return src +def tag_regex(tagname): + '''Return non-grouping regular expressions that match the opening and closing tags for tagname''' + return dict(open=r'(?:<\s*%(t)s\s+[^<>]*?>|<\s*%(t)s\s*>)'%dict(t=tagname), \ + close=r''%dict(t=tagname)) + class HTMLConverter(object): SELECTOR_PAT = re.compile(r"([A-Za-z0-9\-\_\:\.]+[A-Za-z0-9\-\_\:\.\s\,]*)\s*\{([^\}]*)\}") PAGE_BREAK_PAT = re.compile(r'page-break-(?:after|before)\s*:\s*(\w+)', re.IGNORECASE) @@ -94,8 +99,8 @@ class HTMLConverter(object): (re.compile(r"<\s*style.*?>(.*?)<\/\s*style\s*>", re.DOTALL|re.IGNORECASE), lambda match: match.group().replace('', '')), # remove

tags from within tags - (re.compile(r'(.*?)', re.DOTALL|re.IGNORECASE), - lambda match: re.compile(r'<\s*?p.*?>', re.IGNORECASE).sub('', match.group())), + (re.compile(r'%(open)s(.*?)%(close)s'%tag_regex('a'), re.DOTALL|re.IGNORECASE), + lambda match: re.compile(r'%(open)s|%(close)s'%tag_regex('p'), re.IGNORECASE).sub('', match.group())), # Replace common line break patterns with line breaks (re.compile(r'

( |\s)*

', re.IGNORECASE), lambda m: '
'),