MOBI Input: Map invalid <o:p> tags to <p> tags before parsing, to handle broken nesting. Fixes #905715 (Private bug)

This commit is contained in:
Kovid Goyal 2011-12-20 10:04:49 +05:30
parent ff6dd9c16a
commit ff3ae041b9

View File

@ -502,6 +502,7 @@ class MobiReader(object):
self.processed_html = self.processed_html.replace('> <', '>\n<') self.processed_html = self.processed_html.replace('> <', '>\n<')
self.processed_html = self.processed_html.replace('<mbp: ', '<mbp:') self.processed_html = self.processed_html.replace('<mbp: ', '<mbp:')
self.processed_html = re.sub(r'<\?xml[^>]*>', '', self.processed_html) self.processed_html = re.sub(r'<\?xml[^>]*>', '', self.processed_html)
self.processed_html = re.sub(r'<(/?)o:p', r'<\1p', self.processed_html)
# Swap inline and block level elements, and order block level elements according to priority # Swap inline and block level elements, and order block level elements according to priority
# - lxml and beautifulsoup expect/assume a specific order based on xhtml spec # - lxml and beautifulsoup expect/assume a specific order based on xhtml spec
self.processed_html = re.sub(r'(?i)(?P<styletags>(<(h\d+|i|b|u|em|small|big|strong|tt)>\s*){1,})(?P<para><p[^>]*>)', '\g<para>'+'\g<styletags>', self.processed_html) self.processed_html = re.sub(r'(?i)(?P<styletags>(<(h\d+|i|b|u|em|small|big|strong|tt)>\s*){1,})(?P<para><p[^>]*>)', '\g<para>'+'\g<styletags>', self.processed_html)