Conversion pipeline: Disable HTML 5 parsing if it results in deeply nested trees. Fixes #908818 (RuntimeError: maximum recursion depth exceeded in cmp)

This commit is contained in:
Kovid Goyal 2011-12-26 23:47:05 +05:30
parent 003f4aa92c
commit c7f90b9fc7
2 changed files with 20 additions and 2 deletions

View File

@ -502,7 +502,7 @@ class MobiReader(object):
self.processed_html = self.processed_html.replace('> <', '>\n<')
self.processed_html = self.processed_html.replace('<mbp: ', '<mbp:')
self.processed_html = re.sub(r'<\?xml[^>]*>', '', self.processed_html)
self.processed_html = re.sub(r'<(/?)o:p', r'<\1p', self.processed_html)
self.processed_html = re.sub(r'<\s*(/?)\s*o:p[^>]*>', r'', self.processed_html)
# Swap inline and block level elements, and order block level elements according to priority
# - lxml and beautifulsoup expect/assume a specific order based on xhtml spec
self.processed_html = re.sub(r'(?i)(?P<styletags>(<(h\d+|i|b|u|em|small|big|strong|tt)>\s*){1,})(?P<para><p[^>]*>)', '\g<para>'+'\g<styletags>', self.processed_html)

View File

@ -70,9 +70,27 @@ def clone_element(elem, nsmap={}, in_context=True):
nelem.extend(elem)
return nelem
def html5_parse(data):
def node_depth(node):
ans = 0
p = node.getparent()
while p is not None:
ans += 1
p = p.getparent()
return ans
def html5_parse(data, max_nesting_depth=500):
import html5lib
data = html5lib.parse(data, treebuilder='lxml').getroot()
# Check that the asinine HTML 5 algorithm did not result in a tree with
# insane nesting depths
for x in data.iterdescendants():
if len(x) == 0:
# Leaf node
depth = node_depth(x)
if depth > max_nesting_depth:
raise ValueError('html5lib resulted in a tree with nesting'
' depth > %d'%max_nesting_depth)
# Set lang correctly
xl = data.attrib.pop('xmlU0003Alang', None)
if xl is not None and 'lang' not in data.attrib: