mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-06-23 15:30:45 -04:00
Conversion pipeline: Disable HTML 5 parsing if it results in deeply nested trees. Fixes #908818 (RuntimeError: maximum recursion depth exceeded in cmp)
This commit is contained in:
parent
003f4aa92c
commit
c7f90b9fc7
@ -502,7 +502,7 @@ class MobiReader(object):
|
||||
self.processed_html = self.processed_html.replace('> <', '>\n<')
|
||||
self.processed_html = self.processed_html.replace('<mbp: ', '<mbp:')
|
||||
self.processed_html = re.sub(r'<\?xml[^>]*>', '', self.processed_html)
|
||||
self.processed_html = re.sub(r'<(/?)o:p', r'<\1p', self.processed_html)
|
||||
self.processed_html = re.sub(r'<\s*(/?)\s*o:p[^>]*>', r'', self.processed_html)
|
||||
# Swap inline and block level elements, and order block level elements according to priority
|
||||
# - lxml and beautifulsoup expect/assume a specific order based on xhtml spec
|
||||
self.processed_html = re.sub(r'(?i)(?P<styletags>(<(h\d+|i|b|u|em|small|big|strong|tt)>\s*){1,})(?P<para><p[^>]*>)', '\g<para>'+'\g<styletags>', self.processed_html)
|
||||
|
@ -70,9 +70,27 @@ def clone_element(elem, nsmap={}, in_context=True):
|
||||
nelem.extend(elem)
|
||||
return nelem
|
||||
|
||||
def html5_parse(data):
|
||||
def node_depth(node):
|
||||
ans = 0
|
||||
p = node.getparent()
|
||||
while p is not None:
|
||||
ans += 1
|
||||
p = p.getparent()
|
||||
return ans
|
||||
|
||||
def html5_parse(data, max_nesting_depth=500):
|
||||
import html5lib
|
||||
data = html5lib.parse(data, treebuilder='lxml').getroot()
|
||||
|
||||
# Check that the asinine HTML 5 algorithm did not result in a tree with
|
||||
# insane nesting depths
|
||||
for x in data.iterdescendants():
|
||||
if len(x) == 0:
|
||||
# Leaf node
|
||||
depth = node_depth(x)
|
||||
if depth > max_nesting_depth:
|
||||
raise ValueError('html5lib resulted in a tree with nesting'
|
||||
' depth > %d'%max_nesting_depth)
|
||||
# Set lang correctly
|
||||
xl = data.attrib.pop('xmlU0003Alang', None)
|
||||
if xl is not None and 'lang' not in data.attrib:
|
||||
|
Loading…
x
Reference in New Issue
Block a user