mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-06-23 15:30:45 -04:00
Conversion pipeline: Disable HTML 5 parsing if it results in deeply nested trees. Fixes #908818 (RuntimeError: maximum recursion depth exceeded in cmp)
This commit is contained in:
parent
003f4aa92c
commit
c7f90b9fc7
@ -502,7 +502,7 @@ class MobiReader(object):
|
|||||||
self.processed_html = self.processed_html.replace('> <', '>\n<')
|
self.processed_html = self.processed_html.replace('> <', '>\n<')
|
||||||
self.processed_html = self.processed_html.replace('<mbp: ', '<mbp:')
|
self.processed_html = self.processed_html.replace('<mbp: ', '<mbp:')
|
||||||
self.processed_html = re.sub(r'<\?xml[^>]*>', '', self.processed_html)
|
self.processed_html = re.sub(r'<\?xml[^>]*>', '', self.processed_html)
|
||||||
self.processed_html = re.sub(r'<(/?)o:p', r'<\1p', self.processed_html)
|
self.processed_html = re.sub(r'<\s*(/?)\s*o:p[^>]*>', r'', self.processed_html)
|
||||||
# Swap inline and block level elements, and order block level elements according to priority
|
# Swap inline and block level elements, and order block level elements according to priority
|
||||||
# - lxml and beautifulsoup expect/assume a specific order based on xhtml spec
|
# - lxml and beautifulsoup expect/assume a specific order based on xhtml spec
|
||||||
self.processed_html = re.sub(r'(?i)(?P<styletags>(<(h\d+|i|b|u|em|small|big|strong|tt)>\s*){1,})(?P<para><p[^>]*>)', '\g<para>'+'\g<styletags>', self.processed_html)
|
self.processed_html = re.sub(r'(?i)(?P<styletags>(<(h\d+|i|b|u|em|small|big|strong|tt)>\s*){1,})(?P<para><p[^>]*>)', '\g<para>'+'\g<styletags>', self.processed_html)
|
||||||
|
@ -70,9 +70,27 @@ def clone_element(elem, nsmap={}, in_context=True):
|
|||||||
nelem.extend(elem)
|
nelem.extend(elem)
|
||||||
return nelem
|
return nelem
|
||||||
|
|
||||||
def html5_parse(data):
|
def node_depth(node):
|
||||||
|
ans = 0
|
||||||
|
p = node.getparent()
|
||||||
|
while p is not None:
|
||||||
|
ans += 1
|
||||||
|
p = p.getparent()
|
||||||
|
return ans
|
||||||
|
|
||||||
|
def html5_parse(data, max_nesting_depth=500):
|
||||||
import html5lib
|
import html5lib
|
||||||
data = html5lib.parse(data, treebuilder='lxml').getroot()
|
data = html5lib.parse(data, treebuilder='lxml').getroot()
|
||||||
|
|
||||||
|
# Check that the asinine HTML 5 algorithm did not result in a tree with
|
||||||
|
# insane nesting depths
|
||||||
|
for x in data.iterdescendants():
|
||||||
|
if len(x) == 0:
|
||||||
|
# Leaf node
|
||||||
|
depth = node_depth(x)
|
||||||
|
if depth > max_nesting_depth:
|
||||||
|
raise ValueError('html5lib resulted in a tree with nesting'
|
||||||
|
' depth > %d'%max_nesting_depth)
|
||||||
# Set lang correctly
|
# Set lang correctly
|
||||||
xl = data.attrib.pop('xmlU0003Alang', None)
|
xl = data.attrib.pop('xmlU0003Alang', None)
|
||||||
if xl is not None and 'lang' not in data.attrib:
|
if xl is not None and 'lang' not in data.attrib:
|
||||||
|
Loading…
x
Reference in New Issue
Block a user