diff --git a/src/calibre/ebooks/mobi/reader.py b/src/calibre/ebooks/mobi/reader.py index 8637cfb9ae..e58b492cef 100644 --- a/src/calibre/ebooks/mobi/reader.py +++ b/src/calibre/ebooks/mobi/reader.py @@ -502,7 +502,7 @@ class MobiReader(object): self.processed_html = self.processed_html.replace('> <', '>\n<') self.processed_html = self.processed_html.replace(']*>', '', self.processed_html) - self.processed_html = re.sub(r'<(/?)o:p', r'<\1p', self.processed_html) + self.processed_html = re.sub(r'<\s*(/?)\s*o:p[^>]*>', r'', self.processed_html) # Swap inline and block level elements, and order block level elements according to priority # - lxml and beautifulsoup expect/assume a specific order based on xhtml spec self.processed_html = re.sub(r'(?i)(?P(<(h\d+|i|b|u|em|small|big|strong|tt)>\s*){1,})(?P]*>)', '\g'+'\g', self.processed_html) diff --git a/src/calibre/ebooks/oeb/parse_utils.py b/src/calibre/ebooks/oeb/parse_utils.py index f8456914b9..e02a4d0e61 100644 --- a/src/calibre/ebooks/oeb/parse_utils.py +++ b/src/calibre/ebooks/oeb/parse_utils.py @@ -70,9 +70,27 @@ def clone_element(elem, nsmap={}, in_context=True): nelem.extend(elem) return nelem -def html5_parse(data): +def node_depth(node): + ans = 0 + p = node.getparent() + while p is not None: + ans += 1 + p = p.getparent() + return ans + +def html5_parse(data, max_nesting_depth=500): import html5lib data = html5lib.parse(data, treebuilder='lxml').getroot() + + # Check that the asinine HTML 5 algorithm did not result in a tree with + # insane nesting depths + for x in data.iterdescendants(): + if len(x) == 0: + # Leaf node + depth = node_depth(x) + if depth > max_nesting_depth: + raise ValueError('html5lib resulted in a tree with nesting' + ' depth > %d'%max_nesting_depth) # Set lang correctly xl = data.attrib.pop('xmlU0003Alang', None) if xl is not None and 'lang' not in data.attrib: