Test framework for tag soup parsing

Also fix a couple of minor bugs in the current html 5 parser (self closed <style/>, <script/>, <textarea/> tags and re-mapped namespace prefixes.
2025-08-30 23:00:21 -04:00 · 2013-10-23 10:37:14 +05:30 · 2013-10-23 10:37:14 +05:30 · b4bf871077
commit b4bf871077
parent 18d5d9a3a4
3 changed files with 88 additions and 2 deletions
--- a/src/calibre/ebooks/oeb/base.py
+++ b/src/calibre/ebooks/oeb/base.py
@ -88,7 +88,7 @@ self_closing_bad_tags = {'a', 'abbr', 'address', 'article', 'aside', 'audio', 'b
 'label', 'legend', 'li', 'map', 'mark', 'meter', 'nav', 'ol', 'output', 'p',
 'pre', 'progress', 'q', 'rp', 'rt', 'samp', 'section', 'select', 'small',
 'span', 'strong', 'sub', 'summary', 'sup', 'textarea', 'time', 'ul', 'var',
-'video'}
+'video', 'title', 'script', 'style'}

 _self_closing_pat = re.compile(
    r'<(?P<tag>%s)(?=[\s/])(?P<arg>[^>]*)/>'%('|'.join(self_closing_bad_tags)),
--- a/src/calibre/ebooks/oeb/parse_utils.py
+++ b/src/calibre/ebooks/oeb/parse_utils.py
@ -83,7 +83,7 @@ def node_depth(node):
 def html5_parse(data, max_nesting_depth=100):
    import html5lib
    # html5lib bug: http://code.google.com/p/html5lib/issues/detail?id=195
-    data = re.sub(r'<\s*title\s*[^>]*/\s*>', '<title></title>', data)
+    data = re.sub(r'<\s*(title|style|script|textarea)\s*[^>]*/\s*>', r'<\1></\1>', data, flags=re.I)

    data = html5lib.parse(data, treebuilder='lxml').getroot()

@ -116,6 +116,7 @@ def html5_parse(data, max_nesting_depth=100):
                    prefix = x[11:]
                    namespaces[prefix] = val

+        remapped_namespaces = {}
        if namespaces:
            # Some destroyed namespace declarations were found
            p = elem.getparent()
@ -127,6 +128,7 @@ def html5_parse(data, max_nesting_depth=100):
                p.remove(elem)
                elem = clone_element(elem, nsmap=namespaces)
                p.insert(idx, elem)
+                remapped_namespaces = {ns:namespaces[ns] for ns in set(namespaces) - set(elem.nsmap)}

        b = barename(elem.tag)
        idx = b.find('U0003A')
@ -135,6 +137,8 @@ def html5_parse(data, max_nesting_depth=100):
            ns = elem.nsmap.get(prefix, None)
            if ns is None:
                ns = non_html5_namespaces.get(prefix, None)
+            if ns is None:
+                ns = remapped_namespaces.get(prefix, None)
            if ns is not None:
                elem.tag = '{%s}%s'%(ns, tag)

@ -145,6 +149,8 @@ def html5_parse(data, max_nesting_depth=100):
                ns = elem.nsmap.get(prefix, None)
                if ns is None:
                    ns = non_html5_namespaces.get(prefix, None)
+                if ns is None:
+                    ns = remapped_namespaces.get(prefix, None)
                if ns is not None:
                    elem.attrib['{%s}%s'%(ns, tag)] = elem.attrib.pop(b)

--- a/src/calibre/ebooks/oeb/polish/tests/parsing.py
+++ b/src/calibre/ebooks/oeb/polish/tests/parsing.py
@ -0,0 +1,80 @@
+#!/usr/bin/env python
+# vim:fileencoding=utf-8
+from __future__ import (unicode_literals, division, absolute_import,
+                        print_function)
+
+__license__ = 'GPL v3'
+__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
+
+from lxml import etree
+
+from calibre.ebooks.oeb.polish.tests.base import BaseTest
+from calibre.ebooks.oeb.base import XPath, XHTML_NS, SVG_NS, XLINK_NS
+from calibre.ebooks.oeb.parse_utils import html5_parse
+
+def nonvoid_cdata_elements(test, parse_function):
+    ''' If self closed version of non-void cdata elements like <title/> are
+    present, the HTML5 parsing algorithm treats all following data as CDATA '''
+    markup = '''
+    <html> <head><{0}/></head> <body id="test"> </html>
+    '''
+    for tag in ('title', 'style', 'script', 'textarea'):
+        for x in (tag, tag.upper(), '\n' + tag, tag + ' id="xxx" '):
+            root = parse_function(markup.format(x))
+            test.assertEqual(
+                len(XPath('//h:body[@id="test"]')(root)), 1,
+                'Incorrect parsing for <%s/>, parsed markup:\n' % x + etree.tostring(root))
+
+def namespaces(test, parse_function):
+    ae = test.assertEqual
+    markup = ''' <html xmlns="{xhtml}"><head><body id="test"></html> '''.format(xhtml=XHTML_NS)
+    root = parse_function(markup)
+    ae(
+        len(XPath('//h:body[@id="test"]')(root)), 1,
+        'Incorrect parsing, parsed markup:\n' + etree.tostring(root))
+
+    markup = '''
+    <html xmlns="{xhtml}"><head><body id="test">
+    <svg:svg xmlns:svg="{svg}"><svg:image xmlns:xlink="{xlink}" xlink:href="xxx"/></svg:svg>
+    '''.format(xhtml=XHTML_NS, svg=SVG_NS, xlink=XLINK_NS)
+    root = parse_function(markup)
+    err = 'Incorrect parsing, parsed markup:\n' + etree.tostring(root)
+    ae(len(XPath('//h:body[@id="test"]')(root)), 1, err)
+    ae(len(XPath('//svg:svg')(root)), 1, err)
+    ae(len(XPath('//svg:image[@xl:href]')(root)), 1, err)
+
+    markup = '''
+    <html xmlns="{xhtml}"><head><body id="test">
+    <svg xmlns="{svg}" xmlns:xlink="{xlink}" ><image xlink:href="xxx"/></svg>
+    '''.format(xhtml=XHTML_NS, svg=SVG_NS, xlink=XLINK_NS)
+    root = parse_function(markup)
+    err = 'Incorrect parsing, parsed markup:\n' + etree.tostring(root)
+    ae(len(XPath('//h:body[@id="test"]')(root)), 1, err)
+    ae(len(XPath('//svg:svg')(root)), 1, err)
+    ae(len(XPath('//svg:image[@xl:href]')(root)), 1, err)
+
+    markup = '<html><body><svg><image xlink:href="xxx"></svg>'
+    root = parse_function(markup)
+    err = 'Namespaces not created, parsed markup:\n' + etree.tostring(root)
+    ae(len(XPath('//svg:svg')(root)), 1, err)
+    ae(len(XPath('//svg:image[@xl:href]')(root)), 1, err)
+
+    markup = '<html><body><ns1:tag1 xmlns:ns1="NS"><ns2:tag2 xmlns:ns2="NS" ns1:id="test"/><ns1:tag3 xmlns:ns1="NS2" ns1:id="test"/></ns1:tag1>'
+    root = parse_function(markup)
+    err = 'Arbitrary namespaces not preserved, parsed markup:\n' + etree.tostring(root)
+    def xpath(expr):
+        return etree.XPath(expr, namespaces={'ns1':'NS', 'ns2':'NS2'})(root)
+    ae(len(xpath('//ns1:tag1')), 1, err)
+    ae(len(xpath('//ns1:tag2')), 1, err)
+    ae(len(xpath('//ns2:tag3')), 1, err)
+    ae(len(xpath('//ns1:tag2[@ns1:id="test"]')), 1, err)
+    ae(len(xpath('//ns2:tag3[@ns2:id="test"]')), 1, err)
+
+all_checks = (nonvoid_cdata_elements, namespaces)
+
+class ParsingTests(BaseTest):
+
+    def test_conversion_parser(self):
+        ' Test parsing with the parser used for conversion '
+        for test in all_checks:
+            test(self, html5_parse)