Test framework for tag soup parsing

Also fix a couple of minor bugs in the current html 5 parser (self
closed <style/>, <script/>, <textarea/> tags and re-mapped namespace
prefixes.
This commit is contained in:
Kovid Goyal 2013-10-23 10:37:14 +05:30
parent 18d5d9a3a4
commit b4bf871077
3 changed files with 88 additions and 2 deletions

View File

@ -88,7 +88,7 @@ self_closing_bad_tags = {'a', 'abbr', 'address', 'article', 'aside', 'audio', 'b
'label', 'legend', 'li', 'map', 'mark', 'meter', 'nav', 'ol', 'output', 'p',
'pre', 'progress', 'q', 'rp', 'rt', 'samp', 'section', 'select', 'small',
'span', 'strong', 'sub', 'summary', 'sup', 'textarea', 'time', 'ul', 'var',
'video'}
'video', 'title', 'script', 'style'}
_self_closing_pat = re.compile(
r'<(?P<tag>%s)(?=[\s/])(?P<arg>[^>]*)/>'%('|'.join(self_closing_bad_tags)),

View File

@ -83,7 +83,7 @@ def node_depth(node):
def html5_parse(data, max_nesting_depth=100):
import html5lib
# html5lib bug: http://code.google.com/p/html5lib/issues/detail?id=195
data = re.sub(r'<\s*title\s*[^>]*/\s*>', '<title></title>', data)
data = re.sub(r'<\s*(title|style|script|textarea)\s*[^>]*/\s*>', r'<\1></\1>', data, flags=re.I)
data = html5lib.parse(data, treebuilder='lxml').getroot()
@ -116,6 +116,7 @@ def html5_parse(data, max_nesting_depth=100):
prefix = x[11:]
namespaces[prefix] = val
remapped_namespaces = {}
if namespaces:
# Some destroyed namespace declarations were found
p = elem.getparent()
@ -127,6 +128,7 @@ def html5_parse(data, max_nesting_depth=100):
p.remove(elem)
elem = clone_element(elem, nsmap=namespaces)
p.insert(idx, elem)
remapped_namespaces = {ns:namespaces[ns] for ns in set(namespaces) - set(elem.nsmap)}
b = barename(elem.tag)
idx = b.find('U0003A')
@ -135,6 +137,8 @@ def html5_parse(data, max_nesting_depth=100):
ns = elem.nsmap.get(prefix, None)
if ns is None:
ns = non_html5_namespaces.get(prefix, None)
if ns is None:
ns = remapped_namespaces.get(prefix, None)
if ns is not None:
elem.tag = '{%s}%s'%(ns, tag)
@ -145,6 +149,8 @@ def html5_parse(data, max_nesting_depth=100):
ns = elem.nsmap.get(prefix, None)
if ns is None:
ns = non_html5_namespaces.get(prefix, None)
if ns is None:
ns = remapped_namespaces.get(prefix, None)
if ns is not None:
elem.attrib['{%s}%s'%(ns, tag)] = elem.attrib.pop(b)

View File

@ -0,0 +1,80 @@
#!/usr/bin/env python
# vim:fileencoding=utf-8
from __future__ import (unicode_literals, division, absolute_import,
print_function)
__license__ = 'GPL v3'
__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
from lxml import etree
from calibre.ebooks.oeb.polish.tests.base import BaseTest
from calibre.ebooks.oeb.base import XPath, XHTML_NS, SVG_NS, XLINK_NS
from calibre.ebooks.oeb.parse_utils import html5_parse
def nonvoid_cdata_elements(test, parse_function):
''' If self closed version of non-void cdata elements like <title/> are
present, the HTML5 parsing algorithm treats all following data as CDATA '''
markup = '''
<html> <head><{0}/></head> <body id="test"> </html>
'''
for tag in ('title', 'style', 'script', 'textarea'):
for x in (tag, tag.upper(), '\n' + tag, tag + ' id="xxx" '):
root = parse_function(markup.format(x))
test.assertEqual(
len(XPath('//h:body[@id="test"]')(root)), 1,
'Incorrect parsing for <%s/>, parsed markup:\n' % x + etree.tostring(root))
def namespaces(test, parse_function):
ae = test.assertEqual
markup = ''' <html xmlns="{xhtml}"><head><body id="test"></html> '''.format(xhtml=XHTML_NS)
root = parse_function(markup)
ae(
len(XPath('//h:body[@id="test"]')(root)), 1,
'Incorrect parsing, parsed markup:\n' + etree.tostring(root))
markup = '''
<html xmlns="{xhtml}"><head><body id="test">
<svg:svg xmlns:svg="{svg}"><svg:image xmlns:xlink="{xlink}" xlink:href="xxx"/></svg:svg>
'''.format(xhtml=XHTML_NS, svg=SVG_NS, xlink=XLINK_NS)
root = parse_function(markup)
err = 'Incorrect parsing, parsed markup:\n' + etree.tostring(root)
ae(len(XPath('//h:body[@id="test"]')(root)), 1, err)
ae(len(XPath('//svg:svg')(root)), 1, err)
ae(len(XPath('//svg:image[@xl:href]')(root)), 1, err)
markup = '''
<html xmlns="{xhtml}"><head><body id="test">
<svg xmlns="{svg}" xmlns:xlink="{xlink}" ><image xlink:href="xxx"/></svg>
'''.format(xhtml=XHTML_NS, svg=SVG_NS, xlink=XLINK_NS)
root = parse_function(markup)
err = 'Incorrect parsing, parsed markup:\n' + etree.tostring(root)
ae(len(XPath('//h:body[@id="test"]')(root)), 1, err)
ae(len(XPath('//svg:svg')(root)), 1, err)
ae(len(XPath('//svg:image[@xl:href]')(root)), 1, err)
markup = '<html><body><svg><image xlink:href="xxx"></svg>'
root = parse_function(markup)
err = 'Namespaces not created, parsed markup:\n' + etree.tostring(root)
ae(len(XPath('//svg:svg')(root)), 1, err)
ae(len(XPath('//svg:image[@xl:href]')(root)), 1, err)
markup = '<html><body><ns1:tag1 xmlns:ns1="NS"><ns2:tag2 xmlns:ns2="NS" ns1:id="test"/><ns1:tag3 xmlns:ns1="NS2" ns1:id="test"/></ns1:tag1>'
root = parse_function(markup)
err = 'Arbitrary namespaces not preserved, parsed markup:\n' + etree.tostring(root)
def xpath(expr):
return etree.XPath(expr, namespaces={'ns1':'NS', 'ns2':'NS2'})(root)
ae(len(xpath('//ns1:tag1')), 1, err)
ae(len(xpath('//ns1:tag2')), 1, err)
ae(len(xpath('//ns2:tag3')), 1, err)
ae(len(xpath('//ns1:tag2[@ns1:id="test"]')), 1, err)
ae(len(xpath('//ns2:tag3[@ns2:id="test"]')), 1, err)
all_checks = (nonvoid_cdata_elements, namespaces)
class ParsingTests(BaseTest):
def test_conversion_parser(self):
' Test parsing with the parser used for conversion '
for test in all_checks:
test(self, html5_parse)