HTML 5 parsing: Fix handling of xml:lang attributes on all elements

xml:lang is now mapped to a plain lang on all elements, not just <html>
This commit is contained in:
Kovid Goyal 2013-10-23 11:35:47 +05:30
parent b9421065f9
commit ce29abef51
2 changed files with 26 additions and 6 deletions

View File

@ -98,10 +98,6 @@ def html5_parse(data, max_nesting_depth=100):
if depth > max_nesting_depth:
raise ValueError('html5lib resulted in a tree with nesting'
' depth > %d'%max_nesting_depth)
# Set lang correctly
xl = data.attrib.pop('xmlU0003Alang', None)
if xl is not None and 'lang' not in data.attrib:
data.attrib['lang'] = xl
# html5lib has the most inelegant handling of namespaces I have ever seen
# Try to reconstitute destroyed namespace info
@ -110,6 +106,10 @@ def html5_parse(data, max_nesting_depth=100):
seen_namespaces = set()
for elem in tuple(data.iter(tag=etree.Element)):
elem.attrib.pop('xmlns', None)
# Set lang correctly
xl = elem.attrib.pop('xmlU0003Alang', None)
if xl is not None and 'lang' not in elem.attrib:
elem.attrib['lang'] = xl
namespaces = {}
for x in tuple(elem.attrib):
if x.startswith('xmlnsU') or x.startswith(xmlns_declaration):

View File

@ -71,11 +71,31 @@ def namespaces(test, parse_function):
ae(len(xpath('//ns1:tag2[@ns1:id="test"]')), 1, err)
ae(len(xpath('//ns2:tag3[@ns2:id="test"]')), 1, err)
all_checks = (nonvoid_cdata_elements, namespaces)
markup = '<html xml:lang="en"><body><p lang="de"><p xml:lang="es"><p lang="en" xml:lang="de">'
root = parse_function(markup)
err = 'xml:lang not converted to lang, parsed markup:\n' + etree.tostring(root)
ae(len(root.xpath('//*[@lang="en"]')), 2, err)
ae(len(root.xpath('//*[@lang="de"]')), 1, err)
ae(len(root.xpath('//*[@lang="es"]')), 1, err)
ae(len(XPath('//*[@xml:lang]')(root)), 0, err)
def space_characters(test, parse_function):
markup = '<html><p>\u000c</p>'
root = parse_function(markup)
err = 'form feed character not converted, parsed markup:\n' + etree.tostring(root)
test.assertNotIn('\u000c', root.xpath('//*[local-name()="p"]')[0].text, err)
def case_insensitive_element_names(test, parse_function):
markup = '<HTML><P> </p>'
root = parse_function(markup)
err = 'case sensitive parsing, parsed markup:\n' + etree.tostring(root)
test.assertEqual(len(XPath('//h:p')(root)), 1, err)
basic_checks = (nonvoid_cdata_elements, namespaces, space_characters, case_insensitive_element_names)
class ParsingTests(BaseTest):
def test_conversion_parser(self):
' Test parsing with the parser used for conversion '
for test in all_checks:
for test in basic_checks:
test(self, html5_parse)