HTML 5 parsing: Fix handling of xml:lang attributes on all elements

xml:lang is now mapped to a plain lang on all elements, not just <html>
2026-02-21 02:30:13 -05:00 · 2013-10-23 11:35:47 +05:30 · 2013-10-23 11:35:47 +05:30 · ce29abef51
commit ce29abef51
parent b9421065f9
2 changed files with 26 additions and 6 deletions
--- a/src/calibre/ebooks/oeb/parse_utils.py
+++ b/src/calibre/ebooks/oeb/parse_utils.py
@ -98,10 +98,6 @@ def html5_parse(data, max_nesting_depth=100):
            if depth > max_nesting_depth:
                raise ValueError('html5lib resulted in a tree with nesting'
                        ' depth > %d'%max_nesting_depth)
-    # Set lang correctly
-    xl = data.attrib.pop('xmlU0003Alang', None)
-    if xl is not None and 'lang' not in data.attrib:
-        data.attrib['lang'] = xl

    # html5lib has the most inelegant handling of namespaces I have ever seen
    # Try to reconstitute destroyed namespace info
@ -110,6 +106,10 @@ def html5_parse(data, max_nesting_depth=100):
    seen_namespaces = set()
    for elem in tuple(data.iter(tag=etree.Element)):
        elem.attrib.pop('xmlns', None)
+        # Set lang correctly
+        xl = elem.attrib.pop('xmlU0003Alang', None)
+        if xl is not None and 'lang' not in elem.attrib:
+            elem.attrib['lang'] = xl
        namespaces = {}
        for x in tuple(elem.attrib):
            if x.startswith('xmlnsU') or x.startswith(xmlns_declaration):
--- a/src/calibre/ebooks/oeb/polish/tests/parsing.py
+++ b/src/calibre/ebooks/oeb/polish/tests/parsing.py
@ -71,11 +71,31 @@ def namespaces(test, parse_function):
    ae(len(xpath('//ns1:tag2[@ns1:id="test"]')), 1, err)
    ae(len(xpath('//ns2:tag3[@ns2:id="test"]')), 1, err)

-all_checks = (nonvoid_cdata_elements, namespaces)
+    markup = '<html xml:lang="en"><body><p lang="de"><p xml:lang="es"><p lang="en" xml:lang="de">'
+    root = parse_function(markup)
+    err = 'xml:lang not converted to lang, parsed markup:\n' + etree.tostring(root)
+    ae(len(root.xpath('//*[@lang="en"]')), 2, err)
+    ae(len(root.xpath('//*[@lang="de"]')), 1, err)
+    ae(len(root.xpath('//*[@lang="es"]')), 1, err)
+    ae(len(XPath('//*[@xml:lang]')(root)), 0, err)
+
+def space_characters(test, parse_function):
+    markup = '<html><p>\u000c</p>'
+    root = parse_function(markup)
+    err = 'form feed character not converted, parsed markup:\n' + etree.tostring(root)
+    test.assertNotIn('\u000c', root.xpath('//*[local-name()="p"]')[0].text, err)
+
+def case_insensitive_element_names(test, parse_function):
+    markup = '<HTML><P> </p>'
+    root = parse_function(markup)
+    err = 'case sensitive parsing, parsed markup:\n' + etree.tostring(root)
+    test.assertEqual(len(XPath('//h:p')(root)), 1, err)
+
+basic_checks = (nonvoid_cdata_elements, namespaces, space_characters, case_insensitive_element_names)

 class ParsingTests(BaseTest):

    def test_conversion_parser(self):
        ' Test parsing with the parser used for conversion '
-        for test in all_checks:
+        for test in basic_checks:
            test(self, html5_parse)