From 966c5d572c5b656a4de0cb7847f89758c2e51f53 Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Wed, 23 Oct 2013 11:35:47 +0530
Subject: [PATCH] HTML 5 parsing: Fix handling of xml:lang attributes on all
 elements

xml:lang is now mapped to a plain lang on all elements, not just <html>
---
 src/calibre/ebooks/oeb/parse_utils.py         |  8 +++----
 .../ebooks/oeb/polish/tests/parsing.py        | 24 +++++++++++++++++--
 2 files changed, 26 insertions(+), 6 deletions(-)
diff --git a/src/calibre/ebooks/oeb/parse_utils.py b/src/calibre/ebooks/oeb/parse_utils.py
index df315df775..88d9a198c3 100644
--- a/src/calibre/ebooks/oeb/parse_utils.py
+++ b/src/calibre/ebooks/oeb/parse_utils.py
@@ -98,10 +98,6 @@ def html5_parse(data, max_nesting_depth=100):
             if depth > max_nesting_depth:
                 raise ValueError('html5lib resulted in a tree with nesting'
                         ' depth > %d'%max_nesting_depth)
-    # Set lang correctly
-    xl = data.attrib.pop('xmlU0003Alang', None)
-    if xl is not None and 'lang' not in data.attrib:
-        data.attrib['lang'] = xl
 
     # html5lib has the most inelegant handling of namespaces I have ever seen
     # Try to reconstitute destroyed namespace info
@@ -110,6 +106,10 @@ def html5_parse(data, max_nesting_depth=100):
     seen_namespaces = set()
     for elem in tuple(data.iter(tag=etree.Element)):
         elem.attrib.pop('xmlns', None)
+        # Set lang correctly
+        xl = elem.attrib.pop('xmlU0003Alang', None)
+        if xl is not None and 'lang' not in elem.attrib:
+            elem.attrib['lang'] = xl
         namespaces = {}
         for x in tuple(elem.attrib):
             if x.startswith('xmlnsU') or x.startswith(xmlns_declaration):
diff --git a/src/calibre/ebooks/oeb/polish/tests/parsing.py b/src/calibre/ebooks/oeb/polish/tests/parsing.py
index 7b9bda7e21..cbe65947b8 100644
--- a/src/calibre/ebooks/oeb/polish/tests/parsing.py
+++ b/src/calibre/ebooks/oeb/polish/tests/parsing.py
@@ -71,11 +71,31 @@ def namespaces(test, parse_function):
     ae(len(xpath('//ns1:tag2[@ns1:id="test"]')), 1, err)
     ae(len(xpath('//ns2:tag3[@ns2:id="test"]')), 1, err)
 
-all_checks = (nonvoid_cdata_elements, namespaces)
+    markup = '<html xml:lang="en"><body><p lang="de"><p xml:lang="es"><p lang="en" xml:lang="de">'
+    root = parse_function(markup)
+    err = 'xml:lang not converted to lang, parsed markup:\n' + etree.tostring(root)
+    ae(len(root.xpath('//*[@lang="en"]')), 2, err)
+    ae(len(root.xpath('//*[@lang="de"]')), 1, err)
+    ae(len(root.xpath('//*[@lang="es"]')), 1, err)
+    ae(len(XPath('//*[@xml:lang]')(root)), 0, err)
+
+def space_characters(test, parse_function):
+    markup = '<html><p>\u000c</p>'
+    root = parse_function(markup)
+    err = 'form feed character not converted, parsed markup:\n' + etree.tostring(root)
+    test.assertNotIn('\u000c', root.xpath('//*[local-name()="p"]')[0].text, err)
+
+def case_insensitive_element_names(test, parse_function):
+    markup = '<HTML><P> </p>'
+    root = parse_function(markup)
+    err = 'case sensitive parsing, parsed markup:\n' + etree.tostring(root)
+    test.assertEqual(len(XPath('//h:p')(root)), 1, err)
+
+basic_checks = (nonvoid_cdata_elements, namespaces, space_characters, case_insensitive_element_names)
 
 class ParsingTests(BaseTest):
 
     def test_conversion_parser(self):
         ' Test parsing with the parser used for conversion '
-        for test in all_checks:
+        for test in basic_checks:
             test(self, html5_parse)