From 966c5d572c5b656a4de0cb7847f89758c2e51f53 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Wed, 23 Oct 2013 11:35:47 +0530 Subject: [PATCH] HTML 5 parsing: Fix handling of xml:lang attributes on all elements xml:lang is now mapped to a plain lang on all elements, not just --- src/calibre/ebooks/oeb/parse_utils.py | 8 +++---- .../ebooks/oeb/polish/tests/parsing.py | 24 +++++++++++++++++-- 2 files changed, 26 insertions(+), 6 deletions(-) diff --git a/src/calibre/ebooks/oeb/parse_utils.py b/src/calibre/ebooks/oeb/parse_utils.py index df315df775..88d9a198c3 100644 --- a/src/calibre/ebooks/oeb/parse_utils.py +++ b/src/calibre/ebooks/oeb/parse_utils.py @@ -98,10 +98,6 @@ def html5_parse(data, max_nesting_depth=100): if depth > max_nesting_depth: raise ValueError('html5lib resulted in a tree with nesting' ' depth > %d'%max_nesting_depth) - # Set lang correctly - xl = data.attrib.pop('xmlU0003Alang', None) - if xl is not None and 'lang' not in data.attrib: - data.attrib['lang'] = xl # html5lib has the most inelegant handling of namespaces I have ever seen # Try to reconstitute destroyed namespace info @@ -110,6 +106,10 @@ def html5_parse(data, max_nesting_depth=100): seen_namespaces = set() for elem in tuple(data.iter(tag=etree.Element)): elem.attrib.pop('xmlns', None) + # Set lang correctly + xl = elem.attrib.pop('xmlU0003Alang', None) + if xl is not None and 'lang' not in elem.attrib: + elem.attrib['lang'] = xl namespaces = {} for x in tuple(elem.attrib): if x.startswith('xmlnsU') or x.startswith(xmlns_declaration): diff --git a/src/calibre/ebooks/oeb/polish/tests/parsing.py b/src/calibre/ebooks/oeb/polish/tests/parsing.py index 7b9bda7e21..cbe65947b8 100644 --- a/src/calibre/ebooks/oeb/polish/tests/parsing.py +++ b/src/calibre/ebooks/oeb/polish/tests/parsing.py @@ -71,11 +71,31 @@ def namespaces(test, parse_function): ae(len(xpath('//ns1:tag2[@ns1:id="test"]')), 1, err) ae(len(xpath('//ns2:tag3[@ns2:id="test"]')), 1, err) -all_checks = (nonvoid_cdata_elements, namespaces) + markup = '

' + root = parse_function(markup) + err = 'xml:lang not converted to lang, parsed markup:\n' + etree.tostring(root) + ae(len(root.xpath('//*[@lang="en"]')), 2, err) + ae(len(root.xpath('//*[@lang="de"]')), 1, err) + ae(len(root.xpath('//*[@lang="es"]')), 1, err) + ae(len(XPath('//*[@xml:lang]')(root)), 0, err) + +def space_characters(test, parse_function): + markup = '

\u000c

' + root = parse_function(markup) + err = 'form feed character not converted, parsed markup:\n' + etree.tostring(root) + test.assertNotIn('\u000c', root.xpath('//*[local-name()="p"]')[0].text, err) + +def case_insensitive_element_names(test, parse_function): + markup = '

' + root = parse_function(markup) + err = 'case sensitive parsing, parsed markup:\n' + etree.tostring(root) + test.assertEqual(len(XPath('//h:p')(root)), 1, err) + +basic_checks = (nonvoid_cdata_elements, namespaces, space_characters, case_insensitive_element_names) class ParsingTests(BaseTest): def test_conversion_parser(self): ' Test parsing with the parser used for conversion ' - for test in all_checks: + for test in basic_checks: test(self, html5_parse)