mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
HTML 5 parsing: Fix handling of xml:lang attributes on all elements
xml:lang is now mapped to a plain lang on all elements, not just <html>
This commit is contained in:
parent
b9421065f9
commit
ce29abef51
@ -98,10 +98,6 @@ def html5_parse(data, max_nesting_depth=100):
|
||||
if depth > max_nesting_depth:
|
||||
raise ValueError('html5lib resulted in a tree with nesting'
|
||||
' depth > %d'%max_nesting_depth)
|
||||
# Set lang correctly
|
||||
xl = data.attrib.pop('xmlU0003Alang', None)
|
||||
if xl is not None and 'lang' not in data.attrib:
|
||||
data.attrib['lang'] = xl
|
||||
|
||||
# html5lib has the most inelegant handling of namespaces I have ever seen
|
||||
# Try to reconstitute destroyed namespace info
|
||||
@ -110,6 +106,10 @@ def html5_parse(data, max_nesting_depth=100):
|
||||
seen_namespaces = set()
|
||||
for elem in tuple(data.iter(tag=etree.Element)):
|
||||
elem.attrib.pop('xmlns', None)
|
||||
# Set lang correctly
|
||||
xl = elem.attrib.pop('xmlU0003Alang', None)
|
||||
if xl is not None and 'lang' not in elem.attrib:
|
||||
elem.attrib['lang'] = xl
|
||||
namespaces = {}
|
||||
for x in tuple(elem.attrib):
|
||||
if x.startswith('xmlnsU') or x.startswith(xmlns_declaration):
|
||||
|
@ -71,11 +71,31 @@ def namespaces(test, parse_function):
|
||||
ae(len(xpath('//ns1:tag2[@ns1:id="test"]')), 1, err)
|
||||
ae(len(xpath('//ns2:tag3[@ns2:id="test"]')), 1, err)
|
||||
|
||||
all_checks = (nonvoid_cdata_elements, namespaces)
|
||||
markup = '<html xml:lang="en"><body><p lang="de"><p xml:lang="es"><p lang="en" xml:lang="de">'
|
||||
root = parse_function(markup)
|
||||
err = 'xml:lang not converted to lang, parsed markup:\n' + etree.tostring(root)
|
||||
ae(len(root.xpath('//*[@lang="en"]')), 2, err)
|
||||
ae(len(root.xpath('//*[@lang="de"]')), 1, err)
|
||||
ae(len(root.xpath('//*[@lang="es"]')), 1, err)
|
||||
ae(len(XPath('//*[@xml:lang]')(root)), 0, err)
|
||||
|
||||
def space_characters(test, parse_function):
|
||||
markup = '<html><p>\u000c</p>'
|
||||
root = parse_function(markup)
|
||||
err = 'form feed character not converted, parsed markup:\n' + etree.tostring(root)
|
||||
test.assertNotIn('\u000c', root.xpath('//*[local-name()="p"]')[0].text, err)
|
||||
|
||||
def case_insensitive_element_names(test, parse_function):
|
||||
markup = '<HTML><P> </p>'
|
||||
root = parse_function(markup)
|
||||
err = 'case sensitive parsing, parsed markup:\n' + etree.tostring(root)
|
||||
test.assertEqual(len(XPath('//h:p')(root)), 1, err)
|
||||
|
||||
basic_checks = (nonvoid_cdata_elements, namespaces, space_characters, case_insensitive_element_names)
|
||||
|
||||
class ParsingTests(BaseTest):
|
||||
|
||||
def test_conversion_parser(self):
|
||||
' Test parsing with the parser used for conversion '
|
||||
for test in all_checks:
|
||||
for test in basic_checks:
|
||||
test(self, html5_parse)
|
||||
|
Loading…
x
Reference in New Issue
Block a user