HTML 5 parser: drop xmlns:xml declarations on <html> and <body>

This commit is contained in:
Kovid Goyal 2013-12-11 09:56:11 +05:30
parent 31a02ba0f6
commit a30db00a8e
2 changed files with 7 additions and 0 deletions

View File

@ -399,6 +399,8 @@ class TreeBuilder(BaseTreeBuilder):
except TypeError: except TypeError:
pass pass
except ValueError: except ValueError:
if k == 'xmlns:xml':
continue
if k == 'xml:lang' and 'lang' not in html.attrib: if k == 'xml:lang' and 'lang' not in html.attrib:
k = 'lang' k = 'lang'
html.set(to_xml_name(k), v) html.set(to_xml_name(k), v)
@ -414,6 +416,8 @@ class TreeBuilder(BaseTreeBuilder):
except TypeError: except TypeError:
pass pass
except ValueError: except ValueError:
if k == 'xmlns:xml':
continue
if k == 'xml:lang' and 'lang' not in body.attrib: if k == 'xml:lang' and 'lang' not in body.attrib:
k = 'lang' k = 'lang'
body.set(to_xml_name(k), v) body.set(to_xml_name(k), v)

View File

@ -177,6 +177,9 @@ class ParsingTests(BaseTest):
for i, (k, v) in enumerate(root.xpath('//*[local-name()="%s"]' % tag)[0].items()): for i, (k, v) in enumerate(root.xpath('//*[local-name()="%s"]' % tag)[0].items()):
self.assertEqual(i+1, int(v)) self.assertEqual(i+1, int(v))
root = parse('<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en-US" xmlns:xml="http://www.w3.org/XML/1998/namespace"><body/></html>')
self.assertNotIn('xmlnsU0003Axml', root.attrib, 'xml namespace declaration not removed')
def timing(): def timing():
import time, sys import time, sys
from calibre.ebooks.chardet import xml_to_unicode from calibre.ebooks.chardet import xml_to_unicode