Parse namespace prefixed <html:html> correctly

This commit is contained in:
Kovid Goyal 2013-10-25 15:31:07 +05:30
parent d3352aeec9
commit fe32a39f0f
2 changed files with 46 additions and 7 deletions

View File

@ -6,7 +6,7 @@ from __future__ import (unicode_literals, division, absolute_import,
__license__ = 'GPL v3'
__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
import copy
import copy, re
from functools import partial
from lxml.etree import ElementBase, XMLParser, ElementDefaultClassLookup, CommentBase
@ -24,6 +24,12 @@ infoset_filter = InfosetFilter()
to_xml_name = infoset_filter.toXmlName
known_namespaces = {namespaces[k]:k for k in ('mathml', 'svg')}
class NamespacedHTMLPresent(ValueError):
def __init__(self, prefix):
ValueError.__init__(self, prefix)
self.prefix = prefix
def create_lxml_context():
parser = XMLParser(no_network=True)
parser.set_element_class_lookup(ElementDefaultClassLookup(element=Element, comment=Comment))
@ -225,6 +231,12 @@ def process_attribs(attrs, nsmap):
if ':' in k:
if k.startswith('xmlns') and (k.startswith('xmlns:') or k == 'xmlns'):
prefix = k.partition(':')[2] or None
if prefix is not None:
# Use an existing prefix for this namespace, if
# possible
existing = {v:k for k, v in nsmap.iteritems()}.get(v, False)
if existing is not False:
prefix = existing
nsmap[prefix] = v
else:
namespaced_attribs[k] = v
@ -255,6 +267,7 @@ class TreeBuilder(BaseTreeBuilder):
BaseTreeBuilder.__init__(self, True)
self.lxml_context = create_lxml_context()
self.elementClass = partial(ElementFactory, context=self.lxml_context)
self.seen_extra_html = False
def getDocument(self):
return self.document.root
@ -272,6 +285,8 @@ class TreeBuilder(BaseTreeBuilder):
nsmap = nsmap or {}
attribs = process_attribs(token['data'], nsmap)
name = token["name"]
if name.endswith(':html'):
raise NamespacedHTMLPresent(name.rpartition(':')[0])
namespace = token.get("namespace", self.defaultNamespace)
if ':' in name:
prefix, name = name.partition(':')[0::2]
@ -314,9 +329,9 @@ class TreeBuilder(BaseTreeBuilder):
return element
def apply_html_attributes(self, attrs):
if not attrs:
return
html = self.openElements[0]
if len(html) > 0:
raise ValueError('Cannot apply attributes to <html> after it has children')
nsmap = html.nsmap.copy()
attribs = process_attribs(attrs, nsmap)
for k, v in attribs.iteritems():
@ -330,22 +345,34 @@ class TreeBuilder(BaseTreeBuilder):
self.openElements[0] = newroot
if self.document.root is html:
self.document.root = newroot
if len(html) > 0:
# TODO: the nsmap changes need to be propagated down the tree
for child in html:
newroot.append(copy.copy(child))
def parse(raw, decoder=None):
def parse(raw, decoder=None, log=None):
if isinstance(raw, bytes):
raw = xml_to_unicode(raw)[0] if decoder is None else decoder(raw)
# TODO: Replace entities?
raw = fix_self_closing_cdata_tags(raw) # TODO: Handle this in the parser
# TODO: ignore warnings
while True:
try:
parser = HTMLParser(tree=TreeBuilder)
parser.parse(raw, parseMeta=False, useChardet=False)
except NamespacedHTMLPresent as err:
raw = re.sub(r'<\s*/{0,1}(%s:)' % err.prefix, lambda m: m.group().replace(m.group(1), ''), raw, flags=re.I)
continue
break
root = parser.tree.getDocument()
if root.tag != '{%s}%s' % (namespaces['html'], 'html') or root.prefix:
raise ValueError('Failed to parse correctly, root has tag: %s and prefix: %s' % (root.tag, root.prefix))
return root
if __name__ == '__main__':
from lxml import etree
root = parse('<html><p -moo><gah\u000c>')
root = parse('<html:html xmlns:html="{html}" id="a"><html:p><html:p></html:html>'.format(html=namespaces['html']))
print (etree.tostring(root))
print()

View File

@ -67,6 +67,18 @@ def namespaces(test, parse_function):
match_and_prefix(root, '//svg:svg', 'svg', err)
match_and_prefix(root, '//svg:image[@xl:href]', 'svg', err)
root = parse_function('<html id="a"><p><html xmlns:x="y" lang="en"><p>')
err = 'Multiple HTML tags not handled, parsed markup:\n' + etree.tostring(root)
match_and_prefix(root, '//h:html', None, err)
match_and_prefix(root, '//h:html[@lang]', None, err)
match_and_prefix(root, '//h:html[@id]', None, err)
if parse_function is not html5_parse:
markup = '<html:html xmlns:html="{html}" id="a"><html:body><html:p></html:p></html:body></html>'.format(html=XHTML_NS)
root = parse_function(markup)
err = 'HTML namespace prefixed, parsed markup:\n' + etree.tostring(root)
match_and_prefix(root, '//h:html', None, err)
markup = '<html><body><ns1:tag1 xmlns:ns1="NS"><ns2:tag2 xmlns:ns2="NS" ns1:id="test"/><ns1:tag3 xmlns:ns1="NS2" ns1:id="test"/></ns1:tag1>'
root = parse_function(markup)
err = 'Arbitrary namespaces not preserved, parsed markup:\n' + etree.tostring(root)