Speed up parsing by only special casing non-XML safe attributes

This commit is contained in:
Kovid Goyal 2013-10-28 13:34:28 +05:30
parent 2462f6b832
commit a228d95678

View File

@ -9,7 +9,6 @@ __copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
import copy, re, warnings import copy, re, warnings
from functools import partial from functools import partial
from bisect import bisect from bisect import bisect
from collections import OrderedDict
from lxml.etree import ElementBase, XMLParser, ElementDefaultClassLookup, CommentBase from lxml.etree import ElementBase, XMLParser, ElementDefaultClassLookup, CommentBase
@ -28,6 +27,7 @@ to_xml_name = infoset_filter.toXmlName
known_namespaces = {namespaces[k]:k for k in ('mathml', 'svg', 'xlink')} known_namespaces = {namespaces[k]:k for k in ('mathml', 'svg', 'xlink')}
html_ns = namespaces['html'] html_ns = namespaces['html']
xlink_ns = namespaces['xlink'] xlink_ns = namespaces['xlink']
xml_ns = namespaces['xmlns']
class NamespacedHTMLPresent(ValueError): class NamespacedHTMLPresent(ValueError):
@ -203,71 +203,106 @@ def create_lxml_context():
# }}} # }}}
def process_attribs(attrs, nsmap): def clean_attrib(name, val, nsmap, attrib, namespaced_attribs):
attrib_name_map = {}
namespaced_attribs = {}
xmlns = namespaces['xmlns']
for k, v in attrs.iteritems():
if isinstance(k, tuple):
if k[2] == xmlns:
prefix, name, ns = k
if prefix is None:
nsmap[None] = v
else:
nsmap[name] = v
else:
if k[2] == xlink_ns and 'xlink' not in nsmap:
for prefix, ns in tuple(nsmap.iteritems()):
if ns == xlink_ns:
del nsmap[prefix]
nsmap['xlink'] = xlink_ns
attrib_name_map[k] = '{%s}%s' % (k[2], k[1])
else:
if ':' in k:
if k.startswith('xmlns') and (k.startswith('xmlns:') or k == 'xmlns'):
prefix = k.partition(':')[2] or None
if prefix is not None:
# Use an existing prefix for this namespace, if
# possible
existing = {x:k for k, x in nsmap.iteritems()}.get(v, False)
if existing is not False:
prefix = existing
nsmap[prefix] = v
else:
namespaced_attribs[k] = v
else:
attrib_name_map[k] = k
xml_lang = None if isinstance(name, tuple):
for k, v in namespaced_attribs.iteritems(): prefix, name, ns = name
prefix, name = k.partition(':')[0::2] if ns == xml_ns:
if prefix is None:
nsmap[None] = val
else:
nsmap[name] = val
return None, True
nsmap_changed = False
if ns == xlink_ns and 'xlink' not in nsmap:
for prefix, nns in tuple(nsmap.iteritems()):
if nns == xlink_ns:
del nsmap[prefix]
nsmap['xlink'] = xlink_ns
nsmap_changed = True
return ('{%s}%s' % (ns, name)), nsmap_changed
if ':' in name:
prefix, name = name.partition(':')[0::2]
if prefix == 'xmlns':
# Use an existing prefix for this namespace, if
# possible
existing = {x:k for k, x in nsmap.iteritems()}.get(val, False)
if existing is not False:
name = existing
nsmap[name] = val
return None, True
if prefix == 'xml': if prefix == 'xml':
if name == 'lang': if name != 'lang' or name in attrib:
xml_lang = v return None, False
continue return name, False
ns = nsmap.get(prefix, None) ns = nsmap.get(prefix, None)
if ns is not None: if ns is None:
name = '{%s}%s' % (ns, name) namespaced_attribs[(prefix, name)] = val
attrib_name_map[k] = name return None, True
return '{%s}%s' % (ns, name), False
ans = OrderedDict((attrib_name_map.get(k, None), v) for k, v in attrs.iteritems()) return name, False
ans.pop(None, None)
if xml_lang:
ans['lang'] = ans.get('lang', xml_lang)
return ans
def makeelement_ns(ctx, namespace, name, attrib, nsmap): def makeelement_ns(ctx, namespace, prefix, name, attrib, nsmap):
nns = attrib.pop('xmlns', None)
if nns is not None:
nsmap[None] = nns
try: try:
elem = ctx.makeelement('{%s}%s' % (namespace, name), nsmap=nsmap) elem = ctx.makeelement('{%s}%s' % (namespace, name), nsmap=nsmap)
except ValueError: except ValueError:
elem = ctx.makeelement('{%s}%s' % (namespace, to_xml_name(name)), nsmap=nsmap) elem = ctx.makeelement('{%s}%s' % (namespace, to_xml_name(name)), nsmap=nsmap)
# Unfortunately, lxml randomizes attrib order if passed in the makeelement # Unfortunately, lxml randomizes attrib order if passed in the makeelement
# constructor, therefore they have to be set one by one. # constructor, therefore they have to be set one by one.
nsmap_changed = False
namespaced_attribs = {}
for k, v in attrib.iteritems(): for k, v in attrib.iteritems():
try: try:
elem.set(k, v) elem.set(k, v)
except ValueError: except (ValueError, TypeError):
elem.set(to_xml_name(k), v) k, is_namespace = clean_attrib(k, v, nsmap, attrib, namespaced_attribs)
nsmap_changed |= is_namespace
if k is not None:
try:
elem.set(k, v)
except ValueError:
elem.set(to_xml_name(k), v)
if nsmap_changed:
nelem = ctx.makeelement(elem.tag, nsmap=nsmap)
for k, v in elem.items(): # Only elem.items() preserves attrib order
nelem.set(k, v)
for (prefix, name), v in namespaced_attribs.iteritems():
ns = nsmap.get('prefix', None)
if ns is not None:
try:
nelem.set('{%s}%s' % (ns, name), v)
except ValueError:
nelem.set('{%s}%s' % (ns, to_xml_name(name)), v)
else:
nelem.set(to_xml_name('%s:%s' % (prefix, name)), v)
elem = nelem
# Handle namespace prefixed tag names
if prefix is not None:
namespace = nsmap.get(prefix, None)
if namespace is not None and namespace != elem.nsmap[elem.prefix]:
nelem = ctx.makeelement('{%s}%s' %(nsmap[prefix], elem.tag.rpartition('}')[2]), nsmap=nsmap)
for k, v in elem.items():
nelem.set(k, v)
elem = nelem
# Ensure that svg and mathml elements get no namespace prefixes
if elem.prefix is not None and namespace in known_namespaces:
for k, v in tuple(nsmap.iteritems()):
if v == namespace:
del nsmap[k]
nsmap[None] = namespace
nelem = ctx.makeelement(elem.tag, nsmap=nsmap)
for k, v in elem.items():
nelem.set(k, v)
elem = nelem
return elem return elem
class TreeBuilder(BaseTreeBuilder): class TreeBuilder(BaseTreeBuilder):
@ -297,26 +332,15 @@ class TreeBuilder(BaseTreeBuilder):
def createElement(self, token, nsmap=None): def createElement(self, token, nsmap=None):
"""Create an element but don't insert it anywhere""" """Create an element but don't insert it anywhere"""
nsmap = nsmap or {} nsmap = nsmap or {}
attribs = process_attribs(token['data'], nsmap)
name = token_name = token["name"] name = token_name = token["name"]
namespace = token.get("namespace", self.defaultNamespace) namespace = token.get("namespace", self.defaultNamespace)
prefix = None
if ':' in name: if ':' in name:
if name.endswith(':html'): if name.endswith(':html'):
raise NamespacedHTMLPresent(name.rpartition(':')[0]) raise NamespacedHTMLPresent(name.rpartition(':')[0])
prefix, name = name.partition(':')[0::2] prefix, name = name.partition(':')[0::2]
namespace = nsmap.get(prefix, namespace) namespace = nsmap.get(prefix, namespace)
elem = makeelement_ns(self.lxml_context, namespace, name, attribs, nsmap) elem = makeelement_ns(self.lxml_context, namespace, prefix, name, token['data'], nsmap)
# Ensure that svg and mathml elements get no namespace prefixes
if elem.prefix is not None and namespace in known_namespaces:
for k, v in tuple(nsmap.iteritems()):
if v == namespace:
del nsmap[k]
nsmap[None] = namespace
nelem = self.lxml_context.makeelement(elem.tag, nsmap=nsmap)
for k, v in elem.items(): # Only elem.items() preserves attrib order
nelem.set(k, v)
elem = nelem
# Keep a reference to elem so that lxml does not delete and re-create # Keep a reference to elem so that lxml does not delete and re-create
# it, losing the name related attributes # it, losing the name related attributes
@ -366,54 +390,39 @@ class TreeBuilder(BaseTreeBuilder):
if not attrs: if not attrs:
return return
html = self.openElements[0] html = self.openElements[0]
nsmap = html.nsmap.copy() for k, v in attrs.iteritems():
attribs = process_attribs(attrs, nsmap) if k not in html.attrib and k != 'xmlns':
for k, v in attribs.iteritems():
if k not in html.attrib:
try: try:
html.set(k, v) html.set(k, v)
except TypeError:
pass
except ValueError: except ValueError:
if k == 'xml:lang' and 'lang' not in html.attrib:
k = 'lang'
html.set(to_xml_name(k), v) html.set(to_xml_name(k), v)
if nsmap != html.nsmap:
newroot = self.lxml_context.makeelement(html.tag, attrib=html.attrib, nsmap=nsmap)
self.proxy_cache.append(newroot)
newroot.name, newroot.namespace, newroot.nameTuple = html.name, html.namespace, html.nameTuple
self.openElements[0] = newroot
if self.document.root is html:
self.document.root = newroot
if len(html) > 0:
# TODO: the nsmap changes need to be propagated down the tree
for child in html:
newroot.append(copy.copy(child))
def apply_body_attributes(self, attrs): def apply_body_attributes(self, attrs):
if not attrs:
return
body = self.openElements[1] body = self.openElements[1]
nsmap = body.nsmap.copy() for k, v in attrs.iteritems():
attribs = process_attribs(attrs, nsmap) if k not in body.attrib and k !='xmlns':
for k, v in attribs.iteritems():
if k not in body.attrib:
try: try:
body.set(k, v) body.set(k, v)
except TypeError:
pass
except ValueError: except ValueError:
if k == 'xml:lang' and 'lang' not in body.attrib:
k = 'lang'
body.set(to_xml_name(k), v) body.set(to_xml_name(k), v)
# We ignore xmlns attributes on non-first <body> tags
def insertComment(self, token, parent=None): def insertComment(self, token, parent=None):
if parent is None: if parent is None:
parent = self.openElements[-1] parent = self.openElements[-1]
parent.appendChild(Comment(token["data"].replace('--', '- -'))) parent.appendChild(Comment(token["data"].replace('--', '- -')))
def process_namespace_free_attribs(attrs):
anm = {k:k for k, v in attrs.iteritems() if ':' not in k}
for k in frozenset(attrs) - frozenset(anm):
prefix, name = k.partition(':')[0::2]
if prefix != 'xmlns' and name not in anm:
anm[name] = k
ans = OrderedDict((anm.get(k, None), v) for k, v in attrs.iteritems())
ans.pop(None, None)
return ans
def makeelement(ctx, name, attrib): def makeelement(ctx, name, attrib):
attrib.pop('xmlns', None)
try: try:
elem = ctx.makeelement(name) elem = ctx.makeelement(name)
except ValueError: except ValueError:
@ -421,7 +430,11 @@ def makeelement(ctx, name, attrib):
for k, v in attrib.iteritems(): for k, v in attrib.iteritems():
try: try:
elem.set(k, v) elem.set(k, v)
except TypeError:
elem.set(to_xml_name(k[1]), v)
except ValueError: except ValueError:
if k == 'xml:lang' and 'lang' not in attrib:
k = 'lang'
elem.set(to_xml_name(k), v) elem.set(to_xml_name(k), v)
return elem return elem
@ -436,8 +449,7 @@ class NoNamespaceTreeBuilder(TreeBuilder):
def createElement(self, token, nsmap=None): def createElement(self, token, nsmap=None):
name = token['name'].rpartition(':')[2] name = token['name'].rpartition(':')[2]
attribs = process_namespace_free_attribs(token['data']) elem = makeelement(self.lxml_context, name, token['data'])
elem = makeelement(self.lxml_context, name, attribs)
# Keep a reference to elem so that lxml does not delete and re-create # Keep a reference to elem so that lxml does not delete and re-create
# it, losing _namespace # it, losing _namespace
self.proxy_cache.append(elem) self.proxy_cache.append(elem)
@ -458,24 +470,26 @@ class NoNamespaceTreeBuilder(TreeBuilder):
if not attrs: if not attrs:
return return
html = self.openElements[0] html = self.openElements[0]
attribs = process_namespace_free_attribs(attrs) for k, v in attrs.iteritems():
for k, v in attribs.iteritems(): if k not in html.attrib and k != 'xmlns':
if k not in html.attrib:
try: try:
html.set(k, v) html.set(k, v)
except ValueError: except ValueError:
if k == 'xml:lang' and 'lang' not in html.attrib:
k = 'lang'
html.set(to_xml_name(k), v) html.set(to_xml_name(k), v)
def apply_body_attributes(self, attrs): def apply_body_attributes(self, attrs):
if not attrs: if not attrs:
return return
body = self.openElements[1] body = self.openElements[1]
attribs = process_namespace_free_attribs(attrs) for k, v in attrs.iteritems():
for k, v in attribs.iteritems(): if k not in body.attrib and k != 'xmlns':
if k not in body.attrib:
try: try:
body.set(k, v) body.set(k, v)
except ValueError: except ValueError:
if k == 'xml:lang' and 'lang' not in body.attrib:
k = 'lang'
body.set(to_xml_name(k), v) body.set(to_xml_name(k), v)
# Input Stream {{{ # Input Stream {{{