Preserve attribute order when parsing

This commit is contained in:
Kovid Goyal 2013-10-26 14:57:42 +05:30
parent 12a581786b
commit dd676227b8
4 changed files with 153 additions and 118 deletions

View File

@ -9,6 +9,7 @@ __copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
import copy, re, warnings
from functools import partial
from bisect import bisect
from collections import OrderedDict
from lxml.etree import ElementBase, XMLParser, ElementDefaultClassLookup, CommentBase
@ -203,7 +204,7 @@ def create_lxml_context():
# }}}
def process_attribs(attrs, nsmap):
attribs = {}
attrib_name_map = {}
namespaced_attribs = {}
xmlns = namespaces['xmlns']
for k, v in attrs.iteritems():
@ -220,7 +221,7 @@ def process_attribs(attrs, nsmap):
if ns == xlink_ns:
del nsmap[prefix]
nsmap['xlink'] = xlink_ns
attribs['{%s}%s' % (k[2], k[1])] = v
attrib_name_map[k] = '{%s}%s' % (k[2], k[1])
else:
if ':' in k:
if k.startswith('xmlns') and (k.startswith('xmlns:') or k == 'xmlns'):
@ -228,27 +229,46 @@ def process_attribs(attrs, nsmap):
if prefix is not None:
# Use an existing prefix for this namespace, if
# possible
existing = {v:k for k, v in nsmap.iteritems()}.get(v, False)
existing = {x:k for k, x in nsmap.iteritems()}.get(v, False)
if existing is not False:
prefix = existing
nsmap[prefix] = v
else:
namespaced_attribs[k] = v
else:
attribs[k] = v
attrib_name_map[k] = k
xml_lang = None
for k, v in namespaced_attribs.iteritems():
prefix, name = k.partition(':')[0::2]
if prefix == 'xml':
if name == 'lang':
attribs['lang'] = attribs.get('lang', v)
xml_lang = v
continue
ns = nsmap.get(prefix, None)
if ns is not None:
name = '{%s}%s' % (ns, name)
attribs[name] =v
attrib_name_map[k] = name
return attribs
ans = OrderedDict((attrib_name_map.get(k, None), v) for k, v in attrs.iteritems())
ans.pop(None, None)
if xml_lang:
ans['lang'] = ans.get('lang', xml_lang)
return ans
def makeelement_ns(ctx, namespace, name, attrib, nsmap):
try:
elem = ctx.makeelement('{%s}%s' % (namespace, name), nsmap=nsmap)
except ValueError:
elem = ctx.makeelement('{%s}%s' % (namespace, to_xml_name(name)), nsmap=nsmap)
# Unfortunately, lxml randomizes attrib order if passed in the makeelement
# constructor, therefore they have to be set one by one.
for k, v in attrib.iteritems():
try:
elem.set(k, v)
except ValueError:
elem.set(to_xml_name(k), v)
return elem
class TreeBuilder(BaseTreeBuilder):
@ -285,11 +305,7 @@ class TreeBuilder(BaseTreeBuilder):
raise NamespacedHTMLPresent(name.rpartition(':')[0])
prefix, name = name.partition(':')[0::2]
namespace = nsmap.get(prefix, namespace)
try:
elem = self.lxml_context.makeelement('{%s}%s' % (namespace, name), attrib=attribs, nsmap=nsmap)
except ValueError:
attribs = {to_xml_name(k):v for k, v in attribs.iteritems()}
elem = self.lxml_context.makeelement('{%s}%s' % (namespace, to_xml_name(name)), attrib=attribs, nsmap=nsmap)
elem = makeelement_ns(self.lxml_context, namespace, name, attribs, nsmap)
# Ensure that svg and mathml elements get no namespace prefixes
if elem.prefix is not None and namespace in known_namespaces:
@ -297,7 +313,10 @@ class TreeBuilder(BaseTreeBuilder):
if v == namespace:
del nsmap[k]
nsmap[None] = namespace
elem = self.lxml_context.makeelement(elem.tag, attrib=elem.attrib, nsmap=nsmap)
nelem = self.lxml_context.makeelement(elem.tag, nsmap=nsmap)
for k, v in elem.items(): # Only elem.items() preserves attrib order
nelem.set(k, v)
elem = nelem
# Keep a reference to elem so that lxml does not delete and re-create
# it, losing the name related attributes
@ -385,12 +404,26 @@ class TreeBuilder(BaseTreeBuilder):
parent.appendChild(Comment(token["data"].replace('--', '- -')))
def process_namespace_free_attribs(attrs):
attribs = {k:v for k, v in attrs.iteritems() if ':' not in k}
for k in set(attrs) - set(attribs):
anm = {k:k for k, v in attrs.iteritems() if ':' not in k}
for k in frozenset(attrs) - frozenset(anm):
prefix, name = k.partition(':')[0::2]
if prefix != 'xmlns' and name not in attribs:
attribs[name] = attrs[k]
return attribs
if prefix != 'xmlns' and name not in anm:
anm[name] = k
ans = OrderedDict((anm.get(k, None), v) for k, v in attrs.iteritems())
ans.pop(None, None)
return ans
def makeelement(ctx, name, attrib):
try:
elem = ctx.makeelement(name)
except ValueError:
elem = ctx.makeelement(to_xml_name(name))
for k, v in attrib.iteritems():
try:
elem.set(k, v)
except ValueError:
elem.set(to_xml_name(k), v)
return elem
class NoNamespaceTreeBuilder(TreeBuilder):
@ -404,11 +437,7 @@ class NoNamespaceTreeBuilder(TreeBuilder):
def createElement(self, token, nsmap=None):
name = token['name'].rpartition(':')[2]
attribs = process_namespace_free_attribs(token['data'])
try:
elem = self.lxml_context.makeelement(name, attrib=attribs)
except ValueError:
attribs = {to_xml_name(k):v for k, v in attribs.iteritems()}
elem = self.lxml_context.makeelement(to_xml_name(name), attrib=attribs)
elem = makeelement(self.lxml_context, name, attribs)
# Keep a reference to elem so that lxml does not delete and re-create
# it, losing _namespace
self.proxy_cache.append(elem)
@ -551,8 +580,7 @@ def parse(raw, decoder=None, log=None, discard_namespaces=False, line_numbers=Tr
if __name__ == '__main__':
from lxml import etree
root = parse('\n<html><head><title>a\n</title><p>&nbsp;\n<b>b', discard_namespaces=False)
# root = parse('\n<html><p><svg viewbox="0 0 0 0"><image xlink:href="xxx"/><b></svg>&nbsp;\n<b>xxx', discard_namespaces=False)
root = parse('\n<html><head><title>a\n</title><p b=1 c=2 a=0>&nbsp;\n<b>b<svg ass="wipe" viewbox="0">', discard_namespaces=False)
print (etree.tostring(root, encoding='utf-8'))
print()

View File

@ -135,7 +135,15 @@ def multiple_html_and_body(test, parse_function):
test.assertEqual(len(XPath('//h:html[@id and @lang]')(root)), 1, err)
test.assertEqual(len(XPath('//h:body[@id and @lang]')(root)), 1, err)
basic_checks = (nonvoid_cdata_elements, namespaces, space_characters, case_insensitive_element_names, entities, multiple_html_and_body)
def attribute_replacement(test, parse_function):
markup = '<html><body><svg viewbox="0"></svg><svg xmlns="%s" viewbox="1">' % SVG_NS
root = parse_function(markup)
err = 'SVG attributes not normalized, parsed markup:\n' + etree.tostring(root)
test.assertEqual(len(XPath('//svg:svg[@viewBox]')(root)), 2, err)
basic_checks = (nonvoid_cdata_elements, namespaces, space_characters,
case_insensitive_element_names, entities,
multiple_html_and_body, attribute_replacement)
class ParsingTests(BaseTest):
@ -160,3 +168,10 @@ class ParsingTests(BaseTest):
elem = root.xpath('//*[local-name()="%s"]' % tag)[0]
self.assertEqual(lnum, elem.sourceline, 'Line number incorrect for %s, source: %s:' % (tag, src))
for ds in (False, True):
src = '\n<html>\n<p b=1 a=2 c=3 d=4 e=5 f=6 g=7 h=8><svg b=1 a=2 c=3 d=4 e=5 f=6 g=7 h=8>\n'
root = parse(src, discard_namespaces=ds)
for tag in ('p', 'svg'):
for i, (k, v) in enumerate(root.xpath('//*[local-name()="%s"]' % tag)[0].items()):
self.assertEqual(i+1, int(v))

View File

@ -433,6 +433,73 @@ mathmlTextIntegrationPointElements = frozenset((
(namespaces["mathml"], "mtext")
))
adjustSVGAttributes = {
"attributename": "attributeName",
"attributetype": "attributeType",
"basefrequency": "baseFrequency",
"baseprofile": "baseProfile",
"calcmode": "calcMode",
"clippathunits": "clipPathUnits",
"contentscripttype": "contentScriptType",
"contentstyletype": "contentStyleType",
"diffuseconstant": "diffuseConstant",
"edgemode": "edgeMode",
"externalresourcesrequired": "externalResourcesRequired",
"filterres": "filterRes",
"filterunits": "filterUnits",
"glyphref": "glyphRef",
"gradienttransform": "gradientTransform",
"gradientunits": "gradientUnits",
"kernelmatrix": "kernelMatrix",
"kernelunitlength": "kernelUnitLength",
"keypoints": "keyPoints",
"keysplines": "keySplines",
"keytimes": "keyTimes",
"lengthadjust": "lengthAdjust",
"limitingconeangle": "limitingConeAngle",
"markerheight": "markerHeight",
"markerunits": "markerUnits",
"markerwidth": "markerWidth",
"maskcontentunits": "maskContentUnits",
"maskunits": "maskUnits",
"numoctaves": "numOctaves",
"pathlength": "pathLength",
"patterncontentunits": "patternContentUnits",
"patterntransform": "patternTransform",
"patternunits": "patternUnits",
"pointsatx": "pointsAtX",
"pointsaty": "pointsAtY",
"pointsatz": "pointsAtZ",
"preservealpha": "preserveAlpha",
"preserveaspectratio": "preserveAspectRatio",
"primitiveunits": "primitiveUnits",
"refx": "refX",
"refy": "refY",
"repeatcount": "repeatCount",
"repeatdur": "repeatDur",
"requiredextensions": "requiredExtensions",
"requiredfeatures": "requiredFeatures",
"specularconstant": "specularConstant",
"specularexponent": "specularExponent",
"spreadmethod": "spreadMethod",
"startoffset": "startOffset",
"stddeviation": "stdDeviation",
"stitchtiles": "stitchTiles",
"surfacescale": "surfaceScale",
"systemlanguage": "systemLanguage",
"tablevalues": "tableValues",
"targetx": "targetX",
"targety": "targetY",
"textlength": "textLength",
"viewbox": "viewBox",
"viewtarget": "viewTarget",
"xchannelselector": "xChannelSelector",
"ychannelselector": "yChannelSelector",
"zoomandpan": "zoomAndPan"
}
adjustMathMLAttributes = {"definitionurl": "definitionURL"}
adjustForeignAttributes = {
"xlink:actuate": ("xlink", "actuate", namespaces["xlink"]),
"xlink:arcrole": ("xlink", "arcrole", namespaces["xlink"]),

View File

@ -2,6 +2,7 @@ from __future__ import absolute_import, division, unicode_literals
from six import with_metaclass
import types
from collections import OrderedDict
from . import inputstream
from . import tokenizer
@ -10,14 +11,12 @@ from . import treebuilders
from .treebuilders._base import Marker
from . import utils
from . import constants
from .constants import spaceCharacters, asciiUpper2Lower
from .constants import specialElements
from .constants import headingElements
from .constants import cdataElements, rcdataElements
from .constants import tokenTypes, ReparseException, namespaces
from .constants import htmlIntegrationPointElements, mathmlTextIntegrationPointElements
from .constants import adjustForeignAttributes as adjustForeignAttributesMap
from .constants import (
spaceCharacters, asciiUpper2Lower, specialElements, headingElements,
cdataElements, rcdataElements, tokenTypes, tagTokenTypes, ReparseException, namespaces,
htmlIntegrationPointElements, mathmlTextIntegrationPointElements,
adjustForeignAttributes as adjustForeignAttributesMap, adjustSVGAttributes,
adjustMathMLAttributes)
def parse(doc, treebuilder="etree", encoding=None,
@ -255,96 +254,18 @@ class HTMLParser(object):
""" HTML5 specific normalizations to the token stream """
if token["type"] == tokenTypes["StartTag"]:
token["data"] = dict(token["data"][::-1])
token["data"] = OrderedDict(token['data'])
return token
def adjustMathMLAttributes(self, token):
replacements = {"definitionurl": "definitionURL"}
for k, v in replacements.items():
if k in token["data"]:
token["data"][v] = token["data"][k]
del token["data"][k]
adjust_attributes(token, adjustMathMLAttributes)
def adjustSVGAttributes(self, token):
replacements = {
"attributename": "attributeName",
"attributetype": "attributeType",
"basefrequency": "baseFrequency",
"baseprofile": "baseProfile",
"calcmode": "calcMode",
"clippathunits": "clipPathUnits",
"contentscripttype": "contentScriptType",
"contentstyletype": "contentStyleType",
"diffuseconstant": "diffuseConstant",
"edgemode": "edgeMode",
"externalresourcesrequired": "externalResourcesRequired",
"filterres": "filterRes",
"filterunits": "filterUnits",
"glyphref": "glyphRef",
"gradienttransform": "gradientTransform",
"gradientunits": "gradientUnits",
"kernelmatrix": "kernelMatrix",
"kernelunitlength": "kernelUnitLength",
"keypoints": "keyPoints",
"keysplines": "keySplines",
"keytimes": "keyTimes",
"lengthadjust": "lengthAdjust",
"limitingconeangle": "limitingConeAngle",
"markerheight": "markerHeight",
"markerunits": "markerUnits",
"markerwidth": "markerWidth",
"maskcontentunits": "maskContentUnits",
"maskunits": "maskUnits",
"numoctaves": "numOctaves",
"pathlength": "pathLength",
"patterncontentunits": "patternContentUnits",
"patterntransform": "patternTransform",
"patternunits": "patternUnits",
"pointsatx": "pointsAtX",
"pointsaty": "pointsAtY",
"pointsatz": "pointsAtZ",
"preservealpha": "preserveAlpha",
"preserveaspectratio": "preserveAspectRatio",
"primitiveunits": "primitiveUnits",
"refx": "refX",
"refy": "refY",
"repeatcount": "repeatCount",
"repeatdur": "repeatDur",
"requiredextensions": "requiredExtensions",
"requiredfeatures": "requiredFeatures",
"specularconstant": "specularConstant",
"specularexponent": "specularExponent",
"spreadmethod": "spreadMethod",
"startoffset": "startOffset",
"stddeviation": "stdDeviation",
"stitchtiles": "stitchTiles",
"surfacescale": "surfaceScale",
"systemlanguage": "systemLanguage",
"tablevalues": "tableValues",
"targetx": "targetX",
"targety": "targetY",
"textlength": "textLength",
"viewbox": "viewBox",
"viewtarget": "viewTarget",
"xchannelselector": "xChannelSelector",
"ychannelselector": "yChannelSelector",
"zoomandpan": "zoomAndPan"
}
for originalName in list(token["data"].keys()):
if originalName in replacements:
svgName = replacements[originalName]
token["data"][svgName] = token["data"][originalName]
del token["data"][originalName]
adjust_attributes(token, adjustSVGAttributes)
def adjustForeignAttributes(self, token):
replacements = adjustForeignAttributesMap
for originalName in token["data"].keys():
if originalName in replacements:
foreignName = replacements[originalName]
token["data"][foreignName] = token["data"][originalName]
del token["data"][originalName]
adjust_attributes(token, adjustForeignAttributesMap)
def reparseTokenNormal(self, token):
self.parser.phase()
@ -424,7 +345,7 @@ def getPhases(debug):
def log(function):
"""Logger that records which phase processes each token"""
type_names = dict((value, key) for key, value in
constants.tokenTypes.items())
tokenTypes.items())
def wrapped(self, *args, **kwargs):
if function.__name__.startswith("process") and len(args) > 0:
@ -433,7 +354,7 @@ def getPhases(debug):
info = {"type": type_names[token['type']]}
except:
raise
if token['type'] in constants.tagTokenTypes:
if token['type'] in tagTokenTypes:
info["name"] = token['name']
self.parser.log.append((self.parser.tokenizer.state.__name__,
@ -2721,6 +2642,10 @@ def getPhases(debug):
# XXX after after frameset
}
def adjust_attributes(token, replacements):
if frozenset(token['data']) & frozenset(replacements):
token['data'] = OrderedDict(
(replacements.get(k, k), v) for k, v in token['data'].iteritems())
class ParseError(Exception):