From dd676227b83fd8a4b648d8ec9ba0a764a748ee7f Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sat, 26 Oct 2013 14:57:42 +0530 Subject: [PATCH] Preserve attribute order when parsing --- src/calibre/ebooks/oeb/polish/parsing.py | 78 +++++++++---- .../ebooks/oeb/polish/tests/parsing.py | 17 ++- src/html5lib/constants.py | 67 +++++++++++ src/html5lib/html5parser.py | 109 +++--------------- 4 files changed, 153 insertions(+), 118 deletions(-) diff --git a/src/calibre/ebooks/oeb/polish/parsing.py b/src/calibre/ebooks/oeb/polish/parsing.py index e6a56a2591..85843c1441 100644 --- a/src/calibre/ebooks/oeb/polish/parsing.py +++ b/src/calibre/ebooks/oeb/polish/parsing.py @@ -9,6 +9,7 @@ __copyright__ = '2013, Kovid Goyal ' import copy, re, warnings from functools import partial from bisect import bisect +from collections import OrderedDict from lxml.etree import ElementBase, XMLParser, ElementDefaultClassLookup, CommentBase @@ -203,7 +204,7 @@ def create_lxml_context(): # }}} def process_attribs(attrs, nsmap): - attribs = {} + attrib_name_map = {} namespaced_attribs = {} xmlns = namespaces['xmlns'] for k, v in attrs.iteritems(): @@ -220,7 +221,7 @@ def process_attribs(attrs, nsmap): if ns == xlink_ns: del nsmap[prefix] nsmap['xlink'] = xlink_ns - attribs['{%s}%s' % (k[2], k[1])] = v + attrib_name_map[k] = '{%s}%s' % (k[2], k[1]) else: if ':' in k: if k.startswith('xmlns') and (k.startswith('xmlns:') or k == 'xmlns'): @@ -228,27 +229,46 @@ def process_attribs(attrs, nsmap): if prefix is not None: # Use an existing prefix for this namespace, if # possible - existing = {v:k for k, v in nsmap.iteritems()}.get(v, False) + existing = {x:k for k, x in nsmap.iteritems()}.get(v, False) if existing is not False: prefix = existing nsmap[prefix] = v else: namespaced_attribs[k] = v else: - attribs[k] = v + attrib_name_map[k] = k + xml_lang = None for k, v in namespaced_attribs.iteritems(): prefix, name = k.partition(':')[0::2] if prefix == 'xml': if name == 'lang': - attribs['lang'] = attribs.get('lang', v) + xml_lang = v continue ns = nsmap.get(prefix, None) if ns is not None: name = '{%s}%s' % (ns, name) - attribs[name] =v + attrib_name_map[k] = name - return attribs + ans = OrderedDict((attrib_name_map.get(k, None), v) for k, v in attrs.iteritems()) + ans.pop(None, None) + if xml_lang: + ans['lang'] = ans.get('lang', xml_lang) + return ans + +def makeelement_ns(ctx, namespace, name, attrib, nsmap): + try: + elem = ctx.makeelement('{%s}%s' % (namespace, name), nsmap=nsmap) + except ValueError: + elem = ctx.makeelement('{%s}%s' % (namespace, to_xml_name(name)), nsmap=nsmap) + # Unfortunately, lxml randomizes attrib order if passed in the makeelement + # constructor, therefore they have to be set one by one. + for k, v in attrib.iteritems(): + try: + elem.set(k, v) + except ValueError: + elem.set(to_xml_name(k), v) + return elem class TreeBuilder(BaseTreeBuilder): @@ -285,11 +305,7 @@ class TreeBuilder(BaseTreeBuilder): raise NamespacedHTMLPresent(name.rpartition(':')[0]) prefix, name = name.partition(':')[0::2] namespace = nsmap.get(prefix, namespace) - try: - elem = self.lxml_context.makeelement('{%s}%s' % (namespace, name), attrib=attribs, nsmap=nsmap) - except ValueError: - attribs = {to_xml_name(k):v for k, v in attribs.iteritems()} - elem = self.lxml_context.makeelement('{%s}%s' % (namespace, to_xml_name(name)), attrib=attribs, nsmap=nsmap) + elem = makeelement_ns(self.lxml_context, namespace, name, attribs, nsmap) # Ensure that svg and mathml elements get no namespace prefixes if elem.prefix is not None and namespace in known_namespaces: @@ -297,7 +313,10 @@ class TreeBuilder(BaseTreeBuilder): if v == namespace: del nsmap[k] nsmap[None] = namespace - elem = self.lxml_context.makeelement(elem.tag, attrib=elem.attrib, nsmap=nsmap) + nelem = self.lxml_context.makeelement(elem.tag, nsmap=nsmap) + for k, v in elem.items(): # Only elem.items() preserves attrib order + nelem.set(k, v) + elem = nelem # Keep a reference to elem so that lxml does not delete and re-create # it, losing the name related attributes @@ -385,12 +404,26 @@ class TreeBuilder(BaseTreeBuilder): parent.appendChild(Comment(token["data"].replace('--', '- -'))) def process_namespace_free_attribs(attrs): - attribs = {k:v for k, v in attrs.iteritems() if ':' not in k} - for k in set(attrs) - set(attribs): + anm = {k:k for k, v in attrs.iteritems() if ':' not in k} + for k in frozenset(attrs) - frozenset(anm): prefix, name = k.partition(':')[0::2] - if prefix != 'xmlns' and name not in attribs: - attribs[name] = attrs[k] - return attribs + if prefix != 'xmlns' and name not in anm: + anm[name] = k + ans = OrderedDict((anm.get(k, None), v) for k, v in attrs.iteritems()) + ans.pop(None, None) + return ans + +def makeelement(ctx, name, attrib): + try: + elem = ctx.makeelement(name) + except ValueError: + elem = ctx.makeelement(to_xml_name(name)) + for k, v in attrib.iteritems(): + try: + elem.set(k, v) + except ValueError: + elem.set(to_xml_name(k), v) + return elem class NoNamespaceTreeBuilder(TreeBuilder): @@ -404,11 +437,7 @@ class NoNamespaceTreeBuilder(TreeBuilder): def createElement(self, token, nsmap=None): name = token['name'].rpartition(':')[2] attribs = process_namespace_free_attribs(token['data']) - try: - elem = self.lxml_context.makeelement(name, attrib=attribs) - except ValueError: - attribs = {to_xml_name(k):v for k, v in attribs.iteritems()} - elem = self.lxml_context.makeelement(to_xml_name(name), attrib=attribs) + elem = makeelement(self.lxml_context, name, attribs) # Keep a reference to elem so that lxml does not delete and re-create # it, losing _namespace self.proxy_cache.append(elem) @@ -551,8 +580,7 @@ def parse(raw, decoder=None, log=None, discard_namespaces=False, line_numbers=Tr if __name__ == '__main__': from lxml import etree - root = parse('\na\n

 \nb', discard_namespaces=False) - # root = parse('\n

 \nxxx', discard_namespaces=False) + root = parse('\na\n

 \nb', discard_namespaces=False) print (etree.tostring(root, encoding='utf-8')) print() diff --git a/src/calibre/ebooks/oeb/polish/tests/parsing.py b/src/calibre/ebooks/oeb/polish/tests/parsing.py index ec65fe6d51..0346821745 100644 --- a/src/calibre/ebooks/oeb/polish/tests/parsing.py +++ b/src/calibre/ebooks/oeb/polish/tests/parsing.py @@ -135,7 +135,15 @@ def multiple_html_and_body(test, parse_function): test.assertEqual(len(XPath('//h:html[@id and @lang]')(root)), 1, err) test.assertEqual(len(XPath('//h:body[@id and @lang]')(root)), 1, err) -basic_checks = (nonvoid_cdata_elements, namespaces, space_characters, case_insensitive_element_names, entities, multiple_html_and_body) +def attribute_replacement(test, parse_function): + markup = '' % SVG_NS + root = parse_function(markup) + err = 'SVG attributes not normalized, parsed markup:\n' + etree.tostring(root) + test.assertEqual(len(XPath('//svg:svg[@viewBox]')(root)), 2, err) + +basic_checks = (nonvoid_cdata_elements, namespaces, space_characters, + case_insensitive_element_names, entities, + multiple_html_and_body, attribute_replacement) class ParsingTests(BaseTest): @@ -160,3 +168,10 @@ class ParsingTests(BaseTest): elem = root.xpath('//*[local-name()="%s"]' % tag)[0] self.assertEqual(lnum, elem.sourceline, 'Line number incorrect for %s, source: %s:' % (tag, src)) + for ds in (False, True): + src = '\n\n

\n' + root = parse(src, discard_namespaces=ds) + for tag in ('p', 'svg'): + for i, (k, v) in enumerate(root.xpath('//*[local-name()="%s"]' % tag)[0].items()): + self.assertEqual(i+1, int(v)) + diff --git a/src/html5lib/constants.py b/src/html5lib/constants.py index e7089846d5..6d5ccb176b 100644 --- a/src/html5lib/constants.py +++ b/src/html5lib/constants.py @@ -433,6 +433,73 @@ mathmlTextIntegrationPointElements = frozenset(( (namespaces["mathml"], "mtext") )) +adjustSVGAttributes = { + "attributename": "attributeName", + "attributetype": "attributeType", + "basefrequency": "baseFrequency", + "baseprofile": "baseProfile", + "calcmode": "calcMode", + "clippathunits": "clipPathUnits", + "contentscripttype": "contentScriptType", + "contentstyletype": "contentStyleType", + "diffuseconstant": "diffuseConstant", + "edgemode": "edgeMode", + "externalresourcesrequired": "externalResourcesRequired", + "filterres": "filterRes", + "filterunits": "filterUnits", + "glyphref": "glyphRef", + "gradienttransform": "gradientTransform", + "gradientunits": "gradientUnits", + "kernelmatrix": "kernelMatrix", + "kernelunitlength": "kernelUnitLength", + "keypoints": "keyPoints", + "keysplines": "keySplines", + "keytimes": "keyTimes", + "lengthadjust": "lengthAdjust", + "limitingconeangle": "limitingConeAngle", + "markerheight": "markerHeight", + "markerunits": "markerUnits", + "markerwidth": "markerWidth", + "maskcontentunits": "maskContentUnits", + "maskunits": "maskUnits", + "numoctaves": "numOctaves", + "pathlength": "pathLength", + "patterncontentunits": "patternContentUnits", + "patterntransform": "patternTransform", + "patternunits": "patternUnits", + "pointsatx": "pointsAtX", + "pointsaty": "pointsAtY", + "pointsatz": "pointsAtZ", + "preservealpha": "preserveAlpha", + "preserveaspectratio": "preserveAspectRatio", + "primitiveunits": "primitiveUnits", + "refx": "refX", + "refy": "refY", + "repeatcount": "repeatCount", + "repeatdur": "repeatDur", + "requiredextensions": "requiredExtensions", + "requiredfeatures": "requiredFeatures", + "specularconstant": "specularConstant", + "specularexponent": "specularExponent", + "spreadmethod": "spreadMethod", + "startoffset": "startOffset", + "stddeviation": "stdDeviation", + "stitchtiles": "stitchTiles", + "surfacescale": "surfaceScale", + "systemlanguage": "systemLanguage", + "tablevalues": "tableValues", + "targetx": "targetX", + "targety": "targetY", + "textlength": "textLength", + "viewbox": "viewBox", + "viewtarget": "viewTarget", + "xchannelselector": "xChannelSelector", + "ychannelselector": "yChannelSelector", + "zoomandpan": "zoomAndPan" +} + +adjustMathMLAttributes = {"definitionurl": "definitionURL"} + adjustForeignAttributes = { "xlink:actuate": ("xlink", "actuate", namespaces["xlink"]), "xlink:arcrole": ("xlink", "arcrole", namespaces["xlink"]), diff --git a/src/html5lib/html5parser.py b/src/html5lib/html5parser.py index 49d290e37c..b96699c26d 100644 --- a/src/html5lib/html5parser.py +++ b/src/html5lib/html5parser.py @@ -2,6 +2,7 @@ from __future__ import absolute_import, division, unicode_literals from six import with_metaclass import types +from collections import OrderedDict from . import inputstream from . import tokenizer @@ -10,14 +11,12 @@ from . import treebuilders from .treebuilders._base import Marker from . import utils -from . import constants -from .constants import spaceCharacters, asciiUpper2Lower -from .constants import specialElements -from .constants import headingElements -from .constants import cdataElements, rcdataElements -from .constants import tokenTypes, ReparseException, namespaces -from .constants import htmlIntegrationPointElements, mathmlTextIntegrationPointElements -from .constants import adjustForeignAttributes as adjustForeignAttributesMap +from .constants import ( + spaceCharacters, asciiUpper2Lower, specialElements, headingElements, + cdataElements, rcdataElements, tokenTypes, tagTokenTypes, ReparseException, namespaces, + htmlIntegrationPointElements, mathmlTextIntegrationPointElements, + adjustForeignAttributes as adjustForeignAttributesMap, adjustSVGAttributes, + adjustMathMLAttributes) def parse(doc, treebuilder="etree", encoding=None, @@ -255,96 +254,18 @@ class HTMLParser(object): """ HTML5 specific normalizations to the token stream """ if token["type"] == tokenTypes["StartTag"]: - token["data"] = dict(token["data"][::-1]) + token["data"] = OrderedDict(token['data']) return token def adjustMathMLAttributes(self, token): - replacements = {"definitionurl": "definitionURL"} - for k, v in replacements.items(): - if k in token["data"]: - token["data"][v] = token["data"][k] - del token["data"][k] + adjust_attributes(token, adjustMathMLAttributes) def adjustSVGAttributes(self, token): - replacements = { - "attributename": "attributeName", - "attributetype": "attributeType", - "basefrequency": "baseFrequency", - "baseprofile": "baseProfile", - "calcmode": "calcMode", - "clippathunits": "clipPathUnits", - "contentscripttype": "contentScriptType", - "contentstyletype": "contentStyleType", - "diffuseconstant": "diffuseConstant", - "edgemode": "edgeMode", - "externalresourcesrequired": "externalResourcesRequired", - "filterres": "filterRes", - "filterunits": "filterUnits", - "glyphref": "glyphRef", - "gradienttransform": "gradientTransform", - "gradientunits": "gradientUnits", - "kernelmatrix": "kernelMatrix", - "kernelunitlength": "kernelUnitLength", - "keypoints": "keyPoints", - "keysplines": "keySplines", - "keytimes": "keyTimes", - "lengthadjust": "lengthAdjust", - "limitingconeangle": "limitingConeAngle", - "markerheight": "markerHeight", - "markerunits": "markerUnits", - "markerwidth": "markerWidth", - "maskcontentunits": "maskContentUnits", - "maskunits": "maskUnits", - "numoctaves": "numOctaves", - "pathlength": "pathLength", - "patterncontentunits": "patternContentUnits", - "patterntransform": "patternTransform", - "patternunits": "patternUnits", - "pointsatx": "pointsAtX", - "pointsaty": "pointsAtY", - "pointsatz": "pointsAtZ", - "preservealpha": "preserveAlpha", - "preserveaspectratio": "preserveAspectRatio", - "primitiveunits": "primitiveUnits", - "refx": "refX", - "refy": "refY", - "repeatcount": "repeatCount", - "repeatdur": "repeatDur", - "requiredextensions": "requiredExtensions", - "requiredfeatures": "requiredFeatures", - "specularconstant": "specularConstant", - "specularexponent": "specularExponent", - "spreadmethod": "spreadMethod", - "startoffset": "startOffset", - "stddeviation": "stdDeviation", - "stitchtiles": "stitchTiles", - "surfacescale": "surfaceScale", - "systemlanguage": "systemLanguage", - "tablevalues": "tableValues", - "targetx": "targetX", - "targety": "targetY", - "textlength": "textLength", - "viewbox": "viewBox", - "viewtarget": "viewTarget", - "xchannelselector": "xChannelSelector", - "ychannelselector": "yChannelSelector", - "zoomandpan": "zoomAndPan" - } - for originalName in list(token["data"].keys()): - if originalName in replacements: - svgName = replacements[originalName] - token["data"][svgName] = token["data"][originalName] - del token["data"][originalName] + adjust_attributes(token, adjustSVGAttributes) def adjustForeignAttributes(self, token): - replacements = adjustForeignAttributesMap - - for originalName in token["data"].keys(): - if originalName in replacements: - foreignName = replacements[originalName] - token["data"][foreignName] = token["data"][originalName] - del token["data"][originalName] + adjust_attributes(token, adjustForeignAttributesMap) def reparseTokenNormal(self, token): self.parser.phase() @@ -424,7 +345,7 @@ def getPhases(debug): def log(function): """Logger that records which phase processes each token""" type_names = dict((value, key) for key, value in - constants.tokenTypes.items()) + tokenTypes.items()) def wrapped(self, *args, **kwargs): if function.__name__.startswith("process") and len(args) > 0: @@ -433,7 +354,7 @@ def getPhases(debug): info = {"type": type_names[token['type']]} except: raise - if token['type'] in constants.tagTokenTypes: + if token['type'] in tagTokenTypes: info["name"] = token['name'] self.parser.log.append((self.parser.tokenizer.state.__name__, @@ -2721,6 +2642,10 @@ def getPhases(debug): # XXX after after frameset } +def adjust_attributes(token, replacements): + if frozenset(token['data']) & frozenset(replacements): + token['data'] = OrderedDict( + (replacements.get(k, k), v) for k, v in token['data'].iteritems()) class ParseError(Exception):