mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Speed up parsing by only special casing non-XML safe attributes
This commit is contained in:
parent
2462f6b832
commit
a228d95678
@ -9,7 +9,6 @@ __copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
|
|||||||
import copy, re, warnings
|
import copy, re, warnings
|
||||||
from functools import partial
|
from functools import partial
|
||||||
from bisect import bisect
|
from bisect import bisect
|
||||||
from collections import OrderedDict
|
|
||||||
|
|
||||||
from lxml.etree import ElementBase, XMLParser, ElementDefaultClassLookup, CommentBase
|
from lxml.etree import ElementBase, XMLParser, ElementDefaultClassLookup, CommentBase
|
||||||
|
|
||||||
@ -28,6 +27,7 @@ to_xml_name = infoset_filter.toXmlName
|
|||||||
known_namespaces = {namespaces[k]:k for k in ('mathml', 'svg', 'xlink')}
|
known_namespaces = {namespaces[k]:k for k in ('mathml', 'svg', 'xlink')}
|
||||||
html_ns = namespaces['html']
|
html_ns = namespaces['html']
|
||||||
xlink_ns = namespaces['xlink']
|
xlink_ns = namespaces['xlink']
|
||||||
|
xml_ns = namespaces['xmlns']
|
||||||
|
|
||||||
class NamespacedHTMLPresent(ValueError):
|
class NamespacedHTMLPresent(ValueError):
|
||||||
|
|
||||||
@ -203,71 +203,106 @@ def create_lxml_context():
|
|||||||
|
|
||||||
# }}}
|
# }}}
|
||||||
|
|
||||||
def process_attribs(attrs, nsmap):
|
def clean_attrib(name, val, nsmap, attrib, namespaced_attribs):
|
||||||
attrib_name_map = {}
|
|
||||||
namespaced_attribs = {}
|
|
||||||
xmlns = namespaces['xmlns']
|
|
||||||
for k, v in attrs.iteritems():
|
|
||||||
if isinstance(k, tuple):
|
|
||||||
if k[2] == xmlns:
|
|
||||||
prefix, name, ns = k
|
|
||||||
if prefix is None:
|
|
||||||
nsmap[None] = v
|
|
||||||
else:
|
|
||||||
nsmap[name] = v
|
|
||||||
else:
|
|
||||||
if k[2] == xlink_ns and 'xlink' not in nsmap:
|
|
||||||
for prefix, ns in tuple(nsmap.iteritems()):
|
|
||||||
if ns == xlink_ns:
|
|
||||||
del nsmap[prefix]
|
|
||||||
nsmap['xlink'] = xlink_ns
|
|
||||||
attrib_name_map[k] = '{%s}%s' % (k[2], k[1])
|
|
||||||
else:
|
|
||||||
if ':' in k:
|
|
||||||
if k.startswith('xmlns') and (k.startswith('xmlns:') or k == 'xmlns'):
|
|
||||||
prefix = k.partition(':')[2] or None
|
|
||||||
if prefix is not None:
|
|
||||||
# Use an existing prefix for this namespace, if
|
|
||||||
# possible
|
|
||||||
existing = {x:k for k, x in nsmap.iteritems()}.get(v, False)
|
|
||||||
if existing is not False:
|
|
||||||
prefix = existing
|
|
||||||
nsmap[prefix] = v
|
|
||||||
else:
|
|
||||||
namespaced_attribs[k] = v
|
|
||||||
else:
|
|
||||||
attrib_name_map[k] = k
|
|
||||||
|
|
||||||
xml_lang = None
|
if isinstance(name, tuple):
|
||||||
for k, v in namespaced_attribs.iteritems():
|
prefix, name, ns = name
|
||||||
prefix, name = k.partition(':')[0::2]
|
if ns == xml_ns:
|
||||||
|
if prefix is None:
|
||||||
|
nsmap[None] = val
|
||||||
|
else:
|
||||||
|
nsmap[name] = val
|
||||||
|
return None, True
|
||||||
|
nsmap_changed = False
|
||||||
|
if ns == xlink_ns and 'xlink' not in nsmap:
|
||||||
|
for prefix, nns in tuple(nsmap.iteritems()):
|
||||||
|
if nns == xlink_ns:
|
||||||
|
del nsmap[prefix]
|
||||||
|
nsmap['xlink'] = xlink_ns
|
||||||
|
nsmap_changed = True
|
||||||
|
return ('{%s}%s' % (ns, name)), nsmap_changed
|
||||||
|
|
||||||
|
if ':' in name:
|
||||||
|
prefix, name = name.partition(':')[0::2]
|
||||||
|
if prefix == 'xmlns':
|
||||||
|
# Use an existing prefix for this namespace, if
|
||||||
|
# possible
|
||||||
|
existing = {x:k for k, x in nsmap.iteritems()}.get(val, False)
|
||||||
|
if existing is not False:
|
||||||
|
name = existing
|
||||||
|
nsmap[name] = val
|
||||||
|
return None, True
|
||||||
if prefix == 'xml':
|
if prefix == 'xml':
|
||||||
if name == 'lang':
|
if name != 'lang' or name in attrib:
|
||||||
xml_lang = v
|
return None, False
|
||||||
continue
|
return name, False
|
||||||
|
|
||||||
ns = nsmap.get(prefix, None)
|
ns = nsmap.get(prefix, None)
|
||||||
if ns is not None:
|
if ns is None:
|
||||||
name = '{%s}%s' % (ns, name)
|
namespaced_attribs[(prefix, name)] = val
|
||||||
attrib_name_map[k] = name
|
return None, True
|
||||||
|
return '{%s}%s' % (ns, name), False
|
||||||
|
|
||||||
ans = OrderedDict((attrib_name_map.get(k, None), v) for k, v in attrs.iteritems())
|
return name, False
|
||||||
ans.pop(None, None)
|
|
||||||
if xml_lang:
|
|
||||||
ans['lang'] = ans.get('lang', xml_lang)
|
|
||||||
return ans
|
|
||||||
|
|
||||||
def makeelement_ns(ctx, namespace, name, attrib, nsmap):
|
def makeelement_ns(ctx, namespace, prefix, name, attrib, nsmap):
|
||||||
|
nns = attrib.pop('xmlns', None)
|
||||||
|
if nns is not None:
|
||||||
|
nsmap[None] = nns
|
||||||
try:
|
try:
|
||||||
elem = ctx.makeelement('{%s}%s' % (namespace, name), nsmap=nsmap)
|
elem = ctx.makeelement('{%s}%s' % (namespace, name), nsmap=nsmap)
|
||||||
except ValueError:
|
except ValueError:
|
||||||
elem = ctx.makeelement('{%s}%s' % (namespace, to_xml_name(name)), nsmap=nsmap)
|
elem = ctx.makeelement('{%s}%s' % (namespace, to_xml_name(name)), nsmap=nsmap)
|
||||||
# Unfortunately, lxml randomizes attrib order if passed in the makeelement
|
# Unfortunately, lxml randomizes attrib order if passed in the makeelement
|
||||||
# constructor, therefore they have to be set one by one.
|
# constructor, therefore they have to be set one by one.
|
||||||
|
nsmap_changed = False
|
||||||
|
namespaced_attribs = {}
|
||||||
for k, v in attrib.iteritems():
|
for k, v in attrib.iteritems():
|
||||||
try:
|
try:
|
||||||
elem.set(k, v)
|
elem.set(k, v)
|
||||||
except ValueError:
|
except (ValueError, TypeError):
|
||||||
elem.set(to_xml_name(k), v)
|
k, is_namespace = clean_attrib(k, v, nsmap, attrib, namespaced_attribs)
|
||||||
|
nsmap_changed |= is_namespace
|
||||||
|
if k is not None:
|
||||||
|
try:
|
||||||
|
elem.set(k, v)
|
||||||
|
except ValueError:
|
||||||
|
elem.set(to_xml_name(k), v)
|
||||||
|
if nsmap_changed:
|
||||||
|
nelem = ctx.makeelement(elem.tag, nsmap=nsmap)
|
||||||
|
for k, v in elem.items(): # Only elem.items() preserves attrib order
|
||||||
|
nelem.set(k, v)
|
||||||
|
for (prefix, name), v in namespaced_attribs.iteritems():
|
||||||
|
ns = nsmap.get('prefix', None)
|
||||||
|
if ns is not None:
|
||||||
|
try:
|
||||||
|
nelem.set('{%s}%s' % (ns, name), v)
|
||||||
|
except ValueError:
|
||||||
|
nelem.set('{%s}%s' % (ns, to_xml_name(name)), v)
|
||||||
|
else:
|
||||||
|
nelem.set(to_xml_name('%s:%s' % (prefix, name)), v)
|
||||||
|
elem = nelem
|
||||||
|
|
||||||
|
# Handle namespace prefixed tag names
|
||||||
|
if prefix is not None:
|
||||||
|
namespace = nsmap.get(prefix, None)
|
||||||
|
if namespace is not None and namespace != elem.nsmap[elem.prefix]:
|
||||||
|
nelem = ctx.makeelement('{%s}%s' %(nsmap[prefix], elem.tag.rpartition('}')[2]), nsmap=nsmap)
|
||||||
|
for k, v in elem.items():
|
||||||
|
nelem.set(k, v)
|
||||||
|
elem = nelem
|
||||||
|
|
||||||
|
# Ensure that svg and mathml elements get no namespace prefixes
|
||||||
|
if elem.prefix is not None and namespace in known_namespaces:
|
||||||
|
for k, v in tuple(nsmap.iteritems()):
|
||||||
|
if v == namespace:
|
||||||
|
del nsmap[k]
|
||||||
|
nsmap[None] = namespace
|
||||||
|
nelem = ctx.makeelement(elem.tag, nsmap=nsmap)
|
||||||
|
for k, v in elem.items():
|
||||||
|
nelem.set(k, v)
|
||||||
|
elem = nelem
|
||||||
|
|
||||||
return elem
|
return elem
|
||||||
|
|
||||||
class TreeBuilder(BaseTreeBuilder):
|
class TreeBuilder(BaseTreeBuilder):
|
||||||
@ -297,26 +332,15 @@ class TreeBuilder(BaseTreeBuilder):
|
|||||||
def createElement(self, token, nsmap=None):
|
def createElement(self, token, nsmap=None):
|
||||||
"""Create an element but don't insert it anywhere"""
|
"""Create an element but don't insert it anywhere"""
|
||||||
nsmap = nsmap or {}
|
nsmap = nsmap or {}
|
||||||
attribs = process_attribs(token['data'], nsmap)
|
|
||||||
name = token_name = token["name"]
|
name = token_name = token["name"]
|
||||||
namespace = token.get("namespace", self.defaultNamespace)
|
namespace = token.get("namespace", self.defaultNamespace)
|
||||||
|
prefix = None
|
||||||
if ':' in name:
|
if ':' in name:
|
||||||
if name.endswith(':html'):
|
if name.endswith(':html'):
|
||||||
raise NamespacedHTMLPresent(name.rpartition(':')[0])
|
raise NamespacedHTMLPresent(name.rpartition(':')[0])
|
||||||
prefix, name = name.partition(':')[0::2]
|
prefix, name = name.partition(':')[0::2]
|
||||||
namespace = nsmap.get(prefix, namespace)
|
namespace = nsmap.get(prefix, namespace)
|
||||||
elem = makeelement_ns(self.lxml_context, namespace, name, attribs, nsmap)
|
elem = makeelement_ns(self.lxml_context, namespace, prefix, name, token['data'], nsmap)
|
||||||
|
|
||||||
# Ensure that svg and mathml elements get no namespace prefixes
|
|
||||||
if elem.prefix is not None and namespace in known_namespaces:
|
|
||||||
for k, v in tuple(nsmap.iteritems()):
|
|
||||||
if v == namespace:
|
|
||||||
del nsmap[k]
|
|
||||||
nsmap[None] = namespace
|
|
||||||
nelem = self.lxml_context.makeelement(elem.tag, nsmap=nsmap)
|
|
||||||
for k, v in elem.items(): # Only elem.items() preserves attrib order
|
|
||||||
nelem.set(k, v)
|
|
||||||
elem = nelem
|
|
||||||
|
|
||||||
# Keep a reference to elem so that lxml does not delete and re-create
|
# Keep a reference to elem so that lxml does not delete and re-create
|
||||||
# it, losing the name related attributes
|
# it, losing the name related attributes
|
||||||
@ -366,54 +390,39 @@ class TreeBuilder(BaseTreeBuilder):
|
|||||||
if not attrs:
|
if not attrs:
|
||||||
return
|
return
|
||||||
html = self.openElements[0]
|
html = self.openElements[0]
|
||||||
nsmap = html.nsmap.copy()
|
for k, v in attrs.iteritems():
|
||||||
attribs = process_attribs(attrs, nsmap)
|
if k not in html.attrib and k != 'xmlns':
|
||||||
for k, v in attribs.iteritems():
|
|
||||||
if k not in html.attrib:
|
|
||||||
try:
|
try:
|
||||||
html.set(k, v)
|
html.set(k, v)
|
||||||
|
except TypeError:
|
||||||
|
pass
|
||||||
except ValueError:
|
except ValueError:
|
||||||
|
if k == 'xml:lang' and 'lang' not in html.attrib:
|
||||||
|
k = 'lang'
|
||||||
html.set(to_xml_name(k), v)
|
html.set(to_xml_name(k), v)
|
||||||
if nsmap != html.nsmap:
|
|
||||||
newroot = self.lxml_context.makeelement(html.tag, attrib=html.attrib, nsmap=nsmap)
|
|
||||||
self.proxy_cache.append(newroot)
|
|
||||||
newroot.name, newroot.namespace, newroot.nameTuple = html.name, html.namespace, html.nameTuple
|
|
||||||
self.openElements[0] = newroot
|
|
||||||
if self.document.root is html:
|
|
||||||
self.document.root = newroot
|
|
||||||
if len(html) > 0:
|
|
||||||
# TODO: the nsmap changes need to be propagated down the tree
|
|
||||||
for child in html:
|
|
||||||
newroot.append(copy.copy(child))
|
|
||||||
|
|
||||||
def apply_body_attributes(self, attrs):
|
def apply_body_attributes(self, attrs):
|
||||||
|
if not attrs:
|
||||||
|
return
|
||||||
body = self.openElements[1]
|
body = self.openElements[1]
|
||||||
nsmap = body.nsmap.copy()
|
for k, v in attrs.iteritems():
|
||||||
attribs = process_attribs(attrs, nsmap)
|
if k not in body.attrib and k !='xmlns':
|
||||||
for k, v in attribs.iteritems():
|
|
||||||
if k not in body.attrib:
|
|
||||||
try:
|
try:
|
||||||
body.set(k, v)
|
body.set(k, v)
|
||||||
|
except TypeError:
|
||||||
|
pass
|
||||||
except ValueError:
|
except ValueError:
|
||||||
|
if k == 'xml:lang' and 'lang' not in body.attrib:
|
||||||
|
k = 'lang'
|
||||||
body.set(to_xml_name(k), v)
|
body.set(to_xml_name(k), v)
|
||||||
# We ignore xmlns attributes on non-first <body> tags
|
|
||||||
|
|
||||||
def insertComment(self, token, parent=None):
|
def insertComment(self, token, parent=None):
|
||||||
if parent is None:
|
if parent is None:
|
||||||
parent = self.openElements[-1]
|
parent = self.openElements[-1]
|
||||||
parent.appendChild(Comment(token["data"].replace('--', '- -')))
|
parent.appendChild(Comment(token["data"].replace('--', '- -')))
|
||||||
|
|
||||||
def process_namespace_free_attribs(attrs):
|
|
||||||
anm = {k:k for k, v in attrs.iteritems() if ':' not in k}
|
|
||||||
for k in frozenset(attrs) - frozenset(anm):
|
|
||||||
prefix, name = k.partition(':')[0::2]
|
|
||||||
if prefix != 'xmlns' and name not in anm:
|
|
||||||
anm[name] = k
|
|
||||||
ans = OrderedDict((anm.get(k, None), v) for k, v in attrs.iteritems())
|
|
||||||
ans.pop(None, None)
|
|
||||||
return ans
|
|
||||||
|
|
||||||
def makeelement(ctx, name, attrib):
|
def makeelement(ctx, name, attrib):
|
||||||
|
attrib.pop('xmlns', None)
|
||||||
try:
|
try:
|
||||||
elem = ctx.makeelement(name)
|
elem = ctx.makeelement(name)
|
||||||
except ValueError:
|
except ValueError:
|
||||||
@ -421,7 +430,11 @@ def makeelement(ctx, name, attrib):
|
|||||||
for k, v in attrib.iteritems():
|
for k, v in attrib.iteritems():
|
||||||
try:
|
try:
|
||||||
elem.set(k, v)
|
elem.set(k, v)
|
||||||
|
except TypeError:
|
||||||
|
elem.set(to_xml_name(k[1]), v)
|
||||||
except ValueError:
|
except ValueError:
|
||||||
|
if k == 'xml:lang' and 'lang' not in attrib:
|
||||||
|
k = 'lang'
|
||||||
elem.set(to_xml_name(k), v)
|
elem.set(to_xml_name(k), v)
|
||||||
return elem
|
return elem
|
||||||
|
|
||||||
@ -436,8 +449,7 @@ class NoNamespaceTreeBuilder(TreeBuilder):
|
|||||||
|
|
||||||
def createElement(self, token, nsmap=None):
|
def createElement(self, token, nsmap=None):
|
||||||
name = token['name'].rpartition(':')[2]
|
name = token['name'].rpartition(':')[2]
|
||||||
attribs = process_namespace_free_attribs(token['data'])
|
elem = makeelement(self.lxml_context, name, token['data'])
|
||||||
elem = makeelement(self.lxml_context, name, attribs)
|
|
||||||
# Keep a reference to elem so that lxml does not delete and re-create
|
# Keep a reference to elem so that lxml does not delete and re-create
|
||||||
# it, losing _namespace
|
# it, losing _namespace
|
||||||
self.proxy_cache.append(elem)
|
self.proxy_cache.append(elem)
|
||||||
@ -458,24 +470,26 @@ class NoNamespaceTreeBuilder(TreeBuilder):
|
|||||||
if not attrs:
|
if not attrs:
|
||||||
return
|
return
|
||||||
html = self.openElements[0]
|
html = self.openElements[0]
|
||||||
attribs = process_namespace_free_attribs(attrs)
|
for k, v in attrs.iteritems():
|
||||||
for k, v in attribs.iteritems():
|
if k not in html.attrib and k != 'xmlns':
|
||||||
if k not in html.attrib:
|
|
||||||
try:
|
try:
|
||||||
html.set(k, v)
|
html.set(k, v)
|
||||||
except ValueError:
|
except ValueError:
|
||||||
|
if k == 'xml:lang' and 'lang' not in html.attrib:
|
||||||
|
k = 'lang'
|
||||||
html.set(to_xml_name(k), v)
|
html.set(to_xml_name(k), v)
|
||||||
|
|
||||||
def apply_body_attributes(self, attrs):
|
def apply_body_attributes(self, attrs):
|
||||||
if not attrs:
|
if not attrs:
|
||||||
return
|
return
|
||||||
body = self.openElements[1]
|
body = self.openElements[1]
|
||||||
attribs = process_namespace_free_attribs(attrs)
|
for k, v in attrs.iteritems():
|
||||||
for k, v in attribs.iteritems():
|
if k not in body.attrib and k != 'xmlns':
|
||||||
if k not in body.attrib:
|
|
||||||
try:
|
try:
|
||||||
body.set(k, v)
|
body.set(k, v)
|
||||||
except ValueError:
|
except ValueError:
|
||||||
|
if k == 'xml:lang' and 'lang' not in body.attrib:
|
||||||
|
k = 'lang'
|
||||||
body.set(to_xml_name(k), v)
|
body.set(to_xml_name(k), v)
|
||||||
|
|
||||||
# Input Stream {{{
|
# Input Stream {{{
|
||||||
|
Loading…
x
Reference in New Issue
Block a user