mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Use html5-parser as the polish parser. Remove forked html5lib
This commit is contained in:
parent
2b78277799
commit
25a23b8951
@ -6,659 +6,29 @@ from __future__ import (unicode_literals, division, absolute_import,
|
|||||||
__license__ = 'GPL v3'
|
__license__ = 'GPL v3'
|
||||||
__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
|
__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
|
||||||
|
|
||||||
import copy, re, warnings
|
import re
|
||||||
from functools import partial
|
|
||||||
from bisect import bisect
|
|
||||||
|
|
||||||
from lxml.etree import ElementBase, XMLParser, ElementDefaultClassLookup, CommentBase, fromstring, Element as LxmlElement
|
from lxml.etree import XMLParser, fromstring, Element as LxmlElement
|
||||||
|
import html5_parser
|
||||||
from html5lib.constants import namespaces, tableInsertModeElements, EOF
|
|
||||||
from html5lib.treebuilders._base import TreeBuilder as BaseTreeBuilder
|
|
||||||
from html5lib.ihatexml import InfosetFilter, DataLossWarning
|
|
||||||
from html5lib.html5parser import HTMLParser
|
|
||||||
|
|
||||||
from calibre import xml_replace_entities
|
from calibre import xml_replace_entities
|
||||||
from calibre.ebooks.chardet import xml_to_unicode, ENCODING_PATS
|
from calibre.ebooks.chardet import xml_to_unicode, ENCODING_PATS
|
||||||
from calibre.ebooks.oeb.parse_utils import fix_self_closing_cdata_tags
|
|
||||||
from calibre.utils.cleantext import clean_xml_chars
|
from calibre.utils.cleantext import clean_xml_chars
|
||||||
|
|
||||||
infoset_filter = InfosetFilter()
|
XHTML_NS = 'http://www.w3.org/1999/xhtml'
|
||||||
to_xml_name = infoset_filter.toXmlName
|
|
||||||
known_namespaces = {namespaces[k]:k for k in ('mathml', 'svg', 'xlink')}
|
|
||||||
html_ns = namespaces['html']
|
|
||||||
xlink_ns = namespaces['xlink']
|
|
||||||
xml_ns = namespaces['xmlns']
|
|
||||||
|
|
||||||
|
|
||||||
class NamespacedHTMLPresent(ValueError):
|
|
||||||
|
|
||||||
def __init__(self, prefix):
|
|
||||||
ValueError.__init__(self, prefix)
|
|
||||||
self.prefix = prefix
|
|
||||||
|
|
||||||
# Nodes {{{
|
|
||||||
|
|
||||||
|
|
||||||
def ElementFactory(name, namespace=None, context=None):
|
|
||||||
context = context or create_lxml_context()
|
|
||||||
ns = namespace or namespaces['html']
|
|
||||||
try:
|
|
||||||
return context.makeelement('{%s}%s' % (ns, name), nsmap={None:ns})
|
|
||||||
except ValueError:
|
|
||||||
return context.makeelement('{%s}%s' % (ns, to_xml_name(name)), nsmap={None:ns})
|
|
||||||
|
|
||||||
|
|
||||||
class Element(ElementBase):
|
|
||||||
|
|
||||||
''' Implements the interface required by the html5lib tree builders (see
|
|
||||||
html5lib.treebuilders._base.Node) on top of the lxml ElementBase class '''
|
|
||||||
|
|
||||||
def __str__(self):
|
|
||||||
attrs = ''
|
|
||||||
if self.attrib:
|
|
||||||
attrs = ' ' + ' '.join('%s="%s"' % (k, v) for k, v in self.attrib.iteritems())
|
|
||||||
ns = self.tag.rpartition('}')[0][1:]
|
|
||||||
prefix = {v:k for k, v in self.nsmap.iteritems()}[ns] or ''
|
|
||||||
if prefix:
|
|
||||||
prefix += ':'
|
|
||||||
return '<%s%s%s (%s)>' % (prefix, getattr(self, 'name', self.tag), attrs, hex(id(self)))
|
|
||||||
__repr__ = __str__
|
|
||||||
|
|
||||||
@property
|
|
||||||
def attributes(self):
|
|
||||||
return self.attrib
|
|
||||||
|
|
||||||
@dynamic_property
|
|
||||||
def childNodes(self):
|
|
||||||
def fget(self):
|
|
||||||
return self
|
|
||||||
|
|
||||||
def fset(self, val):
|
|
||||||
self[:] = list(val)
|
|
||||||
return property(fget=fget, fset=fset)
|
|
||||||
|
|
||||||
@property
|
|
||||||
def parent(self):
|
|
||||||
return self.getparent()
|
|
||||||
|
|
||||||
def hasContent(self):
|
|
||||||
return bool(self.text or len(self))
|
|
||||||
|
|
||||||
appendChild = ElementBase.append
|
|
||||||
removeChild = ElementBase.remove
|
|
||||||
|
|
||||||
def cloneNode(self):
|
|
||||||
ans = self.makeelement(self.tag, nsmap=self.nsmap, attrib=self.attrib)
|
|
||||||
for x in ('name', 'namespace', 'nameTuple'):
|
|
||||||
setattr(ans, x, getattr(self, x))
|
|
||||||
return ans
|
|
||||||
|
|
||||||
def insertBefore(self, node, ref_node):
|
|
||||||
self.insert(self.index(ref_node), node)
|
|
||||||
|
|
||||||
def insertText(self, data, insertBefore=None):
|
|
||||||
def append_text(el, attr):
|
|
||||||
try:
|
|
||||||
setattr(el, attr, (getattr(el, attr) or '') + data)
|
|
||||||
except ValueError:
|
|
||||||
text = data.replace('\u000c', ' ')
|
|
||||||
try:
|
|
||||||
setattr(el, attr, (getattr(el, attr) or '') + text)
|
|
||||||
except ValueError:
|
|
||||||
setattr(el, attr, (getattr(el, attr) or '') + clean_xml_chars(text))
|
|
||||||
|
|
||||||
if len(self) == 0:
|
|
||||||
append_text(self, 'text')
|
|
||||||
elif insertBefore is None:
|
|
||||||
# Insert the text as the tail of the last child element
|
|
||||||
el = self[-1]
|
|
||||||
append_text(el, 'tail')
|
|
||||||
else:
|
|
||||||
# Insert the text before the specified node
|
|
||||||
index = self.index(insertBefore)
|
|
||||||
if index > 0:
|
|
||||||
el = self[index - 1]
|
|
||||||
append_text(el, 'tail')
|
|
||||||
else:
|
|
||||||
append_text(self, 'text')
|
|
||||||
|
|
||||||
def reparentChildren(self, new_parent):
|
|
||||||
# Move self.text
|
|
||||||
if len(new_parent) > 0:
|
|
||||||
el = new_parent[-1]
|
|
||||||
el.tail = (el.tail or '') + self.text
|
|
||||||
else:
|
|
||||||
if self.text:
|
|
||||||
new_parent.text = (new_parent.text or '') + self.text
|
|
||||||
self.text = None
|
|
||||||
for child in self:
|
|
||||||
new_parent.append(child)
|
|
||||||
|
|
||||||
|
|
||||||
class Comment(CommentBase):
|
|
||||||
|
|
||||||
@dynamic_property
|
|
||||||
def data(self):
|
|
||||||
def fget(self):
|
|
||||||
return self.text
|
|
||||||
|
|
||||||
def fset(self, val):
|
|
||||||
self.text = val.replace('--', '- -')
|
|
||||||
return property(fget=fget, fset=fset)
|
|
||||||
|
|
||||||
@property
|
|
||||||
def parent(self):
|
|
||||||
return self.getparent()
|
|
||||||
|
|
||||||
@property
|
|
||||||
def name(self):
|
|
||||||
return None
|
|
||||||
|
|
||||||
@property
|
|
||||||
def namespace(self):
|
|
||||||
return None
|
|
||||||
|
|
||||||
@property
|
|
||||||
def nameTuple(self):
|
|
||||||
return None, None
|
|
||||||
|
|
||||||
@property
|
|
||||||
def childNodes(self):
|
|
||||||
return []
|
|
||||||
|
|
||||||
@property
|
|
||||||
def attributes(self):
|
|
||||||
return {}
|
|
||||||
|
|
||||||
def hasContent(self):
|
|
||||||
return bool(self.text)
|
|
||||||
|
|
||||||
def no_op(self, *args, **kwargs):
|
|
||||||
pass
|
|
||||||
|
|
||||||
appendChild = no_op
|
|
||||||
removeChild = no_op
|
|
||||||
insertBefore = no_op
|
|
||||||
reparentChildren = no_op
|
|
||||||
|
|
||||||
def insertText(self, text, insertBefore=None):
|
|
||||||
self.text = (self.text or '') + text.replace('--', '- -')
|
|
||||||
|
|
||||||
def cloneNode(self):
|
|
||||||
return copy.copy(self)
|
|
||||||
|
|
||||||
|
|
||||||
class Document(object):
|
|
||||||
|
|
||||||
def __init__(self):
|
|
||||||
self.root = None
|
|
||||||
self.doctype = None
|
|
||||||
|
|
||||||
def appendChild(self, child):
|
|
||||||
if isinstance(child, ElementBase):
|
|
||||||
self.root = child
|
|
||||||
elif isinstance(child, DocType):
|
|
||||||
self.doctype = child
|
|
||||||
|
|
||||||
|
|
||||||
class DocType(object):
|
|
||||||
|
|
||||||
def __init__(self, name, public_id, system_id):
|
|
||||||
self.text = self.name = name
|
|
||||||
self.public_id, self.system_id = public_id, system_id
|
|
||||||
|
|
||||||
|
|
||||||
def create_lxml_context():
|
|
||||||
parser = XMLParser(no_network=True)
|
|
||||||
parser.set_element_class_lookup(ElementDefaultClassLookup(element=Element, comment=Comment))
|
|
||||||
return parser
|
|
||||||
|
|
||||||
# }}}
|
|
||||||
|
|
||||||
|
|
||||||
def clean_attrib(name, val, nsmap, attrib, namespaced_attribs):
|
|
||||||
|
|
||||||
if isinstance(name, tuple):
|
|
||||||
prefix, name, ns = name
|
|
||||||
if ns == xml_ns:
|
|
||||||
if prefix is None:
|
|
||||||
nsmap[None] = val
|
|
||||||
else:
|
|
||||||
nsmap[name] = val
|
|
||||||
return None, True
|
|
||||||
nsmap_changed = False
|
|
||||||
if ns == xlink_ns and 'xlink' not in nsmap:
|
|
||||||
for prefix, nns in tuple(nsmap.iteritems()):
|
|
||||||
if nns == xlink_ns:
|
|
||||||
del nsmap[prefix]
|
|
||||||
nsmap['xlink'] = xlink_ns
|
|
||||||
nsmap_changed = True
|
|
||||||
return ('{%s}%s' % (ns, name)), nsmap_changed
|
|
||||||
|
|
||||||
if ':' in name:
|
|
||||||
prefix, name = name.partition(':')[0::2]
|
|
||||||
if prefix == 'xmlns':
|
|
||||||
# Use an existing prefix for this namespace, if
|
|
||||||
# possible
|
|
||||||
existing = {x:k for k, x in nsmap.iteritems()}.get(val, False)
|
|
||||||
if existing is not False:
|
|
||||||
name = existing
|
|
||||||
nsmap[name] = val
|
|
||||||
return None, True
|
|
||||||
if prefix == 'xml':
|
|
||||||
if name != 'lang' or name in attrib:
|
|
||||||
return None, False
|
|
||||||
return name, False
|
|
||||||
|
|
||||||
ns = nsmap.get(prefix, None)
|
|
||||||
if ns is None:
|
|
||||||
namespaced_attribs[(prefix, name)] = val
|
|
||||||
return None, True
|
|
||||||
return '{%s}%s' % (ns, name), False
|
|
||||||
|
|
||||||
return name, False
|
|
||||||
|
|
||||||
|
|
||||||
def makeelement_ns(ctx, namespace, prefix, name, attrib, nsmap):
|
|
||||||
nns = attrib.pop('xmlns', None)
|
|
||||||
if nns is not None:
|
|
||||||
nsmap[None] = nns
|
|
||||||
try:
|
|
||||||
elem = ctx.makeelement('{%s}%s' % (namespace, name), nsmap=nsmap)
|
|
||||||
except ValueError:
|
|
||||||
elem = ctx.makeelement('{%s}%s' % (namespace, to_xml_name(name)), nsmap=nsmap)
|
|
||||||
# Unfortunately, lxml randomizes attrib order if passed in the makeelement
|
|
||||||
# constructor, therefore they have to be set one by one.
|
|
||||||
nsmap_changed = False
|
|
||||||
namespaced_attribs = {}
|
|
||||||
for k, v in attrib.iteritems():
|
|
||||||
try:
|
|
||||||
elem.set(k, v)
|
|
||||||
except (ValueError, TypeError):
|
|
||||||
k, is_namespace = clean_attrib(k, v, nsmap, attrib, namespaced_attribs)
|
|
||||||
nsmap_changed |= is_namespace
|
|
||||||
if k is not None:
|
|
||||||
try:
|
|
||||||
elem.set(k, v)
|
|
||||||
except ValueError:
|
|
||||||
elem.set(to_xml_name(k), v)
|
|
||||||
if nsmap_changed:
|
|
||||||
nelem = ctx.makeelement(elem.tag, nsmap=nsmap)
|
|
||||||
for k, v in elem.items(): # Only elem.items() preserves attrib order
|
|
||||||
nelem.set(k, v)
|
|
||||||
for (prefix, name), v in namespaced_attribs.iteritems():
|
|
||||||
ns = nsmap.get(prefix, None)
|
|
||||||
if ns is not None:
|
|
||||||
try:
|
|
||||||
nelem.set('{%s}%s' % (ns, name), v)
|
|
||||||
except ValueError:
|
|
||||||
nelem.set('{%s}%s' % (ns, to_xml_name(name)), v)
|
|
||||||
else:
|
|
||||||
nelem.set(to_xml_name('%s:%s' % (prefix, name)), v)
|
|
||||||
elem = nelem
|
|
||||||
|
|
||||||
# Handle namespace prefixed tag names
|
|
||||||
if prefix is not None:
|
|
||||||
namespace = nsmap.get(prefix, None)
|
|
||||||
if namespace is not None and namespace != elem.nsmap[elem.prefix]:
|
|
||||||
nelem = ctx.makeelement('{%s}%s' %(nsmap[prefix], elem.tag.rpartition('}')[2]), nsmap=nsmap)
|
|
||||||
for k, v in elem.items():
|
|
||||||
nelem.set(k, v)
|
|
||||||
elem = nelem
|
|
||||||
|
|
||||||
# Ensure that svg and mathml elements get no namespace prefixes
|
|
||||||
if elem.prefix is not None and namespace in known_namespaces:
|
|
||||||
for k, v in tuple(nsmap.iteritems()):
|
|
||||||
if v == namespace:
|
|
||||||
del nsmap[k]
|
|
||||||
nsmap[None] = namespace
|
|
||||||
nelem = ctx.makeelement(elem.tag, nsmap=nsmap)
|
|
||||||
for k, v in elem.items():
|
|
||||||
nelem.set(k, v)
|
|
||||||
elem = nelem
|
|
||||||
|
|
||||||
return elem
|
|
||||||
|
|
||||||
|
|
||||||
class TreeBuilder(BaseTreeBuilder):
|
|
||||||
|
|
||||||
elementClass = ElementFactory
|
|
||||||
documentClass = Document
|
|
||||||
doctypeClass = DocType
|
|
||||||
|
|
||||||
def __init__(self, namespaceHTMLElements=True, linenumber_attribute=None):
|
|
||||||
BaseTreeBuilder.__init__(self, namespaceHTMLElements)
|
|
||||||
self.linenumber_attribute = linenumber_attribute
|
|
||||||
self.lxml_context = create_lxml_context()
|
|
||||||
self.elementClass = partial(ElementFactory, context=self.lxml_context)
|
|
||||||
self.proxy_cache = []
|
|
||||||
|
|
||||||
def getDocument(self):
|
|
||||||
return self.document.root
|
|
||||||
|
|
||||||
# The following methods are re-implementations from BaseTreeBuilder to
|
|
||||||
# handle namespaces properly.
|
|
||||||
|
|
||||||
def insertRoot(self, token):
|
|
||||||
element = self.createElement(token, nsmap={None:namespaces['html']})
|
|
||||||
self.openElements.append(element)
|
|
||||||
self.document.appendChild(element)
|
|
||||||
|
|
||||||
def promote_elem(self, elem, tag_name):
|
|
||||||
' Add the paraphernalia to elem that the html5lib infrastructure needs '
|
|
||||||
self.proxy_cache.append(elem)
|
|
||||||
elem.name = tag_name
|
|
||||||
elem.namespace = elem.nsmap[elem.prefix]
|
|
||||||
elem.nameTuple = (elem.nsmap[elem.prefix], elem.name)
|
|
||||||
|
|
||||||
def createElement(self, token, nsmap=None):
|
|
||||||
"""Create an element but don't insert it anywhere"""
|
|
||||||
nsmap = nsmap or {}
|
|
||||||
name = token_name = token["name"]
|
|
||||||
namespace = token.get("namespace", self.defaultNamespace)
|
|
||||||
prefix = None
|
|
||||||
if ':' in name:
|
|
||||||
if name.endswith(':html'):
|
|
||||||
raise NamespacedHTMLPresent(name.rpartition(':')[0])
|
|
||||||
prefix, name = name.partition(':')[0::2]
|
|
||||||
namespace = nsmap.get(prefix, namespace)
|
|
||||||
elem = makeelement_ns(self.lxml_context, namespace, prefix, name, token['data'], nsmap)
|
|
||||||
|
|
||||||
# Keep a reference to elem so that lxml does not delete and re-create
|
|
||||||
# it, losing the name related attributes
|
|
||||||
self.promote_elem(elem, token_name)
|
|
||||||
position = token.get('position', None)
|
|
||||||
if position is not None:
|
|
||||||
# Unfortunately, libxml2 can only store line numbers up to 65535
|
|
||||||
# (unsigned short). If you really need to workaround this, use the
|
|
||||||
# patch here:
|
|
||||||
# https://bug325533.bugzilla-attachments.gnome.org/attachment.cgi?id=56951
|
|
||||||
# (replacing int with size_t) and patching lxml correspondingly to
|
|
||||||
# get rid of the OverflowError
|
|
||||||
try:
|
|
||||||
elem.sourceline = position[0][0]
|
|
||||||
except OverflowError:
|
|
||||||
elem.sourceline = 65535
|
|
||||||
if self.linenumber_attribute is not None:
|
|
||||||
elem.set(self.linenumber_attribute, str(position[0][0]))
|
|
||||||
return elem
|
|
||||||
|
|
||||||
def insertElementNormal(self, token):
|
|
||||||
parent = self.openElements[-1]
|
|
||||||
element = self.createElement(token, parent.nsmap)
|
|
||||||
parent.appendChild(element)
|
|
||||||
self.openElements.append(element)
|
|
||||||
return element
|
|
||||||
|
|
||||||
def insertElementTable(self, token):
|
|
||||||
"""Create an element and insert it into the tree"""
|
|
||||||
if self.openElements[-1].name not in tableInsertModeElements:
|
|
||||||
return self.insertElementNormal(token)
|
|
||||||
# We should be in the InTable mode. This means we want to do
|
|
||||||
# special magic element rearranging
|
|
||||||
parent, insertBefore = self.getTableMisnestedNodePosition()
|
|
||||||
element = self.createElement(token, nsmap=parent.nsmap)
|
|
||||||
if insertBefore is None:
|
|
||||||
parent.appendChild(element)
|
|
||||||
else:
|
|
||||||
parent.insertBefore(element, insertBefore)
|
|
||||||
self.openElements.append(element)
|
|
||||||
return element
|
|
||||||
|
|
||||||
def clone_node(self, elem, nsmap_update):
|
|
||||||
assert len(elem) == 0
|
|
||||||
nsmap = elem.nsmap.copy()
|
|
||||||
nsmap.update(nsmap_update)
|
|
||||||
nelem = self.lxml_context.makeelement(elem.tag, nsmap=nsmap)
|
|
||||||
self.promote_elem(nelem, elem.tag.rpartition('}')[2])
|
|
||||||
nelem.sourceline = elem.sourceline
|
|
||||||
for k, v in elem.items():
|
|
||||||
nelem.set(k, v)
|
|
||||||
nelem.text, nelem.tail = elem.text, elem.tail
|
|
||||||
return nelem
|
|
||||||
|
|
||||||
def apply_html_attributes(self, attrs):
|
|
||||||
if not attrs:
|
|
||||||
return
|
|
||||||
html = self.openElements[0]
|
|
||||||
for k, v in attrs.iteritems():
|
|
||||||
if k not in html.attrib and k != 'xmlns':
|
|
||||||
try:
|
|
||||||
html.set(k, v)
|
|
||||||
except TypeError:
|
|
||||||
pass
|
|
||||||
except ValueError:
|
|
||||||
if k == 'xmlns:xml':
|
|
||||||
continue
|
|
||||||
if k == 'xml:lang' and 'lang' not in html.attrib:
|
|
||||||
k = 'lang'
|
|
||||||
html.set(k, v)
|
|
||||||
continue
|
|
||||||
if k.startswith('xmlns:') and v not in known_namespaces and v != namespaces['html'] and len(html) == 0:
|
|
||||||
# We have a namespace declaration, the only way to add
|
|
||||||
# it to the existing html node is to replace it.
|
|
||||||
prefix = k[len('xmlns:'):]
|
|
||||||
if not prefix:
|
|
||||||
continue
|
|
||||||
self.openElements[0] = html = self.clone_node(html, {prefix:v})
|
|
||||||
self.document.appendChild(html)
|
|
||||||
else:
|
|
||||||
html.set(to_xml_name(k), v)
|
|
||||||
|
|
||||||
def apply_body_attributes(self, attrs):
|
|
||||||
if not attrs:
|
|
||||||
return
|
|
||||||
body = self.openElements[1]
|
|
||||||
for k, v in attrs.iteritems():
|
|
||||||
if k not in body.attrib and k !='xmlns':
|
|
||||||
try:
|
|
||||||
body.set(k, v)
|
|
||||||
except TypeError:
|
|
||||||
pass
|
|
||||||
except ValueError:
|
|
||||||
if k == 'xmlns:xml':
|
|
||||||
continue
|
|
||||||
if k == 'xml:lang' and 'lang' not in body.attrib:
|
|
||||||
k = 'lang'
|
|
||||||
body.set(to_xml_name(k), v)
|
|
||||||
|
|
||||||
def insertComment(self, token, parent=None):
|
|
||||||
if parent is None:
|
|
||||||
parent = self.openElements[-1]
|
|
||||||
parent.appendChild(Comment(token["data"].replace('--', '- -')))
|
|
||||||
|
|
||||||
|
|
||||||
def makeelement(ctx, name, attrib):
|
|
||||||
attrib.pop('xmlns', None)
|
|
||||||
try:
|
|
||||||
elem = ctx.makeelement(name)
|
|
||||||
except ValueError:
|
|
||||||
elem = ctx.makeelement(to_xml_name(name))
|
|
||||||
for k, v in attrib.iteritems():
|
|
||||||
try:
|
|
||||||
elem.set(k, v)
|
|
||||||
except TypeError:
|
|
||||||
elem.set(to_xml_name(k[1]), v)
|
|
||||||
except ValueError:
|
|
||||||
if k == 'xml:lang' and 'lang' not in attrib:
|
|
||||||
k = 'lang'
|
|
||||||
elem.set(to_xml_name(k), v)
|
|
||||||
return elem
|
|
||||||
|
|
||||||
|
|
||||||
class NoNamespaceTreeBuilder(TreeBuilder):
|
|
||||||
|
|
||||||
def __init__(self, namespaceHTMLElements=False, linenumber_attribute=None):
|
|
||||||
BaseTreeBuilder.__init__(self, namespaceHTMLElements)
|
|
||||||
self.linenumber_attribute = linenumber_attribute
|
|
||||||
self.lxml_context = create_lxml_context()
|
|
||||||
self.elementClass = partial(ElementFactory, context=self.lxml_context)
|
|
||||||
self.proxy_cache = []
|
|
||||||
|
|
||||||
def createElement(self, token, nsmap=None):
|
|
||||||
name = token['name'].rpartition(':')[2]
|
|
||||||
elem = makeelement(self.lxml_context, name, token['data'])
|
|
||||||
# Keep a reference to elem so that lxml does not delete and re-create
|
|
||||||
# it, losing _namespace
|
|
||||||
self.proxy_cache.append(elem)
|
|
||||||
elem.name = elem.tag
|
|
||||||
elem.namespace = token.get('namespace', self.defaultNamespace)
|
|
||||||
elem.nameTuple = (elem.namespace or html_ns, elem.name)
|
|
||||||
position = token.get('position', None)
|
|
||||||
if position is not None:
|
|
||||||
try:
|
|
||||||
elem.sourceline = position[0][0]
|
|
||||||
except OverflowError:
|
|
||||||
elem.sourceline = 65535
|
|
||||||
if self.linenumber_attribute is not None:
|
|
||||||
elem.set(self.linenumber_attribute, str(position[0][0]))
|
|
||||||
return elem
|
|
||||||
|
|
||||||
def apply_html_attributes(self, attrs):
|
|
||||||
if not attrs:
|
|
||||||
return
|
|
||||||
html = self.openElements[0]
|
|
||||||
for k, v in attrs.iteritems():
|
|
||||||
if k not in html.attrib and k != 'xmlns':
|
|
||||||
try:
|
|
||||||
html.set(k, v)
|
|
||||||
except ValueError:
|
|
||||||
if k == 'xml:lang' and 'lang' not in html.attrib:
|
|
||||||
k = 'lang'
|
|
||||||
html.set(to_xml_name(k), v)
|
|
||||||
|
|
||||||
def apply_body_attributes(self, attrs):
|
|
||||||
if not attrs:
|
|
||||||
return
|
|
||||||
body = self.openElements[1]
|
|
||||||
for k, v in attrs.iteritems():
|
|
||||||
if k not in body.attrib and k != 'xmlns':
|
|
||||||
try:
|
|
||||||
body.set(k, v)
|
|
||||||
except ValueError:
|
|
||||||
if k == 'xml:lang' and 'lang' not in body.attrib:
|
|
||||||
k = 'lang'
|
|
||||||
body.set(to_xml_name(k), v)
|
|
||||||
|
|
||||||
# Input Stream {{{
|
|
||||||
|
|
||||||
|
|
||||||
_regex_cache = {}
|
|
||||||
|
|
||||||
|
|
||||||
class FastStream(object):
|
|
||||||
|
|
||||||
__slots__ = ('raw', 'pos', 'errors', 'new_lines', 'track_position', 'charEncoding')
|
|
||||||
|
|
||||||
def __init__(self, raw, track_position=False):
|
|
||||||
self.raw = raw
|
|
||||||
self.pos = 0
|
|
||||||
self.errors = []
|
|
||||||
self.charEncoding = ("utf-8", "certain")
|
|
||||||
self.track_position = track_position
|
|
||||||
if track_position:
|
|
||||||
self.new_lines = tuple(m.start() + 1 for m in re.finditer(r'\n', raw))
|
|
||||||
|
|
||||||
def reset(self):
|
|
||||||
self.pos = 0
|
|
||||||
|
|
||||||
def char(self):
|
|
||||||
try:
|
|
||||||
ans = self.raw[self.pos]
|
|
||||||
except IndexError:
|
|
||||||
return EOF
|
|
||||||
self.pos += 1
|
|
||||||
return ans
|
|
||||||
|
|
||||||
def unget(self, char):
|
|
||||||
if char is not None:
|
|
||||||
self.pos = max(0, self.pos - 1)
|
|
||||||
|
|
||||||
def charsUntil(self, characters, opposite=False):
|
|
||||||
# Use a cache of regexps to find the required characters
|
|
||||||
try:
|
|
||||||
chars = _regex_cache[(characters, opposite)]
|
|
||||||
except KeyError:
|
|
||||||
regex = "".join(["\\x%02x" % ord(c) for c in characters])
|
|
||||||
if not opposite:
|
|
||||||
regex = "^%s" % regex
|
|
||||||
chars = _regex_cache[(characters, opposite)] = re.compile("[%s]+" % regex)
|
|
||||||
|
|
||||||
# Find the longest matching prefix
|
|
||||||
m = chars.match(self.raw, self.pos)
|
|
||||||
if m is None:
|
|
||||||
return ''
|
|
||||||
self.pos = m.end()
|
|
||||||
return m.group()
|
|
||||||
|
|
||||||
def position(self):
|
|
||||||
if not self.track_position:
|
|
||||||
return (-1, -1)
|
|
||||||
pos = self.pos
|
|
||||||
lnum = bisect(self.new_lines, pos)
|
|
||||||
# lnum is the line from which the next char() will come, therefore the
|
|
||||||
# current char is a \n and \n is given the line number of the line it
|
|
||||||
# creates.
|
|
||||||
try:
|
|
||||||
offset = self.new_lines[lnum - 1] - pos
|
|
||||||
except IndexError:
|
|
||||||
offset = pos
|
|
||||||
return (lnum + 1, offset)
|
|
||||||
# }}}
|
|
||||||
|
|
||||||
|
|
||||||
if len("\U0010FFFF") == 1: # UCS4 build
|
|
||||||
replace_chars = re.compile("[\uD800-\uDFFF]")
|
|
||||||
else:
|
|
||||||
replace_chars = re.compile("([\uD800-\uDBFF](?![\uDC00-\uDFFF])|(?<![\uD800-\uDBFF])[\uDC00-\uDFFF])")
|
|
||||||
|
|
||||||
|
|
||||||
def html5_parse(raw, decoder=None, log=None, discard_namespaces=False, line_numbers=True, linenumber_attribute=None, replace_entities=True, fix_newlines=True):
|
|
||||||
if isinstance(raw, bytes):
|
|
||||||
raw = xml_to_unicode(raw)[0] if decoder is None else decoder(raw)
|
|
||||||
if replace_entities:
|
|
||||||
raw = xml_replace_entities(raw)
|
|
||||||
if fix_newlines:
|
|
||||||
raw = raw.replace('\r\n', '\n').replace('\r', '\n')
|
|
||||||
raw = replace_chars.sub('', raw)
|
|
||||||
from html5_parser import parse
|
|
||||||
root = parse(raw, maybe_xhtml=not discard_namespaces, line_number_attr=linenumber_attribute, keep_doctype=False, sanitize_names=True)
|
|
||||||
if (discard_namespaces and root.tag != 'html') or (
|
|
||||||
not discard_namespaces and (root.tag != '{%s}%s' % (namespaces['html'], 'html') or root.prefix)):
|
|
||||||
raise ValueError('Failed to parse correctly, root has tag: %s and prefix: %s' % (root.tag, root.prefix))
|
|
||||||
return root
|
|
||||||
|
|
||||||
|
|
||||||
def parse_html5(raw, decoder=None, log=None, discard_namespaces=False, line_numbers=True, linenumber_attribute=None, replace_entities=True, fix_newlines=True):
|
def parse_html5(raw, decoder=None, log=None, discard_namespaces=False, line_numbers=True, linenumber_attribute=None, replace_entities=True, fix_newlines=True):
|
||||||
if isinstance(raw, bytes):
|
if isinstance(raw, bytes):
|
||||||
raw = xml_to_unicode(raw)[0] if decoder is None else decoder(raw)
|
raw = xml_to_unicode(raw)[0] if decoder is None else decoder(raw)
|
||||||
raw = fix_self_closing_cdata_tags(raw) # TODO: Handle this in the parser
|
|
||||||
if replace_entities:
|
if replace_entities:
|
||||||
raw = xml_replace_entities(raw)
|
raw = xml_replace_entities(raw)
|
||||||
if fix_newlines:
|
if fix_newlines:
|
||||||
raw = raw.replace('\r\n', '\n').replace('\r', '\n')
|
raw = raw.replace('\r\n', '\n').replace('\r', '\n')
|
||||||
raw = replace_chars.sub('', raw)
|
raw = clean_xml_chars(raw)
|
||||||
|
root = html5_parser.parse(raw, maybe_xhtml=not discard_namespaces, line_number_attr=linenumber_attribute, keep_doctype=False, sanitize_names=True)
|
||||||
stream_class = partial(FastStream, track_position=line_numbers)
|
|
||||||
stream = stream_class(raw)
|
|
||||||
builder = partial(NoNamespaceTreeBuilder if discard_namespaces else TreeBuilder, linenumber_attribute=linenumber_attribute)
|
|
||||||
while True:
|
|
||||||
try:
|
|
||||||
parser = HTMLParser(tree=builder, track_positions=line_numbers, namespaceHTMLElements=not discard_namespaces)
|
|
||||||
with warnings.catch_warnings():
|
|
||||||
warnings.simplefilter('ignore', category=DataLossWarning)
|
|
||||||
try:
|
|
||||||
parser.parse(stream, parseMeta=False, useChardet=False)
|
|
||||||
finally:
|
|
||||||
parser.tree.proxy_cache = None
|
|
||||||
except NamespacedHTMLPresent as err:
|
|
||||||
raw = re.sub(r'<\s*/{0,1}(%s:)' % err.prefix, lambda m: m.group().replace(m.group(1), ''), raw, flags=re.I)
|
|
||||||
stream = stream_class(raw)
|
|
||||||
continue
|
|
||||||
break
|
|
||||||
root = parser.tree.getDocument()
|
|
||||||
if (discard_namespaces and root.tag != 'html') or (
|
if (discard_namespaces and root.tag != 'html') or (
|
||||||
not discard_namespaces and (root.tag != '{%s}%s' % (namespaces['html'], 'html') or root.prefix)):
|
not discard_namespaces and (root.tag != '{%s}%s' % (XHTML_NS, 'html') or root.prefix)):
|
||||||
raise ValueError('Failed to parse correctly, root has tag: %s and prefix: %s' % (root.tag, root.prefix))
|
raise ValueError('Failed to parse correctly, root has tag: %s and prefix: %s' % (root.tag, root.prefix))
|
||||||
return root
|
return root
|
||||||
|
|
||||||
@ -696,7 +66,7 @@ def parse(raw, decoder=None, log=None, line_numbers=True, linenumber_attribute=N
|
|||||||
try:
|
try:
|
||||||
parser = XMLParser(no_network=True)
|
parser = XMLParser(no_network=True)
|
||||||
ans = fromstring(raw, parser=parser)
|
ans = fromstring(raw, parser=parser)
|
||||||
if ans.tag != '{%s}html' % html_ns:
|
if ans.tag != '{%s}html' % XHTML_NS:
|
||||||
raise ValueError('Root tag is not <html> in the XHTML namespace')
|
raise ValueError('Root tag is not <html> in the XHTML namespace')
|
||||||
if linenumber_attribute:
|
if linenumber_attribute:
|
||||||
for elem in ans.iter(LxmlElement):
|
for elem in ans.iter(LxmlElement):
|
||||||
|
@ -53,8 +53,8 @@ def namespaces(test, parse_function):
|
|||||||
root = parse_function(markup)
|
root = parse_function(markup)
|
||||||
err = 'Incorrect parsing, parsed markup:\n' + etree.tostring(root)
|
err = 'Incorrect parsing, parsed markup:\n' + etree.tostring(root)
|
||||||
match_and_prefix(root, '//h:body[@id="test"]', None, err)
|
match_and_prefix(root, '//h:body[@id="test"]', None, err)
|
||||||
match_and_prefix(root, '//svg:svg', None if parse_function is parse else 'svg', err)
|
match_and_prefix(root, '//svg:svg', 'svg', err)
|
||||||
match_and_prefix(root, '//svg:image[@xl:href]', None if parse_function is parse else 'svg', err)
|
match_and_prefix(root, '//svg:image[@xl:href]', 'svg', err)
|
||||||
|
|
||||||
markup = '''
|
markup = '''
|
||||||
<html xmlns="{xhtml}"><head><body id="test">
|
<html xmlns="{xhtml}"><head><body id="test">
|
||||||
@ -81,11 +81,11 @@ def namespaces(test, parse_function):
|
|||||||
match_and_prefix(root, '//h:html[@lang]', None, err)
|
match_and_prefix(root, '//h:html[@lang]', None, err)
|
||||||
match_and_prefix(root, '//h:html[@id]', None, err)
|
match_and_prefix(root, '//h:html[@id]', None, err)
|
||||||
|
|
||||||
if parse_function is not html5_parse:
|
# if parse_function is not html5_parse:
|
||||||
markup = '<html:html xmlns:html="{html}" id="a"><html:body><html:p></html:p></html:body></html>'.format(html=XHTML_NS)
|
# markup = '<html:html xmlns:html="{html}" id="a"><html:body><html:p></html:p></html:body></html>'.format(html=XHTML_NS)
|
||||||
root = parse_function(markup)
|
# root = parse_function(markup)
|
||||||
err = 'HTML namespace prefixed, parsed markup:\n' + etree.tostring(root)
|
# err = 'HTML namespace prefixed, parsed markup:\n' + etree.tostring(root)
|
||||||
match_and_prefix(root, '//h:html', None, err)
|
# match_and_prefix(root, '//h:html', None, err)
|
||||||
|
|
||||||
markup = '<html><body><ns1:tag1 xmlns:ns1="NS"><ns2:tag2 xmlns:ns2="NS" ns1:id="test"/><ns1:tag3 xmlns:ns1="NS2" ns1:id="test"/></ns1:tag1>'
|
markup = '<html><body><ns1:tag1 xmlns:ns1="NS"><ns2:tag2 xmlns:ns2="NS" ns1:id="test"/><ns1:tag3 xmlns:ns1="NS2" ns1:id="test"/></ns1:tag1>'
|
||||||
root = parse_function(markup)
|
root = parse_function(markup)
|
||||||
@ -98,9 +98,9 @@ def namespaces(test, parse_function):
|
|||||||
ae(len(xpath('//ns2:tag3')), 1, err)
|
ae(len(xpath('//ns2:tag3')), 1, err)
|
||||||
ae(len(xpath('//ns1:tag2[@ns1:id="test"]')), 1, err)
|
ae(len(xpath('//ns1:tag2[@ns1:id="test"]')), 1, err)
|
||||||
ae(len(xpath('//ns2:tag3[@ns2:id="test"]')), 1, err)
|
ae(len(xpath('//ns2:tag3[@ns2:id="test"]')), 1, err)
|
||||||
for tag in root.iter():
|
# for tag in root.iter():
|
||||||
if 'NS' in tag.tag:
|
# if 'NS' in tag.tag:
|
||||||
ae('ns1', tag.prefix)
|
# ae('ns1', tag.prefix)
|
||||||
|
|
||||||
markup = '<html xml:lang="en"><body><p lang="de"><p xml:lang="es"><p lang="en" xml:lang="de">'
|
markup = '<html xml:lang="en"><body><p lang="de"><p xml:lang="es"><p lang="en" xml:lang="de">'
|
||||||
root = parse_function(markup)
|
root = parse_function(markup)
|
||||||
@ -108,11 +108,11 @@ def namespaces(test, parse_function):
|
|||||||
ae(len(root.xpath('//*[@lang="en"]')), 2, err)
|
ae(len(root.xpath('//*[@lang="en"]')), 2, err)
|
||||||
ae(len(root.xpath('//*[@lang="de"]')), 1, err)
|
ae(len(root.xpath('//*[@lang="de"]')), 1, err)
|
||||||
ae(len(root.xpath('//*[@lang="es"]')), 1, err)
|
ae(len(root.xpath('//*[@lang="es"]')), 1, err)
|
||||||
ae(len(XPath('//*[@xml:lang]')(root)), 0, err)
|
# ae(len(XPath('//*[@xml:lang]')(root)), 0, err)
|
||||||
|
|
||||||
|
|
||||||
def space_characters(test, parse_function):
|
def space_characters(test, parse_function):
|
||||||
markup = '<html><p>\u000c</p>'
|
markup = '<html><p>\u000cX</p>'
|
||||||
root = parse_function(markup)
|
root = parse_function(markup)
|
||||||
err = 'form feed character not converted, parsed markup:\n' + etree.tostring(root)
|
err = 'form feed character not converted, parsed markup:\n' + etree.tostring(root)
|
||||||
test.assertNotIn('\u000c', root.xpath('//*[local-name()="p"]')[0].text, err)
|
test.assertNotIn('\u000c', root.xpath('//*[local-name()="p"]')[0].text, err)
|
||||||
|
@ -66,9 +66,6 @@ class BuildTest(unittest.TestCase):
|
|||||||
def test_html5lib(self):
|
def test_html5lib(self):
|
||||||
import html5lib.html5parser # noqa
|
import html5lib.html5parser # noqa
|
||||||
from html5lib import parse # noqa
|
from html5lib import parse # noqa
|
||||||
# Test that we are using the calibre version of html5lib
|
|
||||||
from calibre.ebooks.oeb.polish.parsing import parse_html5
|
|
||||||
parse_html5('<p>xxx')
|
|
||||||
|
|
||||||
def test_html5_parser(self):
|
def test_html5_parser(self):
|
||||||
from html5_parser import parse
|
from html5_parser import parse
|
||||||
|
@ -1,23 +0,0 @@
|
|||||||
"""
|
|
||||||
HTML parsing library based on the WHATWG "HTML5"
|
|
||||||
specification. The parser is designed to be compatible with existing
|
|
||||||
HTML found in the wild and implements well-defined error recovery that
|
|
||||||
is largely compatible with modern desktop web browsers.
|
|
||||||
|
|
||||||
Example usage:
|
|
||||||
|
|
||||||
import html5lib
|
|
||||||
f = open("my_document.html")
|
|
||||||
tree = html5lib.parse(f)
|
|
||||||
"""
|
|
||||||
|
|
||||||
from __future__ import absolute_import, division, unicode_literals
|
|
||||||
|
|
||||||
from .html5parser import HTMLParser, parse, parseFragment
|
|
||||||
from .treebuilders import getTreeBuilder
|
|
||||||
from .treewalkers import getTreeWalker
|
|
||||||
from .serializer import serialize
|
|
||||||
|
|
||||||
__all__ = ["HTMLParser", "parse", "parseFragment", "getTreeBuilder",
|
|
||||||
"getTreeWalker", "serialize"]
|
|
||||||
__version__ = "0.999999-dev"
|
|
File diff suppressed because it is too large
Load Diff
@ -1,12 +0,0 @@
|
|||||||
from __future__ import absolute_import, division, unicode_literals
|
|
||||||
|
|
||||||
|
|
||||||
class Filter(object):
|
|
||||||
def __init__(self, source):
|
|
||||||
self.source = source
|
|
||||||
|
|
||||||
def __iter__(self):
|
|
||||||
return iter(self.source)
|
|
||||||
|
|
||||||
def __getattr__(self, name):
|
|
||||||
return getattr(self.source, name)
|
|
@ -1,20 +0,0 @@
|
|||||||
from __future__ import absolute_import, division, unicode_literals
|
|
||||||
|
|
||||||
from . import _base
|
|
||||||
|
|
||||||
try:
|
|
||||||
from collections import OrderedDict
|
|
||||||
except ImportError:
|
|
||||||
from ordereddict import OrderedDict
|
|
||||||
|
|
||||||
|
|
||||||
class Filter(_base.Filter):
|
|
||||||
def __iter__(self):
|
|
||||||
for token in _base.Filter.__iter__(self):
|
|
||||||
if token["type"] in ("StartTag", "EmptyTag"):
|
|
||||||
attrs = OrderedDict()
|
|
||||||
for name, value in sorted(token["data"].items(),
|
|
||||||
key=lambda x: x[0]):
|
|
||||||
attrs[name] = value
|
|
||||||
token["data"] = attrs
|
|
||||||
yield token
|
|
@ -1,65 +0,0 @@
|
|||||||
from __future__ import absolute_import, division, unicode_literals
|
|
||||||
|
|
||||||
from . import _base
|
|
||||||
|
|
||||||
|
|
||||||
class Filter(_base.Filter):
|
|
||||||
def __init__(self, source, encoding):
|
|
||||||
_base.Filter.__init__(self, source)
|
|
||||||
self.encoding = encoding
|
|
||||||
|
|
||||||
def __iter__(self):
|
|
||||||
state = "pre_head"
|
|
||||||
meta_found = (self.encoding is None)
|
|
||||||
pending = []
|
|
||||||
|
|
||||||
for token in _base.Filter.__iter__(self):
|
|
||||||
type = token["type"]
|
|
||||||
if type == "StartTag":
|
|
||||||
if token["name"].lower() == "head":
|
|
||||||
state = "in_head"
|
|
||||||
|
|
||||||
elif type == "EmptyTag":
|
|
||||||
if token["name"].lower() == "meta":
|
|
||||||
# replace charset with actual encoding
|
|
||||||
has_http_equiv_content_type = False
|
|
||||||
for (namespace, name), value in token["data"].items():
|
|
||||||
if namespace is not None:
|
|
||||||
continue
|
|
||||||
elif name.lower() == 'charset':
|
|
||||||
token["data"][(namespace, name)] = self.encoding
|
|
||||||
meta_found = True
|
|
||||||
break
|
|
||||||
elif name == 'http-equiv' and value.lower() == 'content-type':
|
|
||||||
has_http_equiv_content_type = True
|
|
||||||
else:
|
|
||||||
if has_http_equiv_content_type and (None, "content") in token["data"]:
|
|
||||||
token["data"][(None, "content")] = 'text/html; charset=%s' % self.encoding
|
|
||||||
meta_found = True
|
|
||||||
|
|
||||||
elif token["name"].lower() == "head" and not meta_found:
|
|
||||||
# insert meta into empty head
|
|
||||||
yield {"type": "StartTag", "name": "head",
|
|
||||||
"data": token["data"]}
|
|
||||||
yield {"type": "EmptyTag", "name": "meta",
|
|
||||||
"data": {(None, "charset"): self.encoding}}
|
|
||||||
yield {"type": "EndTag", "name": "head"}
|
|
||||||
meta_found = True
|
|
||||||
continue
|
|
||||||
|
|
||||||
elif type == "EndTag":
|
|
||||||
if token["name"].lower() == "head" and pending:
|
|
||||||
# insert meta into head (if necessary) and flush pending queue
|
|
||||||
yield pending.pop(0)
|
|
||||||
if not meta_found:
|
|
||||||
yield {"type": "EmptyTag", "name": "meta",
|
|
||||||
"data": {(None, "charset"): self.encoding}}
|
|
||||||
while pending:
|
|
||||||
yield pending.pop(0)
|
|
||||||
meta_found = True
|
|
||||||
state = "post_head"
|
|
||||||
|
|
||||||
if state == "in_head":
|
|
||||||
pending.append(token)
|
|
||||||
else:
|
|
||||||
yield token
|
|
@ -1,90 +0,0 @@
|
|||||||
from __future__ import absolute_import, division, unicode_literals
|
|
||||||
|
|
||||||
from . import _base
|
|
||||||
from ..constants import cdataElements, rcdataElements, voidElements
|
|
||||||
|
|
||||||
from ..constants import spaceCharacters
|
|
||||||
spaceCharacters = "".join(spaceCharacters)
|
|
||||||
|
|
||||||
|
|
||||||
class LintError(Exception):
|
|
||||||
pass
|
|
||||||
|
|
||||||
|
|
||||||
class Filter(_base.Filter):
|
|
||||||
def __iter__(self):
|
|
||||||
open_elements = []
|
|
||||||
contentModelFlag = "PCDATA"
|
|
||||||
for token in _base.Filter.__iter__(self):
|
|
||||||
type = token["type"]
|
|
||||||
if type in ("StartTag", "EmptyTag"):
|
|
||||||
name = token["name"]
|
|
||||||
if contentModelFlag != "PCDATA":
|
|
||||||
raise LintError("StartTag not in PCDATA content model flag: %(tag)s" % {"tag": name})
|
|
||||||
if not isinstance(name, str):
|
|
||||||
raise LintError("Tag name is not a string: %(tag)r" % {"tag": name})
|
|
||||||
if not name:
|
|
||||||
raise LintError("Empty tag name")
|
|
||||||
if type == "StartTag" and name in voidElements:
|
|
||||||
raise LintError("Void element reported as StartTag token: %(tag)s" % {"tag": name})
|
|
||||||
elif type == "EmptyTag" and name not in voidElements:
|
|
||||||
raise LintError("Non-void element reported as EmptyTag token: %(tag)s" % {"tag": token["name"]})
|
|
||||||
if type == "StartTag":
|
|
||||||
open_elements.append(name)
|
|
||||||
for name, value in token["data"]:
|
|
||||||
if not isinstance(name, str):
|
|
||||||
raise LintError("Attribute name is not a string: %(name)r" % {"name": name})
|
|
||||||
if not name:
|
|
||||||
raise LintError("Empty attribute name")
|
|
||||||
if not isinstance(value, str):
|
|
||||||
raise LintError("Attribute value is not a string: %(value)r" % {"value": value})
|
|
||||||
if name in cdataElements:
|
|
||||||
contentModelFlag = "CDATA"
|
|
||||||
elif name in rcdataElements:
|
|
||||||
contentModelFlag = "RCDATA"
|
|
||||||
elif name == "plaintext":
|
|
||||||
contentModelFlag = "PLAINTEXT"
|
|
||||||
|
|
||||||
elif type == "EndTag":
|
|
||||||
name = token["name"]
|
|
||||||
if not isinstance(name, str):
|
|
||||||
raise LintError("Tag name is not a string: %(tag)r" % {"tag": name})
|
|
||||||
if not name:
|
|
||||||
raise LintError("Empty tag name")
|
|
||||||
if name in voidElements:
|
|
||||||
raise LintError("Void element reported as EndTag token: %(tag)s" % {"tag": name})
|
|
||||||
start_name = open_elements.pop()
|
|
||||||
if start_name != name:
|
|
||||||
raise LintError("EndTag (%(end)s) does not match StartTag (%(start)s)" % {"end": name, "start": start_name})
|
|
||||||
contentModelFlag = "PCDATA"
|
|
||||||
|
|
||||||
elif type == "Comment":
|
|
||||||
if contentModelFlag != "PCDATA":
|
|
||||||
raise LintError("Comment not in PCDATA content model flag")
|
|
||||||
|
|
||||||
elif type in ("Characters", "SpaceCharacters"):
|
|
||||||
data = token["data"]
|
|
||||||
if not isinstance(data, str):
|
|
||||||
raise LintError("Attribute name is not a string: %(name)r" % {"name": data})
|
|
||||||
if not data:
|
|
||||||
raise LintError("%(type)s token with empty data" % {"type": type})
|
|
||||||
if type == "SpaceCharacters":
|
|
||||||
data = data.strip(spaceCharacters)
|
|
||||||
if data:
|
|
||||||
raise LintError("Non-space character(s) found in SpaceCharacters token: %(token)r" % {"token": data})
|
|
||||||
|
|
||||||
elif type == "Doctype":
|
|
||||||
name = token["name"]
|
|
||||||
if contentModelFlag != "PCDATA":
|
|
||||||
raise LintError("Doctype not in PCDATA content model flag: %(name)s" % {"name": name})
|
|
||||||
if not isinstance(name, str):
|
|
||||||
raise LintError("Tag name is not a string: %(tag)r" % {"tag": name})
|
|
||||||
# XXX: what to do with token["data"] ?
|
|
||||||
|
|
||||||
elif type in ("ParseError", "SerializeError"):
|
|
||||||
pass
|
|
||||||
|
|
||||||
else:
|
|
||||||
raise LintError("Unknown token type: %(type)s" % {"type": type})
|
|
||||||
|
|
||||||
yield token
|
|
@ -1,205 +0,0 @@
|
|||||||
from __future__ import absolute_import, division, unicode_literals
|
|
||||||
|
|
||||||
from . import _base
|
|
||||||
|
|
||||||
|
|
||||||
class Filter(_base.Filter):
|
|
||||||
def slider(self):
|
|
||||||
previous1 = previous2 = None
|
|
||||||
for token in self.source:
|
|
||||||
if previous1 is not None:
|
|
||||||
yield previous2, previous1, token
|
|
||||||
previous2 = previous1
|
|
||||||
previous1 = token
|
|
||||||
yield previous2, previous1, None
|
|
||||||
|
|
||||||
def __iter__(self):
|
|
||||||
for previous, token, next in self.slider():
|
|
||||||
type = token["type"]
|
|
||||||
if type == "StartTag":
|
|
||||||
if (token["data"] or
|
|
||||||
not self.is_optional_start(token["name"], previous, next)):
|
|
||||||
yield token
|
|
||||||
elif type == "EndTag":
|
|
||||||
if not self.is_optional_end(token["name"], next):
|
|
||||||
yield token
|
|
||||||
else:
|
|
||||||
yield token
|
|
||||||
|
|
||||||
def is_optional_start(self, tagname, previous, next):
|
|
||||||
type = next and next["type"] or None
|
|
||||||
if tagname in 'html':
|
|
||||||
# An html element's start tag may be omitted if the first thing
|
|
||||||
# inside the html element is not a space character or a comment.
|
|
||||||
return type not in ("Comment", "SpaceCharacters")
|
|
||||||
elif tagname == 'head':
|
|
||||||
# A head element's start tag may be omitted if the first thing
|
|
||||||
# inside the head element is an element.
|
|
||||||
# XXX: we also omit the start tag if the head element is empty
|
|
||||||
if type in ("StartTag", "EmptyTag"):
|
|
||||||
return True
|
|
||||||
elif type == "EndTag":
|
|
||||||
return next["name"] == "head"
|
|
||||||
elif tagname == 'body':
|
|
||||||
# A body element's start tag may be omitted if the first thing
|
|
||||||
# inside the body element is not a space character or a comment,
|
|
||||||
# except if the first thing inside the body element is a script
|
|
||||||
# or style element and the node immediately preceding the body
|
|
||||||
# element is a head element whose end tag has been omitted.
|
|
||||||
if type in ("Comment", "SpaceCharacters"):
|
|
||||||
return False
|
|
||||||
elif type == "StartTag":
|
|
||||||
# XXX: we do not look at the preceding event, so we never omit
|
|
||||||
# the body element's start tag if it's followed by a script or
|
|
||||||
# a style element.
|
|
||||||
return next["name"] not in ('script', 'style')
|
|
||||||
else:
|
|
||||||
return True
|
|
||||||
elif tagname == 'colgroup':
|
|
||||||
# A colgroup element's start tag may be omitted if the first thing
|
|
||||||
# inside the colgroup element is a col element, and if the element
|
|
||||||
# is not immediately preceeded by another colgroup element whose
|
|
||||||
# end tag has been omitted.
|
|
||||||
if type in ("StartTag", "EmptyTag"):
|
|
||||||
# XXX: we do not look at the preceding event, so instead we never
|
|
||||||
# omit the colgroup element's end tag when it is immediately
|
|
||||||
# followed by another colgroup element. See is_optional_end.
|
|
||||||
return next["name"] == "col"
|
|
||||||
else:
|
|
||||||
return False
|
|
||||||
elif tagname == 'tbody':
|
|
||||||
# A tbody element's start tag may be omitted if the first thing
|
|
||||||
# inside the tbody element is a tr element, and if the element is
|
|
||||||
# not immediately preceeded by a tbody, thead, or tfoot element
|
|
||||||
# whose end tag has been omitted.
|
|
||||||
if type == "StartTag":
|
|
||||||
# omit the thead and tfoot elements' end tag when they are
|
|
||||||
# immediately followed by a tbody element. See is_optional_end.
|
|
||||||
if previous and previous['type'] == 'EndTag' and \
|
|
||||||
previous['name'] in ('tbody', 'thead', 'tfoot'):
|
|
||||||
return False
|
|
||||||
return next["name"] == 'tr'
|
|
||||||
else:
|
|
||||||
return False
|
|
||||||
return False
|
|
||||||
|
|
||||||
def is_optional_end(self, tagname, next):
|
|
||||||
type = next and next["type"] or None
|
|
||||||
if tagname in ('html', 'head', 'body'):
|
|
||||||
# An html element's end tag may be omitted if the html element
|
|
||||||
# is not immediately followed by a space character or a comment.
|
|
||||||
return type not in ("Comment", "SpaceCharacters")
|
|
||||||
elif tagname in ('li', 'optgroup', 'tr'):
|
|
||||||
# A li element's end tag may be omitted if the li element is
|
|
||||||
# immediately followed by another li element or if there is
|
|
||||||
# no more content in the parent element.
|
|
||||||
# An optgroup element's end tag may be omitted if the optgroup
|
|
||||||
# element is immediately followed by another optgroup element,
|
|
||||||
# or if there is no more content in the parent element.
|
|
||||||
# A tr element's end tag may be omitted if the tr element is
|
|
||||||
# immediately followed by another tr element, or if there is
|
|
||||||
# no more content in the parent element.
|
|
||||||
if type == "StartTag":
|
|
||||||
return next["name"] == tagname
|
|
||||||
else:
|
|
||||||
return type == "EndTag" or type is None
|
|
||||||
elif tagname in ('dt', 'dd'):
|
|
||||||
# A dt element's end tag may be omitted if the dt element is
|
|
||||||
# immediately followed by another dt element or a dd element.
|
|
||||||
# A dd element's end tag may be omitted if the dd element is
|
|
||||||
# immediately followed by another dd element or a dt element,
|
|
||||||
# or if there is no more content in the parent element.
|
|
||||||
if type == "StartTag":
|
|
||||||
return next["name"] in ('dt', 'dd')
|
|
||||||
elif tagname == 'dd':
|
|
||||||
return type == "EndTag" or type is None
|
|
||||||
else:
|
|
||||||
return False
|
|
||||||
elif tagname == 'p':
|
|
||||||
# A p element's end tag may be omitted if the p element is
|
|
||||||
# immediately followed by an address, article, aside,
|
|
||||||
# blockquote, datagrid, dialog, dir, div, dl, fieldset,
|
|
||||||
# footer, form, h1, h2, h3, h4, h5, h6, header, hr, menu,
|
|
||||||
# nav, ol, p, pre, section, table, or ul, element, or if
|
|
||||||
# there is no more content in the parent element.
|
|
||||||
if type in ("StartTag", "EmptyTag"):
|
|
||||||
return next["name"] in ('address', 'article', 'aside',
|
|
||||||
'blockquote', 'datagrid', 'dialog',
|
|
||||||
'dir', 'div', 'dl', 'fieldset', 'footer',
|
|
||||||
'form', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6',
|
|
||||||
'header', 'hr', 'menu', 'nav', 'ol',
|
|
||||||
'p', 'pre', 'section', 'table', 'ul')
|
|
||||||
else:
|
|
||||||
return type == "EndTag" or type is None
|
|
||||||
elif tagname == 'option':
|
|
||||||
# An option element's end tag may be omitted if the option
|
|
||||||
# element is immediately followed by another option element,
|
|
||||||
# or if it is immediately followed by an <code>optgroup</code>
|
|
||||||
# element, or if there is no more content in the parent
|
|
||||||
# element.
|
|
||||||
if type == "StartTag":
|
|
||||||
return next["name"] in ('option', 'optgroup')
|
|
||||||
else:
|
|
||||||
return type == "EndTag" or type is None
|
|
||||||
elif tagname in ('rt', 'rp'):
|
|
||||||
# An rt element's end tag may be omitted if the rt element is
|
|
||||||
# immediately followed by an rt or rp element, or if there is
|
|
||||||
# no more content in the parent element.
|
|
||||||
# An rp element's end tag may be omitted if the rp element is
|
|
||||||
# immediately followed by an rt or rp element, or if there is
|
|
||||||
# no more content in the parent element.
|
|
||||||
if type == "StartTag":
|
|
||||||
return next["name"] in ('rt', 'rp')
|
|
||||||
else:
|
|
||||||
return type == "EndTag" or type is None
|
|
||||||
elif tagname == 'colgroup':
|
|
||||||
# A colgroup element's end tag may be omitted if the colgroup
|
|
||||||
# element is not immediately followed by a space character or
|
|
||||||
# a comment.
|
|
||||||
if type in ("Comment", "SpaceCharacters"):
|
|
||||||
return False
|
|
||||||
elif type == "StartTag":
|
|
||||||
# XXX: we also look for an immediately following colgroup
|
|
||||||
# element. See is_optional_start.
|
|
||||||
return next["name"] != 'colgroup'
|
|
||||||
else:
|
|
||||||
return True
|
|
||||||
elif tagname in ('thead', 'tbody'):
|
|
||||||
# A thead element's end tag may be omitted if the thead element
|
|
||||||
# is immediately followed by a tbody or tfoot element.
|
|
||||||
# A tbody element's end tag may be omitted if the tbody element
|
|
||||||
# is immediately followed by a tbody or tfoot element, or if
|
|
||||||
# there is no more content in the parent element.
|
|
||||||
# A tfoot element's end tag may be omitted if the tfoot element
|
|
||||||
# is immediately followed by a tbody element, or if there is no
|
|
||||||
# more content in the parent element.
|
|
||||||
# XXX: we never omit the end tag when the following element is
|
|
||||||
# a tbody. See is_optional_start.
|
|
||||||
if type == "StartTag":
|
|
||||||
return next["name"] in ['tbody', 'tfoot']
|
|
||||||
elif tagname == 'tbody':
|
|
||||||
return type == "EndTag" or type is None
|
|
||||||
else:
|
|
||||||
return False
|
|
||||||
elif tagname == 'tfoot':
|
|
||||||
# A tfoot element's end tag may be omitted if the tfoot element
|
|
||||||
# is immediately followed by a tbody element, or if there is no
|
|
||||||
# more content in the parent element.
|
|
||||||
# XXX: we never omit the end tag when the following element is
|
|
||||||
# a tbody. See is_optional_start.
|
|
||||||
if type == "StartTag":
|
|
||||||
return next["name"] == 'tbody'
|
|
||||||
else:
|
|
||||||
return type == "EndTag" or type is None
|
|
||||||
elif tagname in ('td', 'th'):
|
|
||||||
# A td element's end tag may be omitted if the td element is
|
|
||||||
# immediately followed by a td or th element, or if there is
|
|
||||||
# no more content in the parent element.
|
|
||||||
# A th element's end tag may be omitted if the th element is
|
|
||||||
# immediately followed by a td or th element, or if there is
|
|
||||||
# no more content in the parent element.
|
|
||||||
if type == "StartTag":
|
|
||||||
return next["name"] in ('td', 'th')
|
|
||||||
else:
|
|
||||||
return type == "EndTag" or type is None
|
|
||||||
return False
|
|
@ -1,12 +0,0 @@
|
|||||||
from __future__ import absolute_import, division, unicode_literals
|
|
||||||
|
|
||||||
from . import _base
|
|
||||||
from ..sanitizer import HTMLSanitizerMixin
|
|
||||||
|
|
||||||
|
|
||||||
class Filter(_base.Filter, HTMLSanitizerMixin):
|
|
||||||
def __iter__(self):
|
|
||||||
for token in _base.Filter.__iter__(self):
|
|
||||||
token = self.sanitize_token(token)
|
|
||||||
if token:
|
|
||||||
yield token
|
|
@ -1,38 +0,0 @@
|
|||||||
from __future__ import absolute_import, division, unicode_literals
|
|
||||||
|
|
||||||
import re
|
|
||||||
|
|
||||||
from . import _base
|
|
||||||
from ..constants import rcdataElements, spaceCharacters
|
|
||||||
spaceCharacters = "".join(spaceCharacters)
|
|
||||||
|
|
||||||
SPACES_REGEX = re.compile("[%s]+" % spaceCharacters)
|
|
||||||
|
|
||||||
|
|
||||||
class Filter(_base.Filter):
|
|
||||||
|
|
||||||
spacePreserveElements = frozenset(["pre", "textarea"] + list(rcdataElements))
|
|
||||||
|
|
||||||
def __iter__(self):
|
|
||||||
preserve = 0
|
|
||||||
for token in _base.Filter.__iter__(self):
|
|
||||||
type = token["type"]
|
|
||||||
if type == "StartTag" \
|
|
||||||
and (preserve or token["name"] in self.spacePreserveElements):
|
|
||||||
preserve += 1
|
|
||||||
|
|
||||||
elif type == "EndTag" and preserve:
|
|
||||||
preserve -= 1
|
|
||||||
|
|
||||||
elif not preserve and type == "SpaceCharacters" and token["data"]:
|
|
||||||
# Test on token["data"] above to not introduce spaces where there were not
|
|
||||||
token["data"] = " "
|
|
||||||
|
|
||||||
elif not preserve and type == "Characters":
|
|
||||||
token["data"] = collapse_spaces(token["data"])
|
|
||||||
|
|
||||||
yield token
|
|
||||||
|
|
||||||
|
|
||||||
def collapse_spaces(text):
|
|
||||||
return SPACES_REGEX.sub(' ', text)
|
|
File diff suppressed because it is too large
Load Diff
@ -1,285 +0,0 @@
|
|||||||
from __future__ import absolute_import, division, unicode_literals
|
|
||||||
|
|
||||||
import re
|
|
||||||
import warnings
|
|
||||||
|
|
||||||
from .constants import DataLossWarning
|
|
||||||
|
|
||||||
baseChar = """
|
|
||||||
[#x0041-#x005A] | [#x0061-#x007A] | [#x00C0-#x00D6] | [#x00D8-#x00F6] |
|
|
||||||
[#x00F8-#x00FF] | [#x0100-#x0131] | [#x0134-#x013E] | [#x0141-#x0148] |
|
|
||||||
[#x014A-#x017E] | [#x0180-#x01C3] | [#x01CD-#x01F0] | [#x01F4-#x01F5] |
|
|
||||||
[#x01FA-#x0217] | [#x0250-#x02A8] | [#x02BB-#x02C1] | #x0386 |
|
|
||||||
[#x0388-#x038A] | #x038C | [#x038E-#x03A1] | [#x03A3-#x03CE] |
|
|
||||||
[#x03D0-#x03D6] | #x03DA | #x03DC | #x03DE | #x03E0 | [#x03E2-#x03F3] |
|
|
||||||
[#x0401-#x040C] | [#x040E-#x044F] | [#x0451-#x045C] | [#x045E-#x0481] |
|
|
||||||
[#x0490-#x04C4] | [#x04C7-#x04C8] | [#x04CB-#x04CC] | [#x04D0-#x04EB] |
|
|
||||||
[#x04EE-#x04F5] | [#x04F8-#x04F9] | [#x0531-#x0556] | #x0559 |
|
|
||||||
[#x0561-#x0586] | [#x05D0-#x05EA] | [#x05F0-#x05F2] | [#x0621-#x063A] |
|
|
||||||
[#x0641-#x064A] | [#x0671-#x06B7] | [#x06BA-#x06BE] | [#x06C0-#x06CE] |
|
|
||||||
[#x06D0-#x06D3] | #x06D5 | [#x06E5-#x06E6] | [#x0905-#x0939] | #x093D |
|
|
||||||
[#x0958-#x0961] | [#x0985-#x098C] | [#x098F-#x0990] | [#x0993-#x09A8] |
|
|
||||||
[#x09AA-#x09B0] | #x09B2 | [#x09B6-#x09B9] | [#x09DC-#x09DD] |
|
|
||||||
[#x09DF-#x09E1] | [#x09F0-#x09F1] | [#x0A05-#x0A0A] | [#x0A0F-#x0A10] |
|
|
||||||
[#x0A13-#x0A28] | [#x0A2A-#x0A30] | [#x0A32-#x0A33] | [#x0A35-#x0A36] |
|
|
||||||
[#x0A38-#x0A39] | [#x0A59-#x0A5C] | #x0A5E | [#x0A72-#x0A74] |
|
|
||||||
[#x0A85-#x0A8B] | #x0A8D | [#x0A8F-#x0A91] | [#x0A93-#x0AA8] |
|
|
||||||
[#x0AAA-#x0AB0] | [#x0AB2-#x0AB3] | [#x0AB5-#x0AB9] | #x0ABD | #x0AE0 |
|
|
||||||
[#x0B05-#x0B0C] | [#x0B0F-#x0B10] | [#x0B13-#x0B28] | [#x0B2A-#x0B30] |
|
|
||||||
[#x0B32-#x0B33] | [#x0B36-#x0B39] | #x0B3D | [#x0B5C-#x0B5D] |
|
|
||||||
[#x0B5F-#x0B61] | [#x0B85-#x0B8A] | [#x0B8E-#x0B90] | [#x0B92-#x0B95] |
|
|
||||||
[#x0B99-#x0B9A] | #x0B9C | [#x0B9E-#x0B9F] | [#x0BA3-#x0BA4] |
|
|
||||||
[#x0BA8-#x0BAA] | [#x0BAE-#x0BB5] | [#x0BB7-#x0BB9] | [#x0C05-#x0C0C] |
|
|
||||||
[#x0C0E-#x0C10] | [#x0C12-#x0C28] | [#x0C2A-#x0C33] | [#x0C35-#x0C39] |
|
|
||||||
[#x0C60-#x0C61] | [#x0C85-#x0C8C] | [#x0C8E-#x0C90] | [#x0C92-#x0CA8] |
|
|
||||||
[#x0CAA-#x0CB3] | [#x0CB5-#x0CB9] | #x0CDE | [#x0CE0-#x0CE1] |
|
|
||||||
[#x0D05-#x0D0C] | [#x0D0E-#x0D10] | [#x0D12-#x0D28] | [#x0D2A-#x0D39] |
|
|
||||||
[#x0D60-#x0D61] | [#x0E01-#x0E2E] | #x0E30 | [#x0E32-#x0E33] |
|
|
||||||
[#x0E40-#x0E45] | [#x0E81-#x0E82] | #x0E84 | [#x0E87-#x0E88] | #x0E8A |
|
|
||||||
#x0E8D | [#x0E94-#x0E97] | [#x0E99-#x0E9F] | [#x0EA1-#x0EA3] | #x0EA5 |
|
|
||||||
#x0EA7 | [#x0EAA-#x0EAB] | [#x0EAD-#x0EAE] | #x0EB0 | [#x0EB2-#x0EB3] |
|
|
||||||
#x0EBD | [#x0EC0-#x0EC4] | [#x0F40-#x0F47] | [#x0F49-#x0F69] |
|
|
||||||
[#x10A0-#x10C5] | [#x10D0-#x10F6] | #x1100 | [#x1102-#x1103] |
|
|
||||||
[#x1105-#x1107] | #x1109 | [#x110B-#x110C] | [#x110E-#x1112] | #x113C |
|
|
||||||
#x113E | #x1140 | #x114C | #x114E | #x1150 | [#x1154-#x1155] | #x1159 |
|
|
||||||
[#x115F-#x1161] | #x1163 | #x1165 | #x1167 | #x1169 | [#x116D-#x116E] |
|
|
||||||
[#x1172-#x1173] | #x1175 | #x119E | #x11A8 | #x11AB | [#x11AE-#x11AF] |
|
|
||||||
[#x11B7-#x11B8] | #x11BA | [#x11BC-#x11C2] | #x11EB | #x11F0 | #x11F9 |
|
|
||||||
[#x1E00-#x1E9B] | [#x1EA0-#x1EF9] | [#x1F00-#x1F15] | [#x1F18-#x1F1D] |
|
|
||||||
[#x1F20-#x1F45] | [#x1F48-#x1F4D] | [#x1F50-#x1F57] | #x1F59 | #x1F5B |
|
|
||||||
#x1F5D | [#x1F5F-#x1F7D] | [#x1F80-#x1FB4] | [#x1FB6-#x1FBC] | #x1FBE |
|
|
||||||
[#x1FC2-#x1FC4] | [#x1FC6-#x1FCC] | [#x1FD0-#x1FD3] | [#x1FD6-#x1FDB] |
|
|
||||||
[#x1FE0-#x1FEC] | [#x1FF2-#x1FF4] | [#x1FF6-#x1FFC] | #x2126 |
|
|
||||||
[#x212A-#x212B] | #x212E | [#x2180-#x2182] | [#x3041-#x3094] |
|
|
||||||
[#x30A1-#x30FA] | [#x3105-#x312C] | [#xAC00-#xD7A3]"""
|
|
||||||
|
|
||||||
ideographic = """[#x4E00-#x9FA5] | #x3007 | [#x3021-#x3029]"""
|
|
||||||
|
|
||||||
combiningCharacter = """
|
|
||||||
[#x0300-#x0345] | [#x0360-#x0361] | [#x0483-#x0486] | [#x0591-#x05A1] |
|
|
||||||
[#x05A3-#x05B9] | [#x05BB-#x05BD] | #x05BF | [#x05C1-#x05C2] | #x05C4 |
|
|
||||||
[#x064B-#x0652] | #x0670 | [#x06D6-#x06DC] | [#x06DD-#x06DF] |
|
|
||||||
[#x06E0-#x06E4] | [#x06E7-#x06E8] | [#x06EA-#x06ED] | [#x0901-#x0903] |
|
|
||||||
#x093C | [#x093E-#x094C] | #x094D | [#x0951-#x0954] | [#x0962-#x0963] |
|
|
||||||
[#x0981-#x0983] | #x09BC | #x09BE | #x09BF | [#x09C0-#x09C4] |
|
|
||||||
[#x09C7-#x09C8] | [#x09CB-#x09CD] | #x09D7 | [#x09E2-#x09E3] | #x0A02 |
|
|
||||||
#x0A3C | #x0A3E | #x0A3F | [#x0A40-#x0A42] | [#x0A47-#x0A48] |
|
|
||||||
[#x0A4B-#x0A4D] | [#x0A70-#x0A71] | [#x0A81-#x0A83] | #x0ABC |
|
|
||||||
[#x0ABE-#x0AC5] | [#x0AC7-#x0AC9] | [#x0ACB-#x0ACD] | [#x0B01-#x0B03] |
|
|
||||||
#x0B3C | [#x0B3E-#x0B43] | [#x0B47-#x0B48] | [#x0B4B-#x0B4D] |
|
|
||||||
[#x0B56-#x0B57] | [#x0B82-#x0B83] | [#x0BBE-#x0BC2] | [#x0BC6-#x0BC8] |
|
|
||||||
[#x0BCA-#x0BCD] | #x0BD7 | [#x0C01-#x0C03] | [#x0C3E-#x0C44] |
|
|
||||||
[#x0C46-#x0C48] | [#x0C4A-#x0C4D] | [#x0C55-#x0C56] | [#x0C82-#x0C83] |
|
|
||||||
[#x0CBE-#x0CC4] | [#x0CC6-#x0CC8] | [#x0CCA-#x0CCD] | [#x0CD5-#x0CD6] |
|
|
||||||
[#x0D02-#x0D03] | [#x0D3E-#x0D43] | [#x0D46-#x0D48] | [#x0D4A-#x0D4D] |
|
|
||||||
#x0D57 | #x0E31 | [#x0E34-#x0E3A] | [#x0E47-#x0E4E] | #x0EB1 |
|
|
||||||
[#x0EB4-#x0EB9] | [#x0EBB-#x0EBC] | [#x0EC8-#x0ECD] | [#x0F18-#x0F19] |
|
|
||||||
#x0F35 | #x0F37 | #x0F39 | #x0F3E | #x0F3F | [#x0F71-#x0F84] |
|
|
||||||
[#x0F86-#x0F8B] | [#x0F90-#x0F95] | #x0F97 | [#x0F99-#x0FAD] |
|
|
||||||
[#x0FB1-#x0FB7] | #x0FB9 | [#x20D0-#x20DC] | #x20E1 | [#x302A-#x302F] |
|
|
||||||
#x3099 | #x309A"""
|
|
||||||
|
|
||||||
digit = """
|
|
||||||
[#x0030-#x0039] | [#x0660-#x0669] | [#x06F0-#x06F9] | [#x0966-#x096F] |
|
|
||||||
[#x09E6-#x09EF] | [#x0A66-#x0A6F] | [#x0AE6-#x0AEF] | [#x0B66-#x0B6F] |
|
|
||||||
[#x0BE7-#x0BEF] | [#x0C66-#x0C6F] | [#x0CE6-#x0CEF] | [#x0D66-#x0D6F] |
|
|
||||||
[#x0E50-#x0E59] | [#x0ED0-#x0ED9] | [#x0F20-#x0F29]"""
|
|
||||||
|
|
||||||
extender = """
|
|
||||||
#x00B7 | #x02D0 | #x02D1 | #x0387 | #x0640 | #x0E46 | #x0EC6 | #x3005 |
|
|
||||||
#[#x3031-#x3035] | [#x309D-#x309E] | [#x30FC-#x30FE]"""
|
|
||||||
|
|
||||||
letter = " | ".join([baseChar, ideographic])
|
|
||||||
|
|
||||||
# Without the
|
|
||||||
name = " | ".join([letter, digit, ".", "-", "_", combiningCharacter,
|
|
||||||
extender])
|
|
||||||
nameFirst = " | ".join([letter, "_"])
|
|
||||||
|
|
||||||
reChar = re.compile(r"#x([\d|A-F]{4,4})")
|
|
||||||
reCharRange = re.compile(r"\[#x([\d|A-F]{4,4})-#x([\d|A-F]{4,4})\]")
|
|
||||||
|
|
||||||
|
|
||||||
def charStringToList(chars):
|
|
||||||
charRanges = [item.strip() for item in chars.split(" | ")]
|
|
||||||
rv = []
|
|
||||||
for item in charRanges:
|
|
||||||
foundMatch = False
|
|
||||||
for regexp in (reChar, reCharRange):
|
|
||||||
match = regexp.match(item)
|
|
||||||
if match is not None:
|
|
||||||
rv.append([hexToInt(item) for item in match.groups()])
|
|
||||||
if len(rv[-1]) == 1:
|
|
||||||
rv[-1] = rv[-1] * 2
|
|
||||||
foundMatch = True
|
|
||||||
break
|
|
||||||
if not foundMatch:
|
|
||||||
assert len(item) == 1
|
|
||||||
|
|
||||||
rv.append([ord(item)] * 2)
|
|
||||||
rv = normaliseCharList(rv)
|
|
||||||
return rv
|
|
||||||
|
|
||||||
|
|
||||||
def normaliseCharList(charList):
|
|
||||||
charList = sorted(charList)
|
|
||||||
for item in charList:
|
|
||||||
assert item[1] >= item[0]
|
|
||||||
rv = []
|
|
||||||
i = 0
|
|
||||||
while i < len(charList):
|
|
||||||
j = 1
|
|
||||||
rv.append(charList[i])
|
|
||||||
while i + j < len(charList) and charList[i + j][0] <= rv[-1][1] + 1:
|
|
||||||
rv[-1][1] = charList[i + j][1]
|
|
||||||
j += 1
|
|
||||||
i += j
|
|
||||||
return rv
|
|
||||||
|
|
||||||
# We don't really support characters above the BMP :(
|
|
||||||
max_unicode = int("FFFF", 16)
|
|
||||||
|
|
||||||
|
|
||||||
def missingRanges(charList):
|
|
||||||
rv = []
|
|
||||||
if charList[0] != 0:
|
|
||||||
rv.append([0, charList[0][0] - 1])
|
|
||||||
for i, item in enumerate(charList[:-1]):
|
|
||||||
rv.append([item[1] + 1, charList[i + 1][0] - 1])
|
|
||||||
if charList[-1][1] != max_unicode:
|
|
||||||
rv.append([charList[-1][1] + 1, max_unicode])
|
|
||||||
return rv
|
|
||||||
|
|
||||||
|
|
||||||
def listToRegexpStr(charList):
|
|
||||||
rv = []
|
|
||||||
for item in charList:
|
|
||||||
if item[0] == item[1]:
|
|
||||||
rv.append(escapeRegexp(chr(item[0])))
|
|
||||||
else:
|
|
||||||
rv.append(escapeRegexp(chr(item[0])) + "-" +
|
|
||||||
escapeRegexp(chr(item[1])))
|
|
||||||
return "[%s]" % "".join(rv)
|
|
||||||
|
|
||||||
|
|
||||||
def hexToInt(hex_str):
|
|
||||||
return int(hex_str, 16)
|
|
||||||
|
|
||||||
|
|
||||||
def escapeRegexp(string):
|
|
||||||
specialCharacters = (".", "^", "$", "*", "+", "?", "{", "}",
|
|
||||||
"[", "]", "|", "(", ")", "-")
|
|
||||||
for char in specialCharacters:
|
|
||||||
string = string.replace(char, "\\" + char)
|
|
||||||
|
|
||||||
return string
|
|
||||||
|
|
||||||
# output from the above
|
|
||||||
nonXmlNameBMPRegexp = re.compile('[\x00-,/:-@\\[-\\^`\\{-\xb6\xb8-\xbf\xd7\xf7\u0132-\u0133\u013f-\u0140\u0149\u017f\u01c4-\u01cc\u01f1-\u01f3\u01f6-\u01f9\u0218-\u024f\u02a9-\u02ba\u02c2-\u02cf\u02d2-\u02ff\u0346-\u035f\u0362-\u0385\u038b\u038d\u03a2\u03cf\u03d7-\u03d9\u03db\u03dd\u03df\u03e1\u03f4-\u0400\u040d\u0450\u045d\u0482\u0487-\u048f\u04c5-\u04c6\u04c9-\u04ca\u04cd-\u04cf\u04ec-\u04ed\u04f6-\u04f7\u04fa-\u0530\u0557-\u0558\u055a-\u0560\u0587-\u0590\u05a2\u05ba\u05be\u05c0\u05c3\u05c5-\u05cf\u05eb-\u05ef\u05f3-\u0620\u063b-\u063f\u0653-\u065f\u066a-\u066f\u06b8-\u06b9\u06bf\u06cf\u06d4\u06e9\u06ee-\u06ef\u06fa-\u0900\u0904\u093a-\u093b\u094e-\u0950\u0955-\u0957\u0964-\u0965\u0970-\u0980\u0984\u098d-\u098e\u0991-\u0992\u09a9\u09b1\u09b3-\u09b5\u09ba-\u09bb\u09bd\u09c5-\u09c6\u09c9-\u09ca\u09ce-\u09d6\u09d8-\u09db\u09de\u09e4-\u09e5\u09f2-\u0a01\u0a03-\u0a04\u0a0b-\u0a0e\u0a11-\u0a12\u0a29\u0a31\u0a34\u0a37\u0a3a-\u0a3b\u0a3d\u0a43-\u0a46\u0a49-\u0a4a\u0a4e-\u0a58\u0a5d\u0a5f-\u0a65\u0a75-\u0a80\u0a84\u0a8c\u0a8e\u0a92\u0aa9\u0ab1\u0ab4\u0aba-\u0abb\u0ac6\u0aca\u0ace-\u0adf\u0ae1-\u0ae5\u0af0-\u0b00\u0b04\u0b0d-\u0b0e\u0b11-\u0b12\u0b29\u0b31\u0b34-\u0b35\u0b3a-\u0b3b\u0b44-\u0b46\u0b49-\u0b4a\u0b4e-\u0b55\u0b58-\u0b5b\u0b5e\u0b62-\u0b65\u0b70-\u0b81\u0b84\u0b8b-\u0b8d\u0b91\u0b96-\u0b98\u0b9b\u0b9d\u0ba0-\u0ba2\u0ba5-\u0ba7\u0bab-\u0bad\u0bb6\u0bba-\u0bbd\u0bc3-\u0bc5\u0bc9\u0bce-\u0bd6\u0bd8-\u0be6\u0bf0-\u0c00\u0c04\u0c0d\u0c11\u0c29\u0c34\u0c3a-\u0c3d\u0c45\u0c49\u0c4e-\u0c54\u0c57-\u0c5f\u0c62-\u0c65\u0c70-\u0c81\u0c84\u0c8d\u0c91\u0ca9\u0cb4\u0cba-\u0cbd\u0cc5\u0cc9\u0cce-\u0cd4\u0cd7-\u0cdd\u0cdf\u0ce2-\u0ce5\u0cf0-\u0d01\u0d04\u0d0d\u0d11\u0d29\u0d3a-\u0d3d\u0d44-\u0d45\u0d49\u0d4e-\u0d56\u0d58-\u0d5f\u0d62-\u0d65\u0d70-\u0e00\u0e2f\u0e3b-\u0e3f\u0e4f\u0e5a-\u0e80\u0e83\u0e85-\u0e86\u0e89\u0e8b-\u0e8c\u0e8e-\u0e93\u0e98\u0ea0\u0ea4\u0ea6\u0ea8-\u0ea9\u0eac\u0eaf\u0eba\u0ebe-\u0ebf\u0ec5\u0ec7\u0ece-\u0ecf\u0eda-\u0f17\u0f1a-\u0f1f\u0f2a-\u0f34\u0f36\u0f38\u0f3a-\u0f3d\u0f48\u0f6a-\u0f70\u0f85\u0f8c-\u0f8f\u0f96\u0f98\u0fae-\u0fb0\u0fb8\u0fba-\u109f\u10c6-\u10cf\u10f7-\u10ff\u1101\u1104\u1108\u110a\u110d\u1113-\u113b\u113d\u113f\u1141-\u114b\u114d\u114f\u1151-\u1153\u1156-\u1158\u115a-\u115e\u1162\u1164\u1166\u1168\u116a-\u116c\u116f-\u1171\u1174\u1176-\u119d\u119f-\u11a7\u11a9-\u11aa\u11ac-\u11ad\u11b0-\u11b6\u11b9\u11bb\u11c3-\u11ea\u11ec-\u11ef\u11f1-\u11f8\u11fa-\u1dff\u1e9c-\u1e9f\u1efa-\u1eff\u1f16-\u1f17\u1f1e-\u1f1f\u1f46-\u1f47\u1f4e-\u1f4f\u1f58\u1f5a\u1f5c\u1f5e\u1f7e-\u1f7f\u1fb5\u1fbd\u1fbf-\u1fc1\u1fc5\u1fcd-\u1fcf\u1fd4-\u1fd5\u1fdc-\u1fdf\u1fed-\u1ff1\u1ff5\u1ffd-\u20cf\u20dd-\u20e0\u20e2-\u2125\u2127-\u2129\u212c-\u212d\u212f-\u217f\u2183-\u3004\u3006\u3008-\u3020\u3030\u3036-\u3040\u3095-\u3098\u309b-\u309c\u309f-\u30a0\u30fb\u30ff-\u3104\u312d-\u4dff\u9fa6-\uabff\ud7a4-\uffff]')
|
|
||||||
|
|
||||||
nonXmlNameFirstBMPRegexp = re.compile('[\x00-@\\[-\\^`\\{-\xbf\xd7\xf7\u0132-\u0133\u013f-\u0140\u0149\u017f\u01c4-\u01cc\u01f1-\u01f3\u01f6-\u01f9\u0218-\u024f\u02a9-\u02ba\u02c2-\u0385\u0387\u038b\u038d\u03a2\u03cf\u03d7-\u03d9\u03db\u03dd\u03df\u03e1\u03f4-\u0400\u040d\u0450\u045d\u0482-\u048f\u04c5-\u04c6\u04c9-\u04ca\u04cd-\u04cf\u04ec-\u04ed\u04f6-\u04f7\u04fa-\u0530\u0557-\u0558\u055a-\u0560\u0587-\u05cf\u05eb-\u05ef\u05f3-\u0620\u063b-\u0640\u064b-\u0670\u06b8-\u06b9\u06bf\u06cf\u06d4\u06d6-\u06e4\u06e7-\u0904\u093a-\u093c\u093e-\u0957\u0962-\u0984\u098d-\u098e\u0991-\u0992\u09a9\u09b1\u09b3-\u09b5\u09ba-\u09db\u09de\u09e2-\u09ef\u09f2-\u0a04\u0a0b-\u0a0e\u0a11-\u0a12\u0a29\u0a31\u0a34\u0a37\u0a3a-\u0a58\u0a5d\u0a5f-\u0a71\u0a75-\u0a84\u0a8c\u0a8e\u0a92\u0aa9\u0ab1\u0ab4\u0aba-\u0abc\u0abe-\u0adf\u0ae1-\u0b04\u0b0d-\u0b0e\u0b11-\u0b12\u0b29\u0b31\u0b34-\u0b35\u0b3a-\u0b3c\u0b3e-\u0b5b\u0b5e\u0b62-\u0b84\u0b8b-\u0b8d\u0b91\u0b96-\u0b98\u0b9b\u0b9d\u0ba0-\u0ba2\u0ba5-\u0ba7\u0bab-\u0bad\u0bb6\u0bba-\u0c04\u0c0d\u0c11\u0c29\u0c34\u0c3a-\u0c5f\u0c62-\u0c84\u0c8d\u0c91\u0ca9\u0cb4\u0cba-\u0cdd\u0cdf\u0ce2-\u0d04\u0d0d\u0d11\u0d29\u0d3a-\u0d5f\u0d62-\u0e00\u0e2f\u0e31\u0e34-\u0e3f\u0e46-\u0e80\u0e83\u0e85-\u0e86\u0e89\u0e8b-\u0e8c\u0e8e-\u0e93\u0e98\u0ea0\u0ea4\u0ea6\u0ea8-\u0ea9\u0eac\u0eaf\u0eb1\u0eb4-\u0ebc\u0ebe-\u0ebf\u0ec5-\u0f3f\u0f48\u0f6a-\u109f\u10c6-\u10cf\u10f7-\u10ff\u1101\u1104\u1108\u110a\u110d\u1113-\u113b\u113d\u113f\u1141-\u114b\u114d\u114f\u1151-\u1153\u1156-\u1158\u115a-\u115e\u1162\u1164\u1166\u1168\u116a-\u116c\u116f-\u1171\u1174\u1176-\u119d\u119f-\u11a7\u11a9-\u11aa\u11ac-\u11ad\u11b0-\u11b6\u11b9\u11bb\u11c3-\u11ea\u11ec-\u11ef\u11f1-\u11f8\u11fa-\u1dff\u1e9c-\u1e9f\u1efa-\u1eff\u1f16-\u1f17\u1f1e-\u1f1f\u1f46-\u1f47\u1f4e-\u1f4f\u1f58\u1f5a\u1f5c\u1f5e\u1f7e-\u1f7f\u1fb5\u1fbd\u1fbf-\u1fc1\u1fc5\u1fcd-\u1fcf\u1fd4-\u1fd5\u1fdc-\u1fdf\u1fed-\u1ff1\u1ff5\u1ffd-\u2125\u2127-\u2129\u212c-\u212d\u212f-\u217f\u2183-\u3006\u3008-\u3020\u302a-\u3040\u3095-\u30a0\u30fb-\u3104\u312d-\u4dff\u9fa6-\uabff\ud7a4-\uffff]')
|
|
||||||
|
|
||||||
# Simpler things
|
|
||||||
nonPubidCharRegexp = re.compile("[^\x20\x0D\x0Aa-zA-Z0-9\-\'()+,./:=?;!*#@$_%]")
|
|
||||||
|
|
||||||
|
|
||||||
class InfosetFilter(object):
|
|
||||||
replacementRegexp = re.compile(r"U[\dA-F]{5,5}")
|
|
||||||
|
|
||||||
def __init__(self, replaceChars=None,
|
|
||||||
dropXmlnsLocalName=False,
|
|
||||||
dropXmlnsAttrNs=False,
|
|
||||||
preventDoubleDashComments=False,
|
|
||||||
preventDashAtCommentEnd=False,
|
|
||||||
replaceFormFeedCharacters=True,
|
|
||||||
preventSingleQuotePubid=False):
|
|
||||||
|
|
||||||
self.dropXmlnsLocalName = dropXmlnsLocalName
|
|
||||||
self.dropXmlnsAttrNs = dropXmlnsAttrNs
|
|
||||||
|
|
||||||
self.preventDoubleDashComments = preventDoubleDashComments
|
|
||||||
self.preventDashAtCommentEnd = preventDashAtCommentEnd
|
|
||||||
|
|
||||||
self.replaceFormFeedCharacters = replaceFormFeedCharacters
|
|
||||||
|
|
||||||
self.preventSingleQuotePubid = preventSingleQuotePubid
|
|
||||||
|
|
||||||
self.replaceCache = {}
|
|
||||||
|
|
||||||
def coerceAttribute(self, name, namespace=None):
|
|
||||||
if self.dropXmlnsLocalName and name.startswith("xmlns:"):
|
|
||||||
warnings.warn("Attributes cannot begin with xmlns", DataLossWarning)
|
|
||||||
return None
|
|
||||||
elif (self.dropXmlnsAttrNs and
|
|
||||||
namespace == "http://www.w3.org/2000/xmlns/"):
|
|
||||||
warnings.warn("Attributes cannot be in the xml namespace", DataLossWarning)
|
|
||||||
return None
|
|
||||||
else:
|
|
||||||
return self.toXmlName(name)
|
|
||||||
|
|
||||||
def coerceElement(self, name, namespace=None):
|
|
||||||
return self.toXmlName(name)
|
|
||||||
|
|
||||||
def coerceComment(self, data):
|
|
||||||
if self.preventDoubleDashComments:
|
|
||||||
while "--" in data:
|
|
||||||
warnings.warn("Comments cannot contain adjacent dashes", DataLossWarning)
|
|
||||||
data = data.replace("--", "- -")
|
|
||||||
return data
|
|
||||||
|
|
||||||
def coerceCharacters(self, data):
|
|
||||||
if self.replaceFormFeedCharacters:
|
|
||||||
for i in range(data.count("\x0C")):
|
|
||||||
warnings.warn("Text cannot contain U+000C", DataLossWarning)
|
|
||||||
data = data.replace("\x0C", " ")
|
|
||||||
# Other non-xml characters
|
|
||||||
return data
|
|
||||||
|
|
||||||
def coercePubid(self, data):
|
|
||||||
dataOutput = data
|
|
||||||
for char in nonPubidCharRegexp.findall(data):
|
|
||||||
warnings.warn("Coercing non-XML pubid", DataLossWarning)
|
|
||||||
replacement = self.getReplacementCharacter(char)
|
|
||||||
dataOutput = dataOutput.replace(char, replacement)
|
|
||||||
if self.preventSingleQuotePubid and dataOutput.find("'") >= 0:
|
|
||||||
warnings.warn("Pubid cannot contain single quote", DataLossWarning)
|
|
||||||
dataOutput = dataOutput.replace("'", self.getReplacementCharacter("'"))
|
|
||||||
return dataOutput
|
|
||||||
|
|
||||||
def toXmlName(self, name):
|
|
||||||
nameFirst = name[0]
|
|
||||||
nameRest = name[1:]
|
|
||||||
m = nonXmlNameFirstBMPRegexp.match(nameFirst)
|
|
||||||
if m:
|
|
||||||
warnings.warn("Coercing non-XML name", DataLossWarning)
|
|
||||||
nameFirstOutput = self.getReplacementCharacter(nameFirst)
|
|
||||||
else:
|
|
||||||
nameFirstOutput = nameFirst
|
|
||||||
|
|
||||||
nameRestOutput = nameRest
|
|
||||||
replaceChars = set(nonXmlNameBMPRegexp.findall(nameRest))
|
|
||||||
for char in replaceChars:
|
|
||||||
warnings.warn("Coercing non-XML name", DataLossWarning)
|
|
||||||
replacement = self.getReplacementCharacter(char)
|
|
||||||
nameRestOutput = nameRestOutput.replace(char, replacement)
|
|
||||||
return nameFirstOutput + nameRestOutput
|
|
||||||
|
|
||||||
def getReplacementCharacter(self, char):
|
|
||||||
if char in self.replaceCache:
|
|
||||||
replacement = self.replaceCache[char]
|
|
||||||
else:
|
|
||||||
replacement = self.escapeChar(char)
|
|
||||||
return replacement
|
|
||||||
|
|
||||||
def fromXmlName(self, name):
|
|
||||||
for item in set(self.replacementRegexp.findall(name)):
|
|
||||||
name = name.replace(item, self.unescapeChar(item))
|
|
||||||
return name
|
|
||||||
|
|
||||||
def escapeChar(self, char):
|
|
||||||
replacement = "U%05X" % ord(char)
|
|
||||||
self.replaceCache[char] = replacement
|
|
||||||
return replacement
|
|
||||||
|
|
||||||
def unescapeChar(self, charcode):
|
|
||||||
return chr(int(charcode[1:], 16))
|
|
@ -1,888 +0,0 @@
|
|||||||
from __future__ import absolute_import, division, unicode_literals
|
|
||||||
try:
|
|
||||||
text_type = unicode
|
|
||||||
except NameError:
|
|
||||||
text_type = str
|
|
||||||
|
|
||||||
import codecs
|
|
||||||
import re
|
|
||||||
|
|
||||||
from .constants import EOF, spaceCharacters, asciiLetters, asciiUppercase
|
|
||||||
from .constants import encodings, ReparseException
|
|
||||||
from . import utils
|
|
||||||
|
|
||||||
from io import StringIO
|
|
||||||
|
|
||||||
try:
|
|
||||||
from io import BytesIO
|
|
||||||
except ImportError:
|
|
||||||
BytesIO = StringIO
|
|
||||||
|
|
||||||
try:
|
|
||||||
from io import BufferedIOBase
|
|
||||||
except ImportError:
|
|
||||||
class BufferedIOBase(object):
|
|
||||||
pass
|
|
||||||
|
|
||||||
# Non-unicode versions of constants for use in the pre-parser
|
|
||||||
spaceCharactersBytes = frozenset([item.encode("ascii") for item in spaceCharacters])
|
|
||||||
asciiLettersBytes = frozenset([item.encode("ascii") for item in asciiLetters])
|
|
||||||
asciiUppercaseBytes = frozenset([item.encode("ascii") for item in asciiUppercase])
|
|
||||||
spacesAngleBrackets = spaceCharactersBytes | frozenset([b">", b"<"])
|
|
||||||
|
|
||||||
invalid_unicode_re = re.compile("[\u0001-\u0008\u000B\u000E-\u001F\u007F-\u009F\uD800-\uDFFF\uFDD0-\uFDEF\uFFFE\uFFFF\U0001FFFE\U0001FFFF\U0002FFFE\U0002FFFF\U0003FFFE\U0003FFFF\U0004FFFE\U0004FFFF\U0005FFFE\U0005FFFF\U0006FFFE\U0006FFFF\U0007FFFE\U0007FFFF\U0008FFFE\U0008FFFF\U0009FFFE\U0009FFFF\U000AFFFE\U000AFFFF\U000BFFFE\U000BFFFF\U000CFFFE\U000CFFFF\U000DFFFE\U000DFFFF\U000EFFFE\U000EFFFF\U000FFFFE\U000FFFFF\U0010FFFE\U0010FFFF]") # noqa
|
|
||||||
|
|
||||||
non_bmp_invalid_codepoints = set([0x1FFFE, 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE,
|
|
||||||
0x3FFFF, 0x4FFFE, 0x4FFFF, 0x5FFFE, 0x5FFFF,
|
|
||||||
0x6FFFE, 0x6FFFF, 0x7FFFE, 0x7FFFF, 0x8FFFE,
|
|
||||||
0x8FFFF, 0x9FFFE, 0x9FFFF, 0xAFFFE, 0xAFFFF,
|
|
||||||
0xBFFFE, 0xBFFFF, 0xCFFFE, 0xCFFFF, 0xDFFFE,
|
|
||||||
0xDFFFF, 0xEFFFE, 0xEFFFF, 0xFFFFE, 0xFFFFF,
|
|
||||||
0x10FFFE, 0x10FFFF])
|
|
||||||
|
|
||||||
ascii_punctuation_re = re.compile("[\u0009-\u000D\u0020-\u002F\u003A-\u0040\u005B-\u0060\u007B-\u007E]")
|
|
||||||
|
|
||||||
# Cache for charsUntil()
|
|
||||||
charsUntilRegEx = {}
|
|
||||||
|
|
||||||
|
|
||||||
class BufferedStream(object):
|
|
||||||
"""Buffering for streams that do not have buffering of their own
|
|
||||||
|
|
||||||
The buffer is implemented as a list of chunks on the assumption that
|
|
||||||
joining many strings will be slow since it is O(n**2)
|
|
||||||
"""
|
|
||||||
|
|
||||||
def __init__(self, stream):
|
|
||||||
self.stream = stream
|
|
||||||
self.buffer = []
|
|
||||||
self.position = [-1, 0] # chunk number, offset
|
|
||||||
|
|
||||||
def tell(self):
|
|
||||||
pos = 0
|
|
||||||
for chunk in self.buffer[:self.position[0]]:
|
|
||||||
pos += len(chunk)
|
|
||||||
pos += self.position[1]
|
|
||||||
return pos
|
|
||||||
|
|
||||||
def seek(self, pos):
|
|
||||||
assert pos <= self._bufferedBytes()
|
|
||||||
offset = pos
|
|
||||||
i = 0
|
|
||||||
while len(self.buffer[i]) < offset:
|
|
||||||
offset -= len(self.buffer[i])
|
|
||||||
i += 1
|
|
||||||
self.position = [i, offset]
|
|
||||||
|
|
||||||
def read(self, bytes):
|
|
||||||
if not self.buffer:
|
|
||||||
return self._readStream(bytes)
|
|
||||||
elif (self.position[0] == len(self.buffer) and
|
|
||||||
self.position[1] == len(self.buffer[-1])):
|
|
||||||
return self._readStream(bytes)
|
|
||||||
else:
|
|
||||||
return self._readFromBuffer(bytes)
|
|
||||||
|
|
||||||
def _bufferedBytes(self):
|
|
||||||
return sum([len(item) for item in self.buffer])
|
|
||||||
|
|
||||||
def _readStream(self, bytes):
|
|
||||||
data = self.stream.read(bytes)
|
|
||||||
self.buffer.append(data)
|
|
||||||
self.position[0] += 1
|
|
||||||
self.position[1] = len(data)
|
|
||||||
return data
|
|
||||||
|
|
||||||
def _readFromBuffer(self, bytes):
|
|
||||||
remainingBytes = bytes
|
|
||||||
rv = []
|
|
||||||
bufferIndex = self.position[0]
|
|
||||||
bufferOffset = self.position[1]
|
|
||||||
while bufferIndex < len(self.buffer) and remainingBytes != 0:
|
|
||||||
assert remainingBytes > 0
|
|
||||||
bufferedData = self.buffer[bufferIndex]
|
|
||||||
|
|
||||||
if remainingBytes <= len(bufferedData) - bufferOffset:
|
|
||||||
bytesToRead = remainingBytes
|
|
||||||
self.position = [bufferIndex, bufferOffset + bytesToRead]
|
|
||||||
else:
|
|
||||||
bytesToRead = len(bufferedData) - bufferOffset
|
|
||||||
self.position = [bufferIndex, len(bufferedData)]
|
|
||||||
bufferIndex += 1
|
|
||||||
rv.append(bufferedData[bufferOffset:bufferOffset + bytesToRead])
|
|
||||||
remainingBytes -= bytesToRead
|
|
||||||
|
|
||||||
bufferOffset = 0
|
|
||||||
|
|
||||||
if remainingBytes:
|
|
||||||
rv.append(self._readStream(remainingBytes))
|
|
||||||
|
|
||||||
return b"".join(rv)
|
|
||||||
|
|
||||||
|
|
||||||
def HTMLInputStream(source, encoding=None, parseMeta=True, chardet=True):
|
|
||||||
if (hasattr(source, 'unget') and hasattr(source, 'charsUntil') and
|
|
||||||
hasattr(source, 'position') and hasattr(source, 'char') and
|
|
||||||
hasattr(source, 'reset') and hasattr(source, 'errors')):
|
|
||||||
return source
|
|
||||||
if hasattr(source, "read"):
|
|
||||||
isUnicode = isinstance(source.read(0), text_type)
|
|
||||||
else:
|
|
||||||
isUnicode = isinstance(source, text_type)
|
|
||||||
|
|
||||||
if isUnicode:
|
|
||||||
if encoding is not None:
|
|
||||||
raise TypeError("Cannot explicitly set an encoding with a unicode string")
|
|
||||||
|
|
||||||
return HTMLUnicodeInputStream(source)
|
|
||||||
else:
|
|
||||||
return HTMLBinaryInputStream(source, encoding, parseMeta, chardet)
|
|
||||||
|
|
||||||
|
|
||||||
class HTMLUnicodeInputStream(object):
|
|
||||||
"""Provides a unicode stream of characters to the HTMLTokenizer.
|
|
||||||
|
|
||||||
This class takes care of character encoding and removing or replacing
|
|
||||||
incorrect byte-sequences and also provides column and line tracking.
|
|
||||||
|
|
||||||
"""
|
|
||||||
|
|
||||||
_defaultChunkSize = 10240
|
|
||||||
|
|
||||||
def __init__(self, source):
|
|
||||||
"""Initialises the HTMLInputStream.
|
|
||||||
|
|
||||||
HTMLInputStream(source, [encoding]) -> Normalized stream from source
|
|
||||||
for use by html5lib.
|
|
||||||
|
|
||||||
source can be either a file-object, local filename or a string.
|
|
||||||
|
|
||||||
The optional encoding parameter must be a string that indicates
|
|
||||||
the encoding. If specified, that encoding will be used,
|
|
||||||
regardless of any BOM or later declaration (such as in a meta
|
|
||||||
element)
|
|
||||||
|
|
||||||
parseMeta - Look for a <meta> element containing encoding information
|
|
||||||
|
|
||||||
"""
|
|
||||||
|
|
||||||
# Craziness
|
|
||||||
if len("\U0010FFFF") == 1:
|
|
||||||
self.reportCharacterErrors = self.characterErrorsUCS4
|
|
||||||
self.replaceCharactersRegexp = re.compile("[\uD800-\uDFFF]")
|
|
||||||
else:
|
|
||||||
self.reportCharacterErrors = self.characterErrorsUCS2
|
|
||||||
self.replaceCharactersRegexp = re.compile("([\uD800-\uDBFF](?![\uDC00-\uDFFF])|(?<![\uD800-\uDBFF])[\uDC00-\uDFFF])")
|
|
||||||
|
|
||||||
# List of where new lines occur
|
|
||||||
self.newLines = [0]
|
|
||||||
|
|
||||||
self.charEncoding = ("utf-8", "certain")
|
|
||||||
self.dataStream = self.openStream(source)
|
|
||||||
|
|
||||||
self.reset()
|
|
||||||
|
|
||||||
def reset(self):
|
|
||||||
self.chunk = ""
|
|
||||||
self.chunkSize = 0
|
|
||||||
self.chunkOffset = 0
|
|
||||||
self.errors = []
|
|
||||||
|
|
||||||
# number of (complete) lines in previous chunks
|
|
||||||
self.prevNumLines = 0
|
|
||||||
# number of columns in the last line of the previous chunk
|
|
||||||
self.prevNumCols = 0
|
|
||||||
|
|
||||||
# Deal with CR LF and surrogates split over chunk boundaries
|
|
||||||
self._bufferedCharacter = None
|
|
||||||
|
|
||||||
def openStream(self, source):
|
|
||||||
"""Produces a file object from source.
|
|
||||||
|
|
||||||
source can be either a file object, local filename or a string.
|
|
||||||
|
|
||||||
"""
|
|
||||||
# Already a file object
|
|
||||||
if hasattr(source, 'read'):
|
|
||||||
stream = source
|
|
||||||
else:
|
|
||||||
stream = StringIO(source)
|
|
||||||
|
|
||||||
return stream
|
|
||||||
|
|
||||||
def _position(self, offset):
|
|
||||||
chunk = self.chunk
|
|
||||||
nLines = chunk.count('\n', 0, offset)
|
|
||||||
positionLine = self.prevNumLines + nLines
|
|
||||||
lastLinePos = chunk.rfind('\n', 0, offset)
|
|
||||||
if lastLinePos == -1:
|
|
||||||
positionColumn = self.prevNumCols + offset
|
|
||||||
else:
|
|
||||||
positionColumn = offset - (lastLinePos + 1)
|
|
||||||
return (positionLine, positionColumn)
|
|
||||||
|
|
||||||
def position(self):
|
|
||||||
"""Returns (line, col) of the current position in the stream."""
|
|
||||||
line, col = self._position(self.chunkOffset)
|
|
||||||
return (line + 1, col)
|
|
||||||
|
|
||||||
def char(self):
|
|
||||||
""" Read one character from the stream or queue if available. Return
|
|
||||||
EOF when EOF is reached.
|
|
||||||
"""
|
|
||||||
# Read a new chunk from the input stream if necessary
|
|
||||||
if self.chunkOffset >= self.chunkSize:
|
|
||||||
if not self.readChunk():
|
|
||||||
return EOF
|
|
||||||
|
|
||||||
chunkOffset = self.chunkOffset
|
|
||||||
char = self.chunk[chunkOffset]
|
|
||||||
self.chunkOffset = chunkOffset + 1
|
|
||||||
|
|
||||||
return char
|
|
||||||
|
|
||||||
def readChunk(self, chunkSize=None):
|
|
||||||
if chunkSize is None:
|
|
||||||
chunkSize = self._defaultChunkSize
|
|
||||||
|
|
||||||
self.prevNumLines, self.prevNumCols = self._position(self.chunkSize)
|
|
||||||
|
|
||||||
self.chunk = ""
|
|
||||||
self.chunkSize = 0
|
|
||||||
self.chunkOffset = 0
|
|
||||||
|
|
||||||
data = self.dataStream.read(chunkSize)
|
|
||||||
|
|
||||||
# Deal with CR LF and surrogates broken across chunks
|
|
||||||
if self._bufferedCharacter:
|
|
||||||
data = self._bufferedCharacter + data
|
|
||||||
self._bufferedCharacter = None
|
|
||||||
elif not data:
|
|
||||||
# We have no more data, bye-bye stream
|
|
||||||
return False
|
|
||||||
|
|
||||||
if len(data) > 1:
|
|
||||||
lastv = ord(data[-1])
|
|
||||||
if lastv == 0x0D or 0xD800 <= lastv <= 0xDBFF:
|
|
||||||
self._bufferedCharacter = data[-1]
|
|
||||||
data = data[:-1]
|
|
||||||
|
|
||||||
self.reportCharacterErrors(data)
|
|
||||||
|
|
||||||
# Replace invalid characters
|
|
||||||
# Note U+0000 is dealt with in the tokenizer
|
|
||||||
data = self.replaceCharactersRegexp.sub("\ufffd", data)
|
|
||||||
|
|
||||||
data = data.replace("\r\n", "\n")
|
|
||||||
data = data.replace("\r", "\n")
|
|
||||||
|
|
||||||
self.chunk = data
|
|
||||||
self.chunkSize = len(data)
|
|
||||||
|
|
||||||
return True
|
|
||||||
|
|
||||||
def characterErrorsUCS4(self, data):
|
|
||||||
for i in range(len(invalid_unicode_re.findall(data))):
|
|
||||||
self.errors.append("invalid-codepoint")
|
|
||||||
|
|
||||||
def characterErrorsUCS2(self, data):
|
|
||||||
# Someone picked the wrong compile option
|
|
||||||
# You lose
|
|
||||||
skip = False
|
|
||||||
for match in invalid_unicode_re.finditer(data):
|
|
||||||
if skip:
|
|
||||||
continue
|
|
||||||
codepoint = ord(match.group())
|
|
||||||
pos = match.start()
|
|
||||||
# Pretty sure there should be endianness issues here
|
|
||||||
if utils.isSurrogatePair(data[pos:pos + 2]):
|
|
||||||
# We have a surrogate pair!
|
|
||||||
char_val = utils.surrogatePairToCodepoint(data[pos:pos + 2])
|
|
||||||
if char_val in non_bmp_invalid_codepoints:
|
|
||||||
self.errors.append("invalid-codepoint")
|
|
||||||
skip = True
|
|
||||||
elif (codepoint >= 0xD800 and codepoint <= 0xDFFF and
|
|
||||||
pos == len(data) - 1):
|
|
||||||
self.errors.append("invalid-codepoint")
|
|
||||||
else:
|
|
||||||
skip = False
|
|
||||||
self.errors.append("invalid-codepoint")
|
|
||||||
|
|
||||||
def charsUntil(self, characters, opposite=False):
|
|
||||||
""" Returns a string of characters from the stream up to but not
|
|
||||||
including any character in 'characters' or EOF. 'characters' must be
|
|
||||||
a container that supports the 'in' method and iteration over its
|
|
||||||
characters.
|
|
||||||
"""
|
|
||||||
|
|
||||||
# Use a cache of regexps to find the required characters
|
|
||||||
try:
|
|
||||||
chars = charsUntilRegEx[(characters, opposite)]
|
|
||||||
except KeyError:
|
|
||||||
if __debug__:
|
|
||||||
for c in characters:
|
|
||||||
assert(ord(c) < 128)
|
|
||||||
regex = "".join(["\\x%02x" % ord(c) for c in characters])
|
|
||||||
if not opposite:
|
|
||||||
regex = "^%s" % regex
|
|
||||||
chars = charsUntilRegEx[(characters, opposite)] = re.compile("[%s]+" % regex)
|
|
||||||
|
|
||||||
rv = []
|
|
||||||
|
|
||||||
while True:
|
|
||||||
# Find the longest matching prefix
|
|
||||||
m = chars.match(self.chunk, self.chunkOffset)
|
|
||||||
if m is None:
|
|
||||||
# If nothing matched, and it wasn't because we ran out of chunk,
|
|
||||||
# then stop
|
|
||||||
if self.chunkOffset != self.chunkSize:
|
|
||||||
break
|
|
||||||
else:
|
|
||||||
end = m.end()
|
|
||||||
# If not the whole chunk matched, return everything
|
|
||||||
# up to the part that didn't match
|
|
||||||
if end != self.chunkSize:
|
|
||||||
rv.append(self.chunk[self.chunkOffset:end])
|
|
||||||
self.chunkOffset = end
|
|
||||||
break
|
|
||||||
# If the whole remainder of the chunk matched,
|
|
||||||
# use it all and read the next chunk
|
|
||||||
rv.append(self.chunk[self.chunkOffset:])
|
|
||||||
if not self.readChunk():
|
|
||||||
# Reached EOF
|
|
||||||
break
|
|
||||||
|
|
||||||
r = "".join(rv)
|
|
||||||
return r
|
|
||||||
|
|
||||||
def unget(self, char):
|
|
||||||
# Only one character is allowed to be ungotten at once - it must
|
|
||||||
# be consumed again before any further call to unget
|
|
||||||
if char is not None:
|
|
||||||
if self.chunkOffset == 0:
|
|
||||||
# unget is called quite rarely, so it's a good idea to do
|
|
||||||
# more work here if it saves a bit of work in the frequently
|
|
||||||
# called char and charsUntil.
|
|
||||||
# So, just prepend the ungotten character onto the current
|
|
||||||
# chunk:
|
|
||||||
self.chunk = char + self.chunk
|
|
||||||
self.chunkSize += 1
|
|
||||||
else:
|
|
||||||
self.chunkOffset -= 1
|
|
||||||
assert self.chunk[self.chunkOffset] == char
|
|
||||||
|
|
||||||
|
|
||||||
class HTMLBinaryInputStream(HTMLUnicodeInputStream):
|
|
||||||
"""Provides a unicode stream of characters to the HTMLTokenizer.
|
|
||||||
|
|
||||||
This class takes care of character encoding and removing or replacing
|
|
||||||
incorrect byte-sequences and also provides column and line tracking.
|
|
||||||
|
|
||||||
"""
|
|
||||||
|
|
||||||
def __init__(self, source, encoding=None, parseMeta=True, chardet=True):
|
|
||||||
"""Initialises the HTMLInputStream.
|
|
||||||
|
|
||||||
HTMLInputStream(source, [encoding]) -> Normalized stream from source
|
|
||||||
for use by html5lib.
|
|
||||||
|
|
||||||
source can be either a file-object, local filename or a string.
|
|
||||||
|
|
||||||
The optional encoding parameter must be a string that indicates
|
|
||||||
the encoding. If specified, that encoding will be used,
|
|
||||||
regardless of any BOM or later declaration (such as in a meta
|
|
||||||
element)
|
|
||||||
|
|
||||||
parseMeta - Look for a <meta> element containing encoding information
|
|
||||||
|
|
||||||
"""
|
|
||||||
# Raw Stream - for unicode objects this will encode to utf-8 and set
|
|
||||||
# self.charEncoding as appropriate
|
|
||||||
self.rawStream = self.openStream(source)
|
|
||||||
|
|
||||||
HTMLUnicodeInputStream.__init__(self, self.rawStream)
|
|
||||||
|
|
||||||
self.charEncoding = (codecName(encoding), "certain")
|
|
||||||
|
|
||||||
# Encoding Information
|
|
||||||
# Number of bytes to use when looking for a meta element with
|
|
||||||
# encoding information
|
|
||||||
self.numBytesMeta = 512
|
|
||||||
# Number of bytes to use when using detecting encoding using chardet
|
|
||||||
self.numBytesChardet = 100
|
|
||||||
# Encoding to use if no other information can be found
|
|
||||||
self.defaultEncoding = "windows-1252"
|
|
||||||
|
|
||||||
# Detect encoding iff no explicit "transport level" encoding is supplied
|
|
||||||
if (self.charEncoding[0] is None):
|
|
||||||
self.charEncoding = self.detectEncoding(parseMeta, chardet)
|
|
||||||
|
|
||||||
# Call superclass
|
|
||||||
self.reset()
|
|
||||||
|
|
||||||
def reset(self):
|
|
||||||
self.dataStream = codecs.getreader(self.charEncoding[0])(self.rawStream,
|
|
||||||
'replace')
|
|
||||||
HTMLUnicodeInputStream.reset(self)
|
|
||||||
|
|
||||||
def openStream(self, source):
|
|
||||||
"""Produces a file object from source.
|
|
||||||
|
|
||||||
source can be either a file object, local filename or a string.
|
|
||||||
|
|
||||||
"""
|
|
||||||
# Already a file object
|
|
||||||
if hasattr(source, 'read'):
|
|
||||||
stream = source
|
|
||||||
else:
|
|
||||||
stream = BytesIO(source)
|
|
||||||
|
|
||||||
try:
|
|
||||||
stream.seek(stream.tell())
|
|
||||||
except:
|
|
||||||
stream = BufferedStream(stream)
|
|
||||||
|
|
||||||
return stream
|
|
||||||
|
|
||||||
def detectEncoding(self, parseMeta=True, chardet=True):
|
|
||||||
# First look for a BOM
|
|
||||||
# This will also read past the BOM if present
|
|
||||||
encoding = self.detectBOM()
|
|
||||||
confidence = "certain"
|
|
||||||
# If there is no BOM need to look for meta elements with encoding
|
|
||||||
# information
|
|
||||||
if encoding is None and parseMeta:
|
|
||||||
encoding = self.detectEncodingMeta()
|
|
||||||
confidence = "tentative"
|
|
||||||
# Guess with chardet, if avaliable
|
|
||||||
if encoding is None and chardet:
|
|
||||||
confidence = "tentative"
|
|
||||||
try:
|
|
||||||
try:
|
|
||||||
from charade.universaldetector import UniversalDetector
|
|
||||||
except ImportError:
|
|
||||||
from chardet.universaldetector import UniversalDetector
|
|
||||||
buffers = []
|
|
||||||
detector = UniversalDetector()
|
|
||||||
while not detector.done:
|
|
||||||
buffer = self.rawStream.read(self.numBytesChardet)
|
|
||||||
assert isinstance(buffer, bytes)
|
|
||||||
if not buffer:
|
|
||||||
break
|
|
||||||
buffers.append(buffer)
|
|
||||||
detector.feed(buffer)
|
|
||||||
detector.close()
|
|
||||||
encoding = detector.result['encoding']
|
|
||||||
self.rawStream.seek(0)
|
|
||||||
except ImportError:
|
|
||||||
pass
|
|
||||||
# If all else fails use the default encoding
|
|
||||||
if encoding is None:
|
|
||||||
confidence = "tentative"
|
|
||||||
encoding = self.defaultEncoding
|
|
||||||
|
|
||||||
# Substitute for equivalent encodings:
|
|
||||||
encodingSub = {"iso-8859-1": "windows-1252"}
|
|
||||||
|
|
||||||
if encoding.lower() in encodingSub:
|
|
||||||
encoding = encodingSub[encoding.lower()]
|
|
||||||
|
|
||||||
return encoding, confidence
|
|
||||||
|
|
||||||
def changeEncoding(self, newEncoding):
|
|
||||||
assert self.charEncoding[1] != "certain"
|
|
||||||
newEncoding = codecName(newEncoding)
|
|
||||||
if newEncoding in ("utf-16", "utf-16-be", "utf-16-le"):
|
|
||||||
newEncoding = "utf-8"
|
|
||||||
if newEncoding is None:
|
|
||||||
return
|
|
||||||
elif newEncoding == self.charEncoding[0]:
|
|
||||||
self.charEncoding = (self.charEncoding[0], "certain")
|
|
||||||
else:
|
|
||||||
self.rawStream.seek(0)
|
|
||||||
self.reset()
|
|
||||||
self.charEncoding = (newEncoding, "certain")
|
|
||||||
raise ReparseException("Encoding changed from %s to %s" % (self.charEncoding[0], newEncoding))
|
|
||||||
|
|
||||||
def detectBOM(self):
|
|
||||||
"""Attempts to detect at BOM at the start of the stream. If
|
|
||||||
an encoding can be determined from the BOM return the name of the
|
|
||||||
encoding otherwise return None"""
|
|
||||||
bomDict = {
|
|
||||||
codecs.BOM_UTF8: 'utf-8',
|
|
||||||
codecs.BOM_UTF16_LE: 'utf-16-le', codecs.BOM_UTF16_BE: 'utf-16-be',
|
|
||||||
codecs.BOM_UTF32_LE: 'utf-32-le', codecs.BOM_UTF32_BE: 'utf-32-be'
|
|
||||||
}
|
|
||||||
|
|
||||||
# Go to beginning of file and read in 4 bytes
|
|
||||||
string = self.rawStream.read(4)
|
|
||||||
assert isinstance(string, bytes)
|
|
||||||
|
|
||||||
# Try detecting the BOM using bytes from the string
|
|
||||||
encoding = bomDict.get(string[:3]) # UTF-8
|
|
||||||
seek = 3
|
|
||||||
if not encoding:
|
|
||||||
# Need to detect UTF-32 before UTF-16
|
|
||||||
encoding = bomDict.get(string) # UTF-32
|
|
||||||
seek = 4
|
|
||||||
if not encoding:
|
|
||||||
encoding = bomDict.get(string[:2]) # UTF-16
|
|
||||||
seek = 2
|
|
||||||
|
|
||||||
# Set the read position past the BOM if one was found, otherwise
|
|
||||||
# set it to the start of the stream
|
|
||||||
self.rawStream.seek(encoding and seek or 0)
|
|
||||||
|
|
||||||
return encoding
|
|
||||||
|
|
||||||
def detectEncodingMeta(self):
|
|
||||||
"""Report the encoding declared by the meta element
|
|
||||||
"""
|
|
||||||
buffer = self.rawStream.read(self.numBytesMeta)
|
|
||||||
assert isinstance(buffer, bytes)
|
|
||||||
parser = EncodingParser(buffer)
|
|
||||||
self.rawStream.seek(0)
|
|
||||||
encoding = parser.getEncoding()
|
|
||||||
|
|
||||||
if encoding in ("utf-16", "utf-16-be", "utf-16-le"):
|
|
||||||
encoding = "utf-8"
|
|
||||||
|
|
||||||
return encoding
|
|
||||||
|
|
||||||
|
|
||||||
class EncodingBytes(bytes):
|
|
||||||
"""String-like object with an associated position and various extra methods
|
|
||||||
If the position is ever greater than the string length then an exception is
|
|
||||||
raised"""
|
|
||||||
def __new__(self, value):
|
|
||||||
assert isinstance(value, bytes)
|
|
||||||
return bytes.__new__(self, value.lower())
|
|
||||||
|
|
||||||
def __init__(self, value):
|
|
||||||
self._position = -1
|
|
||||||
|
|
||||||
def __iter__(self):
|
|
||||||
return self
|
|
||||||
|
|
||||||
def __next__(self):
|
|
||||||
p = self._position = self._position + 1
|
|
||||||
if p >= len(self):
|
|
||||||
raise StopIteration
|
|
||||||
elif p < 0:
|
|
||||||
raise TypeError
|
|
||||||
return self[p:p + 1]
|
|
||||||
|
|
||||||
def next(self):
|
|
||||||
# Py2 compat
|
|
||||||
return self.__next__()
|
|
||||||
|
|
||||||
def previous(self):
|
|
||||||
p = self._position
|
|
||||||
if p >= len(self):
|
|
||||||
raise StopIteration
|
|
||||||
elif p < 0:
|
|
||||||
raise TypeError
|
|
||||||
self._position = p = p - 1
|
|
||||||
return self[p:p + 1]
|
|
||||||
|
|
||||||
def setPosition(self, position):
|
|
||||||
if self._position >= len(self):
|
|
||||||
raise StopIteration
|
|
||||||
self._position = position
|
|
||||||
|
|
||||||
def getPosition(self):
|
|
||||||
if self._position >= len(self):
|
|
||||||
raise StopIteration
|
|
||||||
if self._position >= 0:
|
|
||||||
return self._position
|
|
||||||
else:
|
|
||||||
return None
|
|
||||||
|
|
||||||
position = property(getPosition, setPosition)
|
|
||||||
|
|
||||||
def getCurrentByte(self):
|
|
||||||
return self[self.position:self.position + 1]
|
|
||||||
|
|
||||||
currentByte = property(getCurrentByte)
|
|
||||||
|
|
||||||
def skip(self, chars=spaceCharactersBytes):
|
|
||||||
"""Skip past a list of characters"""
|
|
||||||
p = self.position # use property for the error-checking
|
|
||||||
while p < len(self):
|
|
||||||
c = self[p:p + 1]
|
|
||||||
if c not in chars:
|
|
||||||
self._position = p
|
|
||||||
return c
|
|
||||||
p += 1
|
|
||||||
self._position = p
|
|
||||||
return None
|
|
||||||
|
|
||||||
def skipUntil(self, chars):
|
|
||||||
p = self.position
|
|
||||||
while p < len(self):
|
|
||||||
c = self[p:p + 1]
|
|
||||||
if c in chars:
|
|
||||||
self._position = p
|
|
||||||
return c
|
|
||||||
p += 1
|
|
||||||
self._position = p
|
|
||||||
return None
|
|
||||||
|
|
||||||
def matchBytes(self, bytes):
|
|
||||||
"""Look for a sequence of bytes at the start of a string. If the bytes
|
|
||||||
are found return True and advance the position to the byte after the
|
|
||||||
match. Otherwise return False and leave the position alone"""
|
|
||||||
p = self.position
|
|
||||||
data = self[p:p + len(bytes)]
|
|
||||||
rv = data.startswith(bytes)
|
|
||||||
if rv:
|
|
||||||
self.position += len(bytes)
|
|
||||||
return rv
|
|
||||||
|
|
||||||
def jumpTo(self, bytes):
|
|
||||||
"""Look for the next sequence of bytes matching a given sequence. If
|
|
||||||
a match is found advance the position to the last byte of the match"""
|
|
||||||
newPosition = self[self.position:].find(bytes)
|
|
||||||
if newPosition > -1:
|
|
||||||
# XXX: This is ugly, but I can't see a nicer way to fix this.
|
|
||||||
if self._position == -1:
|
|
||||||
self._position = 0
|
|
||||||
self._position += (newPosition + len(bytes) - 1)
|
|
||||||
return True
|
|
||||||
else:
|
|
||||||
raise StopIteration
|
|
||||||
|
|
||||||
|
|
||||||
class EncodingParser(object):
|
|
||||||
"""Mini parser for detecting character encoding from meta elements"""
|
|
||||||
|
|
||||||
def __init__(self, data):
|
|
||||||
"""string - the data to work on for encoding detection"""
|
|
||||||
self.data = EncodingBytes(data)
|
|
||||||
self.encoding = None
|
|
||||||
|
|
||||||
def getEncoding(self):
|
|
||||||
methodDispatch = (
|
|
||||||
(b"<!--", self.handleComment),
|
|
||||||
(b"<meta", self.handleMeta),
|
|
||||||
(b"</", self.handlePossibleEndTag),
|
|
||||||
(b"<!", self.handleOther),
|
|
||||||
(b"<?", self.handleOther),
|
|
||||||
(b"<", self.handlePossibleStartTag))
|
|
||||||
for byte in self.data:
|
|
||||||
keepParsing = True
|
|
||||||
for key, method in methodDispatch:
|
|
||||||
if self.data.matchBytes(key):
|
|
||||||
try:
|
|
||||||
keepParsing = method()
|
|
||||||
break
|
|
||||||
except StopIteration:
|
|
||||||
keepParsing = False
|
|
||||||
break
|
|
||||||
if not keepParsing:
|
|
||||||
break
|
|
||||||
|
|
||||||
return self.encoding
|
|
||||||
|
|
||||||
def handleComment(self):
|
|
||||||
"""Skip over comments"""
|
|
||||||
return self.data.jumpTo(b"-->")
|
|
||||||
|
|
||||||
def handleMeta(self):
|
|
||||||
if self.data.currentByte not in spaceCharactersBytes:
|
|
||||||
# if we have <meta not followed by a space so just keep going
|
|
||||||
return True
|
|
||||||
# We have a valid meta element we want to search for attributes
|
|
||||||
hasPragma = False
|
|
||||||
pendingEncoding = None
|
|
||||||
while True:
|
|
||||||
# Try to find the next attribute after the current position
|
|
||||||
attr = self.getAttribute()
|
|
||||||
if attr is None:
|
|
||||||
return True
|
|
||||||
else:
|
|
||||||
if attr[0] == b"http-equiv":
|
|
||||||
hasPragma = attr[1] == b"content-type"
|
|
||||||
if hasPragma and pendingEncoding is not None:
|
|
||||||
self.encoding = pendingEncoding
|
|
||||||
return False
|
|
||||||
elif attr[0] == b"charset":
|
|
||||||
tentativeEncoding = attr[1]
|
|
||||||
codec = codecName(tentativeEncoding)
|
|
||||||
if codec is not None:
|
|
||||||
self.encoding = codec
|
|
||||||
return False
|
|
||||||
elif attr[0] == b"content":
|
|
||||||
contentParser = ContentAttrParser(EncodingBytes(attr[1]))
|
|
||||||
tentativeEncoding = contentParser.parse()
|
|
||||||
if tentativeEncoding is not None:
|
|
||||||
codec = codecName(tentativeEncoding)
|
|
||||||
if codec is not None:
|
|
||||||
if hasPragma:
|
|
||||||
self.encoding = codec
|
|
||||||
return False
|
|
||||||
else:
|
|
||||||
pendingEncoding = codec
|
|
||||||
|
|
||||||
def handlePossibleStartTag(self):
|
|
||||||
return self.handlePossibleTag(False)
|
|
||||||
|
|
||||||
def handlePossibleEndTag(self):
|
|
||||||
next(self.data)
|
|
||||||
return self.handlePossibleTag(True)
|
|
||||||
|
|
||||||
def handlePossibleTag(self, endTag):
|
|
||||||
data = self.data
|
|
||||||
if data.currentByte not in asciiLettersBytes:
|
|
||||||
# If the next byte is not an ascii letter either ignore this
|
|
||||||
# fragment (possible start tag case) or treat it according to
|
|
||||||
# handleOther
|
|
||||||
if endTag:
|
|
||||||
data.previous()
|
|
||||||
self.handleOther()
|
|
||||||
return True
|
|
||||||
|
|
||||||
c = data.skipUntil(spacesAngleBrackets)
|
|
||||||
if c == b"<":
|
|
||||||
# return to the first step in the overall "two step" algorithm
|
|
||||||
# reprocessing the < byte
|
|
||||||
data.previous()
|
|
||||||
else:
|
|
||||||
# Read all attributes
|
|
||||||
attr = self.getAttribute()
|
|
||||||
while attr is not None:
|
|
||||||
attr = self.getAttribute()
|
|
||||||
return True
|
|
||||||
|
|
||||||
def handleOther(self):
|
|
||||||
return self.data.jumpTo(b">")
|
|
||||||
|
|
||||||
def getAttribute(self):
|
|
||||||
"""Return a name,value pair for the next attribute in the stream,
|
|
||||||
if one is found, or None"""
|
|
||||||
data = self.data
|
|
||||||
# Step 1 (skip chars)
|
|
||||||
c = data.skip(spaceCharactersBytes | frozenset([b"/"]))
|
|
||||||
assert c is None or len(c) == 1
|
|
||||||
# Step 2
|
|
||||||
if c in (b">", None):
|
|
||||||
return None
|
|
||||||
# Step 3
|
|
||||||
attrName = []
|
|
||||||
attrValue = []
|
|
||||||
# Step 4 attribute name
|
|
||||||
while True:
|
|
||||||
if c == b"=" and attrName:
|
|
||||||
break
|
|
||||||
elif c in spaceCharactersBytes:
|
|
||||||
# Step 6!
|
|
||||||
c = data.skip()
|
|
||||||
break
|
|
||||||
elif c in (b"/", b">"):
|
|
||||||
return b"".join(attrName), b""
|
|
||||||
elif c in asciiUppercaseBytes:
|
|
||||||
attrName.append(c.lower())
|
|
||||||
elif c is None:
|
|
||||||
return None
|
|
||||||
else:
|
|
||||||
attrName.append(c)
|
|
||||||
# Step 5
|
|
||||||
c = next(data)
|
|
||||||
# Step 7
|
|
||||||
if c != b"=":
|
|
||||||
data.previous()
|
|
||||||
return b"".join(attrName), b""
|
|
||||||
# Step 8
|
|
||||||
next(data)
|
|
||||||
# Step 9
|
|
||||||
c = data.skip()
|
|
||||||
# Step 10
|
|
||||||
if c in (b"'", b'"'):
|
|
||||||
# 10.1
|
|
||||||
quoteChar = c
|
|
||||||
while True:
|
|
||||||
# 10.2
|
|
||||||
c = next(data)
|
|
||||||
# 10.3
|
|
||||||
if c == quoteChar:
|
|
||||||
next(data)
|
|
||||||
return b"".join(attrName), b"".join(attrValue)
|
|
||||||
# 10.4
|
|
||||||
elif c in asciiUppercaseBytes:
|
|
||||||
attrValue.append(c.lower())
|
|
||||||
# 10.5
|
|
||||||
else:
|
|
||||||
attrValue.append(c)
|
|
||||||
elif c == b">":
|
|
||||||
return b"".join(attrName), b""
|
|
||||||
elif c in asciiUppercaseBytes:
|
|
||||||
attrValue.append(c.lower())
|
|
||||||
elif c is None:
|
|
||||||
return None
|
|
||||||
else:
|
|
||||||
attrValue.append(c)
|
|
||||||
# Step 11
|
|
||||||
while True:
|
|
||||||
c = next(data)
|
|
||||||
if c in spacesAngleBrackets:
|
|
||||||
return b"".join(attrName), b"".join(attrValue)
|
|
||||||
elif c in asciiUppercaseBytes:
|
|
||||||
attrValue.append(c.lower())
|
|
||||||
elif c is None:
|
|
||||||
return None
|
|
||||||
else:
|
|
||||||
attrValue.append(c)
|
|
||||||
|
|
||||||
|
|
||||||
class ContentAttrParser(object):
|
|
||||||
def __init__(self, data):
|
|
||||||
assert isinstance(data, bytes)
|
|
||||||
self.data = data
|
|
||||||
|
|
||||||
def parse(self):
|
|
||||||
try:
|
|
||||||
# Check if the attr name is charset
|
|
||||||
# otherwise return
|
|
||||||
self.data.jumpTo(b"charset")
|
|
||||||
self.data.position += 1
|
|
||||||
self.data.skip()
|
|
||||||
if not self.data.currentByte == b"=":
|
|
||||||
# If there is no = sign keep looking for attrs
|
|
||||||
return None
|
|
||||||
self.data.position += 1
|
|
||||||
self.data.skip()
|
|
||||||
# Look for an encoding between matching quote marks
|
|
||||||
if self.data.currentByte in (b'"', b"'"):
|
|
||||||
quoteMark = self.data.currentByte
|
|
||||||
self.data.position += 1
|
|
||||||
oldPosition = self.data.position
|
|
||||||
if self.data.jumpTo(quoteMark):
|
|
||||||
return self.data[oldPosition:self.data.position]
|
|
||||||
else:
|
|
||||||
return None
|
|
||||||
else:
|
|
||||||
# Unquoted value
|
|
||||||
oldPosition = self.data.position
|
|
||||||
try:
|
|
||||||
self.data.skipUntil(spaceCharactersBytes)
|
|
||||||
return self.data[oldPosition:self.data.position]
|
|
||||||
except StopIteration:
|
|
||||||
# Return the whole remaining value
|
|
||||||
return self.data[oldPosition:]
|
|
||||||
except StopIteration:
|
|
||||||
return None
|
|
||||||
|
|
||||||
|
|
||||||
def codecName(encoding):
|
|
||||||
"""Return the python codec name corresponding to an encoding or None if the
|
|
||||||
string doesn't correspond to a valid encoding."""
|
|
||||||
if isinstance(encoding, bytes):
|
|
||||||
try:
|
|
||||||
encoding = encoding.decode("ascii")
|
|
||||||
except UnicodeDecodeError:
|
|
||||||
return None
|
|
||||||
if encoding:
|
|
||||||
canonicalName = ascii_punctuation_re.sub("", encoding).lower()
|
|
||||||
return encodings.get(canonicalName, None)
|
|
||||||
else:
|
|
||||||
return None
|
|
@ -1,304 +0,0 @@
|
|||||||
from __future__ import absolute_import, division, unicode_literals
|
|
||||||
|
|
||||||
import re
|
|
||||||
import sys
|
|
||||||
from xml.sax.saxutils import escape, unescape
|
|
||||||
if sys.version_info[0] == 2:
|
|
||||||
from urlparse import urlparse
|
|
||||||
else:
|
|
||||||
from urllib.parse import urlparse
|
|
||||||
|
|
||||||
from .tokenizer import HTMLTokenizer
|
|
||||||
from .constants import tokenTypes
|
|
||||||
|
|
||||||
|
|
||||||
content_type_rgx = re.compile(r'''
|
|
||||||
^
|
|
||||||
# Match a content type <application>/<type>
|
|
||||||
(?P<content_type>[-a-zA-Z0-9.]+/[-a-zA-Z0-9.]+)
|
|
||||||
# Match any character set and encoding
|
|
||||||
(?:(?:;charset=(?:[-a-zA-Z0-9]+)(?:;(?:base64))?)
|
|
||||||
|(?:;(?:base64))?(?:;charset=(?:[-a-zA-Z0-9]+))?)
|
|
||||||
# Assume the rest is data
|
|
||||||
,.*
|
|
||||||
$
|
|
||||||
''',
|
|
||||||
re.VERBOSE)
|
|
||||||
|
|
||||||
|
|
||||||
class HTMLSanitizerMixin(object):
|
|
||||||
""" sanitization of XHTML+MathML+SVG and of inline style attributes."""
|
|
||||||
|
|
||||||
acceptable_elements = ['a', 'abbr', 'acronym', 'address', 'area',
|
|
||||||
'article', 'aside', 'audio', 'b', 'big', 'blockquote', 'br', 'button',
|
|
||||||
'canvas', 'caption', 'center', 'cite', 'code', 'col', 'colgroup',
|
|
||||||
'command', 'datagrid', 'datalist', 'dd', 'del', 'details', 'dfn',
|
|
||||||
'dialog', 'dir', 'div', 'dl', 'dt', 'em', 'event-source', 'fieldset',
|
|
||||||
'figcaption', 'figure', 'footer', 'font', 'form', 'header', 'h1',
|
|
||||||
'h2', 'h3', 'h4', 'h5', 'h6', 'hr', 'i', 'img', 'input', 'ins',
|
|
||||||
'keygen', 'kbd', 'label', 'legend', 'li', 'm', 'map', 'menu', 'meter',
|
|
||||||
'multicol', 'nav', 'nextid', 'ol', 'output', 'optgroup', 'option',
|
|
||||||
'p', 'pre', 'progress', 'q', 's', 'samp', 'section', 'select',
|
|
||||||
'small', 'sound', 'source', 'spacer', 'span', 'strike', 'strong',
|
|
||||||
'sub', 'sup', 'table', 'tbody', 'td', 'textarea', 'time', 'tfoot',
|
|
||||||
'th', 'thead', 'tr', 'tt', 'u', 'ul', 'var', 'video']
|
|
||||||
|
|
||||||
mathml_elements = ['maction', 'math', 'merror', 'mfrac', 'mi',
|
|
||||||
'mmultiscripts', 'mn', 'mo', 'mover', 'mpadded', 'mphantom',
|
|
||||||
'mprescripts', 'mroot', 'mrow', 'mspace', 'msqrt', 'mstyle', 'msub',
|
|
||||||
'msubsup', 'msup', 'mtable', 'mtd', 'mtext', 'mtr', 'munder',
|
|
||||||
'munderover', 'none']
|
|
||||||
|
|
||||||
svg_elements = ['a', 'animate', 'animateColor', 'animateMotion',
|
|
||||||
'animateTransform', 'clipPath', 'circle', 'defs', 'desc', 'ellipse',
|
|
||||||
'font-face', 'font-face-name', 'font-face-src', 'g', 'glyph', 'hkern',
|
|
||||||
'linearGradient', 'line', 'marker', 'metadata', 'missing-glyph',
|
|
||||||
'mpath', 'path', 'polygon', 'polyline', 'radialGradient', 'rect',
|
|
||||||
'set', 'stop', 'svg', 'switch', 'text', 'title', 'tspan', 'use']
|
|
||||||
|
|
||||||
acceptable_attributes = ['abbr', 'accept', 'accept-charset', 'accesskey',
|
|
||||||
'action', 'align', 'alt', 'autocomplete', 'autofocus', 'axis',
|
|
||||||
'background', 'balance', 'bgcolor', 'bgproperties', 'border',
|
|
||||||
'bordercolor', 'bordercolordark', 'bordercolorlight', 'bottompadding',
|
|
||||||
'cellpadding', 'cellspacing', 'ch', 'challenge', 'char', 'charoff',
|
|
||||||
'choff', 'charset', 'checked', 'cite', 'class', 'clear', 'color',
|
|
||||||
'cols', 'colspan', 'compact', 'contenteditable', 'controls', 'coords',
|
|
||||||
'data', 'datafld', 'datapagesize', 'datasrc', 'datetime', 'default',
|
|
||||||
'delay', 'dir', 'disabled', 'draggable', 'dynsrc', 'enctype', 'end',
|
|
||||||
'face', 'for', 'form', 'frame', 'galleryimg', 'gutter', 'headers',
|
|
||||||
'height', 'hidefocus', 'hidden', 'high', 'href', 'hreflang', 'hspace',
|
|
||||||
'icon', 'id', 'inputmode', 'ismap', 'keytype', 'label', 'leftspacing',
|
|
||||||
'lang', 'list', 'longdesc', 'loop', 'loopcount', 'loopend',
|
|
||||||
'loopstart', 'low', 'lowsrc', 'max', 'maxlength', 'media', 'method',
|
|
||||||
'min', 'multiple', 'name', 'nohref', 'noshade', 'nowrap', 'open',
|
|
||||||
'optimum', 'pattern', 'ping', 'point-size', 'poster', 'pqg', 'preload',
|
|
||||||
'prompt', 'radiogroup', 'readonly', 'rel', 'repeat-max', 'repeat-min',
|
|
||||||
'replace', 'required', 'rev', 'rightspacing', 'rows', 'rowspan',
|
|
||||||
'rules', 'scope', 'selected', 'shape', 'size', 'span', 'src', 'start',
|
|
||||||
'step', 'style', 'summary', 'suppress', 'tabindex', 'target',
|
|
||||||
'template', 'title', 'toppadding', 'type', 'unselectable', 'usemap',
|
|
||||||
'urn', 'valign', 'value', 'variable', 'volume', 'vspace', 'vrml',
|
|
||||||
'width', 'wrap', 'xml:lang']
|
|
||||||
|
|
||||||
mathml_attributes = ['actiontype', 'align', 'columnalign', 'columnalign',
|
|
||||||
'columnalign', 'columnlines', 'columnspacing', 'columnspan', 'depth',
|
|
||||||
'display', 'displaystyle', 'equalcolumns', 'equalrows', 'fence',
|
|
||||||
'fontstyle', 'fontweight', 'frame', 'height', 'linethickness', 'lspace',
|
|
||||||
'mathbackground', 'mathcolor', 'mathvariant', 'mathvariant', 'maxsize',
|
|
||||||
'minsize', 'other', 'rowalign', 'rowalign', 'rowalign', 'rowlines',
|
|
||||||
'rowspacing', 'rowspan', 'rspace', 'scriptlevel', 'selection',
|
|
||||||
'separator', 'stretchy', 'width', 'width', 'xlink:href', 'xlink:show',
|
|
||||||
'xlink:type', 'xmlns', 'xmlns:xlink']
|
|
||||||
|
|
||||||
svg_attributes = ['accent-height', 'accumulate', 'additive', 'alphabetic',
|
|
||||||
'arabic-form', 'ascent', 'attributeName', 'attributeType',
|
|
||||||
'baseProfile', 'bbox', 'begin', 'by', 'calcMode', 'cap-height',
|
|
||||||
'class', 'clip-path', 'color', 'color-rendering', 'content', 'cx',
|
|
||||||
'cy', 'd', 'dx', 'dy', 'descent', 'display', 'dur', 'end', 'fill',
|
|
||||||
'fill-opacity', 'fill-rule', 'font-family', 'font-size',
|
|
||||||
'font-stretch', 'font-style', 'font-variant', 'font-weight', 'from',
|
|
||||||
'fx', 'fy', 'g1', 'g2', 'glyph-name', 'gradientUnits', 'hanging',
|
|
||||||
'height', 'horiz-adv-x', 'horiz-origin-x', 'id', 'ideographic', 'k',
|
|
||||||
'keyPoints', 'keySplines', 'keyTimes', 'lang', 'marker-end',
|
|
||||||
'marker-mid', 'marker-start', 'markerHeight', 'markerUnits',
|
|
||||||
'markerWidth', 'mathematical', 'max', 'min', 'name', 'offset',
|
|
||||||
'opacity', 'orient', 'origin', 'overline-position',
|
|
||||||
'overline-thickness', 'panose-1', 'path', 'pathLength', 'points',
|
|
||||||
'preserveAspectRatio', 'r', 'refX', 'refY', 'repeatCount',
|
|
||||||
'repeatDur', 'requiredExtensions', 'requiredFeatures', 'restart',
|
|
||||||
'rotate', 'rx', 'ry', 'slope', 'stemh', 'stemv', 'stop-color',
|
|
||||||
'stop-opacity', 'strikethrough-position', 'strikethrough-thickness',
|
|
||||||
'stroke', 'stroke-dasharray', 'stroke-dashoffset', 'stroke-linecap',
|
|
||||||
'stroke-linejoin', 'stroke-miterlimit', 'stroke-opacity',
|
|
||||||
'stroke-width', 'systemLanguage', 'target', 'text-anchor', 'to',
|
|
||||||
'transform', 'type', 'u1', 'u2', 'underline-position',
|
|
||||||
'underline-thickness', 'unicode', 'unicode-range', 'units-per-em',
|
|
||||||
'values', 'version', 'viewBox', 'visibility', 'width', 'widths', 'x',
|
|
||||||
'x-height', 'x1', 'x2', 'xlink:actuate', 'xlink:arcrole',
|
|
||||||
'xlink:href', 'xlink:role', 'xlink:show', 'xlink:title', 'xlink:type',
|
|
||||||
'xml:base', 'xml:lang', 'xml:space', 'xmlns', 'xmlns:xlink', 'y',
|
|
||||||
'y1', 'y2', 'zoomAndPan']
|
|
||||||
|
|
||||||
attr_val_is_uri = ['href', 'src', 'cite', 'action', 'longdesc', 'poster', 'background', 'datasrc',
|
|
||||||
'dynsrc', 'lowsrc', 'ping', 'poster', 'xlink:href', 'xml:base']
|
|
||||||
|
|
||||||
svg_attr_val_allows_ref = ['clip-path', 'color-profile', 'cursor', 'fill',
|
|
||||||
'filter', 'marker', 'marker-start', 'marker-mid', 'marker-end',
|
|
||||||
'mask', 'stroke']
|
|
||||||
|
|
||||||
svg_allow_local_href = ['altGlyph', 'animate', 'animateColor',
|
|
||||||
'animateMotion', 'animateTransform', 'cursor', 'feImage', 'filter',
|
|
||||||
'linearGradient', 'pattern', 'radialGradient', 'textpath', 'tref',
|
|
||||||
'set', 'use']
|
|
||||||
|
|
||||||
acceptable_css_properties = ['azimuth', 'background-color',
|
|
||||||
'border-bottom-color', 'border-collapse', 'border-color',
|
|
||||||
'border-left-color', 'border-right-color', 'border-top-color', 'clear',
|
|
||||||
'color', 'cursor', 'direction', 'display', 'elevation', 'float', 'font',
|
|
||||||
'font-family', 'font-size', 'font-style', 'font-variant', 'font-weight',
|
|
||||||
'height', 'letter-spacing', 'line-height', 'overflow', 'pause',
|
|
||||||
'pause-after', 'pause-before', 'pitch', 'pitch-range', 'richness',
|
|
||||||
'speak', 'speak-header', 'speak-numeral', 'speak-punctuation',
|
|
||||||
'speech-rate', 'stress', 'text-align', 'text-decoration', 'text-indent',
|
|
||||||
'unicode-bidi', 'vertical-align', 'voice-family', 'volume',
|
|
||||||
'white-space', 'width']
|
|
||||||
|
|
||||||
acceptable_css_keywords = ['auto', 'aqua', 'black', 'block', 'blue',
|
|
||||||
'bold', 'both', 'bottom', 'brown', 'center', 'collapse', 'dashed',
|
|
||||||
'dotted', 'fuchsia', 'gray', 'green', '!important', 'italic', 'left',
|
|
||||||
'lime', 'maroon', 'medium', 'none', 'navy', 'normal', 'nowrap', 'olive',
|
|
||||||
'pointer', 'purple', 'red', 'right', 'solid', 'silver', 'teal', 'top',
|
|
||||||
'transparent', 'underline', 'white', 'yellow']
|
|
||||||
|
|
||||||
acceptable_svg_properties = ['fill', 'fill-opacity', 'fill-rule',
|
|
||||||
'stroke', 'stroke-width', 'stroke-linecap', 'stroke-linejoin',
|
|
||||||
'stroke-opacity']
|
|
||||||
|
|
||||||
acceptable_protocols = ['ed2k', 'ftp', 'http', 'https', 'irc',
|
|
||||||
'mailto', 'news', 'gopher', 'nntp', 'telnet', 'webcal',
|
|
||||||
'xmpp', 'callto', 'feed', 'urn', 'aim', 'rsync', 'tag',
|
|
||||||
'ssh', 'sftp', 'rtsp', 'afs', 'data']
|
|
||||||
|
|
||||||
acceptable_content_types = ['image/png', 'image/jpeg', 'image/gif', 'image/webp', 'image/bmp', 'text/plain']
|
|
||||||
|
|
||||||
# subclasses may define their own versions of these constants
|
|
||||||
allowed_elements = acceptable_elements + mathml_elements + svg_elements
|
|
||||||
allowed_attributes = acceptable_attributes + mathml_attributes + svg_attributes
|
|
||||||
allowed_css_properties = acceptable_css_properties
|
|
||||||
allowed_css_keywords = acceptable_css_keywords
|
|
||||||
allowed_svg_properties = acceptable_svg_properties
|
|
||||||
allowed_protocols = acceptable_protocols
|
|
||||||
allowed_content_types = acceptable_content_types
|
|
||||||
|
|
||||||
# Sanitize the +html+, escaping all elements not in ALLOWED_ELEMENTS, and
|
|
||||||
# stripping out all # attributes not in ALLOWED_ATTRIBUTES. Style
|
|
||||||
# attributes are parsed, and a restricted set, # specified by
|
|
||||||
# ALLOWED_CSS_PROPERTIES and ALLOWED_CSS_KEYWORDS, are allowed through.
|
|
||||||
# attributes in ATTR_VAL_IS_URI are scanned, and only URI schemes specified
|
|
||||||
# in ALLOWED_PROTOCOLS are allowed.
|
|
||||||
#
|
|
||||||
# sanitize_html('<script> do_nasty_stuff() </script>')
|
|
||||||
# => <script> do_nasty_stuff() </script>
|
|
||||||
# sanitize_html('<a href="javascript: sucker();">Click here for $100</a>')
|
|
||||||
# => <a>Click here for $100</a>
|
|
||||||
def sanitize_token(self, token):
|
|
||||||
|
|
||||||
# accommodate filters which use token_type differently
|
|
||||||
token_type = token["type"]
|
|
||||||
if token_type in list(tokenTypes.keys()):
|
|
||||||
token_type = tokenTypes[token_type]
|
|
||||||
|
|
||||||
if token_type in (tokenTypes["StartTag"], tokenTypes["EndTag"],
|
|
||||||
tokenTypes["EmptyTag"]):
|
|
||||||
if token["name"] in self.allowed_elements:
|
|
||||||
return self.allowed_token(token, token_type)
|
|
||||||
else:
|
|
||||||
return self.disallowed_token(token, token_type)
|
|
||||||
elif token_type == tokenTypes["Comment"]:
|
|
||||||
pass
|
|
||||||
else:
|
|
||||||
return token
|
|
||||||
|
|
||||||
def allowed_token(self, token, token_type):
|
|
||||||
if "data" in token:
|
|
||||||
attrs = dict([(name, val) for name, val in
|
|
||||||
token["data"][::-1]
|
|
||||||
if name in self.allowed_attributes])
|
|
||||||
for attr in self.attr_val_is_uri:
|
|
||||||
if attr not in attrs:
|
|
||||||
continue
|
|
||||||
val_unescaped = re.sub("[`\000-\040\177-\240\s]+", '',
|
|
||||||
unescape(attrs[attr])).lower()
|
|
||||||
# remove replacement characters from unescaped characters
|
|
||||||
val_unescaped = val_unescaped.replace("\ufffd", "")
|
|
||||||
try:
|
|
||||||
uri = urlparse(val_unescaped)
|
|
||||||
except ValueError:
|
|
||||||
uri = None
|
|
||||||
del attrs[attr]
|
|
||||||
if uri and uri.scheme:
|
|
||||||
if uri.scheme not in self.allowed_protocols:
|
|
||||||
del attrs[attr]
|
|
||||||
if uri.scheme == 'data':
|
|
||||||
m = content_type_rgx.match(uri.path)
|
|
||||||
if not m:
|
|
||||||
del attrs[attr]
|
|
||||||
elif m.group('content_type') not in self.allowed_content_types:
|
|
||||||
del attrs[attr]
|
|
||||||
|
|
||||||
for attr in self.svg_attr_val_allows_ref:
|
|
||||||
if attr in attrs:
|
|
||||||
attrs[attr] = re.sub(r'url\s*\(\s*[^#\s][^)]+?\)',
|
|
||||||
' ',
|
|
||||||
unescape(attrs[attr]))
|
|
||||||
if (token["name"] in self.svg_allow_local_href and
|
|
||||||
'xlink:href' in attrs and re.search('^\s*[^#\s].*',
|
|
||||||
attrs['xlink:href'])):
|
|
||||||
del attrs['xlink:href']
|
|
||||||
if 'style' in attrs:
|
|
||||||
attrs['style'] = self.sanitize_css(attrs['style'])
|
|
||||||
token["data"] = [[name, val] for name, val in list(attrs.items())]
|
|
||||||
return token
|
|
||||||
|
|
||||||
def disallowed_token(self, token, token_type):
|
|
||||||
if token_type == tokenTypes["EndTag"]:
|
|
||||||
token["data"] = "</%s>" % token["name"]
|
|
||||||
elif token["data"]:
|
|
||||||
attrs = ''.join([' %s="%s"' % (k, escape(v)) for k, v in token["data"]])
|
|
||||||
token["data"] = "<%s%s>" % (token["name"], attrs)
|
|
||||||
else:
|
|
||||||
token["data"] = "<%s>" % token["name"]
|
|
||||||
if token.get("selfClosing"):
|
|
||||||
token["data"] = token["data"][:-1] + "/>"
|
|
||||||
|
|
||||||
if token["type"] in list(tokenTypes.keys()):
|
|
||||||
token["type"] = "Characters"
|
|
||||||
else:
|
|
||||||
token["type"] = tokenTypes["Characters"]
|
|
||||||
|
|
||||||
del token["name"]
|
|
||||||
return token
|
|
||||||
|
|
||||||
def sanitize_css(self, style):
|
|
||||||
# disallow urls
|
|
||||||
style = re.compile('url\s*\(\s*[^\s)]+?\s*\)\s*').sub(' ', style)
|
|
||||||
|
|
||||||
# gauntlet
|
|
||||||
if not re.match("""^([:,;#%.\sa-zA-Z0-9!]|\w-\w|'[\s\w]+'|"[\s\w]+"|\([\d,\s]+\))*$""", style):
|
|
||||||
return ''
|
|
||||||
if not re.match("^\s*([-\w]+\s*:[^:;]*(;\s*|$))*$", style):
|
|
||||||
return ''
|
|
||||||
|
|
||||||
clean = []
|
|
||||||
for prop, value in re.findall("([-\w]+)\s*:\s*([^:;]*)", style):
|
|
||||||
if not value:
|
|
||||||
continue
|
|
||||||
if prop.lower() in self.allowed_css_properties:
|
|
||||||
clean.append(prop + ': ' + value + ';')
|
|
||||||
elif prop.split('-')[0].lower() in ['background', 'border', 'margin',
|
|
||||||
'padding']:
|
|
||||||
for keyword in value.split():
|
|
||||||
if keyword not in self.acceptable_css_keywords and \
|
|
||||||
not re.match("^(#[0-9a-f]+|rgb\(\d+%?,\d*%?,?\d*%?\)?|\d{0,2}\.?\d{0,2}(cm|em|ex|in|mm|pc|pt|px|%|,|\))?)$", keyword):
|
|
||||||
break
|
|
||||||
else:
|
|
||||||
clean.append(prop + ': ' + value + ';')
|
|
||||||
elif prop.lower() in self.allowed_svg_properties:
|
|
||||||
clean.append(prop + ': ' + value + ';')
|
|
||||||
|
|
||||||
return ' '.join(clean)
|
|
||||||
|
|
||||||
|
|
||||||
class HTMLSanitizer(HTMLTokenizer, HTMLSanitizerMixin):
|
|
||||||
def __init__(self, stream, encoding=None, parseMeta=True, useChardet=True,
|
|
||||||
lowercaseElementName=False, lowercaseAttrName=False, parser=None, track_positions=False):
|
|
||||||
# Change case matching defaults as we only output lowercase html anyway
|
|
||||||
# This solution doesn't seem ideal...
|
|
||||||
HTMLTokenizer.__init__(self, stream, encoding, parseMeta, useChardet,
|
|
||||||
lowercaseElementName, lowercaseAttrName, parser=parser, track_positions=track_positions)
|
|
||||||
|
|
||||||
def __iter__(self):
|
|
||||||
for token in HTMLTokenizer.__iter__(self):
|
|
||||||
token = self.sanitize_token(token)
|
|
||||||
if token:
|
|
||||||
yield token
|
|
@ -1,16 +0,0 @@
|
|||||||
from __future__ import absolute_import, division, unicode_literals
|
|
||||||
|
|
||||||
from .. import treewalkers
|
|
||||||
|
|
||||||
from .htmlserializer import HTMLSerializer
|
|
||||||
|
|
||||||
|
|
||||||
def serialize(input, tree="etree", format="html", encoding=None,
|
|
||||||
**serializer_opts):
|
|
||||||
# XXX: Should we cache this?
|
|
||||||
walker = treewalkers.getTreeWalker(tree)
|
|
||||||
if format == "html":
|
|
||||||
s = HTMLSerializer(**serializer_opts)
|
|
||||||
else:
|
|
||||||
raise ValueError("type must be html")
|
|
||||||
return s.render(walker(input), encoding)
|
|
@ -1,320 +0,0 @@
|
|||||||
from __future__ import absolute_import, division, unicode_literals
|
|
||||||
try:
|
|
||||||
text_type = unicode
|
|
||||||
except NameError:
|
|
||||||
text_type = str
|
|
||||||
|
|
||||||
try:
|
|
||||||
from functools import reduce
|
|
||||||
except ImportError:
|
|
||||||
pass
|
|
||||||
|
|
||||||
from ..constants import voidElements, booleanAttributes, spaceCharacters
|
|
||||||
from ..constants import rcdataElements, entities, xmlEntities
|
|
||||||
from .. import utils
|
|
||||||
from xml.sax.saxutils import escape
|
|
||||||
|
|
||||||
spaceCharacters = "".join(spaceCharacters)
|
|
||||||
|
|
||||||
try:
|
|
||||||
from codecs import register_error, xmlcharrefreplace_errors
|
|
||||||
except ImportError:
|
|
||||||
unicode_encode_errors = "strict"
|
|
||||||
else:
|
|
||||||
unicode_encode_errors = "htmlentityreplace"
|
|
||||||
|
|
||||||
encode_entity_map = {}
|
|
||||||
is_ucs4 = len("\U0010FFFF") == 1
|
|
||||||
for k, v in list(entities.items()):
|
|
||||||
# skip multi-character entities
|
|
||||||
if ((is_ucs4 and len(v) > 1) or
|
|
||||||
(not is_ucs4 and len(v) > 2)):
|
|
||||||
continue
|
|
||||||
if v != "&":
|
|
||||||
if len(v) == 2:
|
|
||||||
v = utils.surrogatePairToCodepoint(v)
|
|
||||||
else:
|
|
||||||
v = ord(v)
|
|
||||||
if not v in encode_entity_map or k.islower():
|
|
||||||
# prefer < over < and similarly for &, >, etc.
|
|
||||||
encode_entity_map[v] = k
|
|
||||||
|
|
||||||
def htmlentityreplace_errors(exc):
|
|
||||||
if isinstance(exc, (UnicodeEncodeError, UnicodeTranslateError)):
|
|
||||||
res = []
|
|
||||||
codepoints = []
|
|
||||||
skip = False
|
|
||||||
for i, c in enumerate(exc.object[exc.start:exc.end]):
|
|
||||||
if skip:
|
|
||||||
skip = False
|
|
||||||
continue
|
|
||||||
index = i + exc.start
|
|
||||||
if utils.isSurrogatePair(exc.object[index:min([exc.end, index + 2])]):
|
|
||||||
codepoint = utils.surrogatePairToCodepoint(exc.object[index:index + 2])
|
|
||||||
skip = True
|
|
||||||
else:
|
|
||||||
codepoint = ord(c)
|
|
||||||
codepoints.append(codepoint)
|
|
||||||
for cp in codepoints:
|
|
||||||
e = encode_entity_map.get(cp)
|
|
||||||
if e:
|
|
||||||
res.append("&")
|
|
||||||
res.append(e)
|
|
||||||
if not e.endswith(";"):
|
|
||||||
res.append(";")
|
|
||||||
else:
|
|
||||||
res.append("&#x%s;" % (hex(cp)[2:]))
|
|
||||||
return ("".join(res), exc.end)
|
|
||||||
else:
|
|
||||||
return xmlcharrefreplace_errors(exc)
|
|
||||||
|
|
||||||
register_error(unicode_encode_errors, htmlentityreplace_errors)
|
|
||||||
|
|
||||||
del register_error
|
|
||||||
|
|
||||||
|
|
||||||
class HTMLSerializer(object):
|
|
||||||
|
|
||||||
# attribute quoting options
|
|
||||||
quote_attr_values = False
|
|
||||||
quote_char = '"'
|
|
||||||
use_best_quote_char = True
|
|
||||||
|
|
||||||
# tag syntax options
|
|
||||||
omit_optional_tags = True
|
|
||||||
minimize_boolean_attributes = True
|
|
||||||
use_trailing_solidus = False
|
|
||||||
space_before_trailing_solidus = True
|
|
||||||
|
|
||||||
# escaping options
|
|
||||||
escape_lt_in_attrs = False
|
|
||||||
escape_rcdata = False
|
|
||||||
resolve_entities = True
|
|
||||||
|
|
||||||
# miscellaneous options
|
|
||||||
alphabetical_attributes = False
|
|
||||||
inject_meta_charset = True
|
|
||||||
strip_whitespace = False
|
|
||||||
sanitize = False
|
|
||||||
|
|
||||||
options = ("quote_attr_values", "quote_char", "use_best_quote_char",
|
|
||||||
"omit_optional_tags", "minimize_boolean_attributes",
|
|
||||||
"use_trailing_solidus", "space_before_trailing_solidus",
|
|
||||||
"escape_lt_in_attrs", "escape_rcdata", "resolve_entities",
|
|
||||||
"alphabetical_attributes", "inject_meta_charset",
|
|
||||||
"strip_whitespace", "sanitize")
|
|
||||||
|
|
||||||
def __init__(self, **kwargs):
|
|
||||||
"""Initialize HTMLSerializer.
|
|
||||||
|
|
||||||
Keyword options (default given first unless specified) include:
|
|
||||||
|
|
||||||
inject_meta_charset=True|False
|
|
||||||
Whether it insert a meta element to define the character set of the
|
|
||||||
document.
|
|
||||||
quote_attr_values=True|False
|
|
||||||
Whether to quote attribute values that don't require quoting
|
|
||||||
per HTML5 parsing rules.
|
|
||||||
quote_char=u'"'|u"'"
|
|
||||||
Use given quote character for attribute quoting. Default is to
|
|
||||||
use double quote unless attribute value contains a double quote,
|
|
||||||
in which case single quotes are used instead.
|
|
||||||
escape_lt_in_attrs=False|True
|
|
||||||
Whether to escape < in attribute values.
|
|
||||||
escape_rcdata=False|True
|
|
||||||
Whether to escape characters that need to be escaped within normal
|
|
||||||
elements within rcdata elements such as style.
|
|
||||||
resolve_entities=True|False
|
|
||||||
Whether to resolve named character entities that appear in the
|
|
||||||
source tree. The XML predefined entities < > & " '
|
|
||||||
are unaffected by this setting.
|
|
||||||
strip_whitespace=False|True
|
|
||||||
Whether to remove semantically meaningless whitespace. (This
|
|
||||||
compresses all whitespace to a single space except within pre.)
|
|
||||||
minimize_boolean_attributes=True|False
|
|
||||||
Shortens boolean attributes to give just the attribute value,
|
|
||||||
for example <input disabled="disabled"> becomes <input disabled>.
|
|
||||||
use_trailing_solidus=False|True
|
|
||||||
Includes a close-tag slash at the end of the start tag of void
|
|
||||||
elements (empty elements whose end tag is forbidden). E.g. <hr/>.
|
|
||||||
space_before_trailing_solidus=True|False
|
|
||||||
Places a space immediately before the closing slash in a tag
|
|
||||||
using a trailing solidus. E.g. <hr />. Requires use_trailing_solidus.
|
|
||||||
sanitize=False|True
|
|
||||||
Strip all unsafe or unknown constructs from output.
|
|
||||||
See `html5lib user documentation`_
|
|
||||||
omit_optional_tags=True|False
|
|
||||||
Omit start/end tags that are optional.
|
|
||||||
alphabetical_attributes=False|True
|
|
||||||
Reorder attributes to be in alphabetical order.
|
|
||||||
|
|
||||||
.. _html5lib user documentation: http://code.google.com/p/html5lib/wiki/UserDocumentation
|
|
||||||
"""
|
|
||||||
if 'quote_char' in kwargs:
|
|
||||||
self.use_best_quote_char = False
|
|
||||||
for attr in self.options:
|
|
||||||
setattr(self, attr, kwargs.get(attr, getattr(self, attr)))
|
|
||||||
self.errors = []
|
|
||||||
self.strict = False
|
|
||||||
|
|
||||||
def encode(self, string):
|
|
||||||
assert(isinstance(string, text_type))
|
|
||||||
if self.encoding:
|
|
||||||
return string.encode(self.encoding, unicode_encode_errors)
|
|
||||||
else:
|
|
||||||
return string
|
|
||||||
|
|
||||||
def encodeStrict(self, string):
|
|
||||||
assert(isinstance(string, text_type))
|
|
||||||
if self.encoding:
|
|
||||||
return string.encode(self.encoding, "strict")
|
|
||||||
else:
|
|
||||||
return string
|
|
||||||
|
|
||||||
def serialize(self, treewalker, encoding=None):
|
|
||||||
self.encoding = encoding
|
|
||||||
in_cdata = False
|
|
||||||
self.errors = []
|
|
||||||
|
|
||||||
if encoding and self.inject_meta_charset:
|
|
||||||
from ..filters.inject_meta_charset import Filter
|
|
||||||
treewalker = Filter(treewalker, encoding)
|
|
||||||
# WhitespaceFilter should be used before OptionalTagFilter
|
|
||||||
# for maximum efficiently of this latter filter
|
|
||||||
if self.strip_whitespace:
|
|
||||||
from ..filters.whitespace import Filter
|
|
||||||
treewalker = Filter(treewalker)
|
|
||||||
if self.sanitize:
|
|
||||||
from ..filters.sanitizer import Filter
|
|
||||||
treewalker = Filter(treewalker)
|
|
||||||
if self.omit_optional_tags:
|
|
||||||
from ..filters.optionaltags import Filter
|
|
||||||
treewalker = Filter(treewalker)
|
|
||||||
# Alphabetical attributes must be last, as other filters
|
|
||||||
# could add attributes and alter the order
|
|
||||||
if self.alphabetical_attributes:
|
|
||||||
from ..filters.alphabeticalattributes import Filter
|
|
||||||
treewalker = Filter(treewalker)
|
|
||||||
|
|
||||||
for token in treewalker:
|
|
||||||
type = token["type"]
|
|
||||||
if type == "Doctype":
|
|
||||||
doctype = "<!DOCTYPE %s" % token["name"]
|
|
||||||
|
|
||||||
if token["publicId"]:
|
|
||||||
doctype += ' PUBLIC "%s"' % token["publicId"]
|
|
||||||
elif token["systemId"]:
|
|
||||||
doctype += " SYSTEM"
|
|
||||||
if token["systemId"]:
|
|
||||||
if token["systemId"].find('"') >= 0:
|
|
||||||
if token["systemId"].find("'") >= 0:
|
|
||||||
self.serializeError("System identifer contains both single and double quote characters")
|
|
||||||
quote_char = "'"
|
|
||||||
else:
|
|
||||||
quote_char = '"'
|
|
||||||
doctype += " %s%s%s" % (quote_char, token["systemId"], quote_char)
|
|
||||||
|
|
||||||
doctype += ">"
|
|
||||||
yield self.encodeStrict(doctype)
|
|
||||||
|
|
||||||
elif type in ("Characters", "SpaceCharacters"):
|
|
||||||
if type == "SpaceCharacters" or in_cdata:
|
|
||||||
if in_cdata and token["data"].find("</") >= 0:
|
|
||||||
self.serializeError("Unexpected </ in CDATA")
|
|
||||||
yield self.encode(token["data"])
|
|
||||||
else:
|
|
||||||
yield self.encode(escape(token["data"]))
|
|
||||||
|
|
||||||
elif type in ("StartTag", "EmptyTag"):
|
|
||||||
name = token["name"]
|
|
||||||
yield self.encodeStrict("<%s" % name)
|
|
||||||
if name in rcdataElements and not self.escape_rcdata:
|
|
||||||
in_cdata = True
|
|
||||||
elif in_cdata:
|
|
||||||
self.serializeError("Unexpected child element of a CDATA element")
|
|
||||||
for (attr_namespace, attr_name), attr_value in token["data"].items():
|
|
||||||
# TODO: Add namespace support here
|
|
||||||
k = attr_name
|
|
||||||
v = attr_value
|
|
||||||
yield self.encodeStrict(' ')
|
|
||||||
|
|
||||||
yield self.encodeStrict(k)
|
|
||||||
if not self.minimize_boolean_attributes or \
|
|
||||||
(k not in booleanAttributes.get(name, tuple())
|
|
||||||
and k not in booleanAttributes.get("", tuple())):
|
|
||||||
yield self.encodeStrict("=")
|
|
||||||
if self.quote_attr_values or not v:
|
|
||||||
quote_attr = True
|
|
||||||
else:
|
|
||||||
quote_attr = reduce(lambda x, y: x or (y in v),
|
|
||||||
spaceCharacters + ">\"'=", False)
|
|
||||||
v = v.replace("&", "&")
|
|
||||||
if self.escape_lt_in_attrs:
|
|
||||||
v = v.replace("<", "<")
|
|
||||||
if quote_attr:
|
|
||||||
quote_char = self.quote_char
|
|
||||||
if self.use_best_quote_char:
|
|
||||||
if "'" in v and '"' not in v:
|
|
||||||
quote_char = '"'
|
|
||||||
elif '"' in v and "'" not in v:
|
|
||||||
quote_char = "'"
|
|
||||||
if quote_char == "'":
|
|
||||||
v = v.replace("'", "'")
|
|
||||||
else:
|
|
||||||
v = v.replace('"', """)
|
|
||||||
yield self.encodeStrict(quote_char)
|
|
||||||
yield self.encode(v)
|
|
||||||
yield self.encodeStrict(quote_char)
|
|
||||||
else:
|
|
||||||
yield self.encode(v)
|
|
||||||
if name in voidElements and self.use_trailing_solidus:
|
|
||||||
if self.space_before_trailing_solidus:
|
|
||||||
yield self.encodeStrict(" /")
|
|
||||||
else:
|
|
||||||
yield self.encodeStrict("/")
|
|
||||||
yield self.encode(">")
|
|
||||||
|
|
||||||
elif type == "EndTag":
|
|
||||||
name = token["name"]
|
|
||||||
if name in rcdataElements:
|
|
||||||
in_cdata = False
|
|
||||||
elif in_cdata:
|
|
||||||
self.serializeError("Unexpected child element of a CDATA element")
|
|
||||||
yield self.encodeStrict("</%s>" % name)
|
|
||||||
|
|
||||||
elif type == "Comment":
|
|
||||||
data = token["data"]
|
|
||||||
if data.find("--") >= 0:
|
|
||||||
self.serializeError("Comment contains --")
|
|
||||||
yield self.encodeStrict("<!--%s-->" % token["data"])
|
|
||||||
|
|
||||||
elif type == "Entity":
|
|
||||||
name = token["name"]
|
|
||||||
key = name + ";"
|
|
||||||
if not key in entities:
|
|
||||||
self.serializeError("Entity %s not recognized" % name)
|
|
||||||
if self.resolve_entities and key not in xmlEntities:
|
|
||||||
data = entities[key]
|
|
||||||
else:
|
|
||||||
data = "&%s;" % name
|
|
||||||
yield self.encodeStrict(data)
|
|
||||||
|
|
||||||
else:
|
|
||||||
self.serializeError(token["data"])
|
|
||||||
|
|
||||||
def render(self, treewalker, encoding=None):
|
|
||||||
if encoding:
|
|
||||||
return b"".join(list(self.serialize(treewalker, encoding)))
|
|
||||||
else:
|
|
||||||
return "".join(list(self.serialize(treewalker)))
|
|
||||||
|
|
||||||
def serializeError(self, data="XXX ERROR MESSAGE NEEDED"):
|
|
||||||
# XXX The idea is to make data mandatory.
|
|
||||||
self.errors.append(data)
|
|
||||||
if self.strict:
|
|
||||||
raise SerializeError
|
|
||||||
|
|
||||||
|
|
||||||
def SerializeError(Exception):
|
|
||||||
"""Error in serialized tree"""
|
|
||||||
pass
|
|
File diff suppressed because it is too large
Load Diff
@ -1,44 +0,0 @@
|
|||||||
from __future__ import absolute_import, division, unicode_literals
|
|
||||||
|
|
||||||
from xml.sax.xmlreader import AttributesNSImpl
|
|
||||||
|
|
||||||
from ..constants import adjustForeignAttributes, unadjustForeignAttributes
|
|
||||||
|
|
||||||
prefix_mapping = {}
|
|
||||||
for prefix, localName, namespace in adjustForeignAttributes.values():
|
|
||||||
if prefix is not None:
|
|
||||||
prefix_mapping[prefix] = namespace
|
|
||||||
|
|
||||||
|
|
||||||
def to_sax(walker, handler):
|
|
||||||
"""Call SAX-like content handler based on treewalker walker"""
|
|
||||||
handler.startDocument()
|
|
||||||
for prefix, namespace in prefix_mapping.items():
|
|
||||||
handler.startPrefixMapping(prefix, namespace)
|
|
||||||
|
|
||||||
for token in walker:
|
|
||||||
type = token["type"]
|
|
||||||
if type == "Doctype":
|
|
||||||
continue
|
|
||||||
elif type in ("StartTag", "EmptyTag"):
|
|
||||||
attrs = AttributesNSImpl(token["data"],
|
|
||||||
unadjustForeignAttributes)
|
|
||||||
handler.startElementNS((token["namespace"], token["name"]),
|
|
||||||
token["name"],
|
|
||||||
attrs)
|
|
||||||
if type == "EmptyTag":
|
|
||||||
handler.endElementNS((token["namespace"], token["name"]),
|
|
||||||
token["name"])
|
|
||||||
elif type == "EndTag":
|
|
||||||
handler.endElementNS((token["namespace"], token["name"]),
|
|
||||||
token["name"])
|
|
||||||
elif type in ("Characters", "SpaceCharacters"):
|
|
||||||
handler.characters(token["data"])
|
|
||||||
elif type == "Comment":
|
|
||||||
pass
|
|
||||||
else:
|
|
||||||
assert False, "Unknown token type"
|
|
||||||
|
|
||||||
for prefix, namespace in prefix_mapping.items():
|
|
||||||
handler.endPrefixMapping(prefix)
|
|
||||||
handler.endDocument()
|
|
@ -1,76 +0,0 @@
|
|||||||
"""A collection of modules for building different kinds of tree from
|
|
||||||
HTML documents.
|
|
||||||
|
|
||||||
To create a treebuilder for a new type of tree, you need to do
|
|
||||||
implement several things:
|
|
||||||
|
|
||||||
1) A set of classes for various types of elements: Document, Doctype,
|
|
||||||
Comment, Element. These must implement the interface of
|
|
||||||
_base.treebuilders.Node (although comment nodes have a different
|
|
||||||
signature for their constructor, see treebuilders.etree.Comment)
|
|
||||||
Textual content may also be implemented as another node type, or not, as
|
|
||||||
your tree implementation requires.
|
|
||||||
|
|
||||||
2) A treebuilder object (called TreeBuilder by convention) that
|
|
||||||
inherits from treebuilders._base.TreeBuilder. This has 4 required attributes:
|
|
||||||
documentClass - the class to use for the bottommost node of a document
|
|
||||||
elementClass - the class to use for HTML Elements
|
|
||||||
commentClass - the class to use for comments
|
|
||||||
doctypeClass - the class to use for doctypes
|
|
||||||
It also has one required method:
|
|
||||||
getDocument - Returns the root node of the complete document tree
|
|
||||||
|
|
||||||
3) If you wish to run the unit tests, you must also create a
|
|
||||||
testSerializer method on your treebuilder which accepts a node and
|
|
||||||
returns a string containing Node and its children serialized according
|
|
||||||
to the format used in the unittests
|
|
||||||
"""
|
|
||||||
|
|
||||||
from __future__ import absolute_import, division, unicode_literals
|
|
||||||
|
|
||||||
from ..utils import default_etree
|
|
||||||
|
|
||||||
treeBuilderCache = {}
|
|
||||||
|
|
||||||
|
|
||||||
def getTreeBuilder(treeType, implementation=None, **kwargs):
|
|
||||||
"""Get a TreeBuilder class for various types of tree with built-in support
|
|
||||||
|
|
||||||
treeType - the name of the tree type required (case-insensitive). Supported
|
|
||||||
values are:
|
|
||||||
|
|
||||||
"dom" - A generic builder for DOM implementations, defaulting to
|
|
||||||
a xml.dom.minidom based implementation.
|
|
||||||
"etree" - A generic builder for tree implementations exposing an
|
|
||||||
ElementTree-like interface, defaulting to
|
|
||||||
xml.etree.cElementTree if available and
|
|
||||||
xml.etree.ElementTree if not.
|
|
||||||
"lxml" - A etree-based builder for lxml.etree, handling
|
|
||||||
limitations of lxml's implementation.
|
|
||||||
|
|
||||||
implementation - (Currently applies to the "etree" and "dom" tree types). A
|
|
||||||
module implementing the tree type e.g.
|
|
||||||
xml.etree.ElementTree or xml.etree.cElementTree."""
|
|
||||||
|
|
||||||
treeType = treeType.lower()
|
|
||||||
if treeType not in treeBuilderCache:
|
|
||||||
if treeType == "dom":
|
|
||||||
from . import dom
|
|
||||||
# Come up with a sane default (pref. from the stdlib)
|
|
||||||
if implementation is None:
|
|
||||||
from xml.dom import minidom
|
|
||||||
implementation = minidom
|
|
||||||
# NEVER cache here, caching is done in the dom submodule
|
|
||||||
return dom.getDomModule(implementation, **kwargs).TreeBuilder
|
|
||||||
elif treeType == "lxml":
|
|
||||||
from . import etree_lxml
|
|
||||||
treeBuilderCache[treeType] = etree_lxml.TreeBuilder
|
|
||||||
elif treeType == "etree":
|
|
||||||
from . import etree
|
|
||||||
if implementation is None:
|
|
||||||
implementation = default_etree
|
|
||||||
# NEVER cache here, caching is done in the etree submodule
|
|
||||||
return etree.getETreeModule(implementation, **kwargs).TreeBuilder
|
|
||||||
else:
|
|
||||||
raise ValueError("""Unrecognised treebuilder "%s" """ % treeType)
|
|
||||||
return treeBuilderCache.get(treeType)
|
|
@ -1,390 +0,0 @@
|
|||||||
from __future__ import absolute_import, division, unicode_literals
|
|
||||||
try:
|
|
||||||
text_type = unicode
|
|
||||||
except NameError:
|
|
||||||
text_type = str
|
|
||||||
|
|
||||||
from ..constants import scopingElements, tableInsertModeElements, namespaces
|
|
||||||
|
|
||||||
# The scope markers are inserted when entering object elements,
|
|
||||||
# marquees, table cells, and table captions, and are used to prevent formatting
|
|
||||||
# from "leaking" into tables, object elements, and marquees.
|
|
||||||
Marker = None
|
|
||||||
|
|
||||||
listElementsMap = {
|
|
||||||
None: (frozenset(scopingElements), False),
|
|
||||||
"button": (frozenset(scopingElements | set([(namespaces["html"], "button")])), False),
|
|
||||||
"list": (frozenset(scopingElements | set([(namespaces["html"], "ol"),
|
|
||||||
(namespaces["html"], "ul")])), False),
|
|
||||||
"table": (frozenset([(namespaces["html"], "html"),
|
|
||||||
(namespaces["html"], "table")]), False),
|
|
||||||
"select": (frozenset([(namespaces["html"], "optgroup"),
|
|
||||||
(namespaces["html"], "option")]), True)
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
class Node(object):
|
|
||||||
def __init__(self, name):
|
|
||||||
"""Node representing an item in the tree.
|
|
||||||
name - The tag name associated with the node
|
|
||||||
parent - The parent of the current node (or None for the document node)
|
|
||||||
value - The value of the current node (applies to text nodes and
|
|
||||||
comments
|
|
||||||
attributes - a dict holding name, value pairs for attributes of the node
|
|
||||||
childNodes - a list of child nodes of the current node. This must
|
|
||||||
include all elements but not necessarily other node types
|
|
||||||
_flags - A list of miscellaneous flags that can be set on the node
|
|
||||||
"""
|
|
||||||
self.name = name
|
|
||||||
self.parent = None
|
|
||||||
self.value = None
|
|
||||||
self.attributes = {}
|
|
||||||
self.childNodes = []
|
|
||||||
self._flags = []
|
|
||||||
|
|
||||||
def __str__(self):
|
|
||||||
attributesStr = " ".join(["%s=\"%s\"" % (name, value)
|
|
||||||
for name, value in
|
|
||||||
self.attributes.items()])
|
|
||||||
if attributesStr:
|
|
||||||
return "<%s %s>" % (self.name, attributesStr)
|
|
||||||
else:
|
|
||||||
return "<%s>" % (self.name)
|
|
||||||
|
|
||||||
def __repr__(self):
|
|
||||||
return "<%s>" % (self.name)
|
|
||||||
|
|
||||||
def appendChild(self, node):
|
|
||||||
"""Insert node as a child of the current node
|
|
||||||
"""
|
|
||||||
raise NotImplementedError
|
|
||||||
|
|
||||||
def insertText(self, data, insertBefore=None):
|
|
||||||
"""Insert data as text in the current node, positioned before the
|
|
||||||
start of node insertBefore or to the end of the node's text.
|
|
||||||
"""
|
|
||||||
raise NotImplementedError
|
|
||||||
|
|
||||||
def insertBefore(self, node, refNode):
|
|
||||||
"""Insert node as a child of the current node, before refNode in the
|
|
||||||
list of child nodes. Raises ValueError if refNode is not a child of
|
|
||||||
the current node"""
|
|
||||||
raise NotImplementedError
|
|
||||||
|
|
||||||
def removeChild(self, node):
|
|
||||||
"""Remove node from the children of the current node
|
|
||||||
"""
|
|
||||||
raise NotImplementedError
|
|
||||||
|
|
||||||
def reparentChildren(self, newParent):
|
|
||||||
"""Move all the children of the current node to newParent.
|
|
||||||
This is needed so that trees that don't store text as nodes move the
|
|
||||||
text in the correct way
|
|
||||||
"""
|
|
||||||
# XXX - should this method be made more general?
|
|
||||||
for child in self.childNodes:
|
|
||||||
newParent.appendChild(child)
|
|
||||||
self.childNodes = []
|
|
||||||
|
|
||||||
def cloneNode(self):
|
|
||||||
"""Return a shallow copy of the current node i.e. a node with the same
|
|
||||||
name and attributes but with no parent or child nodes
|
|
||||||
"""
|
|
||||||
raise NotImplementedError
|
|
||||||
|
|
||||||
def hasContent(self):
|
|
||||||
"""Return true if the node has children or text, false otherwise
|
|
||||||
"""
|
|
||||||
raise NotImplementedError
|
|
||||||
|
|
||||||
|
|
||||||
class ActiveFormattingElements(list):
|
|
||||||
def append(self, node):
|
|
||||||
equalCount = 0
|
|
||||||
if node != Marker:
|
|
||||||
for element in self[::-1]:
|
|
||||||
if element == Marker:
|
|
||||||
break
|
|
||||||
if self.nodesEqual(element, node):
|
|
||||||
equalCount += 1
|
|
||||||
if equalCount == 3:
|
|
||||||
self.remove(element)
|
|
||||||
break
|
|
||||||
list.append(self, node)
|
|
||||||
|
|
||||||
def nodesEqual(self, node1, node2):
|
|
||||||
if not node1.nameTuple == node2.nameTuple:
|
|
||||||
return False
|
|
||||||
|
|
||||||
if not node1.attributes == node2.attributes:
|
|
||||||
return False
|
|
||||||
|
|
||||||
return True
|
|
||||||
|
|
||||||
|
|
||||||
class TreeBuilder(object):
|
|
||||||
"""Base treebuilder implementation
|
|
||||||
documentClass - the class to use for the bottommost node of a document
|
|
||||||
elementClass - the class to use for HTML Elements
|
|
||||||
commentClass - the class to use for comments
|
|
||||||
doctypeClass - the class to use for doctypes
|
|
||||||
"""
|
|
||||||
|
|
||||||
# Document class
|
|
||||||
documentClass = None
|
|
||||||
|
|
||||||
# The class to use for creating a node
|
|
||||||
elementClass = None
|
|
||||||
|
|
||||||
# The class to use for creating comments
|
|
||||||
commentClass = None
|
|
||||||
|
|
||||||
# The class to use for creating doctypes
|
|
||||||
doctypeClass = None
|
|
||||||
|
|
||||||
# Fragment class
|
|
||||||
fragmentClass = None
|
|
||||||
|
|
||||||
def __init__(self, namespaceHTMLElements):
|
|
||||||
if namespaceHTMLElements:
|
|
||||||
self.defaultNamespace = "http://www.w3.org/1999/xhtml"
|
|
||||||
else:
|
|
||||||
self.defaultNamespace = None
|
|
||||||
self.reset()
|
|
||||||
|
|
||||||
def reset(self):
|
|
||||||
self.openElements = []
|
|
||||||
self.activeFormattingElements = ActiveFormattingElements()
|
|
||||||
|
|
||||||
# XXX - rename these to headElement, formElement
|
|
||||||
self.headPointer = None
|
|
||||||
self.formPointer = None
|
|
||||||
|
|
||||||
self.insertFromTable = False
|
|
||||||
|
|
||||||
self.document = self.documentClass()
|
|
||||||
|
|
||||||
def elementInScope(self, target, variant=None):
|
|
||||||
|
|
||||||
# If we pass a node in we match that. if we pass a string
|
|
||||||
# match any node with that name
|
|
||||||
exactNode = hasattr(target, "nameTuple")
|
|
||||||
|
|
||||||
listElements, invert = listElementsMap[variant]
|
|
||||||
|
|
||||||
for node in reversed(self.openElements):
|
|
||||||
if (node.name == target and not exactNode or
|
|
||||||
node == target and exactNode):
|
|
||||||
return True
|
|
||||||
elif (invert ^ (node.nameTuple in listElements)):
|
|
||||||
return False
|
|
||||||
|
|
||||||
assert False # We should never reach this point
|
|
||||||
|
|
||||||
def reconstructActiveFormattingElements(self):
|
|
||||||
# Within this algorithm the order of steps described in the
|
|
||||||
# specification is not quite the same as the order of steps in the
|
|
||||||
# code. It should still do the same though.
|
|
||||||
|
|
||||||
# Step 1: stop the algorithm when there's nothing to do.
|
|
||||||
if not self.activeFormattingElements:
|
|
||||||
return
|
|
||||||
|
|
||||||
# Step 2 and step 3: we start with the last element. So i is -1.
|
|
||||||
i = len(self.activeFormattingElements) - 1
|
|
||||||
entry = self.activeFormattingElements[i]
|
|
||||||
if entry == Marker or entry in self.openElements:
|
|
||||||
return
|
|
||||||
|
|
||||||
# Step 6
|
|
||||||
while entry != Marker and entry not in self.openElements:
|
|
||||||
if i == 0:
|
|
||||||
# This will be reset to 0 below
|
|
||||||
i = -1
|
|
||||||
break
|
|
||||||
i -= 1
|
|
||||||
# Step 5: let entry be one earlier in the list.
|
|
||||||
entry = self.activeFormattingElements[i]
|
|
||||||
|
|
||||||
while True:
|
|
||||||
# Step 7
|
|
||||||
i += 1
|
|
||||||
|
|
||||||
# Step 8
|
|
||||||
entry = self.activeFormattingElements[i]
|
|
||||||
clone = entry.cloneNode() # Mainly to get a new copy of the attributes
|
|
||||||
|
|
||||||
# Step 9
|
|
||||||
element = self.insertElement({"type": "StartTag",
|
|
||||||
"name": clone.name,
|
|
||||||
"namespace": clone.namespace,
|
|
||||||
"data": clone.attributes})
|
|
||||||
|
|
||||||
# Step 10
|
|
||||||
self.activeFormattingElements[i] = element
|
|
||||||
|
|
||||||
# Step 11
|
|
||||||
if element == self.activeFormattingElements[-1]:
|
|
||||||
break
|
|
||||||
|
|
||||||
def clearActiveFormattingElements(self):
|
|
||||||
entry = self.activeFormattingElements.pop()
|
|
||||||
while self.activeFormattingElements and entry != Marker:
|
|
||||||
entry = self.activeFormattingElements.pop()
|
|
||||||
|
|
||||||
def elementInActiveFormattingElements(self, name):
|
|
||||||
"""Check if an element exists between the end of the active
|
|
||||||
formatting elements and the last marker. If it does, return it, else
|
|
||||||
return false"""
|
|
||||||
|
|
||||||
for item in self.activeFormattingElements[::-1]:
|
|
||||||
# Check for Marker first because if it's a Marker it doesn't have a
|
|
||||||
# name attribute.
|
|
||||||
if item == Marker:
|
|
||||||
break
|
|
||||||
elif item.name == name:
|
|
||||||
return item
|
|
||||||
return False
|
|
||||||
|
|
||||||
def insertRoot(self, token):
|
|
||||||
element = self.createElement(token)
|
|
||||||
self.openElements.append(element)
|
|
||||||
self.document.appendChild(element)
|
|
||||||
|
|
||||||
def insertDoctype(self, token):
|
|
||||||
name = token["name"]
|
|
||||||
publicId = token["publicId"]
|
|
||||||
systemId = token["systemId"]
|
|
||||||
|
|
||||||
doctype = self.doctypeClass(name, publicId, systemId)
|
|
||||||
self.document.appendChild(doctype)
|
|
||||||
|
|
||||||
def insertComment(self, token, parent=None):
|
|
||||||
if parent is None:
|
|
||||||
parent = self.openElements[-1]
|
|
||||||
parent.appendChild(self.commentClass(token["data"]))
|
|
||||||
|
|
||||||
def createElement(self, token):
|
|
||||||
"""Create an element but don't insert it anywhere"""
|
|
||||||
name = token["name"]
|
|
||||||
namespace = token.get("namespace", self.defaultNamespace)
|
|
||||||
element = self.elementClass(name, namespace)
|
|
||||||
element.attributes = token["data"]
|
|
||||||
return element
|
|
||||||
|
|
||||||
def apply_html_attributes(self, attrs):
|
|
||||||
for attr, value in attrs.items():
|
|
||||||
if attr not in self.openElements[0].attributes:
|
|
||||||
self.openElements[0].attributes[attr] = value
|
|
||||||
|
|
||||||
def apply_body_attributes(self, attrs):
|
|
||||||
for attr, value in attrs.items():
|
|
||||||
if attr not in self.openElements[1].attributes:
|
|
||||||
self.openElements[1].attributes[attr] = value
|
|
||||||
|
|
||||||
def _getInsertFromTable(self):
|
|
||||||
return self._insertFromTable
|
|
||||||
|
|
||||||
def _setInsertFromTable(self, value):
|
|
||||||
"""Switch the function used to insert an element from the
|
|
||||||
normal one to the misnested table one and back again"""
|
|
||||||
self._insertFromTable = value
|
|
||||||
if value:
|
|
||||||
self.insertElement = self.insertElementTable
|
|
||||||
else:
|
|
||||||
self.insertElement = self.insertElementNormal
|
|
||||||
|
|
||||||
insertFromTable = property(_getInsertFromTable, _setInsertFromTable)
|
|
||||||
|
|
||||||
def insertElementNormal(self, token):
|
|
||||||
name = token["name"]
|
|
||||||
assert isinstance(name, text_type), "Element %s not unicode" % name
|
|
||||||
namespace = token.get("namespace", self.defaultNamespace)
|
|
||||||
element = self.elementClass(name, namespace)
|
|
||||||
element.attributes = token["data"]
|
|
||||||
self.openElements[-1].appendChild(element)
|
|
||||||
self.openElements.append(element)
|
|
||||||
return element
|
|
||||||
|
|
||||||
def insertElementTable(self, token):
|
|
||||||
"""Create an element and insert it into the tree"""
|
|
||||||
element = self.createElement(token)
|
|
||||||
if self.openElements[-1].name not in tableInsertModeElements:
|
|
||||||
return self.insertElementNormal(token)
|
|
||||||
else:
|
|
||||||
# We should be in the InTable mode. This means we want to do
|
|
||||||
# special magic element rearranging
|
|
||||||
parent, insertBefore = self.getTableMisnestedNodePosition()
|
|
||||||
if insertBefore is None:
|
|
||||||
parent.appendChild(element)
|
|
||||||
else:
|
|
||||||
parent.insertBefore(element, insertBefore)
|
|
||||||
self.openElements.append(element)
|
|
||||||
return element
|
|
||||||
|
|
||||||
def insertText(self, data, parent=None):
|
|
||||||
"""Insert text data."""
|
|
||||||
if parent is None:
|
|
||||||
parent = self.openElements[-1]
|
|
||||||
|
|
||||||
if (not self.insertFromTable or (self.insertFromTable and
|
|
||||||
self.openElements[-1].name
|
|
||||||
not in tableInsertModeElements)):
|
|
||||||
parent.insertText(data)
|
|
||||||
else:
|
|
||||||
# We should be in the InTable mode. This means we want to do
|
|
||||||
# special magic element rearranging
|
|
||||||
parent, insertBefore = self.getTableMisnestedNodePosition()
|
|
||||||
parent.insertText(data, insertBefore)
|
|
||||||
|
|
||||||
def getTableMisnestedNodePosition(self):
|
|
||||||
"""Get the foster parent element, and sibling to insert before
|
|
||||||
(or None) when inserting a misnested table node"""
|
|
||||||
# The foster parent element is the one which comes before the most
|
|
||||||
# recently opened table element
|
|
||||||
# XXX - this is really inelegant
|
|
||||||
lastTable = None
|
|
||||||
fosterParent = None
|
|
||||||
insertBefore = None
|
|
||||||
for elm in self.openElements[::-1]:
|
|
||||||
if elm.name == "table":
|
|
||||||
lastTable = elm
|
|
||||||
break
|
|
||||||
if lastTable:
|
|
||||||
# XXX - we should really check that this parent is actually a
|
|
||||||
# node here
|
|
||||||
if lastTable.parent:
|
|
||||||
fosterParent = lastTable.parent
|
|
||||||
insertBefore = lastTable
|
|
||||||
else:
|
|
||||||
fosterParent = self.openElements[
|
|
||||||
self.openElements.index(lastTable) - 1]
|
|
||||||
else:
|
|
||||||
fosterParent = self.openElements[0]
|
|
||||||
return fosterParent, insertBefore
|
|
||||||
|
|
||||||
def generateImpliedEndTags(self, exclude=None):
|
|
||||||
name = self.openElements[-1].name
|
|
||||||
# XXX td, th and tr are not actually needed
|
|
||||||
if (name in frozenset(("dd", "dt", "li", "option", "optgroup", "p", "rp", "rt"))
|
|
||||||
and name != exclude):
|
|
||||||
self.openElements.pop()
|
|
||||||
# XXX This is not entirely what the specification says. We should
|
|
||||||
# investigate it more closely.
|
|
||||||
self.generateImpliedEndTags(exclude)
|
|
||||||
|
|
||||||
def getDocument(self):
|
|
||||||
"Return the final tree"
|
|
||||||
return self.document
|
|
||||||
|
|
||||||
def getFragment(self):
|
|
||||||
"Return the final fragment"
|
|
||||||
# assert self.innerHTML
|
|
||||||
fragment = self.fragmentClass()
|
|
||||||
self.openElements[0].reparentChildren(fragment)
|
|
||||||
return fragment
|
|
||||||
|
|
||||||
def testSerializer(self, node):
|
|
||||||
"""Serialize the subtree of node in the format required by unit tests
|
|
||||||
node - the node from which to start serializing"""
|
|
||||||
raise NotImplementedError
|
|
@ -1,227 +0,0 @@
|
|||||||
from __future__ import absolute_import, division, unicode_literals
|
|
||||||
|
|
||||||
|
|
||||||
from xml.dom import minidom, Node
|
|
||||||
import weakref
|
|
||||||
|
|
||||||
from . import _base
|
|
||||||
from .. import constants
|
|
||||||
from ..constants import namespaces
|
|
||||||
from ..utils import moduleFactoryFactory
|
|
||||||
|
|
||||||
|
|
||||||
def getDomBuilder(DomImplementation):
|
|
||||||
Dom = DomImplementation
|
|
||||||
|
|
||||||
class AttrList(object):
|
|
||||||
def __init__(self, element):
|
|
||||||
self.element = element
|
|
||||||
|
|
||||||
def __iter__(self):
|
|
||||||
return list(self.element.attributes.items()).__iter__()
|
|
||||||
|
|
||||||
def __setitem__(self, name, value):
|
|
||||||
self.element.setAttribute(name, value)
|
|
||||||
|
|
||||||
def __len__(self):
|
|
||||||
return len(list(self.element.attributes.items()))
|
|
||||||
|
|
||||||
def items(self):
|
|
||||||
return [(item[0], item[1]) for item in
|
|
||||||
list(self.element.attributes.items())]
|
|
||||||
|
|
||||||
def keys(self):
|
|
||||||
return list(self.element.attributes.keys())
|
|
||||||
|
|
||||||
def __getitem__(self, name):
|
|
||||||
return self.element.getAttribute(name)
|
|
||||||
|
|
||||||
def __contains__(self, name):
|
|
||||||
if isinstance(name, tuple):
|
|
||||||
raise NotImplementedError
|
|
||||||
else:
|
|
||||||
return self.element.hasAttribute(name)
|
|
||||||
|
|
||||||
class NodeBuilder(_base.Node):
|
|
||||||
def __init__(self, element):
|
|
||||||
_base.Node.__init__(self, element.nodeName)
|
|
||||||
self.element = element
|
|
||||||
|
|
||||||
namespace = property(lambda self: hasattr(self.element, "namespaceURI")
|
|
||||||
and self.element.namespaceURI or None)
|
|
||||||
|
|
||||||
def appendChild(self, node):
|
|
||||||
node.parent = self
|
|
||||||
self.element.appendChild(node.element)
|
|
||||||
|
|
||||||
def insertText(self, data, insertBefore=None):
|
|
||||||
text = self.element.ownerDocument.createTextNode(data)
|
|
||||||
if insertBefore:
|
|
||||||
self.element.insertBefore(text, insertBefore.element)
|
|
||||||
else:
|
|
||||||
self.element.appendChild(text)
|
|
||||||
|
|
||||||
def insertBefore(self, node, refNode):
|
|
||||||
self.element.insertBefore(node.element, refNode.element)
|
|
||||||
node.parent = self
|
|
||||||
|
|
||||||
def removeChild(self, node):
|
|
||||||
if node.element.parentNode == self.element:
|
|
||||||
self.element.removeChild(node.element)
|
|
||||||
node.parent = None
|
|
||||||
|
|
||||||
def reparentChildren(self, newParent):
|
|
||||||
while self.element.hasChildNodes():
|
|
||||||
child = self.element.firstChild
|
|
||||||
self.element.removeChild(child)
|
|
||||||
newParent.element.appendChild(child)
|
|
||||||
self.childNodes = []
|
|
||||||
|
|
||||||
def getAttributes(self):
|
|
||||||
return AttrList(self.element)
|
|
||||||
|
|
||||||
def setAttributes(self, attributes):
|
|
||||||
if attributes:
|
|
||||||
for name, value in list(attributes.items()):
|
|
||||||
if isinstance(name, tuple):
|
|
||||||
if name[0] is not None:
|
|
||||||
qualifiedName = (name[0] + ":" + name[1])
|
|
||||||
else:
|
|
||||||
qualifiedName = name[1]
|
|
||||||
self.element.setAttributeNS(name[2], qualifiedName,
|
|
||||||
value)
|
|
||||||
else:
|
|
||||||
self.element.setAttribute(
|
|
||||||
name, value)
|
|
||||||
attributes = property(getAttributes, setAttributes)
|
|
||||||
|
|
||||||
def cloneNode(self):
|
|
||||||
return NodeBuilder(self.element.cloneNode(False))
|
|
||||||
|
|
||||||
def hasContent(self):
|
|
||||||
return self.element.hasChildNodes()
|
|
||||||
|
|
||||||
def getNameTuple(self):
|
|
||||||
if self.namespace is None:
|
|
||||||
return namespaces["html"], self.name
|
|
||||||
else:
|
|
||||||
return self.namespace, self.name
|
|
||||||
|
|
||||||
nameTuple = property(getNameTuple)
|
|
||||||
|
|
||||||
class TreeBuilder(_base.TreeBuilder):
|
|
||||||
def documentClass(self):
|
|
||||||
self.dom = Dom.getDOMImplementation().createDocument(None, None, None)
|
|
||||||
return weakref.proxy(self)
|
|
||||||
|
|
||||||
def insertDoctype(self, token):
|
|
||||||
name = token["name"]
|
|
||||||
publicId = token["publicId"]
|
|
||||||
systemId = token["systemId"]
|
|
||||||
|
|
||||||
domimpl = Dom.getDOMImplementation()
|
|
||||||
doctype = domimpl.createDocumentType(name, publicId, systemId)
|
|
||||||
self.document.appendChild(NodeBuilder(doctype))
|
|
||||||
if Dom == minidom:
|
|
||||||
doctype.ownerDocument = self.dom
|
|
||||||
|
|
||||||
def elementClass(self, name, namespace=None):
|
|
||||||
if namespace is None and self.defaultNamespace is None:
|
|
||||||
node = self.dom.createElement(name)
|
|
||||||
else:
|
|
||||||
node = self.dom.createElementNS(namespace, name)
|
|
||||||
|
|
||||||
return NodeBuilder(node)
|
|
||||||
|
|
||||||
def commentClass(self, data):
|
|
||||||
return NodeBuilder(self.dom.createComment(data))
|
|
||||||
|
|
||||||
def fragmentClass(self):
|
|
||||||
return NodeBuilder(self.dom.createDocumentFragment())
|
|
||||||
|
|
||||||
def appendChild(self, node):
|
|
||||||
self.dom.appendChild(node.element)
|
|
||||||
|
|
||||||
def testSerializer(self, element):
|
|
||||||
return testSerializer(element)
|
|
||||||
|
|
||||||
def getDocument(self):
|
|
||||||
return self.dom
|
|
||||||
|
|
||||||
def getFragment(self):
|
|
||||||
return _base.TreeBuilder.getFragment(self).element
|
|
||||||
|
|
||||||
def insertText(self, data, parent=None):
|
|
||||||
data = data
|
|
||||||
if parent != self:
|
|
||||||
_base.TreeBuilder.insertText(self, data, parent)
|
|
||||||
else:
|
|
||||||
# HACK: allow text nodes as children of the document node
|
|
||||||
if hasattr(self.dom, '_child_node_types'):
|
|
||||||
if not Node.TEXT_NODE in self.dom._child_node_types:
|
|
||||||
self.dom._child_node_types = list(self.dom._child_node_types)
|
|
||||||
self.dom._child_node_types.append(Node.TEXT_NODE)
|
|
||||||
self.dom.appendChild(self.dom.createTextNode(data))
|
|
||||||
|
|
||||||
implementation = DomImplementation
|
|
||||||
name = None
|
|
||||||
|
|
||||||
def testSerializer(element):
|
|
||||||
element.normalize()
|
|
||||||
rv = []
|
|
||||||
|
|
||||||
def serializeElement(element, indent=0):
|
|
||||||
if element.nodeType == Node.DOCUMENT_TYPE_NODE:
|
|
||||||
if element.name:
|
|
||||||
if element.publicId or element.systemId:
|
|
||||||
publicId = element.publicId or ""
|
|
||||||
systemId = element.systemId or ""
|
|
||||||
rv.append("""|%s<!DOCTYPE %s "%s" "%s">""" %
|
|
||||||
(' ' * indent, element.name, publicId, systemId))
|
|
||||||
else:
|
|
||||||
rv.append("|%s<!DOCTYPE %s>" % (' ' * indent, element.name))
|
|
||||||
else:
|
|
||||||
rv.append("|%s<!DOCTYPE >" % (' ' * indent,))
|
|
||||||
elif element.nodeType == Node.DOCUMENT_NODE:
|
|
||||||
rv.append("#document")
|
|
||||||
elif element.nodeType == Node.DOCUMENT_FRAGMENT_NODE:
|
|
||||||
rv.append("#document-fragment")
|
|
||||||
elif element.nodeType == Node.COMMENT_NODE:
|
|
||||||
rv.append("|%s<!-- %s -->" % (' ' * indent, element.nodeValue))
|
|
||||||
elif element.nodeType == Node.TEXT_NODE:
|
|
||||||
rv.append("|%s\"%s\"" % (' ' * indent, element.nodeValue))
|
|
||||||
else:
|
|
||||||
if (hasattr(element, "namespaceURI") and
|
|
||||||
element.namespaceURI is not None):
|
|
||||||
name = "%s %s" % (constants.prefixes[element.namespaceURI],
|
|
||||||
element.nodeName)
|
|
||||||
else:
|
|
||||||
name = element.nodeName
|
|
||||||
rv.append("|%s<%s>" % (' ' * indent, name))
|
|
||||||
if element.hasAttributes():
|
|
||||||
attributes = []
|
|
||||||
for i in range(len(element.attributes)):
|
|
||||||
attr = element.attributes.item(i)
|
|
||||||
name = attr.nodeName
|
|
||||||
value = attr.value
|
|
||||||
ns = attr.namespaceURI
|
|
||||||
if ns:
|
|
||||||
name = "%s %s" % (constants.prefixes[ns], attr.localName)
|
|
||||||
else:
|
|
||||||
name = attr.nodeName
|
|
||||||
attributes.append((name, value))
|
|
||||||
|
|
||||||
for name, value in sorted(attributes):
|
|
||||||
rv.append('|%s%s="%s"' % (' ' * (indent + 2), name, value))
|
|
||||||
indent += 2
|
|
||||||
for child in element.childNodes:
|
|
||||||
serializeElement(child, indent)
|
|
||||||
serializeElement(element, 0)
|
|
||||||
|
|
||||||
return "\n".join(rv)
|
|
||||||
|
|
||||||
return locals()
|
|
||||||
|
|
||||||
|
|
||||||
# The actual means to get a module!
|
|
||||||
getDomModule = moduleFactoryFactory(getDomBuilder)
|
|
@ -1,340 +0,0 @@
|
|||||||
from __future__ import absolute_import, division, unicode_literals
|
|
||||||
try:
|
|
||||||
text_type = unicode
|
|
||||||
except NameError:
|
|
||||||
text_type = str
|
|
||||||
|
|
||||||
import re
|
|
||||||
|
|
||||||
from . import _base
|
|
||||||
from .. import ihatexml
|
|
||||||
from .. import constants
|
|
||||||
from ..constants import namespaces
|
|
||||||
from ..utils import moduleFactoryFactory
|
|
||||||
|
|
||||||
tag_regexp = re.compile("{([^}]*)}(.*)")
|
|
||||||
|
|
||||||
|
|
||||||
def getETreeBuilder(ElementTreeImplementation, fullTree=False):
|
|
||||||
ElementTree = ElementTreeImplementation
|
|
||||||
ElementTreeCommentType = ElementTree.Comment("asd").tag
|
|
||||||
|
|
||||||
class Element(_base.Node):
|
|
||||||
def __init__(self, name, namespace=None):
|
|
||||||
self._name = name
|
|
||||||
self._namespace = namespace
|
|
||||||
self._element = ElementTree.Element(self._getETreeTag(name,
|
|
||||||
namespace))
|
|
||||||
if namespace is None:
|
|
||||||
self.nameTuple = namespaces["html"], self._name
|
|
||||||
else:
|
|
||||||
self.nameTuple = self._namespace, self._name
|
|
||||||
self.parent = None
|
|
||||||
self._childNodes = []
|
|
||||||
self._flags = []
|
|
||||||
|
|
||||||
def _getETreeTag(self, name, namespace):
|
|
||||||
if namespace is None:
|
|
||||||
etree_tag = name
|
|
||||||
else:
|
|
||||||
etree_tag = "{%s}%s" % (namespace, name)
|
|
||||||
return etree_tag
|
|
||||||
|
|
||||||
def _setName(self, name):
|
|
||||||
self._name = name
|
|
||||||
self._element.tag = self._getETreeTag(self._name, self._namespace)
|
|
||||||
|
|
||||||
def _getName(self):
|
|
||||||
return self._name
|
|
||||||
|
|
||||||
name = property(_getName, _setName)
|
|
||||||
|
|
||||||
def _setNamespace(self, namespace):
|
|
||||||
self._namespace = namespace
|
|
||||||
self._element.tag = self._getETreeTag(self._name, self._namespace)
|
|
||||||
|
|
||||||
def _getNamespace(self):
|
|
||||||
return self._namespace
|
|
||||||
|
|
||||||
namespace = property(_getNamespace, _setNamespace)
|
|
||||||
|
|
||||||
def _getAttributes(self):
|
|
||||||
return self._element.attrib
|
|
||||||
|
|
||||||
def _setAttributes(self, attributes):
|
|
||||||
# Delete existing attributes first
|
|
||||||
# XXX - there may be a better way to do this...
|
|
||||||
for key in list(self._element.attrib.keys()):
|
|
||||||
del self._element.attrib[key]
|
|
||||||
for key, value in attributes.items():
|
|
||||||
if isinstance(key, tuple):
|
|
||||||
name = "{%s}%s" % (key[2], key[1])
|
|
||||||
else:
|
|
||||||
name = key
|
|
||||||
self._element.set(name, value)
|
|
||||||
|
|
||||||
attributes = property(_getAttributes, _setAttributes)
|
|
||||||
|
|
||||||
def _getChildNodes(self):
|
|
||||||
return self._childNodes
|
|
||||||
|
|
||||||
def _setChildNodes(self, value):
|
|
||||||
del self._element[:]
|
|
||||||
self._childNodes = []
|
|
||||||
for element in value:
|
|
||||||
self.insertChild(element)
|
|
||||||
|
|
||||||
childNodes = property(_getChildNodes, _setChildNodes)
|
|
||||||
|
|
||||||
def hasContent(self):
|
|
||||||
"""Return true if the node has children or text"""
|
|
||||||
return bool(self._element.text or len(self._element))
|
|
||||||
|
|
||||||
def appendChild(self, node):
|
|
||||||
self._childNodes.append(node)
|
|
||||||
self._element.append(node._element)
|
|
||||||
node.parent = self
|
|
||||||
|
|
||||||
def insertBefore(self, node, refNode):
|
|
||||||
index = list(self._element).index(refNode._element)
|
|
||||||
self._element.insert(index, node._element)
|
|
||||||
node.parent = self
|
|
||||||
|
|
||||||
def removeChild(self, node):
|
|
||||||
self._element.remove(node._element)
|
|
||||||
node.parent = None
|
|
||||||
|
|
||||||
def insertText(self, data, insertBefore=None):
|
|
||||||
if not(len(self._element)):
|
|
||||||
if not self._element.text:
|
|
||||||
self._element.text = ""
|
|
||||||
self._element.text += data
|
|
||||||
elif insertBefore is None:
|
|
||||||
# Insert the text as the tail of the last child element
|
|
||||||
if not self._element[-1].tail:
|
|
||||||
self._element[-1].tail = ""
|
|
||||||
self._element[-1].tail += data
|
|
||||||
else:
|
|
||||||
# Insert the text before the specified node
|
|
||||||
children = list(self._element)
|
|
||||||
index = children.index(insertBefore._element)
|
|
||||||
if index > 0:
|
|
||||||
if not self._element[index - 1].tail:
|
|
||||||
self._element[index - 1].tail = ""
|
|
||||||
self._element[index - 1].tail += data
|
|
||||||
else:
|
|
||||||
if not self._element.text:
|
|
||||||
self._element.text = ""
|
|
||||||
self._element.text += data
|
|
||||||
|
|
||||||
def cloneNode(self):
|
|
||||||
element = type(self)(self.name, self.namespace)
|
|
||||||
for name, value in self.attributes.items():
|
|
||||||
element.attributes[name] = value
|
|
||||||
return element
|
|
||||||
|
|
||||||
def reparentChildren(self, newParent):
|
|
||||||
if newParent.childNodes:
|
|
||||||
newParent.childNodes[-1]._element.tail += self._element.text
|
|
||||||
else:
|
|
||||||
if not newParent._element.text:
|
|
||||||
newParent._element.text = ""
|
|
||||||
if self._element.text is not None:
|
|
||||||
newParent._element.text += self._element.text
|
|
||||||
self._element.text = ""
|
|
||||||
_base.Node.reparentChildren(self, newParent)
|
|
||||||
|
|
||||||
class Comment(Element):
|
|
||||||
def __init__(self, data):
|
|
||||||
# Use the superclass constructor to set all properties on the
|
|
||||||
# wrapper element
|
|
||||||
self._element = ElementTree.Comment(data)
|
|
||||||
self.parent = None
|
|
||||||
self._childNodes = []
|
|
||||||
self._flags = []
|
|
||||||
|
|
||||||
def _getData(self):
|
|
||||||
return self._element.text
|
|
||||||
|
|
||||||
def _setData(self, value):
|
|
||||||
self._element.text = value
|
|
||||||
|
|
||||||
data = property(_getData, _setData)
|
|
||||||
|
|
||||||
class DocumentType(Element):
|
|
||||||
def __init__(self, name, publicId, systemId):
|
|
||||||
Element.__init__(self, "<!DOCTYPE>")
|
|
||||||
self._element.text = name
|
|
||||||
self.publicId = publicId
|
|
||||||
self.systemId = systemId
|
|
||||||
|
|
||||||
def _getPublicId(self):
|
|
||||||
return self._element.get("publicId", "")
|
|
||||||
|
|
||||||
def _setPublicId(self, value):
|
|
||||||
if value is not None:
|
|
||||||
self._element.set("publicId", value)
|
|
||||||
|
|
||||||
publicId = property(_getPublicId, _setPublicId)
|
|
||||||
|
|
||||||
def _getSystemId(self):
|
|
||||||
return self._element.get("systemId", "")
|
|
||||||
|
|
||||||
def _setSystemId(self, value):
|
|
||||||
if value is not None:
|
|
||||||
self._element.set("systemId", value)
|
|
||||||
|
|
||||||
systemId = property(_getSystemId, _setSystemId)
|
|
||||||
|
|
||||||
class Document(Element):
|
|
||||||
def __init__(self):
|
|
||||||
Element.__init__(self, "DOCUMENT_ROOT")
|
|
||||||
|
|
||||||
class DocumentFragment(Element):
|
|
||||||
def __init__(self):
|
|
||||||
Element.__init__(self, "DOCUMENT_FRAGMENT")
|
|
||||||
|
|
||||||
def testSerializer(element):
|
|
||||||
rv = []
|
|
||||||
|
|
||||||
def serializeElement(element, indent=0):
|
|
||||||
if not(hasattr(element, "tag")):
|
|
||||||
element = element.getroot()
|
|
||||||
if element.tag == "<!DOCTYPE>":
|
|
||||||
if element.get("publicId") or element.get("systemId"):
|
|
||||||
publicId = element.get("publicId") or ""
|
|
||||||
systemId = element.get("systemId") or ""
|
|
||||||
rv.append("""<!DOCTYPE %s "%s" "%s">""" %
|
|
||||||
(element.text, publicId, systemId))
|
|
||||||
else:
|
|
||||||
rv.append("<!DOCTYPE %s>" % (element.text,))
|
|
||||||
elif element.tag == "DOCUMENT_ROOT":
|
|
||||||
rv.append("#document")
|
|
||||||
if element.text is not None:
|
|
||||||
rv.append("|%s\"%s\"" % (' ' * (indent + 2), element.text))
|
|
||||||
if element.tail is not None:
|
|
||||||
raise TypeError("Document node cannot have tail")
|
|
||||||
if hasattr(element, "attrib") and len(element.attrib):
|
|
||||||
raise TypeError("Document node cannot have attributes")
|
|
||||||
elif element.tag == ElementTreeCommentType:
|
|
||||||
rv.append("|%s<!-- %s -->" % (' ' * indent, element.text))
|
|
||||||
else:
|
|
||||||
assert isinstance(element.tag, text_type), \
|
|
||||||
"Expected unicode, got %s, %s" % (type(element.tag), element.tag)
|
|
||||||
nsmatch = tag_regexp.match(element.tag)
|
|
||||||
|
|
||||||
if nsmatch is None:
|
|
||||||
name = element.tag
|
|
||||||
else:
|
|
||||||
ns, name = nsmatch.groups()
|
|
||||||
prefix = constants.prefixes[ns]
|
|
||||||
name = "%s %s" % (prefix, name)
|
|
||||||
rv.append("|%s<%s>" % (' ' * indent, name))
|
|
||||||
|
|
||||||
if hasattr(element, "attrib"):
|
|
||||||
attributes = []
|
|
||||||
for name, value in element.attrib.items():
|
|
||||||
nsmatch = tag_regexp.match(name)
|
|
||||||
if nsmatch is not None:
|
|
||||||
ns, name = nsmatch.groups()
|
|
||||||
prefix = constants.prefixes[ns]
|
|
||||||
attr_string = "%s %s" % (prefix, name)
|
|
||||||
else:
|
|
||||||
attr_string = name
|
|
||||||
attributes.append((attr_string, value))
|
|
||||||
|
|
||||||
for name, value in sorted(attributes):
|
|
||||||
rv.append('|%s%s="%s"' % (' ' * (indent + 2), name, value))
|
|
||||||
if element.text:
|
|
||||||
rv.append("|%s\"%s\"" % (' ' * (indent + 2), element.text))
|
|
||||||
indent += 2
|
|
||||||
for child in element:
|
|
||||||
serializeElement(child, indent)
|
|
||||||
if element.tail:
|
|
||||||
rv.append("|%s\"%s\"" % (' ' * (indent - 2), element.tail))
|
|
||||||
serializeElement(element, 0)
|
|
||||||
|
|
||||||
return "\n".join(rv)
|
|
||||||
|
|
||||||
def tostring(element):
|
|
||||||
"""Serialize an element and its child nodes to a string"""
|
|
||||||
rv = []
|
|
||||||
filter = ihatexml.InfosetFilter()
|
|
||||||
|
|
||||||
def serializeElement(element):
|
|
||||||
if isinstance(element, ElementTree.ElementTree):
|
|
||||||
element = element.getroot()
|
|
||||||
|
|
||||||
if element.tag == "<!DOCTYPE>":
|
|
||||||
if element.get("publicId") or element.get("systemId"):
|
|
||||||
publicId = element.get("publicId") or ""
|
|
||||||
systemId = element.get("systemId") or ""
|
|
||||||
rv.append("""<!DOCTYPE %s PUBLIC "%s" "%s">""" %
|
|
||||||
(element.text, publicId, systemId))
|
|
||||||
else:
|
|
||||||
rv.append("<!DOCTYPE %s>" % (element.text,))
|
|
||||||
elif element.tag == "DOCUMENT_ROOT":
|
|
||||||
if element.text is not None:
|
|
||||||
rv.append(element.text)
|
|
||||||
if element.tail is not None:
|
|
||||||
raise TypeError("Document node cannot have tail")
|
|
||||||
if hasattr(element, "attrib") and len(element.attrib):
|
|
||||||
raise TypeError("Document node cannot have attributes")
|
|
||||||
|
|
||||||
for child in element:
|
|
||||||
serializeElement(child)
|
|
||||||
|
|
||||||
elif element.tag == ElementTreeCommentType:
|
|
||||||
rv.append("<!--%s-->" % (element.text,))
|
|
||||||
else:
|
|
||||||
# This is assumed to be an ordinary element
|
|
||||||
if not element.attrib:
|
|
||||||
rv.append("<%s>" % (filter.fromXmlName(element.tag),))
|
|
||||||
else:
|
|
||||||
attr = " ".join(["%s=\"%s\"" % (
|
|
||||||
filter.fromXmlName(name), value)
|
|
||||||
for name, value in element.attrib.items()])
|
|
||||||
rv.append("<%s %s>" % (element.tag, attr))
|
|
||||||
if element.text:
|
|
||||||
rv.append(element.text)
|
|
||||||
|
|
||||||
for child in element:
|
|
||||||
serializeElement(child)
|
|
||||||
|
|
||||||
rv.append("</%s>" % (element.tag,))
|
|
||||||
|
|
||||||
if element.tail:
|
|
||||||
rv.append(element.tail)
|
|
||||||
|
|
||||||
serializeElement(element)
|
|
||||||
|
|
||||||
return "".join(rv)
|
|
||||||
|
|
||||||
class TreeBuilder(_base.TreeBuilder):
|
|
||||||
documentClass = Document
|
|
||||||
doctypeClass = DocumentType
|
|
||||||
elementClass = Element
|
|
||||||
commentClass = Comment
|
|
||||||
fragmentClass = DocumentFragment
|
|
||||||
implementation = ElementTreeImplementation
|
|
||||||
|
|
||||||
def testSerializer(self, element):
|
|
||||||
return testSerializer(element)
|
|
||||||
|
|
||||||
def getDocument(self):
|
|
||||||
if fullTree:
|
|
||||||
return self.document._element
|
|
||||||
else:
|
|
||||||
if self.defaultNamespace is not None:
|
|
||||||
return self.document._element.find(
|
|
||||||
"{%s}html" % self.defaultNamespace)
|
|
||||||
else:
|
|
||||||
return self.document._element.find("html")
|
|
||||||
|
|
||||||
def getFragment(self):
|
|
||||||
return _base.TreeBuilder.getFragment(self)._element
|
|
||||||
|
|
||||||
return locals()
|
|
||||||
|
|
||||||
|
|
||||||
getETreeModule = moduleFactoryFactory(getETreeBuilder)
|
|
@ -1,374 +0,0 @@
|
|||||||
"""Module for supporting the lxml.etree library. The idea here is to use as much
|
|
||||||
of the native library as possible, without using fragile hacks like custom element
|
|
||||||
names that break between releases. The downside of this is that we cannot represent
|
|
||||||
all possible trees; specifically the following are known to cause problems:
|
|
||||||
|
|
||||||
Text or comments as siblings of the root element
|
|
||||||
Docypes with no name
|
|
||||||
|
|
||||||
When any of these things occur, we emit a DataLossWarning
|
|
||||||
"""
|
|
||||||
|
|
||||||
from __future__ import absolute_import, division, unicode_literals
|
|
||||||
|
|
||||||
import warnings
|
|
||||||
import re
|
|
||||||
import sys
|
|
||||||
|
|
||||||
from . import _base
|
|
||||||
from ..constants import DataLossWarning
|
|
||||||
from .. import constants
|
|
||||||
from . import etree as etree_builders
|
|
||||||
from .. import ihatexml
|
|
||||||
|
|
||||||
import lxml.etree as etree
|
|
||||||
|
|
||||||
|
|
||||||
fullTree = True
|
|
||||||
tag_regexp = re.compile("{([^}]*)}(.*)")
|
|
||||||
|
|
||||||
comment_type = etree.Comment("asd").tag
|
|
||||||
|
|
||||||
|
|
||||||
class DocumentType(object):
|
|
||||||
def __init__(self, name, publicId, systemId):
|
|
||||||
self.name = name
|
|
||||||
self.publicId = publicId
|
|
||||||
self.systemId = systemId
|
|
||||||
|
|
||||||
|
|
||||||
class Document(object):
|
|
||||||
def __init__(self):
|
|
||||||
self._elementTree = None
|
|
||||||
self._childNodes = []
|
|
||||||
|
|
||||||
def appendChild(self, element):
|
|
||||||
self._elementTree.getroot().addnext(element._element)
|
|
||||||
|
|
||||||
def _getChildNodes(self):
|
|
||||||
return self._childNodes
|
|
||||||
|
|
||||||
childNodes = property(_getChildNodes)
|
|
||||||
|
|
||||||
|
|
||||||
def testSerializer(element):
|
|
||||||
rv = []
|
|
||||||
finalText = None
|
|
||||||
infosetFilter = ihatexml.InfosetFilter()
|
|
||||||
|
|
||||||
def serializeElement(element, indent=0):
|
|
||||||
if not hasattr(element, "tag"):
|
|
||||||
if hasattr(element, "getroot"):
|
|
||||||
# Full tree case
|
|
||||||
rv.append("#document")
|
|
||||||
if element.docinfo.internalDTD:
|
|
||||||
if not (element.docinfo.public_id or
|
|
||||||
element.docinfo.system_url):
|
|
||||||
dtd_str = "<!DOCTYPE %s>" % element.docinfo.root_name
|
|
||||||
else:
|
|
||||||
dtd_str = """<!DOCTYPE %s "%s" "%s">""" % (
|
|
||||||
element.docinfo.root_name,
|
|
||||||
element.docinfo.public_id,
|
|
||||||
element.docinfo.system_url)
|
|
||||||
rv.append("|%s%s" % (' ' * (indent + 2), dtd_str))
|
|
||||||
next_element = element.getroot()
|
|
||||||
while next_element.getprevious() is not None:
|
|
||||||
next_element = next_element.getprevious()
|
|
||||||
while next_element is not None:
|
|
||||||
serializeElement(next_element, indent + 2)
|
|
||||||
next_element = next_element.getnext()
|
|
||||||
elif isinstance(element, str) or isinstance(element, bytes):
|
|
||||||
# Text in a fragment
|
|
||||||
assert isinstance(element, str) or sys.version_info.major == 2
|
|
||||||
rv.append("|%s\"%s\"" % (' ' * indent, element))
|
|
||||||
else:
|
|
||||||
# Fragment case
|
|
||||||
rv.append("#document-fragment")
|
|
||||||
for next_element in element:
|
|
||||||
serializeElement(next_element, indent + 2)
|
|
||||||
elif element.tag == comment_type:
|
|
||||||
rv.append("|%s<!-- %s -->" % (' ' * indent, element.text))
|
|
||||||
if hasattr(element, "tail") and element.tail:
|
|
||||||
rv.append("|%s\"%s\"" % (' ' * indent, element.tail))
|
|
||||||
else:
|
|
||||||
assert isinstance(element, etree._Element)
|
|
||||||
nsmatch = etree_builders.tag_regexp.match(element.tag)
|
|
||||||
if nsmatch is not None:
|
|
||||||
ns = nsmatch.group(1)
|
|
||||||
tag = nsmatch.group(2)
|
|
||||||
prefix = constants.prefixes[ns]
|
|
||||||
rv.append("|%s<%s %s>" % (' ' * indent, prefix,
|
|
||||||
infosetFilter.fromXmlName(tag)))
|
|
||||||
else:
|
|
||||||
rv.append("|%s<%s>" % (' ' * indent,
|
|
||||||
infosetFilter.fromXmlName(element.tag)))
|
|
||||||
|
|
||||||
if hasattr(element, "attrib"):
|
|
||||||
attributes = []
|
|
||||||
for name, value in element.attrib.items():
|
|
||||||
nsmatch = tag_regexp.match(name)
|
|
||||||
if nsmatch is not None:
|
|
||||||
ns, name = nsmatch.groups()
|
|
||||||
name = infosetFilter.fromXmlName(name)
|
|
||||||
prefix = constants.prefixes[ns]
|
|
||||||
attr_string = "%s %s" % (prefix, name)
|
|
||||||
else:
|
|
||||||
attr_string = infosetFilter.fromXmlName(name)
|
|
||||||
attributes.append((attr_string, value))
|
|
||||||
|
|
||||||
for name, value in sorted(attributes):
|
|
||||||
rv.append('|%s%s="%s"' % (' ' * (indent + 2), name, value))
|
|
||||||
|
|
||||||
if element.text:
|
|
||||||
rv.append("|%s\"%s\"" % (' ' * (indent + 2), element.text))
|
|
||||||
indent += 2
|
|
||||||
for child in element:
|
|
||||||
serializeElement(child, indent)
|
|
||||||
if hasattr(element, "tail") and element.tail:
|
|
||||||
rv.append("|%s\"%s\"" % (' ' * (indent - 2), element.tail))
|
|
||||||
serializeElement(element, 0)
|
|
||||||
|
|
||||||
if finalText is not None:
|
|
||||||
rv.append("|%s\"%s\"" % (' ' * 2, finalText))
|
|
||||||
|
|
||||||
return "\n".join(rv)
|
|
||||||
|
|
||||||
|
|
||||||
def tostring(element):
|
|
||||||
"""Serialize an element and its child nodes to a string"""
|
|
||||||
rv = []
|
|
||||||
finalText = None
|
|
||||||
|
|
||||||
def serializeElement(element):
|
|
||||||
if not hasattr(element, "tag"):
|
|
||||||
if element.docinfo.internalDTD:
|
|
||||||
if element.docinfo.doctype:
|
|
||||||
dtd_str = element.docinfo.doctype
|
|
||||||
else:
|
|
||||||
dtd_str = "<!DOCTYPE %s>" % element.docinfo.root_name
|
|
||||||
rv.append(dtd_str)
|
|
||||||
serializeElement(element.getroot())
|
|
||||||
|
|
||||||
elif element.tag == comment_type:
|
|
||||||
rv.append("<!--%s-->" % (element.text,))
|
|
||||||
|
|
||||||
else:
|
|
||||||
# This is assumed to be an ordinary element
|
|
||||||
if not element.attrib:
|
|
||||||
rv.append("<%s>" % (element.tag,))
|
|
||||||
else:
|
|
||||||
attr = " ".join(["%s=\"%s\"" % (name, value)
|
|
||||||
for name, value in element.attrib.items()])
|
|
||||||
rv.append("<%s %s>" % (element.tag, attr))
|
|
||||||
if element.text:
|
|
||||||
rv.append(element.text)
|
|
||||||
|
|
||||||
for child in element:
|
|
||||||
serializeElement(child)
|
|
||||||
|
|
||||||
rv.append("</%s>" % (element.tag,))
|
|
||||||
|
|
||||||
if hasattr(element, "tail") and element.tail:
|
|
||||||
rv.append(element.tail)
|
|
||||||
|
|
||||||
serializeElement(element)
|
|
||||||
|
|
||||||
if finalText is not None:
|
|
||||||
rv.append("%s\"" % (' ' * 2, finalText))
|
|
||||||
|
|
||||||
return "".join(rv)
|
|
||||||
|
|
||||||
|
|
||||||
class TreeBuilder(_base.TreeBuilder):
|
|
||||||
documentClass = Document
|
|
||||||
doctypeClass = DocumentType
|
|
||||||
elementClass = None
|
|
||||||
commentClass = None
|
|
||||||
fragmentClass = Document
|
|
||||||
implementation = etree
|
|
||||||
|
|
||||||
def __init__(self, namespaceHTMLElements, fullTree=False):
|
|
||||||
builder = etree_builders.getETreeModule(etree, fullTree=fullTree)
|
|
||||||
infosetFilter = self.infosetFilter = ihatexml.InfosetFilter()
|
|
||||||
self.namespaceHTMLElements = namespaceHTMLElements
|
|
||||||
|
|
||||||
class Attributes(dict):
|
|
||||||
def __init__(self, element, value={}):
|
|
||||||
self._element = element
|
|
||||||
dict.__init__(self, value)
|
|
||||||
for key, value in self.items():
|
|
||||||
if isinstance(key, tuple):
|
|
||||||
name = "{%s}%s" % (key[2], infosetFilter.coerceAttribute(key[1]))
|
|
||||||
else:
|
|
||||||
name = infosetFilter.coerceAttribute(key)
|
|
||||||
self._element._element.attrib[name] = value
|
|
||||||
|
|
||||||
def __setitem__(self, key, value):
|
|
||||||
dict.__setitem__(self, key, value)
|
|
||||||
if isinstance(key, tuple):
|
|
||||||
name = "{%s}%s" % (key[2], infosetFilter.coerceAttribute(key[1]))
|
|
||||||
else:
|
|
||||||
name = infosetFilter.coerceAttribute(key)
|
|
||||||
self._element._element.attrib[name] = value
|
|
||||||
|
|
||||||
class Element(builder.Element):
|
|
||||||
def __init__(self, name, namespace):
|
|
||||||
name = infosetFilter.coerceElement(name)
|
|
||||||
builder.Element.__init__(self, name, namespace=namespace)
|
|
||||||
self._attributes = Attributes(self)
|
|
||||||
|
|
||||||
def _setName(self, name):
|
|
||||||
self._name = infosetFilter.coerceElement(name)
|
|
||||||
self._element.tag = self._getETreeTag(
|
|
||||||
self._name, self._namespace)
|
|
||||||
|
|
||||||
def _getName(self):
|
|
||||||
return infosetFilter.fromXmlName(self._name)
|
|
||||||
|
|
||||||
name = property(_getName, _setName)
|
|
||||||
|
|
||||||
def _getAttributes(self):
|
|
||||||
return self._attributes
|
|
||||||
|
|
||||||
def _setAttributes(self, attributes):
|
|
||||||
self._attributes = Attributes(self, attributes)
|
|
||||||
|
|
||||||
attributes = property(_getAttributes, _setAttributes)
|
|
||||||
|
|
||||||
def insertText(self, data, insertBefore=None):
|
|
||||||
data = infosetFilter.coerceCharacters(data)
|
|
||||||
builder.Element.insertText(self, data, insertBefore)
|
|
||||||
|
|
||||||
def appendChild(self, child):
|
|
||||||
builder.Element.appendChild(self, child)
|
|
||||||
|
|
||||||
class Comment(builder.Comment):
|
|
||||||
def __init__(self, data):
|
|
||||||
data = infosetFilter.coerceComment(data)
|
|
||||||
builder.Comment.__init__(self, data)
|
|
||||||
|
|
||||||
def _setData(self, data):
|
|
||||||
data = infosetFilter.coerceComment(data)
|
|
||||||
self._element.text = data
|
|
||||||
|
|
||||||
def _getData(self):
|
|
||||||
return self._element.text
|
|
||||||
|
|
||||||
data = property(_getData, _setData)
|
|
||||||
|
|
||||||
self.elementClass = Element
|
|
||||||
self.commentClass = builder.Comment
|
|
||||||
# self.fragmentClass = builder.DocumentFragment
|
|
||||||
_base.TreeBuilder.__init__(self, namespaceHTMLElements)
|
|
||||||
|
|
||||||
def reset(self):
|
|
||||||
_base.TreeBuilder.reset(self)
|
|
||||||
self.insertComment = self.insertCommentInitial
|
|
||||||
self.initial_comments = []
|
|
||||||
self.doctype = None
|
|
||||||
|
|
||||||
def testSerializer(self, element):
|
|
||||||
return testSerializer(element)
|
|
||||||
|
|
||||||
def getDocument(self):
|
|
||||||
if fullTree:
|
|
||||||
return self.document._elementTree
|
|
||||||
else:
|
|
||||||
return self.document._elementTree.getroot()
|
|
||||||
|
|
||||||
def getFragment(self):
|
|
||||||
fragment = []
|
|
||||||
element = self.openElements[0]._element
|
|
||||||
if element.text:
|
|
||||||
fragment.append(element.text)
|
|
||||||
fragment.extend(list(element))
|
|
||||||
if element.tail:
|
|
||||||
fragment.append(element.tail)
|
|
||||||
return fragment
|
|
||||||
|
|
||||||
def insertDoctype(self, token):
|
|
||||||
name = token["name"]
|
|
||||||
publicId = token["publicId"]
|
|
||||||
systemId = token["systemId"]
|
|
||||||
|
|
||||||
if not name:
|
|
||||||
warnings.warn("lxml cannot represent empty doctype", DataLossWarning)
|
|
||||||
self.doctype = None
|
|
||||||
else:
|
|
||||||
coercedName = self.infosetFilter.coerceElement(name)
|
|
||||||
if coercedName != name:
|
|
||||||
warnings.warn("lxml cannot represent non-xml doctype", DataLossWarning)
|
|
||||||
|
|
||||||
doctype = self.doctypeClass(coercedName, publicId, systemId)
|
|
||||||
self.doctype = doctype
|
|
||||||
|
|
||||||
def insertCommentInitial(self, data, parent=None):
|
|
||||||
self.initial_comments.append(data)
|
|
||||||
|
|
||||||
def insertCommentMain(self, data, parent=None):
|
|
||||||
if (parent == self.document and
|
|
||||||
self.document._elementTree.getroot()[-1].tag == comment_type):
|
|
||||||
warnings.warn("lxml cannot represent adjacent comments beyond the root elements", DataLossWarning)
|
|
||||||
if data['data']:
|
|
||||||
# lxml cannot handle comment text that contains -- or endswith -
|
|
||||||
# Should really check if changes happened and issue a data loss
|
|
||||||
# warning, but that's a fairly big performance hit.
|
|
||||||
data['data'] = data['data'].replace('--', '\u2010\u2010').rstrip('-')
|
|
||||||
super(TreeBuilder, self).insertComment(data, parent)
|
|
||||||
|
|
||||||
def insertRoot(self, token):
|
|
||||||
"""Create the document root"""
|
|
||||||
# Because of the way libxml2 works, it doesn't seem to be possible to
|
|
||||||
# alter information like the doctype after the tree has been parsed.
|
|
||||||
# Therefore we need to use the built-in parser to create our iniial
|
|
||||||
# tree, after which we can add elements like normal
|
|
||||||
docStr = ""
|
|
||||||
if self.doctype:
|
|
||||||
assert self.doctype.name
|
|
||||||
docStr += "<!DOCTYPE %s" % self.doctype.name
|
|
||||||
if (self.doctype.publicId is not None or
|
|
||||||
self.doctype.systemId is not None):
|
|
||||||
docStr += (' PUBLIC "%s" ' %
|
|
||||||
(self.infosetFilter.coercePubid(self.doctype.publicId or "")))
|
|
||||||
if self.doctype.systemId:
|
|
||||||
sysid = self.doctype.systemId
|
|
||||||
if sysid.find("'") >= 0 and sysid.find('"') >= 0:
|
|
||||||
warnings.warn("DOCTYPE system cannot contain single and double quotes", DataLossWarning)
|
|
||||||
sysid = sysid.replace("'", 'U00027')
|
|
||||||
if sysid.find("'") >= 0:
|
|
||||||
docStr += '"%s"' % sysid
|
|
||||||
else:
|
|
||||||
docStr += "'%s'" % sysid
|
|
||||||
else:
|
|
||||||
docStr += "''"
|
|
||||||
docStr += ">"
|
|
||||||
if self.doctype.name != token["name"]:
|
|
||||||
warnings.warn("lxml cannot represent doctype with a different name to the root element", DataLossWarning)
|
|
||||||
docStr += "<THIS_SHOULD_NEVER_APPEAR_PUBLICLY/>"
|
|
||||||
root = etree.fromstring(docStr)
|
|
||||||
|
|
||||||
# Append the initial comments:
|
|
||||||
for comment_token in self.initial_comments:
|
|
||||||
root.addprevious(etree.Comment(comment_token["data"]))
|
|
||||||
|
|
||||||
# Create the root document and add the ElementTree to it
|
|
||||||
self.document = self.documentClass()
|
|
||||||
self.document._elementTree = root.getroottree()
|
|
||||||
|
|
||||||
# Give the root element the right name
|
|
||||||
name = token["name"]
|
|
||||||
namespace = token.get("namespace", self.defaultNamespace)
|
|
||||||
if namespace is None:
|
|
||||||
etree_tag = name
|
|
||||||
else:
|
|
||||||
etree_tag = "{%s}%s" % (namespace, name)
|
|
||||||
root.tag = etree_tag
|
|
||||||
|
|
||||||
# Add the root element to the internal child/open data structures
|
|
||||||
root_element = self.elementClass(name, namespace)
|
|
||||||
root_element._element = root
|
|
||||||
self.document._childNodes.append(root_element)
|
|
||||||
self.openElements.append(root_element)
|
|
||||||
|
|
||||||
# Reset to the default insert comment function
|
|
||||||
self.insertComment = self.insertCommentMain
|
|
@ -1,147 +0,0 @@
|
|||||||
"""A collection of modules for iterating through different kinds of
|
|
||||||
tree, generating tokens identical to those produced by the tokenizer
|
|
||||||
module.
|
|
||||||
|
|
||||||
To create a tree walker for a new type of tree, you need to do
|
|
||||||
implement a tree walker object (called TreeWalker by convention) that
|
|
||||||
implements a 'serialize' method taking a tree as sole argument and
|
|
||||||
returning an iterator generating tokens.
|
|
||||||
"""
|
|
||||||
|
|
||||||
from __future__ import absolute_import, division, unicode_literals
|
|
||||||
|
|
||||||
__all__ = ["getTreeWalker", "pprint", "dom", "etree", "genshistream", "lxmletree",
|
|
||||||
"pulldom"]
|
|
||||||
|
|
||||||
import sys
|
|
||||||
|
|
||||||
from .. import constants
|
|
||||||
from ..utils import default_etree
|
|
||||||
|
|
||||||
treeWalkerCache = {}
|
|
||||||
|
|
||||||
|
|
||||||
def getTreeWalker(treeType, implementation=None, **kwargs):
|
|
||||||
"""Get a TreeWalker class for various types of tree with built-in support
|
|
||||||
|
|
||||||
treeType - the name of the tree type required (case-insensitive). Supported
|
|
||||||
values are:
|
|
||||||
|
|
||||||
"dom" - The xml.dom.minidom DOM implementation
|
|
||||||
"pulldom" - The xml.dom.pulldom event stream
|
|
||||||
"etree" - A generic walker for tree implementations exposing an
|
|
||||||
elementtree-like interface (known to work with
|
|
||||||
ElementTree, cElementTree and lxml.etree).
|
|
||||||
"lxml" - Optimized walker for lxml.etree
|
|
||||||
"genshi" - a Genshi stream
|
|
||||||
|
|
||||||
implementation - (Currently applies to the "etree" tree type only). A module
|
|
||||||
implementing the tree type e.g. xml.etree.ElementTree or
|
|
||||||
cElementTree."""
|
|
||||||
|
|
||||||
treeType = treeType.lower()
|
|
||||||
if treeType not in treeWalkerCache:
|
|
||||||
if treeType in ("dom", "pulldom"):
|
|
||||||
name = "%s.%s" % (__name__, treeType)
|
|
||||||
__import__(name)
|
|
||||||
mod = sys.modules[name]
|
|
||||||
treeWalkerCache[treeType] = mod.TreeWalker
|
|
||||||
elif treeType == "genshi":
|
|
||||||
from . import genshistream
|
|
||||||
treeWalkerCache[treeType] = genshistream.TreeWalker
|
|
||||||
elif treeType == "lxml":
|
|
||||||
from . import lxmletree
|
|
||||||
treeWalkerCache[treeType] = lxmletree.TreeWalker
|
|
||||||
elif treeType == "etree":
|
|
||||||
from . import etree
|
|
||||||
if implementation is None:
|
|
||||||
implementation = default_etree
|
|
||||||
# XXX: NEVER cache here, caching is done in the etree submodule
|
|
||||||
return etree.getETreeModule(implementation, **kwargs).TreeWalker
|
|
||||||
return treeWalkerCache.get(treeType)
|
|
||||||
|
|
||||||
|
|
||||||
def concatenateCharacterTokens(tokens):
|
|
||||||
pendingCharacters = []
|
|
||||||
for token in tokens:
|
|
||||||
type = token["type"]
|
|
||||||
if type in ("Characters", "SpaceCharacters"):
|
|
||||||
pendingCharacters.append(token["data"])
|
|
||||||
else:
|
|
||||||
if pendingCharacters:
|
|
||||||
yield {"type": "Characters", "data": "".join(pendingCharacters)}
|
|
||||||
pendingCharacters = []
|
|
||||||
yield token
|
|
||||||
if pendingCharacters:
|
|
||||||
yield {"type": "Characters", "data": "".join(pendingCharacters)}
|
|
||||||
|
|
||||||
|
|
||||||
def pprint(walker):
|
|
||||||
"""Pretty printer for tree walkers"""
|
|
||||||
output = []
|
|
||||||
indent = 0
|
|
||||||
for token in concatenateCharacterTokens(walker):
|
|
||||||
type = token["type"]
|
|
||||||
if type in ("StartTag", "EmptyTag"):
|
|
||||||
# tag name
|
|
||||||
if token["namespace"] and token["namespace"] != constants.namespaces["html"]:
|
|
||||||
if token["namespace"] in constants.prefixes:
|
|
||||||
ns = constants.prefixes[token["namespace"]]
|
|
||||||
else:
|
|
||||||
ns = token["namespace"]
|
|
||||||
name = "%s %s" % (ns, token["name"])
|
|
||||||
else:
|
|
||||||
name = token["name"]
|
|
||||||
output.append("%s<%s>" % (" " * indent, name))
|
|
||||||
indent += 2
|
|
||||||
# attributes (sorted for consistent ordering)
|
|
||||||
attrs = token["data"]
|
|
||||||
for (namespace, localname), value in sorted(attrs.items()):
|
|
||||||
if namespace:
|
|
||||||
if namespace in constants.prefixes:
|
|
||||||
ns = constants.prefixes[namespace]
|
|
||||||
else:
|
|
||||||
ns = namespace
|
|
||||||
name = "%s %s" % (ns, localname)
|
|
||||||
else:
|
|
||||||
name = localname
|
|
||||||
output.append("%s%s=\"%s\"" % (" " * indent, name, value))
|
|
||||||
# self-closing
|
|
||||||
if type == "EmptyTag":
|
|
||||||
indent -= 2
|
|
||||||
|
|
||||||
elif type == "EndTag":
|
|
||||||
indent -= 2
|
|
||||||
|
|
||||||
elif type == "Comment":
|
|
||||||
output.append("%s<!-- %s -->" % (" " * indent, token["data"]))
|
|
||||||
|
|
||||||
elif type == "Doctype":
|
|
||||||
if token["name"]:
|
|
||||||
if token["publicId"]:
|
|
||||||
output.append("""%s<!DOCTYPE %s "%s" "%s">""" %
|
|
||||||
(" " * indent,
|
|
||||||
token["name"],
|
|
||||||
token["publicId"],
|
|
||||||
token["systemId"] if token["systemId"] else ""))
|
|
||||||
elif token["systemId"]:
|
|
||||||
output.append("""%s<!DOCTYPE %s "" "%s">""" %
|
|
||||||
(" " * indent,
|
|
||||||
token["name"],
|
|
||||||
token["systemId"]))
|
|
||||||
else:
|
|
||||||
output.append("%s<!DOCTYPE %s>" % (" " * indent,
|
|
||||||
token["name"]))
|
|
||||||
else:
|
|
||||||
output.append("%s<!DOCTYPE >" % (" " * indent,))
|
|
||||||
|
|
||||||
elif type == "Characters":
|
|
||||||
output.append("%s\"%s\"" % (" " * indent, token["data"]))
|
|
||||||
|
|
||||||
elif type == "SpaceCharacters":
|
|
||||||
assert False, "concatenateCharacterTokens should have got rid of all Space tokens"
|
|
||||||
|
|
||||||
else:
|
|
||||||
raise ValueError("Unknown token type, %s" % type)
|
|
||||||
|
|
||||||
return "\n".join(output)
|
|
@ -1,205 +0,0 @@
|
|||||||
from __future__ import absolute_import, division, unicode_literals
|
|
||||||
try:
|
|
||||||
text_type = unicode
|
|
||||||
string_types = basestring,
|
|
||||||
except NameError:
|
|
||||||
text_type = str
|
|
||||||
string_types = str,
|
|
||||||
|
|
||||||
__all__ = ["DOCUMENT", "DOCTYPE", "TEXT", "ELEMENT", "COMMENT", "ENTITY", "UNKNOWN",
|
|
||||||
"TreeWalker", "NonRecursiveTreeWalker"]
|
|
||||||
|
|
||||||
from xml.dom import Node
|
|
||||||
|
|
||||||
DOCUMENT = Node.DOCUMENT_NODE
|
|
||||||
DOCTYPE = Node.DOCUMENT_TYPE_NODE
|
|
||||||
TEXT = Node.TEXT_NODE
|
|
||||||
ELEMENT = Node.ELEMENT_NODE
|
|
||||||
COMMENT = Node.COMMENT_NODE
|
|
||||||
ENTITY = Node.ENTITY_NODE
|
|
||||||
UNKNOWN = "<#UNKNOWN#>"
|
|
||||||
|
|
||||||
from ..constants import voidElements, spaceCharacters
|
|
||||||
spaceCharacters = "".join(spaceCharacters)
|
|
||||||
|
|
||||||
|
|
||||||
def to_text(s, blank_if_none=True):
|
|
||||||
"""Wrapper around six.text_type to convert None to empty string"""
|
|
||||||
if s is None:
|
|
||||||
if blank_if_none:
|
|
||||||
return ""
|
|
||||||
else:
|
|
||||||
return None
|
|
||||||
elif isinstance(s, text_type):
|
|
||||||
return s
|
|
||||||
else:
|
|
||||||
return text_type(s)
|
|
||||||
|
|
||||||
|
|
||||||
def is_text_or_none(string):
|
|
||||||
"""Wrapper around isinstance(string_types) or is None"""
|
|
||||||
return string is None or isinstance(string, string_types)
|
|
||||||
|
|
||||||
|
|
||||||
class TreeWalker(object):
|
|
||||||
def __init__(self, tree):
|
|
||||||
self.tree = tree
|
|
||||||
|
|
||||||
def __iter__(self):
|
|
||||||
raise NotImplementedError
|
|
||||||
|
|
||||||
def error(self, msg):
|
|
||||||
return {"type": "SerializeError", "data": msg}
|
|
||||||
|
|
||||||
def emptyTag(self, namespace, name, attrs, hasChildren=False):
|
|
||||||
assert namespace is None or isinstance(namespace, string_types), type(namespace)
|
|
||||||
assert isinstance(name, string_types), type(name)
|
|
||||||
assert all((namespace is None or isinstance(namespace, string_types)) and
|
|
||||||
isinstance(name, string_types) and
|
|
||||||
isinstance(value, string_types)
|
|
||||||
for (namespace, name), value in attrs.items())
|
|
||||||
|
|
||||||
yield {"type": "EmptyTag", "name": to_text(name, False),
|
|
||||||
"namespace": to_text(namespace),
|
|
||||||
"data": attrs}
|
|
||||||
if hasChildren:
|
|
||||||
yield self.error("Void element has children")
|
|
||||||
|
|
||||||
def startTag(self, namespace, name, attrs):
|
|
||||||
assert namespace is None or isinstance(namespace, string_types), type(namespace)
|
|
||||||
assert isinstance(name, string_types), type(name)
|
|
||||||
assert all((namespace is None or isinstance(namespace, string_types)) and
|
|
||||||
isinstance(name, string_types) and
|
|
||||||
isinstance(value, string_types)
|
|
||||||
for (namespace, name), value in attrs.items())
|
|
||||||
|
|
||||||
return {"type": "StartTag",
|
|
||||||
"name": text_type(name),
|
|
||||||
"namespace": to_text(namespace),
|
|
||||||
"data": dict(((to_text(namespace, False), to_text(name)),
|
|
||||||
to_text(value, False))
|
|
||||||
for (namespace, name), value in attrs.items())}
|
|
||||||
|
|
||||||
def endTag(self, namespace, name):
|
|
||||||
assert namespace is None or isinstance(namespace, string_types), type(namespace)
|
|
||||||
assert isinstance(name, string_types), type(namespace)
|
|
||||||
|
|
||||||
return {"type": "EndTag",
|
|
||||||
"name": to_text(name, False),
|
|
||||||
"namespace": to_text(namespace),
|
|
||||||
"data": {}}
|
|
||||||
|
|
||||||
def text(self, data):
|
|
||||||
assert isinstance(data, string_types), type(data)
|
|
||||||
|
|
||||||
data = to_text(data)
|
|
||||||
middle = data.lstrip(spaceCharacters)
|
|
||||||
left = data[:len(data) - len(middle)]
|
|
||||||
if left:
|
|
||||||
yield {"type": "SpaceCharacters", "data": left}
|
|
||||||
data = middle
|
|
||||||
middle = data.rstrip(spaceCharacters)
|
|
||||||
right = data[len(middle):]
|
|
||||||
if middle:
|
|
||||||
yield {"type": "Characters", "data": middle}
|
|
||||||
if right:
|
|
||||||
yield {"type": "SpaceCharacters", "data": right}
|
|
||||||
|
|
||||||
def comment(self, data):
|
|
||||||
assert isinstance(data, string_types), type(data)
|
|
||||||
|
|
||||||
return {"type": "Comment", "data": text_type(data)}
|
|
||||||
|
|
||||||
def doctype(self, name, publicId=None, systemId=None, correct=True):
|
|
||||||
assert is_text_or_none(name), type(name)
|
|
||||||
assert is_text_or_none(publicId), type(publicId)
|
|
||||||
assert is_text_or_none(systemId), type(systemId)
|
|
||||||
|
|
||||||
return {"type": "Doctype",
|
|
||||||
"name": to_text(name),
|
|
||||||
"publicId": to_text(publicId),
|
|
||||||
"systemId": to_text(systemId),
|
|
||||||
"correct": to_text(correct)}
|
|
||||||
|
|
||||||
def entity(self, name):
|
|
||||||
assert isinstance(name, string_types), type(name)
|
|
||||||
|
|
||||||
return {"type": "Entity", "name": text_type(name)}
|
|
||||||
|
|
||||||
def unknown(self, nodeType):
|
|
||||||
return self.error("Unknown node type: " + nodeType)
|
|
||||||
|
|
||||||
|
|
||||||
class NonRecursiveTreeWalker(TreeWalker):
|
|
||||||
def getNodeDetails(self, node):
|
|
||||||
raise NotImplementedError
|
|
||||||
|
|
||||||
def getFirstChild(self, node):
|
|
||||||
raise NotImplementedError
|
|
||||||
|
|
||||||
def getNextSibling(self, node):
|
|
||||||
raise NotImplementedError
|
|
||||||
|
|
||||||
def getParentNode(self, node):
|
|
||||||
raise NotImplementedError
|
|
||||||
|
|
||||||
def __iter__(self):
|
|
||||||
currentNode = self.tree
|
|
||||||
while currentNode is not None:
|
|
||||||
details = self.getNodeDetails(currentNode)
|
|
||||||
type, details = details[0], details[1:]
|
|
||||||
hasChildren = False
|
|
||||||
|
|
||||||
if type == DOCTYPE:
|
|
||||||
yield self.doctype(*details)
|
|
||||||
|
|
||||||
elif type == TEXT:
|
|
||||||
for token in self.text(*details):
|
|
||||||
yield token
|
|
||||||
|
|
||||||
elif type == ELEMENT:
|
|
||||||
namespace, name, attributes, hasChildren = details
|
|
||||||
if name in voidElements:
|
|
||||||
for token in self.emptyTag(namespace, name, attributes,
|
|
||||||
hasChildren):
|
|
||||||
yield token
|
|
||||||
hasChildren = False
|
|
||||||
else:
|
|
||||||
yield self.startTag(namespace, name, attributes)
|
|
||||||
|
|
||||||
elif type == COMMENT:
|
|
||||||
yield self.comment(details[0])
|
|
||||||
|
|
||||||
elif type == ENTITY:
|
|
||||||
yield self.entity(details[0])
|
|
||||||
|
|
||||||
elif type == DOCUMENT:
|
|
||||||
hasChildren = True
|
|
||||||
|
|
||||||
else:
|
|
||||||
yield self.unknown(details[0])
|
|
||||||
|
|
||||||
if hasChildren:
|
|
||||||
firstChild = self.getFirstChild(currentNode)
|
|
||||||
else:
|
|
||||||
firstChild = None
|
|
||||||
|
|
||||||
if firstChild is not None:
|
|
||||||
currentNode = firstChild
|
|
||||||
else:
|
|
||||||
while currentNode is not None:
|
|
||||||
details = self.getNodeDetails(currentNode)
|
|
||||||
type, details = details[0], details[1:]
|
|
||||||
if type == ELEMENT:
|
|
||||||
namespace, name, attributes, hasChildren = details
|
|
||||||
if name not in voidElements:
|
|
||||||
yield self.endTag(namespace, name)
|
|
||||||
if self.tree is currentNode:
|
|
||||||
currentNode = None
|
|
||||||
break
|
|
||||||
nextSibling = self.getNextSibling(currentNode)
|
|
||||||
if nextSibling is not None:
|
|
||||||
currentNode = nextSibling
|
|
||||||
break
|
|
||||||
else:
|
|
||||||
currentNode = self.getParentNode(currentNode)
|
|
@ -1,43 +0,0 @@
|
|||||||
from __future__ import absolute_import, division, unicode_literals
|
|
||||||
|
|
||||||
from xml.dom import Node
|
|
||||||
|
|
||||||
from . import _base
|
|
||||||
|
|
||||||
|
|
||||||
class TreeWalker(_base.NonRecursiveTreeWalker):
|
|
||||||
def getNodeDetails(self, node):
|
|
||||||
if node.nodeType == Node.DOCUMENT_TYPE_NODE:
|
|
||||||
return _base.DOCTYPE, node.name, node.publicId, node.systemId
|
|
||||||
|
|
||||||
elif node.nodeType in (Node.TEXT_NODE, Node.CDATA_SECTION_NODE):
|
|
||||||
return _base.TEXT, node.nodeValue
|
|
||||||
|
|
||||||
elif node.nodeType == Node.ELEMENT_NODE:
|
|
||||||
attrs = {}
|
|
||||||
for attr in list(node.attributes.keys()):
|
|
||||||
attr = node.getAttributeNode(attr)
|
|
||||||
if attr.namespaceURI:
|
|
||||||
attrs[(attr.namespaceURI, attr.localName)] = attr.value
|
|
||||||
else:
|
|
||||||
attrs[(None, attr.name)] = attr.value
|
|
||||||
return (_base.ELEMENT, node.namespaceURI, node.nodeName,
|
|
||||||
attrs, node.hasChildNodes())
|
|
||||||
|
|
||||||
elif node.nodeType == Node.COMMENT_NODE:
|
|
||||||
return _base.COMMENT, node.nodeValue
|
|
||||||
|
|
||||||
elif node.nodeType in (Node.DOCUMENT_NODE, Node.DOCUMENT_FRAGMENT_NODE):
|
|
||||||
return (_base.DOCUMENT,)
|
|
||||||
|
|
||||||
else:
|
|
||||||
return _base.UNKNOWN, node.nodeType
|
|
||||||
|
|
||||||
def getFirstChild(self, node):
|
|
||||||
return node.firstChild
|
|
||||||
|
|
||||||
def getNextSibling(self, node):
|
|
||||||
return node.nextSibling
|
|
||||||
|
|
||||||
def getParentNode(self, node):
|
|
||||||
return node.parentNode
|
|
@ -1,140 +0,0 @@
|
|||||||
from __future__ import absolute_import, division, unicode_literals
|
|
||||||
|
|
||||||
try:
|
|
||||||
from collections import OrderedDict
|
|
||||||
except ImportError:
|
|
||||||
try:
|
|
||||||
from ordereddict import OrderedDict
|
|
||||||
except ImportError:
|
|
||||||
OrderedDict = dict
|
|
||||||
|
|
||||||
import re
|
|
||||||
|
|
||||||
try:
|
|
||||||
unicode
|
|
||||||
string_types = basestring,
|
|
||||||
except NameError:
|
|
||||||
string_types = str,
|
|
||||||
|
|
||||||
from . import _base
|
|
||||||
from ..utils import moduleFactoryFactory
|
|
||||||
|
|
||||||
tag_regexp = re.compile("{([^}]*)}(.*)")
|
|
||||||
|
|
||||||
|
|
||||||
def getETreeBuilder(ElementTreeImplementation):
|
|
||||||
ElementTree = ElementTreeImplementation
|
|
||||||
ElementTreeCommentType = ElementTree.Comment("asd").tag
|
|
||||||
|
|
||||||
class TreeWalker(_base.NonRecursiveTreeWalker):
|
|
||||||
"""Given the particular ElementTree representation, this implementation,
|
|
||||||
to avoid using recursion, returns "nodes" as tuples with the following
|
|
||||||
content:
|
|
||||||
|
|
||||||
1. The current element
|
|
||||||
|
|
||||||
2. The index of the element relative to its parent
|
|
||||||
|
|
||||||
3. A stack of ancestor elements
|
|
||||||
|
|
||||||
4. A flag "text", "tail" or None to indicate if the current node is a
|
|
||||||
text node; either the text or tail of the current element (1)
|
|
||||||
"""
|
|
||||||
def getNodeDetails(self, node):
|
|
||||||
if isinstance(node, tuple): # It might be the root Element
|
|
||||||
elt, key, parents, flag = node
|
|
||||||
if flag in ("text", "tail"):
|
|
||||||
return _base.TEXT, getattr(elt, flag)
|
|
||||||
else:
|
|
||||||
node = elt
|
|
||||||
|
|
||||||
if not(hasattr(node, "tag")):
|
|
||||||
node = node.getroot()
|
|
||||||
|
|
||||||
if node.tag in ("DOCUMENT_ROOT", "DOCUMENT_FRAGMENT"):
|
|
||||||
return (_base.DOCUMENT,)
|
|
||||||
|
|
||||||
elif node.tag == "<!DOCTYPE>":
|
|
||||||
return (_base.DOCTYPE, node.text,
|
|
||||||
node.get("publicId"), node.get("systemId"))
|
|
||||||
|
|
||||||
elif node.tag == ElementTreeCommentType:
|
|
||||||
return _base.COMMENT, node.text
|
|
||||||
|
|
||||||
else:
|
|
||||||
assert isinstance(node.tag, string_types), type(node.tag)
|
|
||||||
# This is assumed to be an ordinary element
|
|
||||||
match = tag_regexp.match(node.tag)
|
|
||||||
if match:
|
|
||||||
namespace, tag = match.groups()
|
|
||||||
else:
|
|
||||||
namespace = None
|
|
||||||
tag = node.tag
|
|
||||||
attrs = OrderedDict()
|
|
||||||
for name, value in list(node.attrib.items()):
|
|
||||||
match = tag_regexp.match(name)
|
|
||||||
if match:
|
|
||||||
attrs[(match.group(1), match.group(2))] = value
|
|
||||||
else:
|
|
||||||
attrs[(None, name)] = value
|
|
||||||
return (_base.ELEMENT, namespace, tag,
|
|
||||||
attrs, len(node) or node.text)
|
|
||||||
|
|
||||||
def getFirstChild(self, node):
|
|
||||||
if isinstance(node, tuple):
|
|
||||||
element, key, parents, flag = node
|
|
||||||
else:
|
|
||||||
element, key, parents, flag = node, None, [], None
|
|
||||||
|
|
||||||
if flag in ("text", "tail"):
|
|
||||||
return None
|
|
||||||
else:
|
|
||||||
if element.text:
|
|
||||||
return element, key, parents, "text"
|
|
||||||
elif len(element):
|
|
||||||
parents.append(element)
|
|
||||||
return element[0], 0, parents, None
|
|
||||||
else:
|
|
||||||
return None
|
|
||||||
|
|
||||||
def getNextSibling(self, node):
|
|
||||||
if isinstance(node, tuple):
|
|
||||||
element, key, parents, flag = node
|
|
||||||
else:
|
|
||||||
return None
|
|
||||||
|
|
||||||
if flag == "text":
|
|
||||||
if len(element):
|
|
||||||
parents.append(element)
|
|
||||||
return element[0], 0, parents, None
|
|
||||||
else:
|
|
||||||
return None
|
|
||||||
else:
|
|
||||||
if element.tail and flag != "tail":
|
|
||||||
return element, key, parents, "tail"
|
|
||||||
elif key < len(parents[-1]) - 1:
|
|
||||||
return parents[-1][key + 1], key + 1, parents, None
|
|
||||||
else:
|
|
||||||
return None
|
|
||||||
|
|
||||||
def getParentNode(self, node):
|
|
||||||
if isinstance(node, tuple):
|
|
||||||
element, key, parents, flag = node
|
|
||||||
else:
|
|
||||||
return None
|
|
||||||
|
|
||||||
if flag == "text":
|
|
||||||
if not parents:
|
|
||||||
return element
|
|
||||||
else:
|
|
||||||
return element, key, parents, None
|
|
||||||
else:
|
|
||||||
parent = parents.pop()
|
|
||||||
if not parents:
|
|
||||||
return parent
|
|
||||||
else:
|
|
||||||
return parent, list(parents[-1]).index(parent), parents, None
|
|
||||||
|
|
||||||
return locals()
|
|
||||||
|
|
||||||
getETreeModule = moduleFactoryFactory(getETreeBuilder)
|
|
@ -1,69 +0,0 @@
|
|||||||
from __future__ import absolute_import, division, unicode_literals
|
|
||||||
|
|
||||||
from genshi.core import QName
|
|
||||||
from genshi.core import START, END, XML_NAMESPACE, DOCTYPE, TEXT
|
|
||||||
from genshi.core import START_NS, END_NS, START_CDATA, END_CDATA, PI, COMMENT
|
|
||||||
|
|
||||||
from . import _base
|
|
||||||
|
|
||||||
from ..constants import voidElements, namespaces
|
|
||||||
|
|
||||||
|
|
||||||
class TreeWalker(_base.TreeWalker):
|
|
||||||
def __iter__(self):
|
|
||||||
# Buffer the events so we can pass in the following one
|
|
||||||
previous = None
|
|
||||||
for event in self.tree:
|
|
||||||
if previous is not None:
|
|
||||||
for token in self.tokens(previous, event):
|
|
||||||
yield token
|
|
||||||
previous = event
|
|
||||||
|
|
||||||
# Don't forget the final event!
|
|
||||||
if previous is not None:
|
|
||||||
for token in self.tokens(previous, None):
|
|
||||||
yield token
|
|
||||||
|
|
||||||
def tokens(self, event, next):
|
|
||||||
kind, data, pos = event
|
|
||||||
if kind == START:
|
|
||||||
tag, attribs = data
|
|
||||||
name = tag.localname
|
|
||||||
namespace = tag.namespace
|
|
||||||
converted_attribs = {}
|
|
||||||
for k, v in attribs:
|
|
||||||
if isinstance(k, QName):
|
|
||||||
converted_attribs[(k.namespace, k.localname)] = v
|
|
||||||
else:
|
|
||||||
converted_attribs[(None, k)] = v
|
|
||||||
|
|
||||||
if namespace == namespaces["html"] and name in voidElements:
|
|
||||||
for token in self.emptyTag(namespace, name, converted_attribs,
|
|
||||||
not next or next[0] != END
|
|
||||||
or next[1] != tag):
|
|
||||||
yield token
|
|
||||||
else:
|
|
||||||
yield self.startTag(namespace, name, converted_attribs)
|
|
||||||
|
|
||||||
elif kind == END:
|
|
||||||
name = data.localname
|
|
||||||
namespace = data.namespace
|
|
||||||
if name not in voidElements:
|
|
||||||
yield self.endTag(namespace, name)
|
|
||||||
|
|
||||||
elif kind == COMMENT:
|
|
||||||
yield self.comment(data)
|
|
||||||
|
|
||||||
elif kind == TEXT:
|
|
||||||
for token in self.text(data):
|
|
||||||
yield token
|
|
||||||
|
|
||||||
elif kind == DOCTYPE:
|
|
||||||
yield self.doctype(*data)
|
|
||||||
|
|
||||||
elif kind in (XML_NAMESPACE, DOCTYPE, START_NS, END_NS,
|
|
||||||
START_CDATA, END_CDATA, PI):
|
|
||||||
pass
|
|
||||||
|
|
||||||
else:
|
|
||||||
yield self.unknown(kind)
|
|
@ -1,204 +0,0 @@
|
|||||||
from __future__ import absolute_import, division, unicode_literals
|
|
||||||
try:
|
|
||||||
text_type = unicode
|
|
||||||
except NameError:
|
|
||||||
text_type = str
|
|
||||||
|
|
||||||
from lxml import etree
|
|
||||||
from ..treebuilders.etree import tag_regexp
|
|
||||||
|
|
||||||
from . import _base
|
|
||||||
|
|
||||||
from .. import ihatexml
|
|
||||||
|
|
||||||
|
|
||||||
def ensure_str(s):
|
|
||||||
if s is None:
|
|
||||||
return None
|
|
||||||
elif isinstance(s, text_type):
|
|
||||||
return s
|
|
||||||
else:
|
|
||||||
return s.decode("utf-8", "strict")
|
|
||||||
|
|
||||||
|
|
||||||
class Root(object):
|
|
||||||
def __init__(self, et):
|
|
||||||
self.elementtree = et
|
|
||||||
self.children = []
|
|
||||||
if et.docinfo.internalDTD:
|
|
||||||
self.children.append(Doctype(self,
|
|
||||||
ensure_str(et.docinfo.root_name),
|
|
||||||
ensure_str(et.docinfo.public_id),
|
|
||||||
ensure_str(et.docinfo.system_url)))
|
|
||||||
root = et.getroot()
|
|
||||||
node = root
|
|
||||||
|
|
||||||
while node.getprevious() is not None:
|
|
||||||
node = node.getprevious()
|
|
||||||
while node is not None:
|
|
||||||
self.children.append(node)
|
|
||||||
node = node.getnext()
|
|
||||||
|
|
||||||
self.text = None
|
|
||||||
self.tail = None
|
|
||||||
|
|
||||||
def __getitem__(self, key):
|
|
||||||
return self.children[key]
|
|
||||||
|
|
||||||
def getnext(self):
|
|
||||||
return None
|
|
||||||
|
|
||||||
def __len__(self):
|
|
||||||
return 1
|
|
||||||
|
|
||||||
|
|
||||||
class Doctype(object):
|
|
||||||
def __init__(self, root_node, name, public_id, system_id):
|
|
||||||
self.root_node = root_node
|
|
||||||
self.name = name
|
|
||||||
self.public_id = public_id
|
|
||||||
self.system_id = system_id
|
|
||||||
|
|
||||||
self.text = None
|
|
||||||
self.tail = None
|
|
||||||
|
|
||||||
def getnext(self):
|
|
||||||
return self.root_node.children[1]
|
|
||||||
|
|
||||||
|
|
||||||
class FragmentRoot(Root):
|
|
||||||
def __init__(self, children):
|
|
||||||
self.children = [FragmentWrapper(self, child) for child in children]
|
|
||||||
self.text = self.tail = None
|
|
||||||
|
|
||||||
def getnext(self):
|
|
||||||
return None
|
|
||||||
|
|
||||||
|
|
||||||
class FragmentWrapper(object):
|
|
||||||
def __init__(self, fragment_root, obj):
|
|
||||||
self.root_node = fragment_root
|
|
||||||
self.obj = obj
|
|
||||||
if hasattr(self.obj, 'text'):
|
|
||||||
self.text = ensure_str(self.obj.text)
|
|
||||||
else:
|
|
||||||
self.text = None
|
|
||||||
if hasattr(self.obj, 'tail'):
|
|
||||||
self.tail = ensure_str(self.obj.tail)
|
|
||||||
else:
|
|
||||||
self.tail = None
|
|
||||||
|
|
||||||
def __getattr__(self, name):
|
|
||||||
return getattr(self.obj, name)
|
|
||||||
|
|
||||||
def getnext(self):
|
|
||||||
siblings = self.root_node.children
|
|
||||||
idx = siblings.index(self)
|
|
||||||
if idx < len(siblings) - 1:
|
|
||||||
return siblings[idx + 1]
|
|
||||||
else:
|
|
||||||
return None
|
|
||||||
|
|
||||||
def __getitem__(self, key):
|
|
||||||
return self.obj[key]
|
|
||||||
|
|
||||||
def __bool__(self):
|
|
||||||
return bool(self.obj)
|
|
||||||
|
|
||||||
def getparent(self):
|
|
||||||
return None
|
|
||||||
|
|
||||||
def __str__(self):
|
|
||||||
return str(self.obj)
|
|
||||||
|
|
||||||
def __unicode__(self):
|
|
||||||
return str(self.obj)
|
|
||||||
|
|
||||||
def __len__(self):
|
|
||||||
return len(self.obj)
|
|
||||||
|
|
||||||
|
|
||||||
class TreeWalker(_base.NonRecursiveTreeWalker):
|
|
||||||
def __init__(self, tree):
|
|
||||||
if hasattr(tree, "getroot"):
|
|
||||||
tree = Root(tree)
|
|
||||||
elif isinstance(tree, list):
|
|
||||||
tree = FragmentRoot(tree)
|
|
||||||
_base.NonRecursiveTreeWalker.__init__(self, tree)
|
|
||||||
self.filter = ihatexml.InfosetFilter()
|
|
||||||
|
|
||||||
def getNodeDetails(self, node):
|
|
||||||
if isinstance(node, tuple): # Text node
|
|
||||||
node, key = node
|
|
||||||
assert key in ("text", "tail"), "Text nodes are text or tail, found %s" % key
|
|
||||||
return _base.TEXT, ensure_str(getattr(node, key))
|
|
||||||
|
|
||||||
elif isinstance(node, Root):
|
|
||||||
return (_base.DOCUMENT,)
|
|
||||||
|
|
||||||
elif isinstance(node, Doctype):
|
|
||||||
return _base.DOCTYPE, node.name, node.public_id, node.system_id
|
|
||||||
|
|
||||||
elif isinstance(node, FragmentWrapper) and not hasattr(node, "tag"):
|
|
||||||
return _base.TEXT, node.obj
|
|
||||||
|
|
||||||
elif node.tag == etree.Comment:
|
|
||||||
return _base.COMMENT, ensure_str(node.text)
|
|
||||||
|
|
||||||
elif node.tag == etree.Entity:
|
|
||||||
return _base.ENTITY, ensure_str(node.text)[1:-1] # strip &;
|
|
||||||
|
|
||||||
else:
|
|
||||||
# This is assumed to be an ordinary element
|
|
||||||
match = tag_regexp.match(ensure_str(node.tag))
|
|
||||||
if match:
|
|
||||||
namespace, tag = match.groups()
|
|
||||||
else:
|
|
||||||
namespace = None
|
|
||||||
tag = ensure_str(node.tag)
|
|
||||||
attrs = {}
|
|
||||||
for name, value in list(node.attrib.items()):
|
|
||||||
name = ensure_str(name)
|
|
||||||
value = ensure_str(value)
|
|
||||||
match = tag_regexp.match(name)
|
|
||||||
if match:
|
|
||||||
attrs[(match.group(1), match.group(2))] = value
|
|
||||||
else:
|
|
||||||
attrs[(None, name)] = value
|
|
||||||
return (_base.ELEMENT, namespace, self.filter.fromXmlName(tag),
|
|
||||||
attrs, len(node) > 0 or node.text)
|
|
||||||
|
|
||||||
def getFirstChild(self, node):
|
|
||||||
assert not isinstance(node, tuple), "Text nodes have no children"
|
|
||||||
|
|
||||||
assert len(node) or node.text, "Node has no children"
|
|
||||||
if node.text:
|
|
||||||
return (node, "text")
|
|
||||||
else:
|
|
||||||
return node[0]
|
|
||||||
|
|
||||||
def getNextSibling(self, node):
|
|
||||||
if isinstance(node, tuple): # Text node
|
|
||||||
node, key = node
|
|
||||||
assert key in ("text", "tail"), "Text nodes are text or tail, found %s" % key
|
|
||||||
if key == "text":
|
|
||||||
# XXX: we cannot use a "bool(node) and node[0] or None" construct here
|
|
||||||
# because node[0] might evaluate to False if it has no child element
|
|
||||||
if len(node):
|
|
||||||
return node[0]
|
|
||||||
else:
|
|
||||||
return None
|
|
||||||
else: # tail
|
|
||||||
return node.getnext()
|
|
||||||
|
|
||||||
return (node, "tail") if node.tail else node.getnext()
|
|
||||||
|
|
||||||
def getParentNode(self, node):
|
|
||||||
if isinstance(node, tuple): # Text node
|
|
||||||
node, key = node
|
|
||||||
assert key in ("text", "tail"), "Text nodes are text or tail, found %s" % key
|
|
||||||
if key == "text":
|
|
||||||
return node
|
|
||||||
# else: fallback to "normal" processing
|
|
||||||
|
|
||||||
return node.getparent()
|
|
@ -1,63 +0,0 @@
|
|||||||
from __future__ import absolute_import, division, unicode_literals
|
|
||||||
|
|
||||||
from xml.dom.pulldom import START_ELEMENT, END_ELEMENT, \
|
|
||||||
COMMENT, IGNORABLE_WHITESPACE, CHARACTERS
|
|
||||||
|
|
||||||
from . import _base
|
|
||||||
|
|
||||||
from ..constants import voidElements
|
|
||||||
|
|
||||||
|
|
||||||
class TreeWalker(_base.TreeWalker):
|
|
||||||
def __iter__(self):
|
|
||||||
ignore_until = None
|
|
||||||
previous = None
|
|
||||||
for event in self.tree:
|
|
||||||
if previous is not None and \
|
|
||||||
(ignore_until is None or previous[1] is ignore_until):
|
|
||||||
if previous[1] is ignore_until:
|
|
||||||
ignore_until = None
|
|
||||||
for token in self.tokens(previous, event):
|
|
||||||
yield token
|
|
||||||
if token["type"] == "EmptyTag":
|
|
||||||
ignore_until = previous[1]
|
|
||||||
previous = event
|
|
||||||
if ignore_until is None or previous[1] is ignore_until:
|
|
||||||
for token in self.tokens(previous, None):
|
|
||||||
yield token
|
|
||||||
elif ignore_until is not None:
|
|
||||||
raise ValueError("Illformed DOM event stream: void element without END_ELEMENT")
|
|
||||||
|
|
||||||
def tokens(self, event, next):
|
|
||||||
type, node = event
|
|
||||||
if type == START_ELEMENT:
|
|
||||||
name = node.nodeName
|
|
||||||
namespace = node.namespaceURI
|
|
||||||
attrs = {}
|
|
||||||
for attr in list(node.attributes.keys()):
|
|
||||||
attr = node.getAttributeNode(attr)
|
|
||||||
attrs[(attr.namespaceURI, attr.localName)] = attr.value
|
|
||||||
if name in voidElements:
|
|
||||||
for token in self.emptyTag(namespace,
|
|
||||||
name,
|
|
||||||
attrs,
|
|
||||||
not next or next[1] is not node):
|
|
||||||
yield token
|
|
||||||
else:
|
|
||||||
yield self.startTag(namespace, name, attrs)
|
|
||||||
|
|
||||||
elif type == END_ELEMENT:
|
|
||||||
name = node.nodeName
|
|
||||||
namespace = node.namespaceURI
|
|
||||||
if name not in voidElements:
|
|
||||||
yield self.endTag(namespace, name)
|
|
||||||
|
|
||||||
elif type == COMMENT:
|
|
||||||
yield self.comment(node.nodeValue)
|
|
||||||
|
|
||||||
elif type in (IGNORABLE_WHITESPACE, CHARACTERS):
|
|
||||||
for token in self.text(node.nodeValue):
|
|
||||||
yield token
|
|
||||||
|
|
||||||
else:
|
|
||||||
yield self.unknown(type)
|
|
@ -1,12 +0,0 @@
|
|||||||
from __future__ import absolute_import, division, unicode_literals
|
|
||||||
|
|
||||||
from .py import Trie as PyTrie
|
|
||||||
|
|
||||||
Trie = PyTrie
|
|
||||||
|
|
||||||
try:
|
|
||||||
from .datrie import Trie as DATrie
|
|
||||||
except ImportError:
|
|
||||||
pass
|
|
||||||
else:
|
|
||||||
Trie = DATrie
|
|
@ -1,37 +0,0 @@
|
|||||||
from __future__ import absolute_import, division, unicode_literals
|
|
||||||
|
|
||||||
from collections import Mapping
|
|
||||||
|
|
||||||
|
|
||||||
class Trie(Mapping):
|
|
||||||
"""Abstract base class for tries"""
|
|
||||||
|
|
||||||
def keys(self, prefix=None):
|
|
||||||
keys = super().keys()
|
|
||||||
|
|
||||||
if prefix is None:
|
|
||||||
return set(keys)
|
|
||||||
|
|
||||||
# Python 2.6: no set comprehensions
|
|
||||||
return set([x for x in keys if x.startswith(prefix)])
|
|
||||||
|
|
||||||
def has_keys_with_prefix(self, prefix):
|
|
||||||
for key in self.keys():
|
|
||||||
if key.startswith(prefix):
|
|
||||||
return True
|
|
||||||
|
|
||||||
return False
|
|
||||||
|
|
||||||
def longest_prefix(self, prefix):
|
|
||||||
if prefix in self:
|
|
||||||
return prefix
|
|
||||||
|
|
||||||
for i in range(1, len(prefix) + 1):
|
|
||||||
if prefix[:-i] in self:
|
|
||||||
return prefix[:-i]
|
|
||||||
|
|
||||||
raise KeyError(prefix)
|
|
||||||
|
|
||||||
def longest_prefix_item(self, prefix):
|
|
||||||
lprefix = self.longest_prefix(prefix)
|
|
||||||
return (lprefix, self[lprefix])
|
|
@ -1,47 +0,0 @@
|
|||||||
from __future__ import absolute_import, division, unicode_literals
|
|
||||||
|
|
||||||
from datrie import Trie as DATrie
|
|
||||||
try:
|
|
||||||
text_type = unicode
|
|
||||||
except NameError:
|
|
||||||
text_type = str
|
|
||||||
|
|
||||||
from ._base import Trie as ABCTrie
|
|
||||||
|
|
||||||
|
|
||||||
class Trie(ABCTrie):
|
|
||||||
def __init__(self, data):
|
|
||||||
chars = set()
|
|
||||||
for key in data.keys():
|
|
||||||
if not isinstance(key, text_type):
|
|
||||||
raise TypeError("All keys must be strings")
|
|
||||||
for char in key:
|
|
||||||
chars.add(char)
|
|
||||||
|
|
||||||
self._data = DATrie("".join(chars))
|
|
||||||
for key, value in data.items():
|
|
||||||
self._data[key] = value
|
|
||||||
|
|
||||||
def __contains__(self, key):
|
|
||||||
return key in self._data
|
|
||||||
|
|
||||||
def __len__(self):
|
|
||||||
return len(self._data)
|
|
||||||
|
|
||||||
def __iter__(self):
|
|
||||||
raise NotImplementedError()
|
|
||||||
|
|
||||||
def __getitem__(self, key):
|
|
||||||
return self._data[key]
|
|
||||||
|
|
||||||
def keys(self, prefix=None):
|
|
||||||
return self._data.keys(prefix)
|
|
||||||
|
|
||||||
def has_keys_with_prefix(self, prefix):
|
|
||||||
return self._data.has_keys_with_prefix(prefix)
|
|
||||||
|
|
||||||
def longest_prefix(self, prefix):
|
|
||||||
return self._data.longest_prefix(prefix)
|
|
||||||
|
|
||||||
def longest_prefix_item(self, prefix):
|
|
||||||
return self._data.longest_prefix_item(prefix)
|
|
@ -1,70 +0,0 @@
|
|||||||
from __future__ import absolute_import, division, unicode_literals
|
|
||||||
try:
|
|
||||||
text_type = unicode
|
|
||||||
except NameError:
|
|
||||||
text_type = str
|
|
||||||
|
|
||||||
from bisect import bisect_left
|
|
||||||
|
|
||||||
from ._base import Trie as ABCTrie
|
|
||||||
|
|
||||||
|
|
||||||
class Trie(ABCTrie):
|
|
||||||
def __init__(self, data):
|
|
||||||
if not all(isinstance(x, text_type) for x in data.keys()):
|
|
||||||
raise TypeError("All keys must be strings")
|
|
||||||
|
|
||||||
self._data = data
|
|
||||||
self._keys = sorted(data.keys())
|
|
||||||
self._cachestr = ""
|
|
||||||
self._cachepoints = (0, len(data))
|
|
||||||
|
|
||||||
def __contains__(self, key):
|
|
||||||
return key in self._data
|
|
||||||
|
|
||||||
def __len__(self):
|
|
||||||
return len(self._data)
|
|
||||||
|
|
||||||
def __iter__(self):
|
|
||||||
return iter(self._data)
|
|
||||||
|
|
||||||
def __getitem__(self, key):
|
|
||||||
return self._data[key]
|
|
||||||
|
|
||||||
def keys(self, prefix=None):
|
|
||||||
if prefix is None or prefix == "" or not self._keys:
|
|
||||||
return set(self._keys)
|
|
||||||
|
|
||||||
if prefix.startswith(self._cachestr):
|
|
||||||
lo, hi = self._cachepoints
|
|
||||||
start = i = bisect_left(self._keys, prefix, lo, hi)
|
|
||||||
else:
|
|
||||||
start = i = bisect_left(self._keys, prefix)
|
|
||||||
|
|
||||||
keys = set()
|
|
||||||
if start == len(self._keys):
|
|
||||||
return keys
|
|
||||||
|
|
||||||
while self._keys[i].startswith(prefix):
|
|
||||||
keys.add(self._keys[i])
|
|
||||||
i += 1
|
|
||||||
|
|
||||||
self._cachestr = prefix
|
|
||||||
self._cachepoints = (start, i)
|
|
||||||
|
|
||||||
return keys
|
|
||||||
|
|
||||||
def has_keys_with_prefix(self, prefix):
|
|
||||||
if prefix in self._data:
|
|
||||||
return True
|
|
||||||
|
|
||||||
if prefix.startswith(self._cachestr):
|
|
||||||
lo, hi = self._cachepoints
|
|
||||||
i = bisect_left(self._keys, prefix, lo, hi)
|
|
||||||
else:
|
|
||||||
i = bisect_left(self._keys, prefix)
|
|
||||||
|
|
||||||
if i == len(self._keys):
|
|
||||||
return False
|
|
||||||
|
|
||||||
return self._keys[i].startswith(prefix)
|
|
@ -1,82 +0,0 @@
|
|||||||
from __future__ import absolute_import, division, unicode_literals
|
|
||||||
|
|
||||||
from types import ModuleType
|
|
||||||
|
|
||||||
try:
|
|
||||||
import xml.etree.cElementTree as default_etree
|
|
||||||
except ImportError:
|
|
||||||
import xml.etree.ElementTree as default_etree
|
|
||||||
|
|
||||||
|
|
||||||
__all__ = ["default_etree", "MethodDispatcher", "isSurrogatePair",
|
|
||||||
"surrogatePairToCodepoint", "moduleFactoryFactory"]
|
|
||||||
|
|
||||||
|
|
||||||
class MethodDispatcher(dict):
|
|
||||||
"""Dict with 2 special properties:
|
|
||||||
|
|
||||||
On initiation, keys that are lists, sets or tuples are converted to
|
|
||||||
multiple keys so accessing any one of the items in the original
|
|
||||||
list-like object returns the matching value
|
|
||||||
|
|
||||||
md = MethodDispatcher({("foo", "bar"):"baz"})
|
|
||||||
md["foo"] == "baz"
|
|
||||||
|
|
||||||
A default value which can be set through the default attribute.
|
|
||||||
"""
|
|
||||||
|
|
||||||
def __init__(self, items=()):
|
|
||||||
# Using _dictEntries instead of directly assigning to self is about
|
|
||||||
# twice as fast. Please do careful performance testing before changing
|
|
||||||
# anything here.
|
|
||||||
_dictEntries = []
|
|
||||||
for name, value in items:
|
|
||||||
if type(name) in (list, tuple, frozenset, set):
|
|
||||||
for item in name:
|
|
||||||
_dictEntries.append((item, value))
|
|
||||||
else:
|
|
||||||
_dictEntries.append((name, value))
|
|
||||||
dict.__init__(self, _dictEntries)
|
|
||||||
self.default = None
|
|
||||||
|
|
||||||
def __getitem__(self, key):
|
|
||||||
return dict.get(self, key, self.default)
|
|
||||||
|
|
||||||
|
|
||||||
# Some utility functions to dal with weirdness around UCS2 vs UCS4
|
|
||||||
# python builds
|
|
||||||
|
|
||||||
def isSurrogatePair(data):
|
|
||||||
return (len(data) == 2 and
|
|
||||||
ord(data[0]) >= 0xD800 and ord(data[0]) <= 0xDBFF and
|
|
||||||
ord(data[1]) >= 0xDC00 and ord(data[1]) <= 0xDFFF)
|
|
||||||
|
|
||||||
|
|
||||||
def surrogatePairToCodepoint(data):
|
|
||||||
char_val = (0x10000 + (ord(data[0]) - 0xD800) * 0x400 +
|
|
||||||
(ord(data[1]) - 0xDC00))
|
|
||||||
return char_val
|
|
||||||
|
|
||||||
# Module Factory Factory (no, this isn't Java, I know)
|
|
||||||
# Here to stop this being duplicated all over the place.
|
|
||||||
|
|
||||||
|
|
||||||
def moduleFactoryFactory(factory):
|
|
||||||
moduleCache = {}
|
|
||||||
|
|
||||||
def moduleFactory(baseModule, *args, **kwargs):
|
|
||||||
if isinstance(ModuleType.__name__, type("")):
|
|
||||||
name = "_%s_factory" % baseModule.__name__
|
|
||||||
else:
|
|
||||||
name = b"_%s_factory" % baseModule.__name__
|
|
||||||
|
|
||||||
if name in moduleCache:
|
|
||||||
return moduleCache[name]
|
|
||||||
else:
|
|
||||||
mod = ModuleType(name)
|
|
||||||
objs = factory(baseModule, *args, **kwargs)
|
|
||||||
mod.__dict__.update(objs)
|
|
||||||
moduleCache[name] = mod
|
|
||||||
return mod
|
|
||||||
|
|
||||||
return moduleFactory
|
|
Loading…
x
Reference in New Issue
Block a user