Update HTML 5 parser used in calibre (html5lib-python)

This commit is contained in:
Kovid Goyal 2013-10-23 11:04:05 +05:30
parent b4bf871077
commit b9421065f9
46 changed files with 7609 additions and 8932 deletions

View File

@ -562,9 +562,9 @@ def entity_to_unicode(match, exceptions=[], encoding='cp1252',
return check(chr(num).decode(encoding))
except UnicodeDecodeError:
return check(my_unichr(num))
from calibre.utils.html5_entities import entity_map
from html5lib.constants import entities
try:
return check(entity_map[ent])
return check(entities[ent])
except KeyError:
pass
from htmlentitydefs import name2codepoint

View File

@ -81,11 +81,14 @@ def node_depth(node):
return ans
def html5_parse(data, max_nesting_depth=100):
import html5lib
# html5lib bug: http://code.google.com/p/html5lib/issues/detail?id=195
data = re.sub(r'<\s*(title|style|script|textarea)\s*[^>]*/\s*>', r'<\1></\1>', data, flags=re.I)
import html5lib, warnings
from html5lib.constants import cdataElements, rcdataElements
# HTML5 parsing algorithm idiocy: http://code.google.com/p/html5lib/issues/detail?id=195
data = re.sub(r'<\s*(%s)\s*[^>]*/\s*>' % ('|'.join(cdataElements|rcdataElements)), r'<\1></\1>', data, flags=re.I)
data = html5lib.parse(data, treebuilder='lxml').getroot()
with warnings.catch_warnings():
warnings.simplefilter('ignore')
data = html5lib.parse(data, treebuilder='lxml').getroot()
# Check that the asinine HTML 5 algorithm did not result in a tree with
# insane nesting depths

View File

@ -7,6 +7,7 @@ __license__ = 'GPL v3'
__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
from lxml import etree
from html5lib.constants import cdataElements, rcdataElements
from calibre.ebooks.oeb.polish.tests.base import BaseTest
from calibre.ebooks.oeb.base import XPath, XHTML_NS, SVG_NS, XLINK_NS
@ -18,7 +19,7 @@ def nonvoid_cdata_elements(test, parse_function):
markup = '''
<html> <head><{0}/></head> <body id="test"> </html>
'''
for tag in ('title', 'style', 'script', 'textarea'):
for tag in cdataElements | rcdataElements:
for x in (tag, tag.upper(), '\n' + tag, tag + ' id="xxx" '):
root = parse_function(markup.format(x))
test.assertEqual(

File diff suppressed because it is too large Load Diff

View File

@ -1,4 +1,4 @@
"""
"""
HTML parsing library based on the WHATWG "HTML5"
specification. The parser is designed to be compatible with existing
HTML found in the wild and implements well-defined error recovery that
@ -8,10 +8,16 @@ Example usage:
import html5lib
f = open("my_document.html")
tree = html5lib.parse(f)
tree = html5lib.parse(f)
"""
__version__ = "0.90"
from html5parser import HTMLParser, parse, parseFragment
from treebuilders import getTreeBuilder
from treewalkers import getTreeWalker
from serializer import serialize
from __future__ import absolute_import, division, unicode_literals
from .html5parser import HTMLParser, parse, parseFragment
from .treebuilders import getTreeBuilder
from .treewalkers import getTreeWalker
from .serializer import serialize
__all__ = ["HTMLParser", "parse", "parseFragment", "getTreeBuilder",
"getTreeWalker", "serialize"]
__version__ = "0.999-dev"

File diff suppressed because it is too large Load Diff

View File

@ -1,3 +1,5 @@
from __future__ import absolute_import, division, unicode_literals
class Filter(object):
def __init__(self, source):

View File

@ -0,0 +1,20 @@
from __future__ import absolute_import, division, unicode_literals
from . import _base
try:
from collections import OrderedDict
except ImportError:
from ordereddict import OrderedDict
class Filter(_base.Filter):
def __iter__(self):
for token in _base.Filter.__iter__(self):
if token["type"] in ("StartTag", "EmptyTag"):
attrs = OrderedDict()
for name, value in sorted(token["data"].items(),
key=lambda x: x[0]):
attrs[name] = value
token["data"] = attrs
yield token

View File

@ -1,127 +0,0 @@
#
# The goal is to finally have a form filler where you pass data for
# each form, using the algorithm for "Seeding a form with initial values"
# See http://www.whatwg.org/specs/web-forms/current-work/#seeding
#
import _base
from html5lib.constants import spaceCharacters
spaceCharacters = u"".join(spaceCharacters)
class SimpleFilter(_base.Filter):
def __init__(self, source, fieldStorage):
_base.Filter.__init__(self, source)
self.fieldStorage = fieldStorage
def __iter__(self):
field_indices = {}
state = None
field_name = None
for token in _base.Filter.__iter__(self):
type = token["type"]
if type in ("StartTag", "EmptyTag"):
name = token["name"].lower()
if name == "input":
field_name = None
field_type = None
input_value_index = -1
input_checked_index = -1
for i,(n,v) in enumerate(token["data"]):
n = n.lower()
if n == u"name":
field_name = v.strip(spaceCharacters)
elif n == u"type":
field_type = v.strip(spaceCharacters)
elif n == u"checked":
input_checked_index = i
elif n == u"value":
input_value_index = i
value_list = self.fieldStorage.getlist(field_name)
field_index = field_indices.setdefault(field_name, 0)
if field_index < len(value_list):
value = value_list[field_index]
else:
value = ""
if field_type in (u"checkbox", u"radio"):
if value_list:
if token["data"][input_value_index][1] == value:
if input_checked_index < 0:
token["data"].append((u"checked", u""))
field_indices[field_name] = field_index + 1
elif input_checked_index >= 0:
del token["data"][input_checked_index]
elif field_type not in (u"button", u"submit", u"reset"):
if input_value_index >= 0:
token["data"][input_value_index] = (u"value", value)
else:
token["data"].append((u"value", value))
field_indices[field_name] = field_index + 1
field_type = None
field_name = None
elif name == "textarea":
field_type = "textarea"
field_name = dict((token["data"])[::-1])["name"]
elif name == "select":
field_type = "select"
attributes = dict(token["data"][::-1])
field_name = attributes.get("name")
is_select_multiple = "multiple" in attributes
is_selected_option_found = False
elif field_type == "select" and field_name and name == "option":
option_selected_index = -1
option_value = None
for i,(n,v) in enumerate(token["data"]):
n = n.lower()
if n == "selected":
option_selected_index = i
elif n == "value":
option_value = v.strip(spaceCharacters)
if option_value is None:
raise NotImplementedError("<option>s without a value= attribute")
else:
value_list = self.fieldStorage.getlist(field_name)
if value_list:
field_index = field_indices.setdefault(field_name, 0)
if field_index < len(value_list):
value = value_list[field_index]
else:
value = ""
if (is_select_multiple or not is_selected_option_found) and option_value == value:
if option_selected_index < 0:
token["data"].append((u"selected", u""))
field_indices[field_name] = field_index + 1
is_selected_option_found = True
elif option_selected_index >= 0:
del token["data"][option_selected_index]
elif field_type is not None and field_name and type == "EndTag":
name = token["name"].lower()
if name == field_type:
if name == "textarea":
value_list = self.fieldStorage.getlist(field_name)
if value_list:
field_index = field_indices.setdefault(field_name, 0)
if field_index < len(value_list):
value = value_list[field_index]
else:
value = ""
yield {"type": "Characters", "data": value}
field_indices[field_name] = field_index + 1
field_name = None
elif name == "option" and field_type == "select":
pass # TODO: part of "option without value= attribute" processing
elif field_type == "textarea":
continue # ignore token
yield token

View File

@ -1,4 +1,7 @@
import _base
from __future__ import absolute_import, division, unicode_literals
from . import _base
class Filter(_base.Filter):
def __init__(self, source, encoding):
@ -18,29 +21,28 @@ class Filter(_base.Filter):
elif type == "EmptyTag":
if token["name"].lower() == "meta":
# replace charset with actual encoding
has_http_equiv_content_type = False
content_index = -1
for i,(name,value) in enumerate(token["data"]):
if name.lower() == 'charset':
token["data"][i] = (u'charset', self.encoding)
meta_found = True
break
elif name == 'http-equiv' and value.lower() == 'content-type':
has_http_equiv_content_type = True
elif name == 'content':
content_index = i
else:
if has_http_equiv_content_type and content_index >= 0:
token["data"][content_index] = (u'content', u'text/html; charset=%s' % self.encoding)
meta_found = True
# replace charset with actual encoding
has_http_equiv_content_type = False
for (namespace, name), value in token["data"].items():
if namespace is not None:
continue
elif name.lower() == 'charset':
token["data"][(namespace, name)] = self.encoding
meta_found = True
break
elif name == 'http-equiv' and value.lower() == 'content-type':
has_http_equiv_content_type = True
else:
if has_http_equiv_content_type and (None, "content") in token["data"]:
token["data"][(None, "content")] = 'text/html; charset=%s' % self.encoding
meta_found = True
elif token["name"].lower() == "head" and not meta_found:
# insert meta into empty head
yield {"type": "StartTag", "name": "head",
"data": token["data"]}
yield {"type": "EmptyTag", "name": "meta",
"data": [["charset", self.encoding]]}
"data": {(None, "charset"): self.encoding}}
yield {"type": "EndTag", "name": "head"}
meta_found = True
continue
@ -51,7 +53,7 @@ class Filter(_base.Filter):
yield pending.pop(0)
if not meta_found:
yield {"type": "EmptyTag", "name": "meta",
"data": [["charset", self.encoding]]}
"data": {(None, "charset"): self.encoding}}
while pending:
yield pending.pop(0)
meta_found = True

View File

@ -1,13 +1,18 @@
from __future__ import absolute_import, division, unicode_literals
from gettext import gettext
_ = gettext
import _base
from html5lib.constants import cdataElements, rcdataElements, voidElements
from . import _base
from ..constants import cdataElements, rcdataElements, voidElements
from html5lib.constants import spaceCharacters
spaceCharacters = u"".join(spaceCharacters)
from ..constants import spaceCharacters
spaceCharacters = "".join(spaceCharacters)
class LintError(Exception):
pass
class LintError(Exception): pass
class Filter(_base.Filter):
def __iter__(self):
@ -18,24 +23,24 @@ class Filter(_base.Filter):
if type in ("StartTag", "EmptyTag"):
name = token["name"]
if contentModelFlag != "PCDATA":
raise LintError(_("StartTag not in PCDATA content model flag: %s") % name)
if not isinstance(name, unicode):
raise LintError(_(u"Tag name is not a string: %r") % name)
raise LintError(_("StartTag not in PCDATA content model flag: %(tag)s") % {"tag": name})
if not isinstance(name, str):
raise LintError(_("Tag name is not a string: %(tag)r") % {"tag": name})
if not name:
raise LintError(_(u"Empty tag name"))
raise LintError(_("Empty tag name"))
if type == "StartTag" and name in voidElements:
raise LintError(_(u"Void element reported as StartTag token: %s") % name)
raise LintError(_("Void element reported as StartTag token: %(tag)s") % {"tag": name})
elif type == "EmptyTag" and name not in voidElements:
raise LintError(_(u"Non-void element reported as EmptyTag token: %s") % token["name"])
raise LintError(_("Non-void element reported as EmptyTag token: %(tag)s") % {"tag": token["name"]})
if type == "StartTag":
open_elements.append(name)
for name, value in token["data"]:
if not isinstance(name, unicode):
raise LintError(_("Attribute name is not a string: %r") % name)
if not isinstance(name, str):
raise LintError(_("Attribute name is not a string: %(name)r") % {"name": name})
if not name:
raise LintError(_(u"Empty attribute name"))
if not isinstance(value, unicode):
raise LintError(_("Attribute value is not a string: %r") % value)
raise LintError(_("Empty attribute name"))
if not isinstance(value, str):
raise LintError(_("Attribute value is not a string: %(value)r") % {"value": value})
if name in cdataElements:
contentModelFlag = "CDATA"
elif name in rcdataElements:
@ -45,15 +50,15 @@ class Filter(_base.Filter):
elif type == "EndTag":
name = token["name"]
if not isinstance(name, unicode):
raise LintError(_(u"Tag name is not a string: %r") % name)
if not isinstance(name, str):
raise LintError(_("Tag name is not a string: %(tag)r") % {"tag": name})
if not name:
raise LintError(_(u"Empty tag name"))
raise LintError(_("Empty tag name"))
if name in voidElements:
raise LintError(_(u"Void element reported as EndTag token: %s") % name)
raise LintError(_("Void element reported as EndTag token: %(tag)s") % {"tag": name})
start_name = open_elements.pop()
if start_name != name:
raise LintError(_(u"EndTag (%s) does not match StartTag (%s)") % (name, start_name))
raise LintError(_("EndTag (%(end)s) does not match StartTag (%(start)s)") % {"end": name, "start": start_name})
contentModelFlag = "PCDATA"
elif type == "Comment":
@ -62,27 +67,27 @@ class Filter(_base.Filter):
elif type in ("Characters", "SpaceCharacters"):
data = token["data"]
if not isinstance(data, unicode):
raise LintError(_("Attribute name is not a string: %r") % data)
if not isinstance(data, str):
raise LintError(_("Attribute name is not a string: %(name)r") % {"name": data})
if not data:
raise LintError(_(u"%s token with empty data") % type)
raise LintError(_("%(type)s token with empty data") % {"type": type})
if type == "SpaceCharacters":
data = data.strip(spaceCharacters)
if data:
raise LintError(_(u"Non-space character(s) found in SpaceCharacters token: ") % data)
raise LintError(_("Non-space character(s) found in SpaceCharacters token: %(token)r") % {"token": data})
elif type == "Doctype":
name = token["name"]
if contentModelFlag != "PCDATA":
raise LintError(_("Doctype not in PCDATA content model flag: %s") % name)
if not isinstance(name, unicode):
raise LintError(_(u"Tag name is not a string: %r") % name)
raise LintError(_("Doctype not in PCDATA content model flag: %(name)s") % {"name": name})
if not isinstance(name, str):
raise LintError(_("Tag name is not a string: %(tag)r") % {"tag": name})
# XXX: what to do with token["data"] ?
elif type in ("ParseError", "SerializeError"):
pass
else:
raise LintError(_(u"Unknown token type: %s") % type)
raise LintError(_("Unknown token type: %(type)s") % {"type": type})
yield token

View File

@ -1,4 +1,7 @@
import _base
from __future__ import absolute_import, division, unicode_literals
from . import _base
class Filter(_base.Filter):
def slider(self):
@ -14,8 +17,8 @@ class Filter(_base.Filter):
for previous, token, next in self.slider():
type = token["type"]
if type == "StartTag":
if (token["data"] or
not self.is_optional_start(token["name"], previous, next)):
if (token["data"] or
not self.is_optional_start(token["name"], previous, next)):
yield token
elif type == "EndTag":
if not self.is_optional_end(token["name"], next):
@ -73,7 +76,7 @@ class Filter(_base.Filter):
# omit the thead and tfoot elements' end tag when they are
# immediately followed by a tbody element. See is_optional_end.
if previous and previous['type'] == 'EndTag' and \
previous['name'] in ('tbody','thead','tfoot'):
previous['name'] in ('tbody', 'thead', 'tfoot'):
return False
return next["name"] == 'tr'
else:
@ -121,10 +124,10 @@ class Filter(_base.Filter):
# there is no more content in the parent element.
if type in ("StartTag", "EmptyTag"):
return next["name"] in ('address', 'article', 'aside',
'blockquote', 'datagrid', 'dialog',
'blockquote', 'datagrid', 'dialog',
'dir', 'div', 'dl', 'fieldset', 'footer',
'form', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6',
'header', 'hr', 'menu', 'nav', 'ol',
'header', 'hr', 'menu', 'nav', 'ol',
'p', 'pre', 'section', 'table', 'ul')
else:
return type == "EndTag" or type is None

View File

@ -1,8 +1,12 @@
import _base
from html5lib.sanitizer import HTMLSanitizerMixin
from __future__ import absolute_import, division, unicode_literals
from . import _base
from ..sanitizer import HTMLSanitizerMixin
class Filter(_base.Filter, HTMLSanitizerMixin):
def __iter__(self):
for token in _base.Filter.__iter__(self):
token = self.sanitize_token(token)
if token: yield token
if token:
yield token

View File

@ -1,16 +1,13 @@
try:
frozenset
except NameError:
# Import from the sets module for python 2.3
from sets import ImmutableSet as frozenset
from __future__ import absolute_import, division, unicode_literals
import re
import _base
from html5lib.constants import rcdataElements, spaceCharacters
spaceCharacters = u"".join(spaceCharacters)
from . import _base
from ..constants import rcdataElements, spaceCharacters
spaceCharacters = "".join(spaceCharacters)
SPACES_REGEX = re.compile("[%s]+" % spaceCharacters)
SPACES_REGEX = re.compile(u"[%s]+" % spaceCharacters)
class Filter(_base.Filter):
@ -21,7 +18,7 @@ class Filter(_base.Filter):
for token in _base.Filter.__iter__(self):
type = token["type"]
if type == "StartTag" \
and (preserve or token["name"] in self.spacePreserveElements):
and (preserve or token["name"] in self.spacePreserveElements):
preserve += 1
elif type == "EndTag" and preserve:
@ -29,13 +26,13 @@ class Filter(_base.Filter):
elif not preserve and type == "SpaceCharacters" and token["data"]:
# Test on token["data"] above to not introduce spaces where there were not
token["data"] = u" "
token["data"] = " "
elif not preserve and type == "Characters":
token["data"] = collapse_spaces(token["data"])
yield token
def collapse_spaces(text):
return SPACES_REGEX.sub(' ', text)

File diff suppressed because it is too large Load Diff

View File

@ -1,25 +1,105 @@
import re
from __future__ import absolute_import, division, unicode_literals
baseChar = """[#x0041-#x005A] | [#x0061-#x007A] | [#x00C0-#x00D6] | [#x00D8-#x00F6] | [#x00F8-#x00FF] | [#x0100-#x0131] | [#x0134-#x013E] | [#x0141-#x0148] | [#x014A-#x017E] | [#x0180-#x01C3] | [#x01CD-#x01F0] | [#x01F4-#x01F5] | [#x01FA-#x0217] | [#x0250-#x02A8] | [#x02BB-#x02C1] | #x0386 | [#x0388-#x038A] | #x038C | [#x038E-#x03A1] | [#x03A3-#x03CE] | [#x03D0-#x03D6] | #x03DA | #x03DC | #x03DE | #x03E0 | [#x03E2-#x03F3] | [#x0401-#x040C] | [#x040E-#x044F] | [#x0451-#x045C] | [#x045E-#x0481] | [#x0490-#x04C4] | [#x04C7-#x04C8] | [#x04CB-#x04CC] | [#x04D0-#x04EB] | [#x04EE-#x04F5] | [#x04F8-#x04F9] | [#x0531-#x0556] | #x0559 | [#x0561-#x0586] | [#x05D0-#x05EA] | [#x05F0-#x05F2] | [#x0621-#x063A] | [#x0641-#x064A] | [#x0671-#x06B7] | [#x06BA-#x06BE] | [#x06C0-#x06CE] | [#x06D0-#x06D3] | #x06D5 | [#x06E5-#x06E6] | [#x0905-#x0939] | #x093D | [#x0958-#x0961] | [#x0985-#x098C] | [#x098F-#x0990] | [#x0993-#x09A8] | [#x09AA-#x09B0] | #x09B2 | [#x09B6-#x09B9] | [#x09DC-#x09DD] | [#x09DF-#x09E1] | [#x09F0-#x09F1] | [#x0A05-#x0A0A] | [#x0A0F-#x0A10] | [#x0A13-#x0A28] | [#x0A2A-#x0A30] | [#x0A32-#x0A33] | [#x0A35-#x0A36] | [#x0A38-#x0A39] | [#x0A59-#x0A5C] | #x0A5E | [#x0A72-#x0A74] | [#x0A85-#x0A8B] | #x0A8D | [#x0A8F-#x0A91] | [#x0A93-#x0AA8] | [#x0AAA-#x0AB0] | [#x0AB2-#x0AB3] | [#x0AB5-#x0AB9] | #x0ABD | #x0AE0 | [#x0B05-#x0B0C] | [#x0B0F-#x0B10] | [#x0B13-#x0B28] | [#x0B2A-#x0B30] | [#x0B32-#x0B33] | [#x0B36-#x0B39] | #x0B3D | [#x0B5C-#x0B5D] | [#x0B5F-#x0B61] | [#x0B85-#x0B8A] | [#x0B8E-#x0B90] | [#x0B92-#x0B95] | [#x0B99-#x0B9A] | #x0B9C | [#x0B9E-#x0B9F] | [#x0BA3-#x0BA4] | [#x0BA8-#x0BAA] | [#x0BAE-#x0BB5] | [#x0BB7-#x0BB9] | [#x0C05-#x0C0C] | [#x0C0E-#x0C10] | [#x0C12-#x0C28] | [#x0C2A-#x0C33] | [#x0C35-#x0C39] | [#x0C60-#x0C61] | [#x0C85-#x0C8C] | [#x0C8E-#x0C90] | [#x0C92-#x0CA8] | [#x0CAA-#x0CB3] | [#x0CB5-#x0CB9] | #x0CDE | [#x0CE0-#x0CE1] | [#x0D05-#x0D0C] | [#x0D0E-#x0D10] | [#x0D12-#x0D28] | [#x0D2A-#x0D39] | [#x0D60-#x0D61] | [#x0E01-#x0E2E] | #x0E30 | [#x0E32-#x0E33] | [#x0E40-#x0E45] | [#x0E81-#x0E82] | #x0E84 | [#x0E87-#x0E88] | #x0E8A | #x0E8D | [#x0E94-#x0E97] | [#x0E99-#x0E9F] | [#x0EA1-#x0EA3] | #x0EA5 | #x0EA7 | [#x0EAA-#x0EAB] | [#x0EAD-#x0EAE] | #x0EB0 | [#x0EB2-#x0EB3] | #x0EBD | [#x0EC0-#x0EC4] | [#x0F40-#x0F47] | [#x0F49-#x0F69] | [#x10A0-#x10C5] | [#x10D0-#x10F6] | #x1100 | [#x1102-#x1103] | [#x1105-#x1107] | #x1109 | [#x110B-#x110C] | [#x110E-#x1112] | #x113C | #x113E | #x1140 | #x114C | #x114E | #x1150 | [#x1154-#x1155] | #x1159 | [#x115F-#x1161] | #x1163 | #x1165 | #x1167 | #x1169 | [#x116D-#x116E] | [#x1172-#x1173] | #x1175 | #x119E | #x11A8 | #x11AB | [#x11AE-#x11AF] | [#x11B7-#x11B8] | #x11BA | [#x11BC-#x11C2] | #x11EB | #x11F0 | #x11F9 | [#x1E00-#x1E9B] | [#x1EA0-#x1EF9] | [#x1F00-#x1F15] | [#x1F18-#x1F1D] | [#x1F20-#x1F45] | [#x1F48-#x1F4D] | [#x1F50-#x1F57] | #x1F59 | #x1F5B | #x1F5D | [#x1F5F-#x1F7D] | [#x1F80-#x1FB4] | [#x1FB6-#x1FBC] | #x1FBE | [#x1FC2-#x1FC4] | [#x1FC6-#x1FCC] | [#x1FD0-#x1FD3] | [#x1FD6-#x1FDB] | [#x1FE0-#x1FEC] | [#x1FF2-#x1FF4] | [#x1FF6-#x1FFC] | #x2126 | [#x212A-#x212B] | #x212E | [#x2180-#x2182] | [#x3041-#x3094] | [#x30A1-#x30FA] | [#x3105-#x312C] | [#xAC00-#xD7A3]"""
import re
import warnings
from .constants import DataLossWarning
baseChar = """
[#x0041-#x005A] | [#x0061-#x007A] | [#x00C0-#x00D6] | [#x00D8-#x00F6] |
[#x00F8-#x00FF] | [#x0100-#x0131] | [#x0134-#x013E] | [#x0141-#x0148] |
[#x014A-#x017E] | [#x0180-#x01C3] | [#x01CD-#x01F0] | [#x01F4-#x01F5] |
[#x01FA-#x0217] | [#x0250-#x02A8] | [#x02BB-#x02C1] | #x0386 |
[#x0388-#x038A] | #x038C | [#x038E-#x03A1] | [#x03A3-#x03CE] |
[#x03D0-#x03D6] | #x03DA | #x03DC | #x03DE | #x03E0 | [#x03E2-#x03F3] |
[#x0401-#x040C] | [#x040E-#x044F] | [#x0451-#x045C] | [#x045E-#x0481] |
[#x0490-#x04C4] | [#x04C7-#x04C8] | [#x04CB-#x04CC] | [#x04D0-#x04EB] |
[#x04EE-#x04F5] | [#x04F8-#x04F9] | [#x0531-#x0556] | #x0559 |
[#x0561-#x0586] | [#x05D0-#x05EA] | [#x05F0-#x05F2] | [#x0621-#x063A] |
[#x0641-#x064A] | [#x0671-#x06B7] | [#x06BA-#x06BE] | [#x06C0-#x06CE] |
[#x06D0-#x06D3] | #x06D5 | [#x06E5-#x06E6] | [#x0905-#x0939] | #x093D |
[#x0958-#x0961] | [#x0985-#x098C] | [#x098F-#x0990] | [#x0993-#x09A8] |
[#x09AA-#x09B0] | #x09B2 | [#x09B6-#x09B9] | [#x09DC-#x09DD] |
[#x09DF-#x09E1] | [#x09F0-#x09F1] | [#x0A05-#x0A0A] | [#x0A0F-#x0A10] |
[#x0A13-#x0A28] | [#x0A2A-#x0A30] | [#x0A32-#x0A33] | [#x0A35-#x0A36] |
[#x0A38-#x0A39] | [#x0A59-#x0A5C] | #x0A5E | [#x0A72-#x0A74] |
[#x0A85-#x0A8B] | #x0A8D | [#x0A8F-#x0A91] | [#x0A93-#x0AA8] |
[#x0AAA-#x0AB0] | [#x0AB2-#x0AB3] | [#x0AB5-#x0AB9] | #x0ABD | #x0AE0 |
[#x0B05-#x0B0C] | [#x0B0F-#x0B10] | [#x0B13-#x0B28] | [#x0B2A-#x0B30] |
[#x0B32-#x0B33] | [#x0B36-#x0B39] | #x0B3D | [#x0B5C-#x0B5D] |
[#x0B5F-#x0B61] | [#x0B85-#x0B8A] | [#x0B8E-#x0B90] | [#x0B92-#x0B95] |
[#x0B99-#x0B9A] | #x0B9C | [#x0B9E-#x0B9F] | [#x0BA3-#x0BA4] |
[#x0BA8-#x0BAA] | [#x0BAE-#x0BB5] | [#x0BB7-#x0BB9] | [#x0C05-#x0C0C] |
[#x0C0E-#x0C10] | [#x0C12-#x0C28] | [#x0C2A-#x0C33] | [#x0C35-#x0C39] |
[#x0C60-#x0C61] | [#x0C85-#x0C8C] | [#x0C8E-#x0C90] | [#x0C92-#x0CA8] |
[#x0CAA-#x0CB3] | [#x0CB5-#x0CB9] | #x0CDE | [#x0CE0-#x0CE1] |
[#x0D05-#x0D0C] | [#x0D0E-#x0D10] | [#x0D12-#x0D28] | [#x0D2A-#x0D39] |
[#x0D60-#x0D61] | [#x0E01-#x0E2E] | #x0E30 | [#x0E32-#x0E33] |
[#x0E40-#x0E45] | [#x0E81-#x0E82] | #x0E84 | [#x0E87-#x0E88] | #x0E8A |
#x0E8D | [#x0E94-#x0E97] | [#x0E99-#x0E9F] | [#x0EA1-#x0EA3] | #x0EA5 |
#x0EA7 | [#x0EAA-#x0EAB] | [#x0EAD-#x0EAE] | #x0EB0 | [#x0EB2-#x0EB3] |
#x0EBD | [#x0EC0-#x0EC4] | [#x0F40-#x0F47] | [#x0F49-#x0F69] |
[#x10A0-#x10C5] | [#x10D0-#x10F6] | #x1100 | [#x1102-#x1103] |
[#x1105-#x1107] | #x1109 | [#x110B-#x110C] | [#x110E-#x1112] | #x113C |
#x113E | #x1140 | #x114C | #x114E | #x1150 | [#x1154-#x1155] | #x1159 |
[#x115F-#x1161] | #x1163 | #x1165 | #x1167 | #x1169 | [#x116D-#x116E] |
[#x1172-#x1173] | #x1175 | #x119E | #x11A8 | #x11AB | [#x11AE-#x11AF] |
[#x11B7-#x11B8] | #x11BA | [#x11BC-#x11C2] | #x11EB | #x11F0 | #x11F9 |
[#x1E00-#x1E9B] | [#x1EA0-#x1EF9] | [#x1F00-#x1F15] | [#x1F18-#x1F1D] |
[#x1F20-#x1F45] | [#x1F48-#x1F4D] | [#x1F50-#x1F57] | #x1F59 | #x1F5B |
#x1F5D | [#x1F5F-#x1F7D] | [#x1F80-#x1FB4] | [#x1FB6-#x1FBC] | #x1FBE |
[#x1FC2-#x1FC4] | [#x1FC6-#x1FCC] | [#x1FD0-#x1FD3] | [#x1FD6-#x1FDB] |
[#x1FE0-#x1FEC] | [#x1FF2-#x1FF4] | [#x1FF6-#x1FFC] | #x2126 |
[#x212A-#x212B] | #x212E | [#x2180-#x2182] | [#x3041-#x3094] |
[#x30A1-#x30FA] | [#x3105-#x312C] | [#xAC00-#xD7A3]"""
ideographic = """[#x4E00-#x9FA5] | #x3007 | [#x3021-#x3029]"""
combiningCharacter = """[#x0300-#x0345] | [#x0360-#x0361] | [#x0483-#x0486] | [#x0591-#x05A1] | [#x05A3-#x05B9] | [#x05BB-#x05BD] | #x05BF | [#x05C1-#x05C2] | #x05C4 | [#x064B-#x0652] | #x0670 | [#x06D6-#x06DC] | [#x06DD-#x06DF] | [#x06E0-#x06E4] | [#x06E7-#x06E8] | [#x06EA-#x06ED] | [#x0901-#x0903] | #x093C | [#x093E-#x094C] | #x094D | [#x0951-#x0954] | [#x0962-#x0963] | [#x0981-#x0983] | #x09BC | #x09BE | #x09BF | [#x09C0-#x09C4] | [#x09C7-#x09C8] | [#x09CB-#x09CD] | #x09D7 | [#x09E2-#x09E3] | #x0A02 | #x0A3C | #x0A3E | #x0A3F | [#x0A40-#x0A42] | [#x0A47-#x0A48] | [#x0A4B-#x0A4D] | [#x0A70-#x0A71] | [#x0A81-#x0A83] | #x0ABC | [#x0ABE-#x0AC5] | [#x0AC7-#x0AC9] | [#x0ACB-#x0ACD] | [#x0B01-#x0B03] | #x0B3C | [#x0B3E-#x0B43] | [#x0B47-#x0B48] | [#x0B4B-#x0B4D] | [#x0B56-#x0B57] | [#x0B82-#x0B83] | [#x0BBE-#x0BC2] | [#x0BC6-#x0BC8] | [#x0BCA-#x0BCD] | #x0BD7 | [#x0C01-#x0C03] | [#x0C3E-#x0C44] | [#x0C46-#x0C48] | [#x0C4A-#x0C4D] | [#x0C55-#x0C56] | [#x0C82-#x0C83] | [#x0CBE-#x0CC4] | [#x0CC6-#x0CC8] | [#x0CCA-#x0CCD] | [#x0CD5-#x0CD6] | [#x0D02-#x0D03] | [#x0D3E-#x0D43] | [#x0D46-#x0D48] | [#x0D4A-#x0D4D] | #x0D57 | #x0E31 | [#x0E34-#x0E3A] | [#x0E47-#x0E4E] | #x0EB1 | [#x0EB4-#x0EB9] | [#x0EBB-#x0EBC] | [#x0EC8-#x0ECD] | [#x0F18-#x0F19] | #x0F35 | #x0F37 | #x0F39 | #x0F3E | #x0F3F | [#x0F71-#x0F84] | [#x0F86-#x0F8B] | [#x0F90-#x0F95] | #x0F97 | [#x0F99-#x0FAD] | [#x0FB1-#x0FB7] | #x0FB9 | [#x20D0-#x20DC] | #x20E1 | [#x302A-#x302F] | #x3099 | #x309A"""
combiningCharacter = """
[#x0300-#x0345] | [#x0360-#x0361] | [#x0483-#x0486] | [#x0591-#x05A1] |
[#x05A3-#x05B9] | [#x05BB-#x05BD] | #x05BF | [#x05C1-#x05C2] | #x05C4 |
[#x064B-#x0652] | #x0670 | [#x06D6-#x06DC] | [#x06DD-#x06DF] |
[#x06E0-#x06E4] | [#x06E7-#x06E8] | [#x06EA-#x06ED] | [#x0901-#x0903] |
#x093C | [#x093E-#x094C] | #x094D | [#x0951-#x0954] | [#x0962-#x0963] |
[#x0981-#x0983] | #x09BC | #x09BE | #x09BF | [#x09C0-#x09C4] |
[#x09C7-#x09C8] | [#x09CB-#x09CD] | #x09D7 | [#x09E2-#x09E3] | #x0A02 |
#x0A3C | #x0A3E | #x0A3F | [#x0A40-#x0A42] | [#x0A47-#x0A48] |
[#x0A4B-#x0A4D] | [#x0A70-#x0A71] | [#x0A81-#x0A83] | #x0ABC |
[#x0ABE-#x0AC5] | [#x0AC7-#x0AC9] | [#x0ACB-#x0ACD] | [#x0B01-#x0B03] |
#x0B3C | [#x0B3E-#x0B43] | [#x0B47-#x0B48] | [#x0B4B-#x0B4D] |
[#x0B56-#x0B57] | [#x0B82-#x0B83] | [#x0BBE-#x0BC2] | [#x0BC6-#x0BC8] |
[#x0BCA-#x0BCD] | #x0BD7 | [#x0C01-#x0C03] | [#x0C3E-#x0C44] |
[#x0C46-#x0C48] | [#x0C4A-#x0C4D] | [#x0C55-#x0C56] | [#x0C82-#x0C83] |
[#x0CBE-#x0CC4] | [#x0CC6-#x0CC8] | [#x0CCA-#x0CCD] | [#x0CD5-#x0CD6] |
[#x0D02-#x0D03] | [#x0D3E-#x0D43] | [#x0D46-#x0D48] | [#x0D4A-#x0D4D] |
#x0D57 | #x0E31 | [#x0E34-#x0E3A] | [#x0E47-#x0E4E] | #x0EB1 |
[#x0EB4-#x0EB9] | [#x0EBB-#x0EBC] | [#x0EC8-#x0ECD] | [#x0F18-#x0F19] |
#x0F35 | #x0F37 | #x0F39 | #x0F3E | #x0F3F | [#x0F71-#x0F84] |
[#x0F86-#x0F8B] | [#x0F90-#x0F95] | #x0F97 | [#x0F99-#x0FAD] |
[#x0FB1-#x0FB7] | #x0FB9 | [#x20D0-#x20DC] | #x20E1 | [#x302A-#x302F] |
#x3099 | #x309A"""
digit = """[#x0030-#x0039] | [#x0660-#x0669] | [#x06F0-#x06F9] | [#x0966-#x096F] | [#x09E6-#x09EF] | [#x0A66-#x0A6F] | [#x0AE6-#x0AEF] | [#x0B66-#x0B6F] | [#x0BE7-#x0BEF] | [#x0C66-#x0C6F] | [#x0CE6-#x0CEF] | [#x0D66-#x0D6F] | [#x0E50-#x0E59] | [#x0ED0-#x0ED9] | [#x0F20-#x0F29]"""
digit = """
[#x0030-#x0039] | [#x0660-#x0669] | [#x06F0-#x06F9] | [#x0966-#x096F] |
[#x09E6-#x09EF] | [#x0A66-#x0A6F] | [#x0AE6-#x0AEF] | [#x0B66-#x0B6F] |
[#x0BE7-#x0BEF] | [#x0C66-#x0C6F] | [#x0CE6-#x0CEF] | [#x0D66-#x0D6F] |
[#x0E50-#x0E59] | [#x0ED0-#x0ED9] | [#x0F20-#x0F29]"""
extender = """#x00B7 | #x02D0 | #x02D1 | #x0387 | #x0640 | #x0E46 | #x0EC6 | #x3005 | [#x3031-#x3035] | [#x309D-#x309E] | [#x30FC-#x30FE]"""
extender = """
#x00B7 | #x02D0 | #x02D1 | #x0387 | #x0640 | #x0E46 | #x0EC6 | #x3005 |
#[#x3031-#x3035] | [#x309D-#x309E] | [#x30FC-#x30FE]"""
letter = " | ".join([baseChar, ideographic])
#Without the
name = " | ".join([letter, digit, ".", "-", "_", combiningCharacter,
extender])
# Without the
name = " | ".join([letter, digit, ".", "-", "_", combiningCharacter,
extender])
nameFirst = " | ".join([letter, "_"])
reChar = re.compile(r"#x([\d|A-F]{4,4})")
reCharRange = re.compile(r"\[#x([\d|A-F]{4,4})-#x([\d|A-F]{4,4})\]")
def charStringToList(chars):
charRanges = [item.strip() for item in chars.split(" | ")]
rv = []
@ -30,16 +110,17 @@ def charStringToList(chars):
if match is not None:
rv.append([hexToInt(item) for item in match.groups()])
if len(rv[-1]) == 1:
rv[-1] = rv[-1]*2
rv[-1] = rv[-1] * 2
foundMatch = True
break
if not foundMatch:
assert len(item) == 1
rv.append([ord(item)] * 2)
rv = normaliseCharList(rv)
return rv
def normaliseCharList(charList):
charList = sorted(charList)
for item in charList:
@ -49,61 +130,69 @@ def normaliseCharList(charList):
while i < len(charList):
j = 1
rv.append(charList[i])
while i + j < len(charList) and charList[i+j][0] <= rv[-1][1] + 1:
rv[-1][1] = charList[i+j][1]
while i + j < len(charList) and charList[i + j][0] <= rv[-1][1] + 1:
rv[-1][1] = charList[i + j][1]
j += 1
i += j
return rv
#We don't really support characters above the BMP :(
# We don't really support characters above the BMP :(
max_unicode = int("FFFF", 16)
def missingRanges(charList):
rv = []
if charList[0] != 0:
rv.append([0, charList[0][0] - 1])
for i, item in enumerate(charList[:-1]):
rv.append([item[1]+1, charList[i+1][0] - 1])
rv.append([item[1] + 1, charList[i + 1][0] - 1])
if charList[-1][1] != max_unicode:
rv.append([charList[-1][1] + 1, max_unicode])
return rv
def listToRegexpStr(charList):
rv = []
for item in charList:
if item[0] == item[1]:
rv.append(escapeRegexp(unichr(item[0])))
rv.append(escapeRegexp(chr(item[0])))
else:
rv.append(escapeRegexp(unichr(item[0])) + "-" +
escapeRegexp(unichr(item[1])))
return "[%s]"%"".join(rv)
rv.append(escapeRegexp(chr(item[0])) + "-" +
escapeRegexp(chr(item[1])))
return "[%s]" % "".join(rv)
def hexToInt(hex_str):
return int(hex_str, 16)
def escapeRegexp(string):
specialCharacters = (".", "^", "$", "*", "+", "?", "{", "}",
"[", "]", "|", "(", ")", "-")
"[", "]", "|", "(", ")", "-")
for char in specialCharacters:
string = string.replace(char, "\\" + char)
if char in string:
print string
return string
#output from the above
nonXmlNameBMPRegexp = re.compile(u'[\x00-,/:-@\\[-\\^`\\{-\xb6\xb8-\xbf\xd7\xf7\u0132-\u0133\u013f-\u0140\u0149\u017f\u01c4-\u01cc\u01f1-\u01f3\u01f6-\u01f9\u0218-\u024f\u02a9-\u02ba\u02c2-\u02cf\u02d2-\u02ff\u0346-\u035f\u0362-\u0385\u038b\u038d\u03a2\u03cf\u03d7-\u03d9\u03db\u03dd\u03df\u03e1\u03f4-\u0400\u040d\u0450\u045d\u0482\u0487-\u048f\u04c5-\u04c6\u04c9-\u04ca\u04cd-\u04cf\u04ec-\u04ed\u04f6-\u04f7\u04fa-\u0530\u0557-\u0558\u055a-\u0560\u0587-\u0590\u05a2\u05ba\u05be\u05c0\u05c3\u05c5-\u05cf\u05eb-\u05ef\u05f3-\u0620\u063b-\u063f\u0653-\u065f\u066a-\u066f\u06b8-\u06b9\u06bf\u06cf\u06d4\u06e9\u06ee-\u06ef\u06fa-\u0900\u0904\u093a-\u093b\u094e-\u0950\u0955-\u0957\u0964-\u0965\u0970-\u0980\u0984\u098d-\u098e\u0991-\u0992\u09a9\u09b1\u09b3-\u09b5\u09ba-\u09bb\u09bd\u09c5-\u09c6\u09c9-\u09ca\u09ce-\u09d6\u09d8-\u09db\u09de\u09e4-\u09e5\u09f2-\u0a01\u0a03-\u0a04\u0a0b-\u0a0e\u0a11-\u0a12\u0a29\u0a31\u0a34\u0a37\u0a3a-\u0a3b\u0a3d\u0a43-\u0a46\u0a49-\u0a4a\u0a4e-\u0a58\u0a5d\u0a5f-\u0a65\u0a75-\u0a80\u0a84\u0a8c\u0a8e\u0a92\u0aa9\u0ab1\u0ab4\u0aba-\u0abb\u0ac6\u0aca\u0ace-\u0adf\u0ae1-\u0ae5\u0af0-\u0b00\u0b04\u0b0d-\u0b0e\u0b11-\u0b12\u0b29\u0b31\u0b34-\u0b35\u0b3a-\u0b3b\u0b44-\u0b46\u0b49-\u0b4a\u0b4e-\u0b55\u0b58-\u0b5b\u0b5e\u0b62-\u0b65\u0b70-\u0b81\u0b84\u0b8b-\u0b8d\u0b91\u0b96-\u0b98\u0b9b\u0b9d\u0ba0-\u0ba2\u0ba5-\u0ba7\u0bab-\u0bad\u0bb6\u0bba-\u0bbd\u0bc3-\u0bc5\u0bc9\u0bce-\u0bd6\u0bd8-\u0be6\u0bf0-\u0c00\u0c04\u0c0d\u0c11\u0c29\u0c34\u0c3a-\u0c3d\u0c45\u0c49\u0c4e-\u0c54\u0c57-\u0c5f\u0c62-\u0c65\u0c70-\u0c81\u0c84\u0c8d\u0c91\u0ca9\u0cb4\u0cba-\u0cbd\u0cc5\u0cc9\u0cce-\u0cd4\u0cd7-\u0cdd\u0cdf\u0ce2-\u0ce5\u0cf0-\u0d01\u0d04\u0d0d\u0d11\u0d29\u0d3a-\u0d3d\u0d44-\u0d45\u0d49\u0d4e-\u0d56\u0d58-\u0d5f\u0d62-\u0d65\u0d70-\u0e00\u0e2f\u0e3b-\u0e3f\u0e4f\u0e5a-\u0e80\u0e83\u0e85-\u0e86\u0e89\u0e8b-\u0e8c\u0e8e-\u0e93\u0e98\u0ea0\u0ea4\u0ea6\u0ea8-\u0ea9\u0eac\u0eaf\u0eba\u0ebe-\u0ebf\u0ec5\u0ec7\u0ece-\u0ecf\u0eda-\u0f17\u0f1a-\u0f1f\u0f2a-\u0f34\u0f36\u0f38\u0f3a-\u0f3d\u0f48\u0f6a-\u0f70\u0f85\u0f8c-\u0f8f\u0f96\u0f98\u0fae-\u0fb0\u0fb8\u0fba-\u109f\u10c6-\u10cf\u10f7-\u10ff\u1101\u1104\u1108\u110a\u110d\u1113-\u113b\u113d\u113f\u1141-\u114b\u114d\u114f\u1151-\u1153\u1156-\u1158\u115a-\u115e\u1162\u1164\u1166\u1168\u116a-\u116c\u116f-\u1171\u1174\u1176-\u119d\u119f-\u11a7\u11a9-\u11aa\u11ac-\u11ad\u11b0-\u11b6\u11b9\u11bb\u11c3-\u11ea\u11ec-\u11ef\u11f1-\u11f8\u11fa-\u1dff\u1e9c-\u1e9f\u1efa-\u1eff\u1f16-\u1f17\u1f1e-\u1f1f\u1f46-\u1f47\u1f4e-\u1f4f\u1f58\u1f5a\u1f5c\u1f5e\u1f7e-\u1f7f\u1fb5\u1fbd\u1fbf-\u1fc1\u1fc5\u1fcd-\u1fcf\u1fd4-\u1fd5\u1fdc-\u1fdf\u1fed-\u1ff1\u1ff5\u1ffd-\u20cf\u20dd-\u20e0\u20e2-\u2125\u2127-\u2129\u212c-\u212d\u212f-\u217f\u2183-\u3004\u3006\u3008-\u3020\u3030\u3036-\u3040\u3095-\u3098\u309b-\u309c\u309f-\u30a0\u30fb\u30ff-\u3104\u312d-\u4dff\u9fa6-\uabff\ud7a4-\uffff]')
# output from the above
nonXmlNameBMPRegexp = re.compile('[\x00-,/:-@\\[-\\^`\\{-\xb6\xb8-\xbf\xd7\xf7\u0132-\u0133\u013f-\u0140\u0149\u017f\u01c4-\u01cc\u01f1-\u01f3\u01f6-\u01f9\u0218-\u024f\u02a9-\u02ba\u02c2-\u02cf\u02d2-\u02ff\u0346-\u035f\u0362-\u0385\u038b\u038d\u03a2\u03cf\u03d7-\u03d9\u03db\u03dd\u03df\u03e1\u03f4-\u0400\u040d\u0450\u045d\u0482\u0487-\u048f\u04c5-\u04c6\u04c9-\u04ca\u04cd-\u04cf\u04ec-\u04ed\u04f6-\u04f7\u04fa-\u0530\u0557-\u0558\u055a-\u0560\u0587-\u0590\u05a2\u05ba\u05be\u05c0\u05c3\u05c5-\u05cf\u05eb-\u05ef\u05f3-\u0620\u063b-\u063f\u0653-\u065f\u066a-\u066f\u06b8-\u06b9\u06bf\u06cf\u06d4\u06e9\u06ee-\u06ef\u06fa-\u0900\u0904\u093a-\u093b\u094e-\u0950\u0955-\u0957\u0964-\u0965\u0970-\u0980\u0984\u098d-\u098e\u0991-\u0992\u09a9\u09b1\u09b3-\u09b5\u09ba-\u09bb\u09bd\u09c5-\u09c6\u09c9-\u09ca\u09ce-\u09d6\u09d8-\u09db\u09de\u09e4-\u09e5\u09f2-\u0a01\u0a03-\u0a04\u0a0b-\u0a0e\u0a11-\u0a12\u0a29\u0a31\u0a34\u0a37\u0a3a-\u0a3b\u0a3d\u0a43-\u0a46\u0a49-\u0a4a\u0a4e-\u0a58\u0a5d\u0a5f-\u0a65\u0a75-\u0a80\u0a84\u0a8c\u0a8e\u0a92\u0aa9\u0ab1\u0ab4\u0aba-\u0abb\u0ac6\u0aca\u0ace-\u0adf\u0ae1-\u0ae5\u0af0-\u0b00\u0b04\u0b0d-\u0b0e\u0b11-\u0b12\u0b29\u0b31\u0b34-\u0b35\u0b3a-\u0b3b\u0b44-\u0b46\u0b49-\u0b4a\u0b4e-\u0b55\u0b58-\u0b5b\u0b5e\u0b62-\u0b65\u0b70-\u0b81\u0b84\u0b8b-\u0b8d\u0b91\u0b96-\u0b98\u0b9b\u0b9d\u0ba0-\u0ba2\u0ba5-\u0ba7\u0bab-\u0bad\u0bb6\u0bba-\u0bbd\u0bc3-\u0bc5\u0bc9\u0bce-\u0bd6\u0bd8-\u0be6\u0bf0-\u0c00\u0c04\u0c0d\u0c11\u0c29\u0c34\u0c3a-\u0c3d\u0c45\u0c49\u0c4e-\u0c54\u0c57-\u0c5f\u0c62-\u0c65\u0c70-\u0c81\u0c84\u0c8d\u0c91\u0ca9\u0cb4\u0cba-\u0cbd\u0cc5\u0cc9\u0cce-\u0cd4\u0cd7-\u0cdd\u0cdf\u0ce2-\u0ce5\u0cf0-\u0d01\u0d04\u0d0d\u0d11\u0d29\u0d3a-\u0d3d\u0d44-\u0d45\u0d49\u0d4e-\u0d56\u0d58-\u0d5f\u0d62-\u0d65\u0d70-\u0e00\u0e2f\u0e3b-\u0e3f\u0e4f\u0e5a-\u0e80\u0e83\u0e85-\u0e86\u0e89\u0e8b-\u0e8c\u0e8e-\u0e93\u0e98\u0ea0\u0ea4\u0ea6\u0ea8-\u0ea9\u0eac\u0eaf\u0eba\u0ebe-\u0ebf\u0ec5\u0ec7\u0ece-\u0ecf\u0eda-\u0f17\u0f1a-\u0f1f\u0f2a-\u0f34\u0f36\u0f38\u0f3a-\u0f3d\u0f48\u0f6a-\u0f70\u0f85\u0f8c-\u0f8f\u0f96\u0f98\u0fae-\u0fb0\u0fb8\u0fba-\u109f\u10c6-\u10cf\u10f7-\u10ff\u1101\u1104\u1108\u110a\u110d\u1113-\u113b\u113d\u113f\u1141-\u114b\u114d\u114f\u1151-\u1153\u1156-\u1158\u115a-\u115e\u1162\u1164\u1166\u1168\u116a-\u116c\u116f-\u1171\u1174\u1176-\u119d\u119f-\u11a7\u11a9-\u11aa\u11ac-\u11ad\u11b0-\u11b6\u11b9\u11bb\u11c3-\u11ea\u11ec-\u11ef\u11f1-\u11f8\u11fa-\u1dff\u1e9c-\u1e9f\u1efa-\u1eff\u1f16-\u1f17\u1f1e-\u1f1f\u1f46-\u1f47\u1f4e-\u1f4f\u1f58\u1f5a\u1f5c\u1f5e\u1f7e-\u1f7f\u1fb5\u1fbd\u1fbf-\u1fc1\u1fc5\u1fcd-\u1fcf\u1fd4-\u1fd5\u1fdc-\u1fdf\u1fed-\u1ff1\u1ff5\u1ffd-\u20cf\u20dd-\u20e0\u20e2-\u2125\u2127-\u2129\u212c-\u212d\u212f-\u217f\u2183-\u3004\u3006\u3008-\u3020\u3030\u3036-\u3040\u3095-\u3098\u309b-\u309c\u309f-\u30a0\u30fb\u30ff-\u3104\u312d-\u4dff\u9fa6-\uabff\ud7a4-\uffff]')
nonXmlNameFirstBMPRegexp = re.compile('[\x00-@\\[-\\^`\\{-\xbf\xd7\xf7\u0132-\u0133\u013f-\u0140\u0149\u017f\u01c4-\u01cc\u01f1-\u01f3\u01f6-\u01f9\u0218-\u024f\u02a9-\u02ba\u02c2-\u0385\u0387\u038b\u038d\u03a2\u03cf\u03d7-\u03d9\u03db\u03dd\u03df\u03e1\u03f4-\u0400\u040d\u0450\u045d\u0482-\u048f\u04c5-\u04c6\u04c9-\u04ca\u04cd-\u04cf\u04ec-\u04ed\u04f6-\u04f7\u04fa-\u0530\u0557-\u0558\u055a-\u0560\u0587-\u05cf\u05eb-\u05ef\u05f3-\u0620\u063b-\u0640\u064b-\u0670\u06b8-\u06b9\u06bf\u06cf\u06d4\u06d6-\u06e4\u06e7-\u0904\u093a-\u093c\u093e-\u0957\u0962-\u0984\u098d-\u098e\u0991-\u0992\u09a9\u09b1\u09b3-\u09b5\u09ba-\u09db\u09de\u09e2-\u09ef\u09f2-\u0a04\u0a0b-\u0a0e\u0a11-\u0a12\u0a29\u0a31\u0a34\u0a37\u0a3a-\u0a58\u0a5d\u0a5f-\u0a71\u0a75-\u0a84\u0a8c\u0a8e\u0a92\u0aa9\u0ab1\u0ab4\u0aba-\u0abc\u0abe-\u0adf\u0ae1-\u0b04\u0b0d-\u0b0e\u0b11-\u0b12\u0b29\u0b31\u0b34-\u0b35\u0b3a-\u0b3c\u0b3e-\u0b5b\u0b5e\u0b62-\u0b84\u0b8b-\u0b8d\u0b91\u0b96-\u0b98\u0b9b\u0b9d\u0ba0-\u0ba2\u0ba5-\u0ba7\u0bab-\u0bad\u0bb6\u0bba-\u0c04\u0c0d\u0c11\u0c29\u0c34\u0c3a-\u0c5f\u0c62-\u0c84\u0c8d\u0c91\u0ca9\u0cb4\u0cba-\u0cdd\u0cdf\u0ce2-\u0d04\u0d0d\u0d11\u0d29\u0d3a-\u0d5f\u0d62-\u0e00\u0e2f\u0e31\u0e34-\u0e3f\u0e46-\u0e80\u0e83\u0e85-\u0e86\u0e89\u0e8b-\u0e8c\u0e8e-\u0e93\u0e98\u0ea0\u0ea4\u0ea6\u0ea8-\u0ea9\u0eac\u0eaf\u0eb1\u0eb4-\u0ebc\u0ebe-\u0ebf\u0ec5-\u0f3f\u0f48\u0f6a-\u109f\u10c6-\u10cf\u10f7-\u10ff\u1101\u1104\u1108\u110a\u110d\u1113-\u113b\u113d\u113f\u1141-\u114b\u114d\u114f\u1151-\u1153\u1156-\u1158\u115a-\u115e\u1162\u1164\u1166\u1168\u116a-\u116c\u116f-\u1171\u1174\u1176-\u119d\u119f-\u11a7\u11a9-\u11aa\u11ac-\u11ad\u11b0-\u11b6\u11b9\u11bb\u11c3-\u11ea\u11ec-\u11ef\u11f1-\u11f8\u11fa-\u1dff\u1e9c-\u1e9f\u1efa-\u1eff\u1f16-\u1f17\u1f1e-\u1f1f\u1f46-\u1f47\u1f4e-\u1f4f\u1f58\u1f5a\u1f5c\u1f5e\u1f7e-\u1f7f\u1fb5\u1fbd\u1fbf-\u1fc1\u1fc5\u1fcd-\u1fcf\u1fd4-\u1fd5\u1fdc-\u1fdf\u1fed-\u1ff1\u1ff5\u1ffd-\u2125\u2127-\u2129\u212c-\u212d\u212f-\u217f\u2183-\u3006\u3008-\u3020\u302a-\u3040\u3095-\u30a0\u30fb-\u3104\u312d-\u4dff\u9fa6-\uabff\ud7a4-\uffff]')
# Simpler things
nonPubidCharRegexp = re.compile("[^\x20\x0D\x0Aa-zA-Z0-9\-\'()+,./:=?;!*#@$_%]")
nonXmlNameFirstBMPRegexp = re.compile(u'[\x00-@\\[-\\^`\\{-\xbf\xd7\xf7\u0132-\u0133\u013f-\u0140\u0149\u017f\u01c4-\u01cc\u01f1-\u01f3\u01f6-\u01f9\u0218-\u024f\u02a9-\u02ba\u02c2-\u0385\u0387\u038b\u038d\u03a2\u03cf\u03d7-\u03d9\u03db\u03dd\u03df\u03e1\u03f4-\u0400\u040d\u0450\u045d\u0482-\u048f\u04c5-\u04c6\u04c9-\u04ca\u04cd-\u04cf\u04ec-\u04ed\u04f6-\u04f7\u04fa-\u0530\u0557-\u0558\u055a-\u0560\u0587-\u05cf\u05eb-\u05ef\u05f3-\u0620\u063b-\u0640\u064b-\u0670\u06b8-\u06b9\u06bf\u06cf\u06d4\u06d6-\u06e4\u06e7-\u0904\u093a-\u093c\u093e-\u0957\u0962-\u0984\u098d-\u098e\u0991-\u0992\u09a9\u09b1\u09b3-\u09b5\u09ba-\u09db\u09de\u09e2-\u09ef\u09f2-\u0a04\u0a0b-\u0a0e\u0a11-\u0a12\u0a29\u0a31\u0a34\u0a37\u0a3a-\u0a58\u0a5d\u0a5f-\u0a71\u0a75-\u0a84\u0a8c\u0a8e\u0a92\u0aa9\u0ab1\u0ab4\u0aba-\u0abc\u0abe-\u0adf\u0ae1-\u0b04\u0b0d-\u0b0e\u0b11-\u0b12\u0b29\u0b31\u0b34-\u0b35\u0b3a-\u0b3c\u0b3e-\u0b5b\u0b5e\u0b62-\u0b84\u0b8b-\u0b8d\u0b91\u0b96-\u0b98\u0b9b\u0b9d\u0ba0-\u0ba2\u0ba5-\u0ba7\u0bab-\u0bad\u0bb6\u0bba-\u0c04\u0c0d\u0c11\u0c29\u0c34\u0c3a-\u0c5f\u0c62-\u0c84\u0c8d\u0c91\u0ca9\u0cb4\u0cba-\u0cdd\u0cdf\u0ce2-\u0d04\u0d0d\u0d11\u0d29\u0d3a-\u0d5f\u0d62-\u0e00\u0e2f\u0e31\u0e34-\u0e3f\u0e46-\u0e80\u0e83\u0e85-\u0e86\u0e89\u0e8b-\u0e8c\u0e8e-\u0e93\u0e98\u0ea0\u0ea4\u0ea6\u0ea8-\u0ea9\u0eac\u0eaf\u0eb1\u0eb4-\u0ebc\u0ebe-\u0ebf\u0ec5-\u0f3f\u0f48\u0f6a-\u109f\u10c6-\u10cf\u10f7-\u10ff\u1101\u1104\u1108\u110a\u110d\u1113-\u113b\u113d\u113f\u1141-\u114b\u114d\u114f\u1151-\u1153\u1156-\u1158\u115a-\u115e\u1162\u1164\u1166\u1168\u116a-\u116c\u116f-\u1171\u1174\u1176-\u119d\u119f-\u11a7\u11a9-\u11aa\u11ac-\u11ad\u11b0-\u11b6\u11b9\u11bb\u11c3-\u11ea\u11ec-\u11ef\u11f1-\u11f8\u11fa-\u1dff\u1e9c-\u1e9f\u1efa-\u1eff\u1f16-\u1f17\u1f1e-\u1f1f\u1f46-\u1f47\u1f4e-\u1f4f\u1f58\u1f5a\u1f5c\u1f5e\u1f7e-\u1f7f\u1fb5\u1fbd\u1fbf-\u1fc1\u1fc5\u1fcd-\u1fcf\u1fd4-\u1fd5\u1fdc-\u1fdf\u1fed-\u1ff1\u1ff5\u1ffd-\u2125\u2127-\u2129\u212c-\u212d\u212f-\u217f\u2183-\u3006\u3008-\u3020\u302a-\u3040\u3095-\u30a0\u30fb-\u3104\u312d-\u4dff\u9fa6-\uabff\ud7a4-\uffff]')
class InfosetFilter(object):
replacementRegexp = re.compile(r"U[\dA-F]{5,5}")
def __init__(self, replaceChars = None,
dropXmlnsLocalName = False,
dropXmlnsAttrNs = False,
preventDoubleDashComments = False,
preventDashAtCommentEnd = False,
replaceFormFeedCharacters = True):
def __init__(self, replaceChars=None,
dropXmlnsLocalName=False,
dropXmlnsAttrNs=False,
preventDoubleDashComments=False,
preventDashAtCommentEnd=False,
replaceFormFeedCharacters=True,
preventSingleQuotePubid=False):
self.dropXmlnsLocalName = dropXmlnsLocalName
self.dropXmlnsAttrNs = dropXmlnsAttrNs
@ -113,14 +202,17 @@ class InfosetFilter(object):
self.replaceFormFeedCharacters = replaceFormFeedCharacters
self.preventSingleQuotePubid = preventSingleQuotePubid
self.replaceCache = {}
def coerceAttribute(self, name, namespace=None):
if self.dropXmlnsLocalName and name.startswith("xmlns:"):
#Need a datalosswarning here
warnings.warn("Attributes cannot begin with xmlns", DataLossWarning)
return None
elif (self.dropXmlnsAttrNs and
elif (self.dropXmlnsAttrNs and
namespace == "http://www.w3.org/2000/xmlns/"):
warnings.warn("Attributes cannot be in the xml namespace", DataLossWarning)
return None
else:
return self.toXmlName(name)
@ -131,20 +223,35 @@ class InfosetFilter(object):
def coerceComment(self, data):
if self.preventDoubleDashComments:
while "--" in data:
warnings.warn("Comments cannot contain adjacent dashes", DataLossWarning)
data = data.replace("--", "- -")
return data
def coerceCharacters(self, data):
if self.replaceFormFeedCharacters:
for i in range(data.count("\x0C")):
warnings.warn("Text cannot contain U+000C", DataLossWarning)
data = data.replace("\x0C", " ")
#Other non-xml characters
# Other non-xml characters
return data
def coercePubid(self, data):
dataOutput = data
for char in nonPubidCharRegexp.findall(data):
warnings.warn("Coercing non-XML pubid", DataLossWarning)
replacement = self.getReplacementCharacter(char)
dataOutput = dataOutput.replace(char, replacement)
if self.preventSingleQuotePubid and dataOutput.find("'") >= 0:
warnings.warn("Pubid cannot contain single quote", DataLossWarning)
dataOutput = dataOutput.replace("'", self.getReplacementCharacter("'"))
return dataOutput
def toXmlName(self, name):
nameFirst = name[0]
nameRest = name[1:]
m = nonXmlNameFirstBMPRegexp.match(nameFirst)
if m:
warnings.warn("Coercing non-XML name", DataLossWarning)
nameFirstOutput = self.getReplacementCharacter(nameFirst)
else:
nameFirstOutput = nameFirst
@ -152,10 +259,11 @@ class InfosetFilter(object):
nameRestOutput = nameRest
replaceChars = set(nonXmlNameBMPRegexp.findall(nameRest))
for char in replaceChars:
warnings.warn("Coercing non-XML name", DataLossWarning)
replacement = self.getReplacementCharacter(char)
nameRestOutput = nameRestOutput.replace(char, replacement)
return nameFirstOutput + nameRestOutput
def getReplacementCharacter(self, char):
if char in self.replaceCache:
replacement = self.replaceCache[char]
@ -169,9 +277,9 @@ class InfosetFilter(object):
return name
def escapeChar(self, char):
replacement = "U" + hex(ord(char))[2:].upper().rjust(5, "0")
replacement = "U%05X" % ord(char)
self.replaceCache[char] = replacement
return replacement
def unescapeChar(self, charcode):
return unichr(int(charcode[1:], 16))
return chr(int(charcode[1:], 16))

File diff suppressed because it is too large Load Diff

View File

@ -1,125 +1,145 @@
from __future__ import absolute_import, division, unicode_literals
import re
from xml.sax.saxutils import escape, unescape
from tokenizer import HTMLTokenizer
from constants import tokenTypes
from .tokenizer import HTMLTokenizer
from .constants import tokenTypes
class HTMLSanitizerMixin(object):
""" sanitization of XHTML+MathML+SVG and of inline style attributes."""
acceptable_elements = ['a', 'abbr', 'acronym', 'address', 'area', 'b',
'big', 'blockquote', 'br', 'button', 'caption', 'center', 'cite',
'code', 'col', 'colgroup', 'dd', 'del', 'dfn', 'dir', 'div', 'dl', 'dt',
'em', 'fieldset', 'font', 'form', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6',
'hr', 'i', 'img', 'input', 'ins', 'kbd', 'label', 'legend', 'li', 'map',
'menu', 'ol', 'optgroup', 'option', 'p', 'pre', 'q', 's', 'samp',
'select', 'small', 'span', 'strike', 'strong', 'sub', 'sup', 'table',
'tbody', 'td', 'textarea', 'tfoot', 'th', 'thead', 'tr', 'tt', 'u',
'ul', 'var']
acceptable_elements = ['a', 'abbr', 'acronym', 'address', 'area',
'article', 'aside', 'audio', 'b', 'big', 'blockquote', 'br', 'button',
'canvas', 'caption', 'center', 'cite', 'code', 'col', 'colgroup',
'command', 'datagrid', 'datalist', 'dd', 'del', 'details', 'dfn',
'dialog', 'dir', 'div', 'dl', 'dt', 'em', 'event-source', 'fieldset',
'figcaption', 'figure', 'footer', 'font', 'form', 'header', 'h1',
'h2', 'h3', 'h4', 'h5', 'h6', 'hr', 'i', 'img', 'input', 'ins',
'keygen', 'kbd', 'label', 'legend', 'li', 'm', 'map', 'menu', 'meter',
'multicol', 'nav', 'nextid', 'ol', 'output', 'optgroup', 'option',
'p', 'pre', 'progress', 'q', 's', 'samp', 'section', 'select',
'small', 'sound', 'source', 'spacer', 'span', 'strike', 'strong',
'sub', 'sup', 'table', 'tbody', 'td', 'textarea', 'time', 'tfoot',
'th', 'thead', 'tr', 'tt', 'u', 'ul', 'var', 'video']
mathml_elements = ['maction', 'math', 'merror', 'mfrac', 'mi',
'mmultiscripts', 'mn', 'mo', 'mover', 'mpadded', 'mphantom',
'mprescripts', 'mroot', 'mrow', 'mspace', 'msqrt', 'mstyle', 'msub',
'msubsup', 'msup', 'mtable', 'mtd', 'mtext', 'mtr', 'munder',
'munderover', 'none']
'mmultiscripts', 'mn', 'mo', 'mover', 'mpadded', 'mphantom',
'mprescripts', 'mroot', 'mrow', 'mspace', 'msqrt', 'mstyle', 'msub',
'msubsup', 'msup', 'mtable', 'mtd', 'mtext', 'mtr', 'munder',
'munderover', 'none']
svg_elements = ['a', 'animate', 'animateColor', 'animateMotion',
'animateTransform', 'circle', 'defs', 'desc', 'ellipse', 'font-face',
'font-face-name', 'font-face-src', 'g', 'glyph', 'hkern',
'linearGradient', 'line', 'marker', 'metadata', 'missing-glyph',
'mpath', 'path', 'polygon', 'polyline', 'radialGradient', 'rect',
'set', 'stop', 'svg', 'switch', 'text', 'title', 'tspan', 'use']
'animateTransform', 'clipPath', 'circle', 'defs', 'desc', 'ellipse',
'font-face', 'font-face-name', 'font-face-src', 'g', 'glyph', 'hkern',
'linearGradient', 'line', 'marker', 'metadata', 'missing-glyph',
'mpath', 'path', 'polygon', 'polyline', 'radialGradient', 'rect',
'set', 'stop', 'svg', 'switch', 'text', 'title', 'tspan', 'use']
acceptable_attributes = ['abbr', 'accept', 'accept-charset', 'accesskey',
'action', 'align', 'alt', 'axis', 'border', 'cellpadding',
'cellspacing', 'char', 'charoff', 'charset', 'checked', 'cite', 'class',
'clear', 'cols', 'colspan', 'color', 'compact', 'coords', 'datetime',
'dir', 'disabled', 'enctype', 'for', 'frame', 'headers', 'height',
'href', 'hreflang', 'hspace', 'id', 'ismap', 'label', 'lang',
'longdesc', 'maxlength', 'media', 'method', 'multiple', 'name',
'nohref', 'noshade', 'nowrap', 'prompt', 'readonly', 'rel', 'rev',
'rows', 'rowspan', 'rules', 'scope', 'selected', 'shape', 'size',
'span', 'src', 'start', 'style', 'summary', 'tabindex', 'target',
'title', 'type', 'usemap', 'valign', 'value', 'vspace', 'width',
'xml:lang']
'action', 'align', 'alt', 'autocomplete', 'autofocus', 'axis',
'background', 'balance', 'bgcolor', 'bgproperties', 'border',
'bordercolor', 'bordercolordark', 'bordercolorlight', 'bottompadding',
'cellpadding', 'cellspacing', 'ch', 'challenge', 'char', 'charoff',
'choff', 'charset', 'checked', 'cite', 'class', 'clear', 'color',
'cols', 'colspan', 'compact', 'contenteditable', 'controls', 'coords',
'data', 'datafld', 'datapagesize', 'datasrc', 'datetime', 'default',
'delay', 'dir', 'disabled', 'draggable', 'dynsrc', 'enctype', 'end',
'face', 'for', 'form', 'frame', 'galleryimg', 'gutter', 'headers',
'height', 'hidefocus', 'hidden', 'high', 'href', 'hreflang', 'hspace',
'icon', 'id', 'inputmode', 'ismap', 'keytype', 'label', 'leftspacing',
'lang', 'list', 'longdesc', 'loop', 'loopcount', 'loopend',
'loopstart', 'low', 'lowsrc', 'max', 'maxlength', 'media', 'method',
'min', 'multiple', 'name', 'nohref', 'noshade', 'nowrap', 'open',
'optimum', 'pattern', 'ping', 'point-size', 'poster', 'pqg', 'preload',
'prompt', 'radiogroup', 'readonly', 'rel', 'repeat-max', 'repeat-min',
'replace', 'required', 'rev', 'rightspacing', 'rows', 'rowspan',
'rules', 'scope', 'selected', 'shape', 'size', 'span', 'src', 'start',
'step', 'style', 'summary', 'suppress', 'tabindex', 'target',
'template', 'title', 'toppadding', 'type', 'unselectable', 'usemap',
'urn', 'valign', 'value', 'variable', 'volume', 'vspace', 'vrml',
'width', 'wrap', 'xml:lang']
mathml_attributes = ['actiontype', 'align', 'columnalign', 'columnalign',
'columnalign', 'columnlines', 'columnspacing', 'columnspan', 'depth',
'display', 'displaystyle', 'equalcolumns', 'equalrows', 'fence',
'fontstyle', 'fontweight', 'frame', 'height', 'linethickness', 'lspace',
'mathbackground', 'mathcolor', 'mathvariant', 'mathvariant', 'maxsize',
'minsize', 'other', 'rowalign', 'rowalign', 'rowalign', 'rowlines',
'rowspacing', 'rowspan', 'rspace', 'scriptlevel', 'selection',
'separator', 'stretchy', 'width', 'width', 'xlink:href', 'xlink:show',
'xlink:type', 'xmlns', 'xmlns:xlink']
svg_attributes = ['accent-height', 'accumulate', 'additive', 'alphabetic',
'arabic-form', 'ascent', 'attributeName', 'attributeType',
'baseProfile', 'bbox', 'begin', 'by', 'calcMode', 'cap-height',
'class', 'color', 'color-rendering', 'content', 'cx', 'cy', 'd', 'dx',
'dy', 'descent', 'display', 'dur', 'end', 'fill', 'fill-opacity',
'fill-rule', 'font-family', 'font-size', 'font-stretch', 'font-style',
'font-variant', 'font-weight', 'from', 'fx', 'fy', 'g1', 'g2',
'glyph-name', 'gradientUnits', 'hanging', 'height', 'horiz-adv-x',
'horiz-origin-x', 'id', 'ideographic', 'k', 'keyPoints',
'keySplines', 'keyTimes', 'lang', 'marker-end', 'marker-mid',
'marker-start', 'markerHeight', 'markerUnits', 'markerWidth',
'mathematical', 'max', 'min', 'name', 'offset', 'opacity', 'orient',
'origin', 'overline-position', 'overline-thickness', 'panose-1',
'path', 'pathLength', 'points', 'preserveAspectRatio', 'r', 'refX',
'refY', 'repeatCount', 'repeatDur', 'requiredExtensions',
'requiredFeatures', 'restart', 'rotate', 'rx', 'ry', 'slope',
'stemh', 'stemv', 'stop-color', 'stop-opacity',
'strikethrough-position', 'strikethrough-thickness', 'stroke',
'stroke-dasharray', 'stroke-dashoffset', 'stroke-linecap',
'stroke-linejoin', 'stroke-miterlimit', 'stroke-opacity',
'stroke-width', 'systemLanguage', 'target', 'text-anchor', 'to',
'transform', 'type', 'u1', 'u2', 'underline-position',
'underline-thickness', 'unicode', 'unicode-range', 'units-per-em',
'values', 'version', 'viewBox', 'visibility', 'width', 'widths', 'x',
'x-height', 'x1', 'x2', 'xlink:actuate', 'xlink:arcrole',
'xlink:href', 'xlink:role', 'xlink:show', 'xlink:title',
'xlink:type', 'xml:base', 'xml:lang', 'xml:space', 'xmlns',
'xmlns:xlink', 'y', 'y1', 'y2', 'zoomAndPan']
'columnalign', 'columnlines', 'columnspacing', 'columnspan', 'depth',
'display', 'displaystyle', 'equalcolumns', 'equalrows', 'fence',
'fontstyle', 'fontweight', 'frame', 'height', 'linethickness', 'lspace',
'mathbackground', 'mathcolor', 'mathvariant', 'mathvariant', 'maxsize',
'minsize', 'other', 'rowalign', 'rowalign', 'rowalign', 'rowlines',
'rowspacing', 'rowspan', 'rspace', 'scriptlevel', 'selection',
'separator', 'stretchy', 'width', 'width', 'xlink:href', 'xlink:show',
'xlink:type', 'xmlns', 'xmlns:xlink']
attr_val_is_uri = ['href', 'src', 'cite', 'action', 'longdesc',
'xlink:href', 'xml:base']
svg_attributes = ['accent-height', 'accumulate', 'additive', 'alphabetic',
'arabic-form', 'ascent', 'attributeName', 'attributeType',
'baseProfile', 'bbox', 'begin', 'by', 'calcMode', 'cap-height',
'class', 'clip-path', 'color', 'color-rendering', 'content', 'cx',
'cy', 'd', 'dx', 'dy', 'descent', 'display', 'dur', 'end', 'fill',
'fill-opacity', 'fill-rule', 'font-family', 'font-size',
'font-stretch', 'font-style', 'font-variant', 'font-weight', 'from',
'fx', 'fy', 'g1', 'g2', 'glyph-name', 'gradientUnits', 'hanging',
'height', 'horiz-adv-x', 'horiz-origin-x', 'id', 'ideographic', 'k',
'keyPoints', 'keySplines', 'keyTimes', 'lang', 'marker-end',
'marker-mid', 'marker-start', 'markerHeight', 'markerUnits',
'markerWidth', 'mathematical', 'max', 'min', 'name', 'offset',
'opacity', 'orient', 'origin', 'overline-position',
'overline-thickness', 'panose-1', 'path', 'pathLength', 'points',
'preserveAspectRatio', 'r', 'refX', 'refY', 'repeatCount',
'repeatDur', 'requiredExtensions', 'requiredFeatures', 'restart',
'rotate', 'rx', 'ry', 'slope', 'stemh', 'stemv', 'stop-color',
'stop-opacity', 'strikethrough-position', 'strikethrough-thickness',
'stroke', 'stroke-dasharray', 'stroke-dashoffset', 'stroke-linecap',
'stroke-linejoin', 'stroke-miterlimit', 'stroke-opacity',
'stroke-width', 'systemLanguage', 'target', 'text-anchor', 'to',
'transform', 'type', 'u1', 'u2', 'underline-position',
'underline-thickness', 'unicode', 'unicode-range', 'units-per-em',
'values', 'version', 'viewBox', 'visibility', 'width', 'widths', 'x',
'x-height', 'x1', 'x2', 'xlink:actuate', 'xlink:arcrole',
'xlink:href', 'xlink:role', 'xlink:show', 'xlink:title', 'xlink:type',
'xml:base', 'xml:lang', 'xml:space', 'xmlns', 'xmlns:xlink', 'y',
'y1', 'y2', 'zoomAndPan']
attr_val_is_uri = ['href', 'src', 'cite', 'action', 'longdesc', 'poster',
'xlink:href', 'xml:base']
svg_attr_val_allows_ref = ['clip-path', 'color-profile', 'cursor', 'fill',
'filter', 'marker', 'marker-start', 'marker-mid', 'marker-end', 'mask', 'stroke']
'filter', 'marker', 'marker-start', 'marker-mid', 'marker-end',
'mask', 'stroke']
svg_allow_local_href = ['altGlyph', 'animate', 'animateColor',
'animateMotion', 'animateTransform', 'cursor', 'feImage', 'filter',
'linearGradient', 'pattern', 'radialGradient', 'textpath', 'tref',
'set', 'use']
svg_allow_local_href = ['altGlyph', 'animate', 'animateColor', 'animateMotion',
'animateTransform', 'cursor', 'feImage', 'filter', 'linearGradient', 'pattern',
'radialGradient', 'textpath', 'tref', 'set', 'use']
acceptable_css_properties = ['azimuth', 'background-color',
'border-bottom-color', 'border-collapse', 'border-color',
'border-left-color', 'border-right-color', 'border-top-color', 'clear',
'color', 'cursor', 'direction', 'display', 'elevation', 'float', 'font',
'font-family', 'font-size', 'font-style', 'font-variant', 'font-weight',
'height', 'letter-spacing', 'line-height', 'overflow', 'pause',
'pause-after', 'pause-before', 'pitch', 'pitch-range', 'richness',
'speak', 'speak-header', 'speak-numeral', 'speak-punctuation',
'speech-rate', 'stress', 'text-align', 'text-decoration', 'text-indent',
'unicode-bidi', 'vertical-align', 'voice-family', 'volume',
'white-space', 'width']
'border-bottom-color', 'border-collapse', 'border-color',
'border-left-color', 'border-right-color', 'border-top-color', 'clear',
'color', 'cursor', 'direction', 'display', 'elevation', 'float', 'font',
'font-family', 'font-size', 'font-style', 'font-variant', 'font-weight',
'height', 'letter-spacing', 'line-height', 'overflow', 'pause',
'pause-after', 'pause-before', 'pitch', 'pitch-range', 'richness',
'speak', 'speak-header', 'speak-numeral', 'speak-punctuation',
'speech-rate', 'stress', 'text-align', 'text-decoration', 'text-indent',
'unicode-bidi', 'vertical-align', 'voice-family', 'volume',
'white-space', 'width']
acceptable_css_keywords = ['auto', 'aqua', 'black', 'block', 'blue',
'bold', 'both', 'bottom', 'brown', 'center', 'collapse', 'dashed',
'dotted', 'fuchsia', 'gray', 'green', '!important', 'italic', 'left',
'lime', 'maroon', 'medium', 'none', 'navy', 'normal', 'nowrap', 'olive',
'pointer', 'purple', 'red', 'right', 'solid', 'silver', 'teal', 'top',
'transparent', 'underline', 'white', 'yellow']
acceptable_svg_properties = [ 'fill', 'fill-opacity', 'fill-rule',
'stroke', 'stroke-width', 'stroke-linecap', 'stroke-linejoin',
'stroke-opacity']
acceptable_protocols = [ 'ed2k', 'ftp', 'http', 'https', 'irc',
'mailto', 'news', 'gopher', 'nntp', 'telnet', 'webcal',
'xmpp', 'callto', 'feed', 'urn', 'aim', 'rsync', 'tag',
'ssh', 'sftp', 'rtsp', 'afs' ]
'bold', 'both', 'bottom', 'brown', 'center', 'collapse', 'dashed',
'dotted', 'fuchsia', 'gray', 'green', '!important', 'italic', 'left',
'lime', 'maroon', 'medium', 'none', 'navy', 'normal', 'nowrap', 'olive',
'pointer', 'purple', 'red', 'right', 'solid', 'silver', 'teal', 'top',
'transparent', 'underline', 'white', 'yellow']
acceptable_svg_properties = ['fill', 'fill-opacity', 'fill-rule',
'stroke', 'stroke-width', 'stroke-linecap', 'stroke-linejoin',
'stroke-opacity']
acceptable_protocols = ['ed2k', 'ftp', 'http', 'https', 'irc',
'mailto', 'news', 'gopher', 'nntp', 'telnet', 'webcal',
'xmpp', 'callto', 'feed', 'urn', 'aim', 'rsync', 'tag',
'ssh', 'sftp', 'rtsp', 'afs']
# subclasses may define their own versions of these constants
allowed_elements = acceptable_elements + mathml_elements + svg_elements
allowed_attributes = acceptable_attributes + mathml_attributes + svg_attributes
@ -140,88 +160,109 @@ class HTMLSanitizerMixin(object):
# sanitize_html('<a href="javascript: sucker();">Click here for $100</a>')
# => <a>Click here for $100</a>
def sanitize_token(self, token):
if token["type"] in (tokenTypes["StartTag"], tokenTypes["EndTag"],
tokenTypes["EmptyTag"]):
# accommodate filters which use token_type differently
token_type = token["type"]
if token_type in list(tokenTypes.keys()):
token_type = tokenTypes[token_type]
if token_type in (tokenTypes["StartTag"], tokenTypes["EndTag"],
tokenTypes["EmptyTag"]):
if token["name"] in self.allowed_elements:
if token.has_key("data"):
attrs = dict([(name,val) for name,val in
token["data"][::-1]
if name in self.allowed_attributes])
for attr in self.attr_val_is_uri:
if not attrs.has_key(attr):
continue
val_unescaped = re.sub("[`\000-\040\177-\240\s]+", '',
unescape(attrs[attr])).lower()
#remove replacement characters from unescaped characters
val_unescaped = val_unescaped.replace(u"\ufffd", "")
if (re.match("^[a-z0-9][-+.a-z0-9]*:",val_unescaped) and
(val_unescaped.split(':')[0] not in
self.allowed_protocols)):
del attrs[attr]
for attr in self.svg_attr_val_allows_ref:
if attr in attrs:
attrs[attr] = re.sub(r'url\s*\(\s*[^#\s][^)]+?\)',
' ',
unescape(attrs[attr]))
if (token["name"] in self.svg_allow_local_href and
'xlink:href' in attrs and re.search('^\s*[^#\s].*',
attrs['xlink:href'])):
del attrs['xlink:href']
if attrs.has_key('style'):
attrs['style'] = self.sanitize_css(attrs['style'])
token["data"] = [[name,val] for name,val in attrs.items()]
return token
return self.allowed_token(token, token_type)
else:
if token["type"] == tokenTypes["EndTag"]:
token["data"] = "</%s>" % token["name"]
elif token["data"]:
attrs = ''.join([' %s="%s"' % (k,escape(v)) for k,v in token["data"]])
token["data"] = "<%s%s>" % (token["name"],attrs)
else:
token["data"] = "<%s>" % token["name"]
if token["selfClosing"]:
token["data"]=token["data"][:-1] + "/>"
token["type"] = tokenTypes["Characters"]
del token["name"]
return token
elif token["type"] == tokenTypes["Comment"]:
return self.disallowed_token(token, token_type)
elif token_type == tokenTypes["Comment"]:
pass
else:
return token
def allowed_token(self, token, token_type):
if "data" in token:
attrs = dict([(name, val) for name, val in
token["data"][::-1]
if name in self.allowed_attributes])
for attr in self.attr_val_is_uri:
if attr not in attrs:
continue
val_unescaped = re.sub("[`\000-\040\177-\240\s]+", '',
unescape(attrs[attr])).lower()
# remove replacement characters from unescaped characters
val_unescaped = val_unescaped.replace("\ufffd", "")
if (re.match("^[a-z0-9][-+.a-z0-9]*:", val_unescaped) and
(val_unescaped.split(':')[0] not in
self.allowed_protocols)):
del attrs[attr]
for attr in self.svg_attr_val_allows_ref:
if attr in attrs:
attrs[attr] = re.sub(r'url\s*\(\s*[^#\s][^)]+?\)',
' ',
unescape(attrs[attr]))
if (token["name"] in self.svg_allow_local_href and
'xlink:href' in attrs and re.search('^\s*[^#\s].*',
attrs['xlink:href'])):
del attrs['xlink:href']
if 'style' in attrs:
attrs['style'] = self.sanitize_css(attrs['style'])
token["data"] = [[name, val] for name, val in list(attrs.items())]
return token
def disallowed_token(self, token, token_type):
if token_type == tokenTypes["EndTag"]:
token["data"] = "</%s>" % token["name"]
elif token["data"]:
attrs = ''.join([' %s="%s"' % (k, escape(v)) for k, v in token["data"]])
token["data"] = "<%s%s>" % (token["name"], attrs)
else:
token["data"] = "<%s>" % token["name"]
if token.get("selfClosing"):
token["data"] = token["data"][:-1] + "/>"
if token["type"] in list(tokenTypes.keys()):
token["type"] = "Characters"
else:
token["type"] = tokenTypes["Characters"]
del token["name"]
return token
def sanitize_css(self, style):
# disallow urls
style=re.compile('url\s*\(\s*[^\s)]+?\s*\)\s*').sub(' ',style)
style = re.compile('url\s*\(\s*[^\s)]+?\s*\)\s*').sub(' ', style)
# gauntlet
if not re.match("""^([:,;#%.\sa-zA-Z0-9!]|\w-\w|'[\s\w]+'|"[\s\w]+"|\([\d,\s]+\))*$""", style): return ''
if not re.match("^\s*([-\w]+\s*:[^:;]*(;\s*|$))*$", style): return ''
if not re.match("""^([:,;#%.\sa-zA-Z0-9!]|\w-\w|'[\s\w]+'|"[\s\w]+"|\([\d,\s]+\))*$""", style):
return ''
if not re.match("^\s*([-\w]+\s*:[^:;]*(;\s*|$))*$", style):
return ''
clean = []
for prop,value in re.findall("([-\w]+)\s*:\s*([^:;]*)",style):
if not value: continue
if prop.lower() in self.allowed_css_properties:
clean.append(prop + ': ' + value + ';')
elif prop.split('-')[0].lower() in ['background','border','margin',
'padding']:
for keyword in value.split():
if not keyword in self.acceptable_css_keywords and \
not re.match("^(#[0-9a-f]+|rgb\(\d+%?,\d*%?,?\d*%?\)?|\d{0,2}\.?\d{0,2}(cm|em|ex|in|mm|pc|pt|px|%|,|\))?)$",keyword):
break
else:
clean.append(prop + ': ' + value + ';')
elif prop.lower() in self.allowed_svg_properties:
clean.append(prop + ': ' + value + ';')
for prop, value in re.findall("([-\w]+)\s*:\s*([^:;]*)", style):
if not value:
continue
if prop.lower() in self.allowed_css_properties:
clean.append(prop + ': ' + value + ';')
elif prop.split('-')[0].lower() in ['background', 'border', 'margin',
'padding']:
for keyword in value.split():
if not keyword in self.acceptable_css_keywords and \
not re.match("^(#[0-9a-f]+|rgb\(\d+%?,\d*%?,?\d*%?\)?|\d{0,2}\.?\d{0,2}(cm|em|ex|in|mm|pc|pt|px|%|,|\))?)$", keyword):
break
else:
clean.append(prop + ': ' + value + ';')
elif prop.lower() in self.allowed_svg_properties:
clean.append(prop + ': ' + value + ';')
return ' '.join(clean)
class HTMLSanitizer(HTMLTokenizer, HTMLSanitizerMixin):
def __init__(self, stream, encoding=None, parseMeta=True, useChardet=True,
lowercaseElementName=False, lowercaseAttrName=False):
#Change case matching defaults as we only output lowercase html anyway
#This solution doesn't seem ideal...
lowercaseElementName=False, lowercaseAttrName=False, parser=None):
# Change case matching defaults as we only output lowercase html anyway
# This solution doesn't seem ideal...
HTMLTokenizer.__init__(self, stream, encoding, parseMeta, useChardet,
lowercaseElementName, lowercaseAttrName)
lowercaseElementName, lowercaseAttrName, parser=parser)
def __iter__(self):
for token in HTMLTokenizer.__iter__(self):

View File

@ -1,17 +1,16 @@
from __future__ import absolute_import, division, unicode_literals
from html5lib import treewalkers
from .. import treewalkers
from htmlserializer import HTMLSerializer
from xhtmlserializer import XHTMLSerializer
from .htmlserializer import HTMLSerializer
def serialize(input, tree="simpletree", format="html", encoding=None,
def serialize(input, tree="etree", format="html", encoding=None,
**serializer_opts):
# XXX: Should we cache this?
walker = treewalkers.getTreeWalker(tree)
walker = treewalkers.getTreeWalker(tree)
if format == "html":
s = HTMLSerializer(**serializer_opts)
elif format == "xhtml":
s = XHTMLSerializer(**serializer_opts)
else:
raise ValueError, "type must be either html or xhtml"
raise ValueError("type must be html")
return s.render(walker(input), encoding)

View File

@ -1,18 +1,20 @@
try:
frozenset
except NameError:
# Import from the sets module for python 2.3
from sets import ImmutableSet as frozenset
from __future__ import absolute_import, division, unicode_literals
from six import text_type
import gettext
_ = gettext.gettext
from html5lib.constants import voidElements, booleanAttributes, spaceCharacters
from html5lib.constants import rcdataElements
try:
from functools import reduce
except ImportError:
pass
from ..constants import voidElements, booleanAttributes, spaceCharacters
from ..constants import rcdataElements, entities, xmlEntities
from .. import utils
from xml.sax.saxutils import escape
spaceCharacters = u"".join(spaceCharacters)
spaceCharacters = "".join(spaceCharacters)
try:
from codecs import register_error, xmlcharrefreplace_errors
@ -21,27 +23,48 @@ except ImportError:
else:
unicode_encode_errors = "htmlentityreplace"
from html5lib.constants import entities
encode_entity_map = {}
for k, v in entities.items():
if v != "&" and encode_entity_map.get(v) != k.lower():
# prefer &lt; over &LT; and similarly for &amp;, &gt;, etc.
encode_entity_map[v] = k
is_ucs4 = len("\U0010FFFF") == 1
for k, v in list(entities.items()):
# skip multi-character entities
if ((is_ucs4 and len(v) > 1) or
(not is_ucs4 and len(v) > 2)):
continue
if v != "&":
if len(v) == 2:
v = utils.surrogatePairToCodepoint(v)
else:
v = ord(v)
if not v in encode_entity_map or k.islower():
# prefer &lt; over &LT; and similarly for &amp;, &gt;, etc.
encode_entity_map[v] = k
def htmlentityreplace_errors(exc):
if isinstance(exc, (UnicodeEncodeError, UnicodeTranslateError)):
res = []
for c in exc.object[exc.start:exc.end]:
e = encode_entity_map.get(c)
codepoints = []
skip = False
for i, c in enumerate(exc.object[exc.start:exc.end]):
if skip:
skip = False
continue
index = i + exc.start
if utils.isSurrogatePair(exc.object[index:min([exc.end, index + 2])]):
codepoint = utils.surrogatePairToCodepoint(exc.object[index:index + 2])
skip = True
else:
codepoint = ord(c)
codepoints.append(codepoint)
for cp in codepoints:
e = encode_entity_map.get(cp)
if e:
res.append("&")
res.append(e)
if not e.endswith(";"):
res.append(";")
else:
res.append(c.encode(exc.encoding, "xmlcharrefreplace"))
return (u"".join(res), exc.end)
res.append("&#x%s;" % (hex(cp)[2:]))
return ("".join(res), exc.end)
else:
return xmlcharrefreplace_errors(exc)
@ -49,125 +72,185 @@ else:
del register_error
def encode(text, encoding):
return text.encode(encoding, unicode_encode_errors)
class HTMLSerializer(object):
# attribute quoting options
quote_attr_values = False
quote_char = '"'
use_best_quote_char = True
minimize_boolean_attributes = True
# tag syntax options
omit_optional_tags = True
minimize_boolean_attributes = True
use_trailing_solidus = False
space_before_trailing_solidus = True
# escaping options
escape_lt_in_attrs = False
escape_rcdata = False
resolve_entities = True
# miscellaneous options
alphabetical_attributes = False
inject_meta_charset = True
strip_whitespace = False
sanitize = False
omit_optional_tags = True
options = ("quote_attr_values", "quote_char", "use_best_quote_char",
"minimize_boolean_attributes", "use_trailing_solidus",
"space_before_trailing_solidus", "omit_optional_tags",
"strip_whitespace", "inject_meta_charset", "escape_lt_in_attrs",
"escape_rcdata", 'use_trailing_solidus', "sanitize")
"omit_optional_tags", "minimize_boolean_attributes",
"use_trailing_solidus", "space_before_trailing_solidus",
"escape_lt_in_attrs", "escape_rcdata", "resolve_entities",
"alphabetical_attributes", "inject_meta_charset",
"strip_whitespace", "sanitize")
def __init__(self, **kwargs):
if kwargs.has_key('quote_char'):
"""Initialize HTMLSerializer.
Keyword options (default given first unless specified) include:
inject_meta_charset=True|False
Whether it insert a meta element to define the character set of the
document.
quote_attr_values=True|False
Whether to quote attribute values that don't require quoting
per HTML5 parsing rules.
quote_char=u'"'|u"'"
Use given quote character for attribute quoting. Default is to
use double quote unless attribute value contains a double quote,
in which case single quotes are used instead.
escape_lt_in_attrs=False|True
Whether to escape < in attribute values.
escape_rcdata=False|True
Whether to escape characters that need to be escaped within normal
elements within rcdata elements such as style.
resolve_entities=True|False
Whether to resolve named character entities that appear in the
source tree. The XML predefined entities &lt; &gt; &amp; &quot; &apos;
are unaffected by this setting.
strip_whitespace=False|True
Whether to remove semantically meaningless whitespace. (This
compresses all whitespace to a single space except within pre.)
minimize_boolean_attributes=True|False
Shortens boolean attributes to give just the attribute value,
for example <input disabled="disabled"> becomes <input disabled>.
use_trailing_solidus=False|True
Includes a close-tag slash at the end of the start tag of void
elements (empty elements whose end tag is forbidden). E.g. <hr/>.
space_before_trailing_solidus=True|False
Places a space immediately before the closing slash in a tag
using a trailing solidus. E.g. <hr />. Requires use_trailing_solidus.
sanitize=False|True
Strip all unsafe or unknown constructs from output.
See `html5lib user documentation`_
omit_optional_tags=True|False
Omit start/end tags that are optional.
alphabetical_attributes=False|True
Reorder attributes to be in alphabetical order.
.. _html5lib user documentation: http://code.google.com/p/html5lib/wiki/UserDocumentation
"""
if 'quote_char' in kwargs:
self.use_best_quote_char = False
for attr in self.options:
setattr(self, attr, kwargs.get(attr, getattr(self, attr)))
self.errors = []
self.strict = False
def encode(self, string):
assert(isinstance(string, text_type))
if self.encoding:
return string.encode(self.encoding, unicode_encode_errors)
else:
return string
def encodeStrict(self, string):
assert(isinstance(string, text_type))
if self.encoding:
return string.encode(self.encoding, "strict")
else:
return string
def serialize(self, treewalker, encoding=None):
self.encoding = encoding
in_cdata = False
self.errors = []
if encoding and self.inject_meta_charset:
from html5lib.filters.inject_meta_charset import Filter
from ..filters.inject_meta_charset import Filter
treewalker = Filter(treewalker, encoding)
# XXX: WhitespaceFilter should be used before OptionalTagFilter
# WhitespaceFilter should be used before OptionalTagFilter
# for maximum efficiently of this latter filter
if self.strip_whitespace:
from html5lib.filters.whitespace import Filter
from ..filters.whitespace import Filter
treewalker = Filter(treewalker)
if self.sanitize:
from html5lib.filters.sanitizer import Filter
from ..filters.sanitizer import Filter
treewalker = Filter(treewalker)
if self.omit_optional_tags:
from html5lib.filters.optionaltags import Filter
from ..filters.optionaltags import Filter
treewalker = Filter(treewalker)
# Alphabetical attributes must be last, as other filters
# could add attributes and alter the order
if self.alphabetical_attributes:
from ..filters.alphabeticalattributes import Filter
treewalker = Filter(treewalker)
for token in treewalker:
type = token["type"]
if type == "Doctype":
doctype = u"<!DOCTYPE %s" % token["name"]
doctype = "<!DOCTYPE %s" % token["name"]
if token["publicId"]:
doctype += u' PUBLIC "%s"' % token["publicId"]
doctype += ' PUBLIC "%s"' % token["publicId"]
elif token["systemId"]:
doctype += u" SYSTEM"
if token["systemId"]:
if token["systemId"].find(u'"') >= 0:
if token["systemId"].find(u"'") >= 0:
doctype += " SYSTEM"
if token["systemId"]:
if token["systemId"].find('"') >= 0:
if token["systemId"].find("'") >= 0:
self.serializeError(_("System identifer contains both single and double quote characters"))
quote_char = u"'"
quote_char = "'"
else:
quote_char = u'"'
doctype += u" %s%s%s" % (quote_char, token["systemId"], quote_char)
doctype += u">"
if encoding:
yield doctype.encode(encoding)
else:
yield doctype
quote_char = '"'
doctype += " %s%s%s" % (quote_char, token["systemId"], quote_char)
doctype += ">"
yield self.encodeStrict(doctype)
elif type in ("Characters", "SpaceCharacters"):
if type == "SpaceCharacters" or in_cdata:
if in_cdata and token["data"].find("</") >= 0:
self.serializeError(_("Unexpected </ in CDATA"))
if encoding:
yield token["data"].encode(encoding, "strict")
else:
yield token["data"]
elif encoding:
yield encode(escape(token["data"]), encoding)
yield self.encode(token["data"])
else:
yield escape(token["data"])
yield self.encode(escape(token["data"]))
elif type in ("StartTag", "EmptyTag"):
name = token["name"]
yield self.encodeStrict("<%s" % name)
if name in rcdataElements and not self.escape_rcdata:
in_cdata = True
elif in_cdata:
self.serializeError(_("Unexpected child element of a CDATA element"))
attrs = token["data"]
if hasattr(attrs, "items"):
attrs = attrs.items()
attrs.sort()
attributes = []
for k,v in attrs:
if encoding:
k = k.encode(encoding, "strict")
attributes.append(' ')
for (attr_namespace, attr_name), attr_value in token["data"].items():
# TODO: Add namespace support here
k = attr_name
v = attr_value
yield self.encodeStrict(' ')
attributes.append(k)
yield self.encodeStrict(k)
if not self.minimize_boolean_attributes or \
(k not in booleanAttributes.get(name, tuple()) \
and k not in booleanAttributes.get("", tuple())):
attributes.append("=")
(k not in booleanAttributes.get(name, tuple())
and k not in booleanAttributes.get("", tuple())):
yield self.encodeStrict("=")
if self.quote_attr_values or not v:
quote_attr = True
else:
quote_attr = reduce(lambda x,y: x or (y in v),
spaceCharacters + ">\"'=", False)
quote_attr = reduce(lambda x, y: x or (y in v),
spaceCharacters + ">\"'=", False)
v = v.replace("&", "&amp;")
if self.escape_lt_in_attrs: v = v.replace("<", "&lt;")
if encoding:
v = encode(v, encoding)
if self.escape_lt_in_attrs:
v = v.replace("<", "&lt;")
if quote_attr:
quote_char = self.quote_char
if self.use_best_quote_char:
@ -179,20 +262,17 @@ class HTMLSerializer(object):
v = v.replace("'", "&#39;")
else:
v = v.replace('"', "&quot;")
attributes.append(quote_char)
attributes.append(v)
attributes.append(quote_char)
yield self.encodeStrict(quote_char)
yield self.encode(v)
yield self.encodeStrict(quote_char)
else:
attributes.append(v)
yield self.encode(v)
if name in voidElements and self.use_trailing_solidus:
if self.space_before_trailing_solidus:
attributes.append(" /")
yield self.encodeStrict(" /")
else:
attributes.append("/")
if encoding:
yield "<%s%s>" % (name.encode(encoding, "strict"), "".join(attributes))
else:
yield u"<%s%s>" % (name, u"".join(attributes))
yield self.encodeStrict("/")
yield self.encode(">")
elif type == "EndTag":
name = token["name"]
@ -200,28 +280,33 @@ class HTMLSerializer(object):
in_cdata = False
elif in_cdata:
self.serializeError(_("Unexpected child element of a CDATA element"))
end_tag = u"</%s>" % name
if encoding:
end_tag = end_tag.encode(encoding, "strict")
yield end_tag
yield self.encodeStrict("</%s>" % name)
elif type == "Comment":
data = token["data"]
if data.find("--") >= 0:
self.serializeError(_("Comment contains --"))
comment = u"<!--%s-->" % token["data"]
if encoding:
comment = comment.encode(encoding, unicode_encode_errors)
yield comment
yield self.encodeStrict("<!--%s-->" % token["data"])
elif type == "Entity":
name = token["name"]
key = name + ";"
if not key in entities:
self.serializeError(_("Entity %s not recognized" % name))
if self.resolve_entities and key not in xmlEntities:
data = entities[key]
else:
data = "&%s;" % name
yield self.encodeStrict(data)
else:
self.serializeError(token["data"])
def render(self, treewalker, encoding=None):
if encoding:
return "".join(list(self.serialize(treewalker, encoding)))
return b"".join(list(self.serialize(treewalker, encoding)))
else:
return u"".join(list(self.serialize(treewalker)))
return "".join(list(self.serialize(treewalker)))
def serializeError(self, data="XXX ERROR MESSAGE NEEDED"):
# XXX The idea is to make data mandatory.
@ -229,6 +314,7 @@ class HTMLSerializer(object):
if self.strict:
raise SerializeError
def SerializeError(Exception):
"""Error in serialized tree"""
pass

View File

@ -1,9 +0,0 @@
from htmlserializer import HTMLSerializer
class XHTMLSerializer(HTMLSerializer):
quote_attr_values = True
minimize_boolean_attributes = False
use_trailing_solidus = True
escape_lt_in_attrs = True
omit_optional_tags = False
escape_rcdata = True

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

View File

@ -0,0 +1,44 @@
from __future__ import absolute_import, division, unicode_literals
from xml.sax.xmlreader import AttributesNSImpl
from ..constants import adjustForeignAttributes, unadjustForeignAttributes
prefix_mapping = {}
for prefix, localName, namespace in adjustForeignAttributes.values():
if prefix is not None:
prefix_mapping[prefix] = namespace
def to_sax(walker, handler):
"""Call SAX-like content handler based on treewalker walker"""
handler.startDocument()
for prefix, namespace in prefix_mapping.items():
handler.startPrefixMapping(prefix, namespace)
for token in walker:
type = token["type"]
if type == "Doctype":
continue
elif type in ("StartTag", "EmptyTag"):
attrs = AttributesNSImpl(token["data"],
unadjustForeignAttributes)
handler.startElementNS((token["namespace"], token["name"]),
token["name"],
attrs)
if type == "EmptyTag":
handler.endElementNS((token["namespace"], token["name"]),
token["name"])
elif type == "EndTag":
handler.endElementNS((token["namespace"], token["name"]),
token["name"])
elif type in ("Characters", "SpaceCharacters"):
handler.characters(token["data"])
elif type == "Comment":
pass
else:
assert False, "Unknown token type"
for prefix, namespace in prefix_mapping.items():
handler.endPrefixMapping(prefix)
handler.endDocument()

View File

@ -7,7 +7,7 @@ implement several things:
1) A set of classes for various types of elements: Document, Doctype,
Comment, Element. These must implement the interface of
_base.treebuilders.Node (although comment nodes have a different
signature for their constructor, see treebuilders.simpletree.Comment)
signature for their constructor, see treebuilders.etree.Comment)
Textual content may also be implemented as another node type, or not, as
your tree implementation requires.
@ -24,69 +24,53 @@ getDocument - Returns the root node of the complete document tree
testSerializer method on your treebuilder which accepts a node and
returns a string containing Node and its children serialized according
to the format used in the unittests
The supplied simpletree module provides a python-only implementation
of a full treebuilder and is a useful reference for the semantics of
the various methods.
"""
from __future__ import absolute_import, division, unicode_literals
from ..utils import default_etree
treeBuilderCache = {}
def getTreeBuilder(treeType, implementation=None, **kwargs):
"""Get a TreeBuilder class for various types of tree with built-in support
treeType - the name of the tree type required (case-insensitive). Supported
values are "simpletree", "dom", "etree" and "beautifulsoup"
"simpletree" - a built-in DOM-ish tree type with support for some
more pythonic idioms.
"dom" - A generic builder for DOM implementations, defaulting to
a xml.dom.minidom based implementation for the sake of
backwards compatibility (as releases up until 0.10 had a
builder called "dom" that was a minidom implemenation).
"etree" - A generic builder for tree implementations exposing an
elementtree-like interface (known to work with
ElementTree, cElementTree and lxml.etree).
"beautifulsoup" - Beautiful soup (if installed)
values are:
"dom" - A generic builder for DOM implementations, defaulting to
a xml.dom.minidom based implementation.
"etree" - A generic builder for tree implementations exposing an
ElementTree-like interface, defaulting to
xml.etree.cElementTree if available and
xml.etree.ElementTree if not.
"lxml" - A etree-based builder for lxml.etree, handling
limitations of lxml's implementation.
implementation - (Currently applies to the "etree" and "dom" tree types). A
module implementing the tree type e.g.
xml.etree.ElementTree or lxml.etree."""
xml.etree.ElementTree or xml.etree.cElementTree."""
treeType = treeType.lower()
if treeType not in treeBuilderCache:
if treeType == "dom":
import dom
# XXX: Keep backwards compatibility by using minidom if no implementation is given
if implementation == None:
from . import dom
# Come up with a sane default (pref. from the stdlib)
if implementation is None:
from xml.dom import minidom
implementation = minidom
# XXX: NEVER cache here, caching is done in the dom submodule
# NEVER cache here, caching is done in the dom submodule
return dom.getDomModule(implementation, **kwargs).TreeBuilder
elif treeType == "simpletree":
import simpletree
treeBuilderCache[treeType] = simpletree.TreeBuilder
elif treeType == "beautifulsoup":
import soup
treeBuilderCache[treeType] = soup.TreeBuilder
elif treeType == "lxml":
import etree_lxml
from . import etree_lxml
treeBuilderCache[treeType] = etree_lxml.TreeBuilder
elif treeType == "etree":
# Come up with a sane default
if implementation == None:
try:
import xml.etree.cElementTree as ET
except ImportError:
try:
import xml.etree.ElementTree as ET
except ImportError:
try:
import cElementTree as ET
except ImportError:
import elementtree.ElementTree as ET
implementation = ET
import etree
# XXX: NEVER cache here, caching is done in the etree submodule
from . import etree
if implementation is None:
implementation = default_etree
# NEVER cache here, caching is done in the etree submodule
return etree.getETreeModule(implementation, **kwargs).TreeBuilder
else:
raise ValueError("""Unrecognised treebuilder "%s" """ % treeType)
return treeBuilderCache.get(treeType)

View File

@ -1,25 +1,34 @@
from html5lib.constants import scopingElements, tableInsertModeElements, namespaces
try:
frozenset
except NameError:
# Import from the sets module for python 2.3
from sets import Set as set
from sets import ImmutableSet as frozenset
from __future__ import absolute_import, division, unicode_literals
from six import text_type
# The scope markers are inserted when entering buttons, object elements,
from ..constants import scopingElements, tableInsertModeElements, namespaces
# The scope markers are inserted when entering object elements,
# marquees, table cells, and table captions, and are used to prevent formatting
# from "leaking" into tables, buttons, object elements, and marquees.
# from "leaking" into tables, object elements, and marquees.
Marker = None
listElementsMap = {
None: (frozenset(scopingElements), False),
"button": (frozenset(scopingElements | set([(namespaces["html"], "button")])), False),
"list": (frozenset(scopingElements | set([(namespaces["html"], "ol"),
(namespaces["html"], "ul")])), False),
"table": (frozenset([(namespaces["html"], "html"),
(namespaces["html"], "table")]), False),
"select": (frozenset([(namespaces["html"], "optgroup"),
(namespaces["html"], "option")]), True)
}
class Node(object):
def __init__(self, name):
"""Node representing an item in the tree.
name - The tag name associated with the node
parent - The parent of the current node (or None for the document node)
value - The value of the current node (applies to text nodes and
value - The value of the current node (applies to text nodes and
comments
attributes - a dict holding name, value pairs for attributes of the node
childNodes - a list of child nodes of the current node. This must
childNodes - a list of child nodes of the current node. This must
include all elements but not necessarily other node types
_flags - A list of miscellaneous flags that can be set on the node
"""
@ -30,14 +39,14 @@ class Node(object):
self.childNodes = []
self._flags = []
def __unicode__(self):
attributesStr = " ".join(["%s=\"%s\""%(name, value)
for name, value in
self.attributes.iteritems()])
def __str__(self):
attributesStr = " ".join(["%s=\"%s\"" % (name, value)
for name, value in
self.attributes.items()])
if attributesStr:
return "<%s %s>"%(self.name,attributesStr)
return "<%s %s>" % (self.name, attributesStr)
else:
return "<%s>"%(self.name)
return "<%s>" % (self.name)
def __repr__(self):
return "<%s>" % (self.name)
@ -48,14 +57,14 @@ class Node(object):
raise NotImplementedError
def insertText(self, data, insertBefore=None):
"""Insert data as text in the current node, positioned before the
"""Insert data as text in the current node, positioned before the
start of node insertBefore or to the end of the node's text.
"""
raise NotImplementedError
def insertBefore(self, node, refNode):
"""Insert node as a child of the current node, before refNode in the
list of child nodes. Raises ValueError if refNode is not a child of
"""Insert node as a child of the current node, before refNode in the
list of child nodes. Raises ValueError if refNode is not a child of
the current node"""
raise NotImplementedError
@ -65,11 +74,11 @@ class Node(object):
raise NotImplementedError
def reparentChildren(self, newParent):
"""Move all the children of the current node to newParent.
This is needed so that trees that don't store text as nodes move the
"""Move all the children of the current node to newParent.
This is needed so that trees that don't store text as nodes move the
text in the correct way
"""
#XXX - should this method be made more general?
# XXX - should this method be made more general?
for child in self.childNodes:
newParent.appendChild(child)
self.childNodes = []
@ -80,12 +89,36 @@ class Node(object):
"""
raise NotImplementedError
def hasContent(self):
"""Return true if the node has children or text, false otherwise
"""
raise NotImplementedError
class ActiveFormattingElements(list):
def append(self, node):
equalCount = 0
if node != Marker:
for element in self[::-1]:
if element == Marker:
break
if self.nodesEqual(element, node):
equalCount += 1
if equalCount == 3:
self.remove(element)
break
list.append(self, node)
def nodesEqual(self, node1, node2):
if not node1.nameTuple == node2.nameTuple:
return False
if not node1.attributes == node2.attributes:
return False
return True
class TreeBuilder(object):
"""Base treebuilder implementation
documentClass - the class to use for the bottommost node of a document
@ -94,19 +127,19 @@ class TreeBuilder(object):
doctypeClass - the class to use for doctypes
"""
#Document class
# Document class
documentClass = None
#The class to use for creating a node
# The class to use for creating a node
elementClass = None
#The class to use for creating comments
# The class to use for creating comments
commentClass = None
#The class to use for creating doctypes
# The class to use for creating doctypes
doctypeClass = None
#Fragment class
# Fragment class
fragmentClass = None
def __init__(self, namespaceHTMLElements):
@ -115,12 +148,12 @@ class TreeBuilder(object):
else:
self.defaultNamespace = None
self.reset()
def reset(self):
self.openElements = []
self.activeFormattingElements = []
self.activeFormattingElements = ActiveFormattingElements()
#XXX - rename these to headElement, formElement
# XXX - rename these to headElement, formElement
self.headPointer = None
self.formPointer = None
@ -129,23 +162,21 @@ class TreeBuilder(object):
self.document = self.documentClass()
def elementInScope(self, target, variant=None):
# Exit early when possible.
listElementsMap = {
None:scopingElements,
"list":scopingElements | set([(namespaces["html"], "ol"),
(namespaces["html"], "ul")]),
"table":set([(namespaces["html"], "html"),
(namespaces["html"], "table")])
}
listElements = listElementsMap[variant]
# If we pass a node in we match that. if we pass a string
# match any node with that name
exactNode = hasattr(target, "nameTuple")
listElements, invert = listElementsMap[variant]
for node in reversed(self.openElements):
if node.name == target:
if (node.name == target and not exactNode or
node == target and exactNode):
return True
elif node.nameTuple in listElements:
elif (invert ^ (node.nameTuple in listElements)):
return False
assert False # We should never reach this point
assert False # We should never reach this point
def reconstructActiveFormattingElements(self):
# Within this algorithm the order of steps described in the
@ -165,7 +196,7 @@ class TreeBuilder(object):
# Step 6
while entry != Marker and entry not in self.openElements:
if i == 0:
#This will be reset to 0 below
# This will be reset to 0 below
i = -1
break
i -= 1
@ -178,13 +209,13 @@ class TreeBuilder(object):
# Step 8
entry = self.activeFormattingElements[i]
clone = entry.cloneNode() #Mainly to get a new copy of the attributes
clone = entry.cloneNode() # Mainly to get a new copy of the attributes
# Step 9
element = self.insertElement({"type":"StartTag",
"name":clone.name,
"namespace":clone.namespace,
"data":clone.attributes})
element = self.insertElement({"type": "StartTag",
"name": clone.name,
"namespace": clone.namespace,
"data": clone.attributes})
# Step 10
self.activeFormattingElements[i] = element
@ -229,7 +260,7 @@ class TreeBuilder(object):
if parent is None:
parent = self.openElements[-1]
parent.appendChild(self.commentClass(token["data"]))
def createElement(self, token):
"""Create an element but don't insert it anywhere"""
name = token["name"]
@ -251,9 +282,10 @@ class TreeBuilder(object):
self.insertElement = self.insertElementNormal
insertFromTable = property(_getInsertFromTable, _setInsertFromTable)
def insertElementNormal(self, token):
name = token["name"]
assert isinstance(name, text_type), "Element %s not unicode" % name
namespace = token.get("namespace", self.defaultNamespace)
element = self.elementClass(name, namespace)
element.attributes = token["data"]
@ -262,13 +294,13 @@ class TreeBuilder(object):
return element
def insertElementTable(self, token):
"""Create an element and insert it into the tree"""
"""Create an element and insert it into the tree"""
element = self.createElement(token)
if self.openElements[-1].name not in tableInsertModeElements:
return self.insertElementNormal(token)
else:
#We should be in the InTable mode. This means we want to do
#special magic element rearranging
# We should be in the InTable mode. This means we want to do
# special magic element rearranging
parent, insertBefore = self.getTableMisnestedNodePosition()
if insertBefore is None:
parent.appendChild(element)
@ -283,7 +315,7 @@ class TreeBuilder(object):
parent = self.openElements[-1]
if (not self.insertFromTable or (self.insertFromTable and
self.openElements[-1].name
self.openElements[-1].name
not in tableInsertModeElements)):
parent.insertText(data)
else:
@ -291,14 +323,14 @@ class TreeBuilder(object):
# special magic element rearranging
parent, insertBefore = self.getTableMisnestedNodePosition()
parent.insertText(data, insertBefore)
def getTableMisnestedNodePosition(self):
"""Get the foster parent element, and sibling to insert before
(or None) when inserting a misnested table node"""
# The foster parent element is the one which comes before the most
# recently opened table element
# XXX - this is really inelegant
lastTable=None
lastTable = None
fosterParent = None
insertBefore = None
for elm in self.openElements[::-1]:
@ -321,8 +353,8 @@ class TreeBuilder(object):
def generateImpliedEndTags(self, exclude=None):
name = self.openElements[-1].name
# XXX td, th and tr are not actually needed
if (name in frozenset(("dd", "dt", "li", "p", "td", "th", "tr"))
and name != exclude):
if (name in frozenset(("dd", "dt", "li", "option", "optgroup", "p", "rp", "rt"))
and name != exclude):
self.openElements.pop()
# XXX This is not entirely what the specification says. We should
# investigate it more closely.
@ -331,10 +363,10 @@ class TreeBuilder(object):
def getDocument(self):
"Return the final tree"
return self.document
def getFragment(self):
"Return the final fragment"
#assert self.innerHTML
# assert self.innerHTML
fragment = self.fragmentClass()
self.openElements[0].reparentChildren(fragment)
return fragment

View File

@ -1,40 +1,38 @@
from __future__ import absolute_import, division, unicode_literals
from xml.dom import minidom, Node, XML_NAMESPACE, XMLNS_NAMESPACE
import new
import re
from xml.dom import minidom, Node
import weakref
import _base
from html5lib import constants, ihatexml
from html5lib.constants import namespaces
from . import _base
from .. import constants
from ..constants import namespaces
from ..utils import moduleFactoryFactory
moduleCache = {}
def getDomModule(DomImplementation):
name = "_" + DomImplementation.__name__+"builder"
if name in moduleCache:
return moduleCache[name]
else:
mod = new.module(name)
objs = getDomBuilder(DomImplementation)
mod.__dict__.update(objs)
moduleCache[name] = mod
return mod
def getDomBuilder(DomImplementation):
Dom = DomImplementation
class AttrList:
class AttrList(object):
def __init__(self, element):
self.element = element
def __iter__(self):
return self.element.attributes.items().__iter__()
return list(self.element.attributes.items()).__iter__()
def __setitem__(self, name, value):
self.element.setAttribute(name, value)
def __len__(self):
return len(list(self.element.attributes.items()))
def items(self):
return [(item[0], item[1]) for item in
self.element.attributes.items()]
list(self.element.attributes.items())]
def keys(self):
return self.element.attributes.keys()
return list(self.element.attributes.keys())
def __getitem__(self, name):
return self.element.getAttribute(name)
@ -43,68 +41,68 @@ def getDomBuilder(DomImplementation):
raise NotImplementedError
else:
return self.element.hasAttribute(name)
class NodeBuilder(_base.Node):
def __init__(self, element):
_base.Node.__init__(self, element.nodeName)
self.element = element
namespace = property(lambda self:hasattr(self.element, "namespaceURI")
namespace = property(lambda self: hasattr(self.element, "namespaceURI")
and self.element.namespaceURI or None)
def appendChild(self, node):
node.parent = self
self.element.appendChild(node.element)
def insertText(self, data, insertBefore=None):
text = self.element.ownerDocument.createTextNode(data)
if insertBefore:
self.element.insertBefore(text, insertBefore.element)
else:
self.element.appendChild(text)
def insertBefore(self, node, refNode):
self.element.insertBefore(node.element, refNode.element)
node.parent = self
def removeChild(self, node):
if node.element.parentNode == self.element:
self.element.removeChild(node.element)
node.parent = None
def reparentChildren(self, newParent):
while self.element.hasChildNodes():
child = self.element.firstChild
self.element.removeChild(child)
newParent.element.appendChild(child)
self.childNodes = []
def getAttributes(self):
return AttrList(self.element)
def setAttributes(self, attributes):
if attributes:
for name, value in attributes.items():
for name, value in list(attributes.items()):
if isinstance(name, tuple):
if name[0] is not None:
qualifiedName = (name[0] + ":" + name[1])
else:
qualifiedName = name[1]
self.element.setAttributeNS(name[2], qualifiedName,
self.element.setAttributeNS(name[2], qualifiedName,
value)
else:
self.element.setAttribute(
name, value)
attributes = property(getAttributes, setAttributes)
def cloneNode(self):
return NodeBuilder(self.element.cloneNode(False))
def hasContent(self):
return self.element.hasChildNodes()
def getNameTuple(self):
if self.namespace == None:
if self.namespace is None:
return namespaces["html"], self.name
else:
return self.namespace, self.name
@ -113,9 +111,9 @@ def getDomBuilder(DomImplementation):
class TreeBuilder(_base.TreeBuilder):
def documentClass(self):
self.dom = Dom.getDOMImplementation().createDocument(None,None,None)
self.dom = Dom.getDOMImplementation().createDocument(None, None, None)
return weakref.proxy(self)
def insertDoctype(self, token):
name = token["name"]
publicId = token["publicId"]
@ -126,7 +124,7 @@ def getDomBuilder(DomImplementation):
self.document.appendChild(NodeBuilder(doctype))
if Dom == minidom:
doctype.ownerDocument = self.dom
def elementClass(self, name, namespace=None):
if namespace is None and self.defaultNamespace is None:
node = self.dom.createElement(name)
@ -134,153 +132,96 @@ def getDomBuilder(DomImplementation):
node = self.dom.createElementNS(namespace, name)
return NodeBuilder(node)
def commentClass(self, data):
return NodeBuilder(self.dom.createComment(data))
def fragmentClass(self):
return NodeBuilder(self.dom.createDocumentFragment())
def appendChild(self, node):
self.dom.appendChild(node.element)
def testSerializer(self, element):
return testSerializer(element)
def getDocument(self):
return self.dom
def getFragment(self):
return _base.TreeBuilder.getFragment(self).element
def insertText(self, data, parent=None):
data=data
if parent <> self:
data = data
if parent != self:
_base.TreeBuilder.insertText(self, data, parent)
else:
# HACK: allow text nodes as children of the document node
if hasattr(self.dom, '_child_node_types'):
if not Node.TEXT_NODE in self.dom._child_node_types:
self.dom._child_node_types=list(self.dom._child_node_types)
self.dom._child_node_types = list(self.dom._child_node_types)
self.dom._child_node_types.append(Node.TEXT_NODE)
self.dom.appendChild(self.dom.createTextNode(data))
implementation = DomImplementation
name = None
def testSerializer(element):
element.normalize()
rv = []
def serializeElement(element, indent=0):
if element.nodeType == Node.DOCUMENT_TYPE_NODE:
if element.name:
if element.publicId or element.systemId:
publicId = element.publicId or ""
systemId = element.systemId or ""
rv.append( """|%s<!DOCTYPE %s "%s" "%s">"""%(
' '*indent, element.name, publicId, systemId))
rv.append("""|%s<!DOCTYPE %s "%s" "%s">""" %
(' ' * indent, element.name, publicId, systemId))
else:
rv.append("|%s<!DOCTYPE %s>"%(' '*indent, element.name))
rv.append("|%s<!DOCTYPE %s>" % (' ' * indent, element.name))
else:
rv.append("|%s<!DOCTYPE >"%(' '*indent,))
rv.append("|%s<!DOCTYPE >" % (' ' * indent,))
elif element.nodeType == Node.DOCUMENT_NODE:
rv.append("#document")
elif element.nodeType == Node.DOCUMENT_FRAGMENT_NODE:
rv.append("#document-fragment")
elif element.nodeType == Node.COMMENT_NODE:
rv.append("|%s<!-- %s -->"%(' '*indent, element.nodeValue))
rv.append("|%s<!-- %s -->" % (' ' * indent, element.nodeValue))
elif element.nodeType == Node.TEXT_NODE:
rv.append("|%s\"%s\"" %(' '*indent, element.nodeValue))
rv.append("|%s\"%s\"" % (' ' * indent, element.nodeValue))
else:
if (hasattr(element, "namespaceURI") and
element.namespaceURI != None):
name = "%s %s"%(constants.prefixes[element.namespaceURI],
element.nodeName)
element.namespaceURI is not None):
name = "%s %s" % (constants.prefixes[element.namespaceURI],
element.nodeName)
else:
name = element.nodeName
rv.append("|%s<%s>"%(' '*indent, name))
rv.append("|%s<%s>" % (' ' * indent, name))
if element.hasAttributes():
i = 0
attr = element.attributes.item(i)
while attr:
attributes = []
for i in range(len(element.attributes)):
attr = element.attributes.item(i)
name = attr.nodeName
value = attr.value
ns = attr.namespaceURI
if ns:
name = "%s %s"%(constants.prefixes[ns], attr.localName)
name = "%s %s" % (constants.prefixes[ns], attr.localName)
else:
name = attr.nodeName
i += 1
attr = element.attributes.item(i)
attributes.append((name, value))
rv.append('|%s%s="%s"' % (' '*(indent+2), name, value))
for name, value in sorted(attributes):
rv.append('|%s%s="%s"' % (' ' * (indent + 2), name, value))
indent += 2
for child in element.childNodes:
serializeElement(child, indent)
serializeElement(element, 0)
return "\n".join(rv)
def dom2sax(node, handler, nsmap={'xml':XML_NAMESPACE}):
if node.nodeType == Node.ELEMENT_NODE:
if not nsmap:
handler.startElement(node.nodeName, node.attributes)
for child in node.childNodes: dom2sax(child, handler, nsmap)
handler.endElement(node.nodeName)
else:
attributes = dict(node.attributes.itemsNS())
# gather namespace declarations
prefixes = []
for attrname in node.attributes.keys():
attr = node.getAttributeNode(attrname)
if (attr.namespaceURI == XMLNS_NAMESPACE or
(attr.namespaceURI == None and attr.nodeName.startswith('xmlns'))):
prefix = (attr.nodeName != 'xmlns' and attr.nodeName or None)
handler.startPrefixMapping(prefix, attr.nodeValue)
prefixes.append(prefix)
nsmap = nsmap.copy()
nsmap[prefix] = attr.nodeValue
del attributes[(attr.namespaceURI, attr.nodeName)]
# apply namespace declarations
for attrname in node.attributes.keys():
attr = node.getAttributeNode(attrname)
if attr.namespaceURI == None and ':' in attr.nodeName:
prefix = attr.nodeName.split(':')[0]
if nsmap.has_key(prefix):
del attributes[(attr.namespaceURI, attr.nodeName)]
attributes[(nsmap[prefix],attr.nodeName)]=attr.nodeValue
# SAX events
ns = node.namespaceURI or nsmap.get(None,None)
handler.startElementNS((ns,node.nodeName), node.nodeName, attributes)
for child in node.childNodes: dom2sax(child, handler, nsmap)
handler.endElementNS((ns, node.nodeName), node.nodeName)
for prefix in prefixes: handler.endPrefixMapping(prefix)
elif node.nodeType in [Node.TEXT_NODE, Node.CDATA_SECTION_NODE]:
handler.characters(node.nodeValue)
elif node.nodeType == Node.DOCUMENT_NODE:
handler.startDocument()
for child in node.childNodes: dom2sax(child, handler, nsmap)
handler.endDocument()
elif node.nodeType == Node.DOCUMENT_FRAGMENT_NODE:
for child in node.childNodes: dom2sax(child, handler, nsmap)
else:
# ATTRIBUTE_NODE
# ENTITY_NODE
# PROCESSING_INSTRUCTION_NODE
# COMMENT_NODE
# DOCUMENT_TYPE_NODE
# NOTATION_NODE
pass
return locals()
# Keep backwards compatibility with things that directly load
# classes/functions from this module
for key, value in getDomModule(minidom).__dict__.items():
globals()[key] = value
# The actual means to get a module!
getDomModule = moduleFactoryFactory(getDomBuilder)

View File

@ -1,28 +1,21 @@
import new
from __future__ import absolute_import, division, unicode_literals
from six import text_type
import re
import _base
from html5lib import ihatexml
from html5lib import constants
from html5lib.constants import namespaces
from . import _base
from .. import ihatexml
from .. import constants
from ..constants import namespaces
from ..utils import moduleFactoryFactory
tag_regexp = re.compile("{([^}]*)}(.*)")
moduleCache = {}
def getETreeModule(ElementTreeImplementation, fullTree=False):
name = "_" + ElementTreeImplementation.__name__+"builder"
if name in moduleCache:
return moduleCache[name]
else:
mod = new.module("_" + ElementTreeImplementation.__name__+"builder")
objs = getETreeBuilder(ElementTreeImplementation, fullTree)
mod.__dict__.update(objs)
moduleCache[name] = mod
return mod
def getETreeBuilder(ElementTreeImplementation, fullTree=False):
ElementTree = ElementTreeImplementation
ElementTreeCommentType = ElementTree.Comment("asd").tag
class Element(_base.Node):
def __init__(self, name, namespace=None):
self._name = name
@ -41,16 +34,16 @@ def getETreeBuilder(ElementTreeImplementation, fullTree=False):
if namespace is None:
etree_tag = name
else:
etree_tag = "{%s}%s"%(namespace, name)
etree_tag = "{%s}%s" % (namespace, name)
return etree_tag
def _setName(self, name):
self._name = name
self._element.tag = self._getETreeTag(self._name, self._namespace)
def _getName(self):
return self._name
name = property(_getName, _setName)
def _setNamespace(self, namespace):
@ -61,81 +54,82 @@ def getETreeBuilder(ElementTreeImplementation, fullTree=False):
return self._namespace
namespace = property(_getNamespace, _setNamespace)
def _getAttributes(self):
return self._element.attrib
def _setAttributes(self, attributes):
#Delete existing attributes first
#XXX - there may be a better way to do this...
for key in self._element.attrib.keys():
# Delete existing attributes first
# XXX - there may be a better way to do this...
for key in list(self._element.attrib.keys()):
del self._element.attrib[key]
for key, value in attributes.iteritems():
for key, value in attributes.items():
if isinstance(key, tuple):
name = "{%s}%s"%(key[2], key[1])
name = "{%s}%s" % (key[2], key[1])
else:
name = key
self._element.set(name, value)
attributes = property(_getAttributes, _setAttributes)
def _getChildNodes(self):
return self._childNodes
return self._childNodes
def _setChildNodes(self, value):
del self._element[:]
self._childNodes = []
for element in value:
self.insertChild(element)
childNodes = property(_getChildNodes, _setChildNodes)
def hasContent(self):
"""Return true if the node has children or text"""
return bool(self._element.text or self._element.getchildren())
return bool(self._element.text or len(self._element))
def appendChild(self, node):
self._childNodes.append(node)
self._element.append(node._element)
node.parent = self
def insertBefore(self, node, refNode):
index = self._element.getchildren().index(refNode._element)
index = list(self._element).index(refNode._element)
self._element.insert(index, node._element)
node.parent = self
def removeChild(self, node):
self._element.remove(node._element)
node.parent=None
node.parent = None
def insertText(self, data, insertBefore=None):
if not(len(self._element)):
if not self._element.text:
self._element.text = ""
self._element.text += data
elif insertBefore is None:
#Insert the text as the tail of the last child element
# Insert the text as the tail of the last child element
if not self._element[-1].tail:
self._element[-1].tail = ""
self._element[-1].tail += data
else:
#Insert the text before the specified node
children = self._element.getchildren()
# Insert the text before the specified node
children = list(self._element)
index = children.index(insertBefore._element)
if index > 0:
if not self._element[index-1].tail:
self._element[index-1].tail = ""
self._element[index-1].tail += data
if not self._element[index - 1].tail:
self._element[index - 1].tail = ""
self._element[index - 1].tail += data
else:
if not self._element.text:
self._element.text = ""
self._element.text += data
def cloneNode(self):
element = Element(self.name, self.namespace)
for name, value in self.attributes.iteritems():
element = type(self)(self.name, self.namespace)
for name, value in self.attributes.items():
element.attributes[name] = value
return element
def reparentChildren(self, newParent):
if newParent.childNodes:
newParent.childNodes[-1]._element.tail += self._element.text
@ -146,60 +140,60 @@ def getETreeBuilder(ElementTreeImplementation, fullTree=False):
newParent._element.text += self._element.text
self._element.text = ""
_base.Node.reparentChildren(self, newParent)
class Comment(Element):
def __init__(self, data):
#Use the superclass constructor to set all properties on the
#wrapper element
# Use the superclass constructor to set all properties on the
# wrapper element
self._element = ElementTree.Comment(data)
self.parent = None
self._childNodes = []
self._flags = []
def _getData(self):
return self._element.text
def _setData(self, value):
self._element.text = value
data = property(_getData, _setData)
class DocumentType(Element):
def __init__(self, name, publicId, systemId):
Element.__init__(self, "<!DOCTYPE>")
Element.__init__(self, "<!DOCTYPE>")
self._element.text = name
self.publicId = publicId
self.systemId = systemId
def _getPublicId(self):
return self._element.get(u"publicId", "")
return self._element.get("publicId", "")
def _setPublicId(self, value):
if value is not None:
self._element.set(u"publicId", value)
self._element.set("publicId", value)
publicId = property(_getPublicId, _setPublicId)
def _getSystemId(self):
return self._element.get(u"systemId", "")
return self._element.get("systemId", "")
def _setSystemId(self, value):
if value is not None:
self._element.set(u"systemId", value)
self._element.set("systemId", value)
systemId = property(_getSystemId, _setSystemId)
class Document(Element):
def __init__(self):
Element.__init__(self, "<DOCUMENT_ROOT>")
Element.__init__(self, "DOCUMENT_ROOT")
class DocumentFragment(Element):
def __init__(self):
Element.__init__(self, "<DOCUMENT_FRAGMENT>")
Element.__init__(self, "DOCUMENT_FRAGMENT")
def testSerializer(element):
rv = []
finalText = None
def serializeElement(element, indent=0):
if not(hasattr(element, "tag")):
element = element.getroot()
@ -207,19 +201,23 @@ def getETreeBuilder(ElementTreeImplementation, fullTree=False):
if element.get("publicId") or element.get("systemId"):
publicId = element.get("publicId") or ""
systemId = element.get("systemId") or ""
rv.append( """<!DOCTYPE %s "%s" "%s">"""%(
element.text, publicId, systemId))
else:
rv.append("<!DOCTYPE %s>"%(element.text,))
elif element.tag == "<DOCUMENT_ROOT>":
rv.append("""<!DOCTYPE %s "%s" "%s">""" %
(element.text, publicId, systemId))
else:
rv.append("<!DOCTYPE %s>" % (element.text,))
elif element.tag == "DOCUMENT_ROOT":
rv.append("#document")
if element.text:
rv.append("|%s\"%s\""%(' '*(indent+2), element.text))
if element.tail:
finalText = element.tail
elif type(element.tag) == type(ElementTree.Comment):
rv.append("|%s<!-- %s -->"%(' '*indent, element.text))
if element.text is not None:
rv.append("|%s\"%s\"" % (' ' * (indent + 2), element.text))
if element.tail is not None:
raise TypeError("Document node cannot have tail")
if hasattr(element, "attrib") and len(element.attrib):
raise TypeError("Document node cannot have attributes")
elif element.tag == ElementTreeCommentType:
rv.append("|%s<!-- %s -->" % (' ' * indent, element.text))
else:
assert isinstance(element.tag, text_type), \
"Expected unicode, got %s, %s" % (type(element.tag), element.tag)
nsmatch = tag_regexp.match(element.tag)
if nsmatch is None:
@ -227,103 +225,113 @@ def getETreeBuilder(ElementTreeImplementation, fullTree=False):
else:
ns, name = nsmatch.groups()
prefix = constants.prefixes[ns]
name = "%s %s"%(prefix, name)
rv.append("|%s<%s>"%(' '*indent, name))
name = "%s %s" % (prefix, name)
rv.append("|%s<%s>" % (' ' * indent, name))
if hasattr(element, "attrib"):
for name, value in element.attrib.iteritems():
attributes = []
for name, value in element.attrib.items():
nsmatch = tag_regexp.match(name)
if nsmatch is not None:
ns, name = nsmatch.groups()
prefix = constants.prefixes[ns]
name = "%s %s"%(prefix, name)
rv.append('|%s%s="%s"' % (' '*(indent+2), name, value))
attr_string = "%s %s" % (prefix, name)
else:
attr_string = name
attributes.append((attr_string, value))
for name, value in sorted(attributes):
rv.append('|%s%s="%s"' % (' ' * (indent + 2), name, value))
if element.text:
rv.append("|%s\"%s\"" %(' '*(indent+2), element.text))
rv.append("|%s\"%s\"" % (' ' * (indent + 2), element.text))
indent += 2
for child in element.getchildren():
for child in element:
serializeElement(child, indent)
if element.tail:
rv.append("|%s\"%s\"" %(' '*(indent-2), element.tail))
rv.append("|%s\"%s\"" % (' ' * (indent - 2), element.tail))
serializeElement(element, 0)
if finalText is not None:
rv.append("|%s\"%s\""%(' '*2, finalText))
return "\n".join(rv)
def tostring(element):
"""Serialize an element and its child nodes to a string"""
rv = []
finalText = None
filter = ihatexml.InfosetFilter()
def serializeElement(element):
if type(element) == type(ElementTree.ElementTree):
if isinstance(element, ElementTree.ElementTree):
element = element.getroot()
if element.tag == "<!DOCTYPE>":
if element.get("publicId") or element.get("systemId"):
publicId = element.get("publicId") or ""
systemId = element.get("systemId") or ""
rv.append( """<!DOCTYPE %s PUBLIC "%s" "%s">"""%(
element.text, publicId, systemId))
else:
rv.append("<!DOCTYPE %s>"%(element.text,))
elif element.tag == "<DOCUMENT_ROOT>":
if element.text:
rv.append(element.text)
if element.tail:
finalText = element.tail
for child in element.getchildren():
serializeElement(child)
elif type(element.tag) == type(ElementTree.Comment):
rv.append("<!--%s-->"%(element.text,))
else:
#This is assumed to be an ordinary element
if not element.attrib:
rv.append("<%s>"%(filter.fromXmlName(element.tag),))
rv.append("""<!DOCTYPE %s PUBLIC "%s" "%s">""" %
(element.text, publicId, systemId))
else:
attr = " ".join(["%s=\"%s\""%(
filter.fromXmlName(name), value)
for name, value in element.attrib.iteritems()])
rv.append("<%s %s>"%(element.tag, attr))
rv.append("<!DOCTYPE %s>" % (element.text,))
elif element.tag == "DOCUMENT_ROOT":
if element.text is not None:
rv.append(element.text)
if element.tail is not None:
raise TypeError("Document node cannot have tail")
if hasattr(element, "attrib") and len(element.attrib):
raise TypeError("Document node cannot have attributes")
for child in element:
serializeElement(child)
elif element.tag == ElementTreeCommentType:
rv.append("<!--%s-->" % (element.text,))
else:
# This is assumed to be an ordinary element
if not element.attrib:
rv.append("<%s>" % (filter.fromXmlName(element.tag),))
else:
attr = " ".join(["%s=\"%s\"" % (
filter.fromXmlName(name), value)
for name, value in element.attrib.items()])
rv.append("<%s %s>" % (element.tag, attr))
if element.text:
rv.append(element.text)
for child in element.getchildren():
for child in element:
serializeElement(child)
rv.append("</%s>"%(element.tag,))
rv.append("</%s>" % (element.tag,))
if element.tail:
rv.append(element.tail)
serializeElement(element)
if finalText is not None:
rv.append("%s\""%(' '*2, finalText))
return "".join(rv)
class TreeBuilder(_base.TreeBuilder):
documentClass = Document
doctypeClass = DocumentType
elementClass = Element
commentClass = Comment
fragmentClass = DocumentFragment
implementation = ElementTreeImplementation
def testSerializer(self, element):
return testSerializer(element)
def getDocument(self):
if fullTree:
return self.document._element
else:
return self.document._element.find("html")
if self.defaultNamespace is not None:
return self.document._element.find(
"{%s}html" % self.defaultNamespace)
else:
return self.document._element.find("html")
def getFragment(self):
return _base.TreeBuilder.getFragment(self)._element
return locals()
getETreeModule = moduleFactoryFactory(getETreeBuilder)

View File

@ -1,20 +1,3 @@
import new
import warnings
import re
import _base
from html5lib.constants import DataLossWarning
import html5lib.constants as constants
import etree as etree_builders
from html5lib import ihatexml
try:
import lxml.etree as etree
except ImportError:
pass
fullTree = True
"""Module for supporting the lxml.etree library. The idea here is to use as much
of the native library as possible, without using fragile hacks like custom element
names that break between releases. The downside of this is that we cannot represent
@ -26,12 +9,34 @@ Docypes with no name
When any of these things occur, we emit a DataLossWarning
"""
from __future__ import absolute_import, division, unicode_literals
import warnings
import re
import sys
from . import _base
from ..constants import DataLossWarning
from .. import constants
from . import etree as etree_builders
from .. import ihatexml
import lxml.etree as etree
fullTree = True
tag_regexp = re.compile("{([^}]*)}(.*)")
comment_type = etree.Comment("asd").tag
class DocumentType(object):
def __init__(self, name, publicId, systemId):
self.name = name
self.name = name
self.publicId = publicId
self.systemId = systemId
class Document(object):
def __init__(self):
self._elementTree = None
@ -42,117 +47,126 @@ class Document(object):
def _getChildNodes(self):
return self._childNodes
childNodes = property(_getChildNodes)
def testSerializer(element):
rv = []
finalText = None
filter = ihatexml.InfosetFilter()
infosetFilter = ihatexml.InfosetFilter()
def serializeElement(element, indent=0):
if not hasattr(element, "tag"):
if hasattr(element, "getroot"):
#Full tree case
if hasattr(element, "getroot"):
# Full tree case
rv.append("#document")
if element.docinfo.internalDTD:
if not (element.docinfo.public_id or
if not (element.docinfo.public_id or
element.docinfo.system_url):
dtd_str = "<!DOCTYPE %s>"%element.docinfo.root_name
dtd_str = "<!DOCTYPE %s>" % element.docinfo.root_name
else:
dtd_str = """<!DOCTYPE %s "%s" "%s">"""%(
element.docinfo.root_name,
dtd_str = """<!DOCTYPE %s "%s" "%s">""" % (
element.docinfo.root_name,
element.docinfo.public_id,
element.docinfo.system_url)
rv.append("|%s%s"%(' '*(indent+2), dtd_str))
rv.append("|%s%s" % (' ' * (indent + 2), dtd_str))
next_element = element.getroot()
while next_element.getprevious() is not None:
next_element = next_element.getprevious()
while next_element is not None:
serializeElement(next_element, indent+2)
serializeElement(next_element, indent + 2)
next_element = next_element.getnext()
elif isinstance(element, basestring):
#Text in a fragment
rv.append("|%s\"%s\""%(' '*indent, element))
elif isinstance(element, str) or isinstance(element, bytes):
# Text in a fragment
assert isinstance(element, str) or sys.version_info.major == 2
rv.append("|%s\"%s\"" % (' ' * indent, element))
else:
#Fragment case
# Fragment case
rv.append("#document-fragment")
for next_element in element:
serializeElement(next_element, indent+2)
elif type(element.tag) == type(etree.Comment):
rv.append("|%s<!-- %s -->"%(' '*indent, element.text))
serializeElement(next_element, indent + 2)
elif element.tag == comment_type:
rv.append("|%s<!-- %s -->" % (' ' * indent, element.text))
if hasattr(element, "tail") and element.tail:
rv.append("|%s\"%s\"" % (' ' * indent, element.tail))
else:
assert isinstance(element, etree._Element)
nsmatch = etree_builders.tag_regexp.match(element.tag)
if nsmatch is not None:
ns = nsmatch.group(1)
tag = nsmatch.group(2)
prefix = constants.prefixes[ns]
rv.append("|%s<%s %s>"%(' '*indent, prefix,
filter.fromXmlName(tag)))
rv.append("|%s<%s %s>" % (' ' * indent, prefix,
infosetFilter.fromXmlName(tag)))
else:
rv.append("|%s<%s>"%(' '*indent,
filter.fromXmlName(element.tag)))
rv.append("|%s<%s>" % (' ' * indent,
infosetFilter.fromXmlName(element.tag)))
if hasattr(element, "attrib"):
for name, value in element.attrib.iteritems():
nsmatch = etree_builders.tag_regexp.match(name)
if nsmatch:
ns = nsmatch.group(1)
name = nsmatch.group(2)
attributes = []
for name, value in element.attrib.items():
nsmatch = tag_regexp.match(name)
if nsmatch is not None:
ns, name = nsmatch.groups()
name = infosetFilter.fromXmlName(name)
prefix = constants.prefixes[ns]
rv.append('|%s%s %s="%s"' % (' '*(indent+2),
prefix,
filter.fromXmlName(name),
value))
else:
rv.append('|%s%s="%s"' % (' '*(indent+2),
filter.fromXmlName(name),
value))
attr_string = "%s %s" % (prefix, name)
else:
attr_string = infosetFilter.fromXmlName(name)
attributes.append((attr_string, value))
for name, value in sorted(attributes):
rv.append('|%s%s="%s"' % (' ' * (indent + 2), name, value))
if element.text:
rv.append("|%s\"%s\"" %(' '*(indent+2), element.text))
rv.append("|%s\"%s\"" % (' ' * (indent + 2), element.text))
indent += 2
for child in element.getchildren():
for child in element:
serializeElement(child, indent)
if hasattr(element, "tail") and element.tail:
rv.append("|%s\"%s\"" %(' '*(indent-2), element.tail))
if hasattr(element, "tail") and element.tail:
rv.append("|%s\"%s\"" % (' ' * (indent - 2), element.tail))
serializeElement(element, 0)
if finalText is not None:
rv.append("|%s\"%s\""%(' '*2, finalText))
rv.append("|%s\"%s\"" % (' ' * 2, finalText))
return "\n".join(rv)
def tostring(element):
"""Serialize an element and its child nodes to a string"""
rv = []
finalText = None
def serializeElement(element):
if not hasattr(element, "tag"):
if element.docinfo.internalDTD:
if element.docinfo.doctype:
dtd_str = element.docinfo.doctype
else:
dtd_str = "<!DOCTYPE %s>"%element.docinfo.root_name
dtd_str = "<!DOCTYPE %s>" % element.docinfo.root_name
rv.append(dtd_str)
serializeElement(element.getroot())
elif type(element.tag) == type(etree.Comment):
rv.append("<!--%s-->"%(element.text,))
elif element.tag == comment_type:
rv.append("<!--%s-->" % (element.text,))
else:
#This is assumed to be an ordinary element
# This is assumed to be an ordinary element
if not element.attrib:
rv.append("<%s>"%(element.tag,))
rv.append("<%s>" % (element.tag,))
else:
attr = " ".join(["%s=\"%s\""%(name, value)
for name, value in element.attrib.iteritems()])
rv.append("<%s %s>"%(element.tag, attr))
attr = " ".join(["%s=\"%s\"" % (name, value)
for name, value in element.attrib.items()])
rv.append("<%s %s>" % (element.tag, attr))
if element.text:
rv.append(element.text)
for child in element.getchildren():
for child in element:
serializeElement(child)
rv.append("</%s>"%(element.tag,))
rv.append("</%s>" % (element.tag,))
if hasattr(element, "tail") and element.tail:
rv.append(element.tail)
@ -160,56 +174,57 @@ def tostring(element):
serializeElement(element)
if finalText is not None:
rv.append("%s\""%(' '*2, finalText))
rv.append("%s\"" % (' ' * 2, finalText))
return "".join(rv)
class TreeBuilder(_base.TreeBuilder):
documentClass = Document
doctypeClass = DocumentType
elementClass = None
commentClass = None
fragmentClass = Document
fragmentClass = Document
implementation = etree
def __init__(self, namespaceHTMLElements, fullTree = False):
def __init__(self, namespaceHTMLElements, fullTree=False):
builder = etree_builders.getETreeModule(etree, fullTree=fullTree)
filter = self.filter = ihatexml.InfosetFilter()
infosetFilter = self.infosetFilter = ihatexml.InfosetFilter()
self.namespaceHTMLElements = namespaceHTMLElements
class Attributes(dict):
def __init__(self, element, value={}):
self._element = element
dict.__init__(self, value)
for key, value in self.iteritems():
for key, value in self.items():
if isinstance(key, tuple):
name = "{%s}%s"%(key[2], filter.coerceAttribute(key[1]))
name = "{%s}%s" % (key[2], infosetFilter.coerceAttribute(key[1]))
else:
name = filter.coerceAttribute(key)
name = infosetFilter.coerceAttribute(key)
self._element._element.attrib[name] = value
def __setitem__(self, key, value):
dict.__setitem__(self, key, value)
if isinstance(key, tuple):
name = "{%s}%s"%(key[2], filter.coerceAttribute(key[1]))
name = "{%s}%s" % (key[2], infosetFilter.coerceAttribute(key[1]))
else:
name = filter.coerceAttribute(key)
name = infosetFilter.coerceAttribute(key)
self._element._element.attrib[name] = value
class Element(builder.Element):
def __init__(self, name, namespace):
name = filter.coerceElement(name)
name = infosetFilter.coerceElement(name)
builder.Element.__init__(self, name, namespace=namespace)
self._attributes = Attributes(self)
def _setName(self, name):
self._name = filter.coerceElement(name)
self._name = infosetFilter.coerceElement(name)
self._element.tag = self._getETreeTag(
self._name, self._namespace)
def _getName(self):
return filter.fromXmlName(self._name)
return infosetFilter.fromXmlName(self._name)
name = property(_getName, _setName)
def _getAttributes(self):
@ -217,24 +232,23 @@ class TreeBuilder(_base.TreeBuilder):
def _setAttributes(self, attributes):
self._attributes = Attributes(self, attributes)
attributes = property(_getAttributes, _setAttributes)
def insertText(self, data, insertBefore=None):
data = filter.coerceCharacters(data)
data = infosetFilter.coerceCharacters(data)
builder.Element.insertText(self, data, insertBefore)
def appendChild(self, child):
builder.Element.appendChild(self, child)
class Comment(builder.Comment):
def __init__(self, data):
data = filter.coerceComment(data)
data = infosetFilter.coerceComment(data)
builder.Comment.__init__(self, data)
def _setData(self, data):
data = filter.coerceComment(data)
data = infosetFilter.coerceComment(data)
self._element.text = data
def _getData(self):
@ -244,9 +258,9 @@ class TreeBuilder(_base.TreeBuilder):
self.elementClass = Element
self.commentClass = builder.Comment
#self.fragmentClass = builder.DocumentFragment
# self.fragmentClass = builder.DocumentFragment
_base.TreeBuilder.__init__(self, namespaceHTMLElements)
def reset(self):
_base.TreeBuilder.reset(self)
self.insertComment = self.insertCommentInitial
@ -261,13 +275,13 @@ class TreeBuilder(_base.TreeBuilder):
return self.document._elementTree
else:
return self.document._elementTree.getroot()
def getFragment(self):
fragment = []
element = self.openElements[0]._element
if element.text:
fragment.append(element.text)
fragment.extend(element.getchildren())
fragment.extend(list(element))
if element.tail:
fragment.append(element.tail)
return fragment
@ -277,59 +291,79 @@ class TreeBuilder(_base.TreeBuilder):
publicId = token["publicId"]
systemId = token["systemId"]
if not name or ihatexml.nonXmlNameBMPRegexp.search(name) or name[0] == '"':
warnings.warn("lxml cannot represent null or non-xml doctype", DataLossWarning)
if not name:
warnings.warn("lxml cannot represent empty doctype", DataLossWarning)
self.doctype = None
else:
coercedName = self.infosetFilter.coerceElement(name)
if coercedName != name:
warnings.warn("lxml cannot represent non-xml doctype", DataLossWarning)
doctype = self.doctypeClass(coercedName, publicId, systemId)
self.doctype = doctype
doctype = self.doctypeClass(name, publicId, systemId)
self.doctype = doctype
def insertCommentInitial(self, data, parent=None):
self.initial_comments.append(data)
def insertCommentMain(self, data, parent=None):
if (parent == self.document and
self.document._elementTree.getroot()[-1].tag == comment_type):
warnings.warn("lxml cannot represent adjacent comments beyond the root elements", DataLossWarning)
super(TreeBuilder, self).insertComment(data, parent)
def insertRoot(self, token):
"""Create the document root"""
#Because of the way libxml2 works, it doesn't seem to be possible to
#alter information like the doctype after the tree has been parsed.
#Therefore we need to use the built-in parser to create our iniial
#tree, after which we can add elements like normal
# Because of the way libxml2 works, it doesn't seem to be possible to
# alter information like the doctype after the tree has been parsed.
# Therefore we need to use the built-in parser to create our iniial
# tree, after which we can add elements like normal
docStr = ""
if self.doctype and self.doctype.name and not self.doctype.name.startswith('"'):
docStr += "<!DOCTYPE %s"%self.doctype.name
if (self.doctype.publicId is not None or
self.doctype.systemId is not None):
docStr += ' PUBLIC "%s" "%s"'%(self.doctype.publicId or "",
self.doctype.systemId or "")
if self.doctype:
assert self.doctype.name
docStr += "<!DOCTYPE %s" % self.doctype.name
if (self.doctype.publicId is not None or
self.doctype.systemId is not None):
docStr += (' PUBLIC "%s" ' %
(self.infosetFilter.coercePubid(self.doctype.publicId or "")))
if self.doctype.systemId:
sysid = self.doctype.systemId
if sysid.find("'") >= 0 and sysid.find('"') >= 0:
warnings.warn("DOCTYPE system cannot contain single and double quotes", DataLossWarning)
sysid = sysid.replace("'", 'U00027')
if sysid.find("'") >= 0:
docStr += '"%s"' % sysid
else:
docStr += "'%s'" % sysid
else:
docStr += "''"
docStr += ">"
if self.doctype.name != token["name"]:
warnings.warn("lxml cannot represent doctype with a different name to the root element", DataLossWarning)
docStr += "<THIS_SHOULD_NEVER_APPEAR_PUBLICLY/>"
try:
root = etree.fromstring(docStr)
except etree.XMLSyntaxError:
print docStr
raise
#Append the initial comments:
root = etree.fromstring(docStr)
# Append the initial comments:
for comment_token in self.initial_comments:
root.addprevious(etree.Comment(comment_token["data"]))
#Create the root document and add the ElementTree to it
# Create the root document and add the ElementTree to it
self.document = self.documentClass()
self.document._elementTree = root.getroottree()
# Give the root element the right name
name = token["name"]
namespace = token.get("namespace", self.defaultNamespace)
if namespace is None:
etree_tag = name
else:
etree_tag = "{%s}%s"%(namespace, name)
etree_tag = "{%s}%s" % (namespace, name)
root.tag = etree_tag
#Add the root element to the internal child/open data structures
# Add the root element to the internal child/open data structures
root_element = self.elementClass(name, namespace)
root_element._element = root
self.document._childNodes.append(root_element)
self.openElements.append(root_element)
#Reset to the default insert comment function
self.insertComment = super(TreeBuilder, self).insertComment
# Reset to the default insert comment function
self.insertComment = self.insertCommentMain

View File

@ -1,248 +0,0 @@
import _base
from html5lib.constants import voidElements, namespaces, prefixes
from xml.sax.saxutils import escape
# Really crappy basic implementation of a DOM-core like thing
class Node(_base.Node):
type = -1
def __init__(self, name):
self.name = name
self.parent = None
self.value = None
self.childNodes = []
self._flags = []
def __iter__(self):
for node in self.childNodes:
yield node
for item in node:
yield item
def __unicode__(self):
return self.name
def toxml(self):
raise NotImplementedError
def printTree(self, indent=0):
tree = '\n|%s%s' % (' '* indent, unicode(self))
for child in self.childNodes:
tree += child.printTree(indent + 2)
return tree
def appendChild(self, node):
if (isinstance(node, TextNode) and self.childNodes and
isinstance(self.childNodes[-1], TextNode)):
self.childNodes[-1].value += node.value
else:
self.childNodes.append(node)
node.parent = self
def insertText(self, data, insertBefore=None):
if insertBefore is None:
self.appendChild(TextNode(data))
else:
self.insertBefore(TextNode(data), insertBefore)
def insertBefore(self, node, refNode):
index = self.childNodes.index(refNode)
if (isinstance(node, TextNode) and index > 0 and
isinstance(self.childNodes[index - 1], TextNode)):
self.childNodes[index - 1].value += node.value
else:
self.childNodes.insert(index, node)
node.parent = self
def removeChild(self, node):
try:
self.childNodes.remove(node)
except:
# XXX
raise
node.parent = None
def cloneNode(self):
raise NotImplementedError
def hasContent(self):
"""Return true if the node has children or text"""
return bool(self.childNodes)
def getNameTuple(self):
if self.namespace == None:
return namespaces["html"], self.name
else:
return self.namespace, self.name
nameTuple = property(getNameTuple)
class Document(Node):
type = 1
def __init__(self):
Node.__init__(self, None)
def __unicode__(self):
return "#document"
def appendChild(self, child):
Node.appendChild(self, child)
def toxml(self, encoding="utf=8"):
result = ""
for child in self.childNodes:
result += child.toxml()
return result.encode(encoding)
def hilite(self, encoding="utf-8"):
result = "<pre>"
for child in self.childNodes:
result += child.hilite()
return result.encode(encoding) + "</pre>"
def printTree(self):
tree = unicode(self)
for child in self.childNodes:
tree += child.printTree(2)
return tree
def cloneNode(self):
return Document()
class DocumentFragment(Document):
type = 2
def __unicode__(self):
return "#document-fragment"
def cloneNode(self):
return DocumentFragment()
class DocumentType(Node):
type = 3
def __init__(self, name, publicId, systemId):
Node.__init__(self, name)
self.publicId = publicId
self.systemId = systemId
def __unicode__(self):
if self.publicId or self.systemId:
publicId = self.publicId or ""
systemId = self.systemId or ""
return """<!DOCTYPE %s "%s" "%s">"""%(
self.name, publicId, systemId)
else:
return u"<!DOCTYPE %s>" % self.name
toxml = __unicode__
def hilite(self):
return '<code class="markup doctype">&lt;!DOCTYPE %s></code>' % self.name
def cloneNode(self):
return DocumentType(self.name, self.publicId, self.systemId)
class TextNode(Node):
type = 4
def __init__(self, value):
Node.__init__(self, None)
self.value = value
def __unicode__(self):
return u"\"%s\"" % self.value
def toxml(self):
return escape(self.value)
hilite = toxml
def cloneNode(self):
return TextNode(self.value)
class Element(Node):
type = 5
def __init__(self, name, namespace=None):
Node.__init__(self, name)
self.namespace = namespace
self.attributes = {}
def __unicode__(self):
if self.namespace == None:
return u"<%s>" % self.name
else:
return u"<%s %s>"%(prefixes[self.namespace], self.name)
def toxml(self):
result = '<' + self.name
if self.attributes:
for name,value in self.attributes.iteritems():
result += u' %s="%s"' % (name, escape(value,{'"':'&quot;'}))
if self.childNodes:
result += '>'
for child in self.childNodes:
result += child.toxml()
result += u'</%s>' % self.name
else:
result += u'/>'
return result
def hilite(self):
result = '&lt;<code class="markup element-name">%s</code>' % self.name
if self.attributes:
for name, value in self.attributes.iteritems():
result += ' <code class="markup attribute-name">%s</code>=<code class="markup attribute-value">"%s"</code>' % (name, escape(value, {'"':'&quot;'}))
if self.childNodes:
result += ">"
for child in self.childNodes:
result += child.hilite()
elif self.name in voidElements:
return result + ">"
return result + '&lt;/<code class="markup element-name">%s</code>>' % self.name
def printTree(self, indent):
tree = '\n|%s%s' % (' '*indent, unicode(self))
indent += 2
if self.attributes:
for name, value in self.attributes.iteritems():
if isinstance(name, tuple):
name = "%s %s"%(name[0], name[1])
tree += '\n|%s%s="%s"' % (' ' * indent, name, value)
for child in self.childNodes:
tree += child.printTree(indent)
return tree
def cloneNode(self):
newNode = Element(self.name)
if hasattr(self, 'namespace'):
newNode.namespace = self.namespace
for attr, value in self.attributes.iteritems():
newNode.attributes[attr] = value
return newNode
class CommentNode(Node):
type = 6
def __init__(self, data):
Node.__init__(self, None)
self.data = data
def __unicode__(self):
return "<!-- %s -->" % self.data
def toxml(self):
return "<!--%s-->" % self.data
def hilite(self):
return '<code class="markup comment">&lt;!--%s--></code>' % escape(self.data)
def cloneNode(self):
return CommentNode(self.data)
class TreeBuilder(_base.TreeBuilder):
documentClass = Document
doctypeClass = DocumentType
elementClass = Element
commentClass = CommentNode
fragmentClass = DocumentFragment
def testSerializer(self, node):
return node.printTree()

View File

@ -1,228 +0,0 @@
import warnings
warnings.warn("BeautifulSoup 3.x (as of 3.1) is not fully compatible with html5lib and support will be removed in the future", DeprecationWarning)
from BeautifulSoup import BeautifulSoup, Tag, NavigableString, Comment, Declaration
import _base
from html5lib.constants import namespaces, DataLossWarning
class AttrList(object):
def __init__(self, element):
self.element = element
self.attrs = dict(self.element.attrs)
def __iter__(self):
return self.attrs.items().__iter__()
def __setitem__(self, name, value):
"set attr", name, value
self.element[name] = value
def items(self):
return self.attrs.items()
def keys(self):
return self.attrs.keys()
def __getitem__(self, name):
return self.attrs[name]
def __contains__(self, name):
return name in self.attrs.keys()
class Element(_base.Node):
def __init__(self, element, soup, namespace):
_base.Node.__init__(self, element.name)
self.element = element
self.soup = soup
self.namespace = namespace
def _nodeIndex(self, node, refNode):
# Finds a node by identity rather than equality
for index in range(len(self.element.contents)):
if id(self.element.contents[index]) == id(refNode.element):
return index
return None
def appendChild(self, node):
if (node.element.__class__ == NavigableString and self.element.contents
and self.element.contents[-1].__class__ == NavigableString):
# Concatenate new text onto old text node
# (TODO: This has O(n^2) performance, for input like "a</a>a</a>a</a>...")
newStr = NavigableString(self.element.contents[-1]+node.element)
# Remove the old text node
# (Can't simply use .extract() by itself, because it fails if
# an equal text node exists within the parent node)
oldElement = self.element.contents[-1]
del self.element.contents[-1]
oldElement.parent = None
oldElement.extract()
self.element.insert(len(self.element.contents), newStr)
else:
self.element.insert(len(self.element.contents), node.element)
node.parent = self
def getAttributes(self):
return AttrList(self.element)
def setAttributes(self, attributes):
if attributes:
for name, value in attributes.items():
self.element[name] = value
attributes = property(getAttributes, setAttributes)
def insertText(self, data, insertBefore=None):
text = TextNode(NavigableString(data), self.soup)
if insertBefore:
self.insertBefore(text, insertBefore)
else:
self.appendChild(text)
def insertBefore(self, node, refNode):
index = self._nodeIndex(node, refNode)
if (node.element.__class__ == NavigableString and self.element.contents
and self.element.contents[index-1].__class__ == NavigableString):
# (See comments in appendChild)
newStr = NavigableString(self.element.contents[index-1]+node.element)
oldNode = self.element.contents[index-1]
del self.element.contents[index-1]
oldNode.parent = None
oldNode.extract()
self.element.insert(index-1, newStr)
else:
self.element.insert(index, node.element)
node.parent = self
def removeChild(self, node):
index = self._nodeIndex(node.parent, node)
del node.parent.element.contents[index]
node.element.parent = None
node.element.extract()
node.parent = None
def reparentChildren(self, newParent):
while self.element.contents:
child = self.element.contents[0]
child.extract()
if isinstance(child, Tag):
newParent.appendChild(Element(child, self.soup, namespaces["html"]))
else:
newParent.appendChild(TextNode(child, self.soup))
def cloneNode(self):
node = Element(Tag(self.soup, self.element.name), self.soup, self.namespace)
for key,value in self.attributes:
node.attributes[key] = value
return node
def hasContent(self):
return self.element.contents
def getNameTuple(self):
if self.namespace == None:
return namespaces["html"], self.name
else:
return self.namespace, self.name
nameTuple = property(getNameTuple)
class TextNode(Element):
def __init__(self, element, soup):
_base.Node.__init__(self, None)
self.element = element
self.soup = soup
def cloneNode(self):
raise NotImplementedError
class TreeBuilder(_base.TreeBuilder):
def __init__(self, namespaceHTMLElements):
if namespaceHTMLElements:
warnings.warn("BeautifulSoup cannot represent elements in any namespace", DataLossWarning)
_base.TreeBuilder.__init__(self, namespaceHTMLElements)
def documentClass(self):
self.soup = BeautifulSoup("")
return Element(self.soup, self.soup, None)
def insertDoctype(self, token):
name = token["name"]
publicId = token["publicId"]
systemId = token["systemId"]
if publicId:
self.soup.insert(0, Declaration("%s PUBLIC \"%s\" \"%s\""%(name, publicId, systemId or "")))
elif systemId:
self.soup.insert(0, Declaration("%s SYSTEM \"%s\""%
(name, systemId)))
else:
self.soup.insert(0, Declaration(name))
def elementClass(self, name, namespace):
if namespace is not None:
warnings.warn("BeautifulSoup cannot represent elements in any namespace", DataLossWarning)
return Element(Tag(self.soup, name), self.soup, namespace)
def commentClass(self, data):
return TextNode(Comment(data), self.soup)
def fragmentClass(self):
self.soup = BeautifulSoup("")
self.soup.name = "[document_fragment]"
return Element(self.soup, self.soup, None)
def appendChild(self, node):
self.soup.insert(len(self.soup.contents), node.element)
def testSerializer(self, element):
return testSerializer(element)
def getDocument(self):
return self.soup
def getFragment(self):
return _base.TreeBuilder.getFragment(self).element
def testSerializer(element):
import re
rv = []
def serializeElement(element, indent=0):
if isinstance(element, Declaration):
doctype_regexp = r'(?P<name>[^\s]*)( PUBLIC "(?P<publicId>.*)" "(?P<systemId1>.*)"| SYSTEM "(?P<systemId2>.*)")?'
m = re.compile(doctype_regexp).match(element.string)
assert m is not None, "DOCTYPE did not match expected format"
name = m.group('name')
publicId = m.group('publicId')
if publicId is not None:
systemId = m.group('systemId1') or ""
else:
systemId = m.group('systemId2')
if publicId is not None or systemId is not None:
rv.append("""|%s<!DOCTYPE %s "%s" "%s">"""%
(' '*indent, name, publicId or "", systemId or ""))
else:
rv.append("|%s<!DOCTYPE %s>"%(' '*indent, name))
elif isinstance(element, BeautifulSoup):
if element.name == "[document_fragment]":
rv.append("#document-fragment")
else:
rv.append("#document")
elif isinstance(element, Comment):
rv.append("|%s<!-- %s -->"%(' '*indent, element.string))
elif isinstance(element, unicode):
rv.append("|%s\"%s\"" %(' '*indent, element))
else:
rv.append("|%s<%s>"%(' '*indent, element.name))
if element.attrs:
for name, value in element.attrs:
rv.append('|%s%s="%s"' % (' '*(indent+2), name, value))
indent += 2
if hasattr(element, "contents"):
for child in element.contents:
serializeElement(child, indent)
serializeElement(element, 0)
return "\n".join(rv)

View File

@ -8,23 +8,27 @@ implements a 'serialize' method taking a tree as sole argument and
returning an iterator generating tokens.
"""
from __future__ import absolute_import, division, unicode_literals
import sys
from ..utils import default_etree
treeWalkerCache = {}
def getTreeWalker(treeType, implementation=None, **kwargs):
"""Get a TreeWalker class for various types of tree with built-in support
treeType - the name of the tree type required (case-insensitive). Supported
values are "simpletree", "dom", "etree" and "beautifulsoup"
values are:
"simpletree" - a built-in DOM-ish tree type with support for some
more pythonic idioms.
"dom" - The xml.dom.minidom DOM implementation
"pulldom" - The xml.dom.pulldom event stream
"etree" - A generic walker for tree implementations exposing an
elementtree-like interface (known to work with
ElementTree, cElementTree and lxml.etree).
"lxml" - Optimized walker for lxml.etree
"beautifulsoup" - Beautiful soup (if installed)
"genshi" - a Genshi stream
implementation - (Currently applies to the "etree" tree type only). A module
@ -33,20 +37,21 @@ def getTreeWalker(treeType, implementation=None, **kwargs):
treeType = treeType.lower()
if treeType not in treeWalkerCache:
if treeType in ("dom", "pulldom", "simpletree"):
mod = __import__(treeType, globals())
if treeType in ("dom", "pulldom"):
name = "%s.%s" % (__name__, treeType)
__import__(name)
mod = sys.modules[name]
treeWalkerCache[treeType] = mod.TreeWalker
elif treeType == "genshi":
import genshistream
from . import genshistream
treeWalkerCache[treeType] = genshistream.TreeWalker
elif treeType == "beautifulsoup":
import soup
treeWalkerCache[treeType] = soup.TreeWalker
elif treeType == "lxml":
import lxmletree
from . import lxmletree
treeWalkerCache[treeType] = lxmletree.TreeWalker
elif treeType == "etree":
import etree
from . import etree
if implementation is None:
implementation = default_etree
# XXX: NEVER cache here, caching is done in the etree submodule
return etree.getETreeModule(implementation, **kwargs).TreeWalker
return treeWalkerCache.get(treeType)

View File

@ -1,8 +1,40 @@
from __future__ import absolute_import, division, unicode_literals
from six import text_type, string_types
import gettext
_ = gettext.gettext
from html5lib.constants import voidElements, spaceCharacters
spaceCharacters = u"".join(spaceCharacters)
from xml.dom import Node
DOCUMENT = Node.DOCUMENT_NODE
DOCTYPE = Node.DOCUMENT_TYPE_NODE
TEXT = Node.TEXT_NODE
ELEMENT = Node.ELEMENT_NODE
COMMENT = Node.COMMENT_NODE
ENTITY = Node.ENTITY_NODE
UNKNOWN = "<#UNKNOWN#>"
from ..constants import voidElements, spaceCharacters
spaceCharacters = "".join(spaceCharacters)
def to_text(s, blank_if_none=True):
"""Wrapper around six.text_type to convert None to empty string"""
if s is None:
if blank_if_none:
return ""
else:
return None
elif isinstance(s, text_type):
return s
else:
return text_type(s)
def is_text_or_none(string):
"""Wrapper around isinstance(string_types) or is None"""
return string is None or isinstance(string, string_types)
class TreeWalker(object):
def __init__(self, tree):
@ -14,36 +46,50 @@ class TreeWalker(object):
def error(self, msg):
return {"type": "SerializeError", "data": msg}
def normalizeAttrs(self, attrs):
if not attrs:
attrs = []
elif hasattr(attrs, 'items'):
attrs = attrs.items()
return [(unicode(name),unicode(value)) for name,value in attrs]
def emptyTag(self, namespace, name, attrs, hasChildren=False):
yield {"type": "EmptyTag", "name": unicode(name),
"namespace":unicode(namespace),
"data": self.normalizeAttrs(attrs)}
assert namespace is None or isinstance(namespace, string_types), type(namespace)
assert isinstance(name, string_types), type(name)
assert all((namespace is None or isinstance(namespace, string_types)) and
isinstance(name, string_types) and
isinstance(value, string_types)
for (namespace, name), value in attrs.items())
yield {"type": "EmptyTag", "name": to_text(name, False),
"namespace": to_text(namespace),
"data": attrs}
if hasChildren:
yield self.error(_("Void element has children"))
def startTag(self, namespace, name, attrs):
return {"type": "StartTag",
"name": unicode(name),
"namespace":unicode(namespace),
"data": self.normalizeAttrs(attrs)}
assert namespace is None or isinstance(namespace, string_types), type(namespace)
assert isinstance(name, string_types), type(name)
assert all((namespace is None or isinstance(namespace, string_types)) and
isinstance(name, string_types) and
isinstance(value, string_types)
for (namespace, name), value in attrs.items())
return {"type": "StartTag",
"name": text_type(name),
"namespace": to_text(namespace),
"data": dict(((to_text(namespace, False), to_text(name)),
to_text(value, False))
for (namespace, name), value in attrs.items())}
def endTag(self, namespace, name):
return {"type": "EndTag",
"name": unicode(name),
"namespace":unicode(namespace),
"data": []}
assert namespace is None or isinstance(namespace, string_types), type(namespace)
assert isinstance(name, string_types), type(namespace)
return {"type": "EndTag",
"name": to_text(name, False),
"namespace": to_text(namespace),
"data": {}}
def text(self, data):
data = unicode(data)
assert isinstance(data, string_types), type(data)
data = to_text(data)
middle = data.lstrip(spaceCharacters)
left = data[:len(data)-len(middle)]
left = data[:len(data) - len(middle)]
if left:
yield {"type": "SpaceCharacters", "data": left}
data = middle
@ -55,52 +101,40 @@ class TreeWalker(object):
yield {"type": "SpaceCharacters", "data": right}
def comment(self, data):
return {"type": "Comment", "data": unicode(data)}
assert isinstance(data, string_types), type(data)
return {"type": "Comment", "data": text_type(data)}
def doctype(self, name, publicId=None, systemId=None, correct=True):
assert is_text_or_none(name), type(name)
assert is_text_or_none(publicId), type(publicId)
assert is_text_or_none(systemId), type(systemId)
return {"type": "Doctype",
"name": name is not None and unicode(name) or u"",
"publicId": publicId,
"systemId": systemId,
"correct": correct}
"name": to_text(name),
"publicId": to_text(publicId),
"systemId": to_text(systemId),
"correct": to_text(correct)}
def entity(self, name):
assert isinstance(name, string_types), type(name)
return {"type": "Entity", "name": text_type(name)}
def unknown(self, nodeType):
return self.error(_("Unknown node type: ") + nodeType)
class RecursiveTreeWalker(TreeWalker):
def walkChildren(self, node):
raise NodeImplementedError
def element(self, node, namespace, name, attrs, hasChildren):
if name in voidElements:
for token in self.emptyTag(namespace, name, attrs, hasChildren):
yield token
else:
yield self.startTag(name, attrs)
if hasChildren:
for token in self.walkChildren(node):
yield token
yield self.endTag(name)
from xml.dom import Node
DOCUMENT = Node.DOCUMENT_NODE
DOCTYPE = Node.DOCUMENT_TYPE_NODE
TEXT = Node.TEXT_NODE
ELEMENT = Node.ELEMENT_NODE
COMMENT = Node.COMMENT_NODE
UNKNOWN = "<#UNKNOWN#>"
class NonRecursiveTreeWalker(TreeWalker):
def getNodeDetails(self, node):
raise NotImplementedError
def getFirstChild(self, node):
raise NotImplementedError
def getNextSibling(self, node):
raise NotImplementedError
def getParentNode(self, node):
raise NotImplementedError
@ -110,7 +144,6 @@ class NonRecursiveTreeWalker(TreeWalker):
details = self.getNodeDetails(currentNode)
type, details = details[0], details[1:]
hasChildren = False
endTag = None
if type == DOCTYPE:
yield self.doctype(*details)
@ -122,28 +155,30 @@ class NonRecursiveTreeWalker(TreeWalker):
elif type == ELEMENT:
namespace, name, attributes, hasChildren = details
if name in voidElements:
for token in self.emptyTag(namespace, name, attributes,
for token in self.emptyTag(namespace, name, attributes,
hasChildren):
yield token
hasChildren = False
else:
endTag = name
yield self.startTag(namespace, name, attributes)
elif type == COMMENT:
yield self.comment(details[0])
elif type == ENTITY:
yield self.entity(details[0])
elif type == DOCUMENT:
hasChildren = True
else:
yield self.unknown(details[0])
if hasChildren:
firstChild = self.getFirstChild(currentNode)
else:
firstChild = None
if firstChild is not None:
currentNode = firstChild
else:

View File

@ -1,10 +1,12 @@
from __future__ import absolute_import, division, unicode_literals
from xml.dom import Node
import gettext
_ = gettext.gettext
import _base
from html5lib.constants import voidElements
from . import _base
class TreeWalker(_base.NonRecursiveTreeWalker):
def getNodeDetails(self, node):
@ -15,8 +17,15 @@ class TreeWalker(_base.NonRecursiveTreeWalker):
return _base.TEXT, node.nodeValue
elif node.nodeType == Node.ELEMENT_NODE:
return (_base.ELEMENT, node.namespaceURI, node.nodeName,
node.attributes.items(), node.hasChildNodes)
attrs = {}
for attr in list(node.attributes.keys()):
attr = node.getAttributeNode(attr)
if attr.namespaceURI:
attrs[(attr.namespaceURI, attr.localName)] = attr.value
else:
attrs[(None, attr.name)] = attr.value
return (_base.ELEMENT, node.namespaceURI, node.nodeName,
attrs, node.hasChildNodes())
elif node.nodeType == Node.COMMENT_NODE:
return _base.COMMENT, node.nodeValue

View File

@ -1,30 +1,28 @@
from __future__ import absolute_import, division, unicode_literals
try:
from collections import OrderedDict
except ImportError:
try:
from ordereddict import OrderedDict
except ImportError:
OrderedDict = dict
import gettext
_ = gettext.gettext
import new
import copy
import re
import _base
from html5lib.constants import voidElements
from six import text_type
from . import _base
from ..utils import moduleFactoryFactory
tag_regexp = re.compile("{([^}]*)}(.*)")
moduleCache = {}
def getETreeModule(ElementTreeImplementation):
name = "_" + ElementTreeImplementation.__name__+"builder"
if name in moduleCache:
return moduleCache[name]
else:
mod = new.module("_" + ElementTreeImplementation.__name__+"builder")
objs = getETreeBuilder(ElementTreeImplementation)
mod.__dict__.update(objs)
moduleCache[name] = mod
return mod
def getETreeBuilder(ElementTreeImplementation):
ElementTree = ElementTreeImplementation
ElementTreeCommentType = ElementTree.Comment("asd").tag
class TreeWalker(_base.NonRecursiveTreeWalker):
"""Given the particular ElementTree representation, this implementation,
@ -32,16 +30,16 @@ def getETreeBuilder(ElementTreeImplementation):
content:
1. The current element
2. The index of the element relative to its parent
3. A stack of ancestor elements
4. A flag "text", "tail" or None to indicate if the current node is a
text node; either the text or tail of the current element (1)
"""
def getNodeDetails(self, node):
if isinstance(node, tuple): # It might be the root Element
if isinstance(node, tuple): # It might be the root Element
elt, key, parents, flag = node
if flag in ("text", "tail"):
return _base.TEXT, getattr(elt, flag)
@ -51,33 +49,41 @@ def getETreeBuilder(ElementTreeImplementation):
if not(hasattr(node, "tag")):
node = node.getroot()
if node.tag in ("<DOCUMENT_ROOT>", "<DOCUMENT_FRAGMENT>"):
if node.tag in ("DOCUMENT_ROOT", "DOCUMENT_FRAGMENT"):
return (_base.DOCUMENT,)
elif node.tag == "<!DOCTYPE>":
return (_base.DOCTYPE, node.text,
return (_base.DOCTYPE, node.text,
node.get("publicId"), node.get("systemId"))
elif type(node.tag) == type(ElementTree.Comment):
elif node.tag == ElementTreeCommentType:
return _base.COMMENT, node.text
else:
#This is assumed to be an ordinary element
assert type(node.tag) == text_type, type(node.tag)
# This is assumed to be an ordinary element
match = tag_regexp.match(node.tag)
if match:
namespace, tag = match.groups()
else:
namespace = None
tag = node.tag
return (_base.ELEMENT, namespace, tag,
node.attrib.items(), len(node) or node.text)
attrs = OrderedDict()
for name, value in list(node.attrib.items()):
match = tag_regexp.match(name)
if match:
attrs[(match.group(1), match.group(2))] = value
else:
attrs[(None, name)] = value
return (_base.ELEMENT, namespace, tag,
attrs, len(node) or node.text)
def getFirstChild(self, node):
if isinstance(node, tuple):
element, key, parents, flag = node
else:
element, key, parents, flag = node, None, [], None
if flag in ("text", "tail"):
return None
else:
@ -88,13 +94,13 @@ def getETreeBuilder(ElementTreeImplementation):
return element[0], 0, parents, None
else:
return None
def getNextSibling(self, node):
if isinstance(node, tuple):
element, key, parents, flag = node
else:
return None
if flag == "text":
if len(element):
parents.append(element)
@ -105,16 +111,16 @@ def getETreeBuilder(ElementTreeImplementation):
if element.tail and flag != "tail":
return element, key, parents, "tail"
elif key < len(parents[-1]) - 1:
return parents[-1][key+1], key+1, parents, None
return parents[-1][key + 1], key + 1, parents, None
else:
return None
def getParentNode(self, node):
if isinstance(node, tuple):
element, key, parents, flag = node
else:
return None
if flag == "text":
if not parents:
return element
@ -128,3 +134,5 @@ def getETreeBuilder(ElementTreeImplementation):
return parent, list(parents[-1]).index(parent), parents, None
return locals()
getETreeModule = moduleFactoryFactory(getETreeBuilder)

View File

@ -1,50 +1,49 @@
from __future__ import absolute_import, division, unicode_literals
from genshi.core import QName
from genshi.core import START, END, XML_NAMESPACE, DOCTYPE, TEXT
from genshi.core import START_NS, END_NS, START_CDATA, END_CDATA, PI, COMMENT
from genshi.output import NamespaceFlattener
from genshi.core import START_NS, END_NS, START_CDATA, END_CDATA, PI, COMMENT
import _base
from . import _base
from ..constants import voidElements, namespaces
from html5lib.constants import voidElements
class TreeWalker(_base.TreeWalker):
def __iter__(self):
depth = 0
ignore_until = None
# Buffer the events so we can pass in the following one
previous = None
for event in self.tree:
if previous is not None:
if previous[0] == START:
depth += 1
if ignore_until <= depth:
ignore_until = None
if ignore_until is None:
for token in self.tokens(previous, event):
yield token
if token["type"] == "EmptyTag":
ignore_until = depth
if previous[0] == END:
depth -= 1
previous = event
if previous is not None:
if ignore_until is None or ignore_until <= depth:
for token in self.tokens(previous, None):
for token in self.tokens(previous, event):
yield token
elif ignore_until is not None:
raise ValueError("Illformed DOM event stream: void element without END_ELEMENT")
previous = event
# Don't forget the final event!
if previous is not None:
for token in self.tokens(previous, None):
yield token
def tokens(self, event, next):
kind, data, pos = event
if kind == START:
tag, attrib = data
tag, attribs = data
name = tag.localname
namespace = tag.namespace
if tag in voidElements:
for token in self.emptyTag(namespace, name, list(attrib),
not next or next[0] != END
converted_attribs = {}
for k, v in attribs:
if isinstance(k, QName):
converted_attribs[(k.namespace, k.localname)] = v
else:
converted_attribs[(None, k)] = v
if namespace == namespaces["html"] and name in voidElements:
for token in self.emptyTag(namespace, name, converted_attribs,
not next or next[0] != END
or next[1] != tag):
yield token
else:
yield self.startTag(namespace, name, list(attrib))
yield self.startTag(namespace, name, converted_attribs)
elif kind == END:
name = data.localname
@ -62,8 +61,8 @@ class TreeWalker(_base.TreeWalker):
elif kind == DOCTYPE:
yield self.doctype(*data)
elif kind in (XML_NAMESPACE, DOCTYPE, START_NS, END_NS, \
START_CDATA, END_CDATA, PI):
elif kind in (XML_NAMESPACE, DOCTYPE, START_NS, END_NS,
START_CDATA, END_CDATA, PI):
pass
else:

View File

@ -1,22 +1,35 @@
from __future__ import absolute_import, division, unicode_literals
from six import text_type
from lxml import etree
from html5lib.treebuilders.etree import tag_regexp
from ..treebuilders.etree import tag_regexp
from gettext import gettext
_ = gettext
import _base
from . import _base
from .. import ihatexml
def ensure_str(s):
if s is None:
return None
elif isinstance(s, text_type):
return s
else:
return s.decode("utf-8", "strict")
from html5lib.constants import voidElements
from html5lib import ihatexml
class Root(object):
def __init__(self, et):
self.elementtree = et
self.children = []
if et.docinfo.internalDTD:
self.children.append(Doctype(self, et.docinfo.root_name,
et.docinfo.public_id,
et.docinfo.system_url))
self.children.append(Doctype(self,
ensure_str(et.docinfo.root_name),
ensure_str(et.docinfo.public_id),
ensure_str(et.docinfo.system_url)))
root = et.getroot()
node = root
@ -28,7 +41,7 @@ class Root(object):
self.text = None
self.tail = None
def __getitem__(self, key):
return self.children[key]
@ -38,19 +51,21 @@ class Root(object):
def __len__(self):
return 1
class Doctype(object):
def __init__(self, root_node, name, public_id, system_id):
self.root_node = root_node
self.name = name
self.public_id = public_id
self.system_id = system_id
self.text = None
self.tail = None
def getnext(self):
return self.root_node.children[1]
class FragmentRoot(Root):
def __init__(self, children):
self.children = [FragmentWrapper(self, child) for child in children]
@ -59,23 +74,27 @@ class FragmentRoot(Root):
def getnext(self):
return None
class FragmentWrapper(object):
def __init__(self, fragment_root, obj):
self.root_node = fragment_root
self.obj = obj
if hasattr(self.obj, 'text'):
self.text = self.obj.text
self.text = ensure_str(self.obj.text)
else:
self.text = None
if hasattr(self.obj, 'tail'):
self.tail = self.obj.tail
self.tail = ensure_str(self.obj.tail)
else:
self.tail = None
self.isstring = isinstance(obj, basestring)
self.isstring = isinstance(obj, str) or isinstance(obj, bytes)
# Support for bytes here is Py2
if self.isstring:
self.obj = ensure_str(self.obj)
def __getattr__(self, name):
return getattr(self.obj, name)
def getnext(self):
siblings = self.root_node.children
idx = siblings.index(self)
@ -87,7 +106,7 @@ class FragmentWrapper(object):
def __getitem__(self, key):
return self.obj[key]
def __nonzero__(self):
def __bool__(self):
return bool(self.obj)
def getparent(self):
@ -96,10 +115,13 @@ class FragmentWrapper(object):
def __str__(self):
return str(self.obj)
def __unicode__(self):
return str(self.obj)
def __len__(self):
return len(self.obj)
class TreeWalker(_base.NonRecursiveTreeWalker):
def __init__(self, tree):
if hasattr(tree, "getroot"):
@ -108,11 +130,12 @@ class TreeWalker(_base.NonRecursiveTreeWalker):
tree = FragmentRoot(tree)
_base.NonRecursiveTreeWalker.__init__(self, tree)
self.filter = ihatexml.InfosetFilter()
def getNodeDetails(self, node):
if isinstance(node, tuple): # Text node
if isinstance(node, tuple): # Text node
node, key = node
assert key in ("text", "tail"), _("Text nodes are text or tail, found %s") % key
return _base.TEXT, getattr(node, key)
return _base.TEXT, ensure_str(getattr(node, key))
elif isinstance(node, Root):
return (_base.DOCUMENT,)
@ -121,23 +144,33 @@ class TreeWalker(_base.NonRecursiveTreeWalker):
return _base.DOCTYPE, node.name, node.public_id, node.system_id
elif isinstance(node, FragmentWrapper) and node.isstring:
return _base.TEXT, node
return _base.TEXT, node.obj
elif node.tag == etree.Comment:
return _base.COMMENT, node.text
return _base.COMMENT, ensure_str(node.text)
elif node.tag == etree.Entity:
return _base.ENTITY, ensure_str(node.text)[1:-1] # strip &;
else:
#This is assumed to be an ordinary element
match = tag_regexp.match(node.tag)
# This is assumed to be an ordinary element
match = tag_regexp.match(ensure_str(node.tag))
if match:
namespace, tag = match.groups()
else:
namespace = None
tag = node.tag
return (_base.ELEMENT, namespace, self.filter.fromXmlName(tag),
[(self.filter.fromXmlName(name), value) for
name,value in node.attrib.iteritems()],
len(node) > 0 or node.text)
tag = ensure_str(node.tag)
attrs = {}
for name, value in list(node.attrib.items()):
name = ensure_str(name)
value = ensure_str(value)
match = tag_regexp.match(name)
if match:
attrs[(match.group(1), match.group(2))] = value
else:
attrs[(None, name)] = value
return (_base.ELEMENT, namespace, self.filter.fromXmlName(tag),
attrs, len(node) > 0 or node.text)
def getFirstChild(self, node):
assert not isinstance(node, tuple), _("Text nodes have no children")
@ -149,7 +182,7 @@ class TreeWalker(_base.NonRecursiveTreeWalker):
return node[0]
def getNextSibling(self, node):
if isinstance(node, tuple): # Text node
if isinstance(node, tuple): # Text node
node, key = node
assert key in ("text", "tail"), _("Text nodes are text or tail, found %s") % key
if key == "text":
@ -159,13 +192,13 @@ class TreeWalker(_base.NonRecursiveTreeWalker):
return node[0]
else:
return None
else: # tail
else: # tail
return node.getnext()
return node.tail and (node, "tail") or node.getnext()
return (node, "tail") if node.tail else node.getnext()
def getParentNode(self, node):
if isinstance(node, tuple): # Text node
if isinstance(node, tuple): # Text node
node, key = node
assert key in ("text", "tail"), _("Text nodes are text or tail, found %s") % key
if key == "text":

View File

@ -1,9 +1,12 @@
from __future__ import absolute_import, division, unicode_literals
from xml.dom.pulldom import START_ELEMENT, END_ELEMENT, \
COMMENT, IGNORABLE_WHITESPACE, CHARACTERS
import _base
from . import _base
from ..constants import voidElements
from html5lib.constants import voidElements
class TreeWalker(_base.TreeWalker):
def __iter__(self):
@ -11,7 +14,7 @@ class TreeWalker(_base.TreeWalker):
previous = None
for event in self.tree:
if previous is not None and \
(ignore_until is None or previous[1] is ignore_until):
(ignore_until is None or previous[1] is ignore_until):
if previous[1] is ignore_until:
ignore_until = None
for token in self.tokens(previous, event):
@ -30,14 +33,18 @@ class TreeWalker(_base.TreeWalker):
if type == START_ELEMENT:
name = node.nodeName
namespace = node.namespaceURI
attrs = {}
for attr in list(node.attributes.keys()):
attr = node.getAttributeNode(attr)
attrs[(attr.namespaceURI, attr.localName)] = attr.value
if name in voidElements:
for token in self.emptyTag(namespace,
name,
node.attributes.items(),
attrs,
not next or next[1] is not node):
yield token
else:
yield self.startTag(namespace, name, node.attributes.items())
yield self.startTag(namespace, name, attrs)
elif type == END_ELEMENT:
name = node.nodeName

View File

@ -1,72 +0,0 @@
import gettext
_ = gettext.gettext
import _base
class TreeWalker(_base.NonRecursiveTreeWalker):
"""Given that simpletree has no performant way of getting a node's
next sibling, this implementation returns "nodes" as tuples with the
following content:
1. The parent Node (Element, Document or DocumentFragment)
2. The child index of the current node in its parent's children list
3. A list used as a stack of all ancestors. It is a pair tuple whose
first item is a parent Node and second item is a child index.
"""
def getNodeDetails(self, node):
if isinstance(node, tuple): # It might be the root Node
parent, idx, parents = node
node = parent.childNodes[idx]
# testing node.type allows us not to import treebuilders.simpletree
if node.type in (1, 2): # Document or DocumentFragment
return (_base.DOCUMENT,)
elif node.type == 3: # DocumentType
return _base.DOCTYPE, node.name, node.publicId, node.systemId
elif node.type == 4: # TextNode
return _base.TEXT, node.value
elif node.type == 5: # Element
return (_base.ELEMENT, node.namespace, node.name,
node.attributes.items(), node.hasContent())
elif node.type == 6: # CommentNode
return _base.COMMENT, node.data
else:
return _node.UNKNOWN, node.type
def getFirstChild(self, node):
if isinstance(node, tuple): # It might be the root Node
parent, idx, parents = node
parents.append((parent, idx))
node = parent.childNodes[idx]
else:
parents = []
assert node.hasContent(), "Node has no children"
return (node, 0, parents)
def getNextSibling(self, node):
assert isinstance(node, tuple), "Node is not a tuple: " + str(node)
parent, idx, parents = node
idx += 1
if len(parent.childNodes) > idx:
return (parent, idx, parents)
else:
return None
def getParentNode(self, node):
assert isinstance(node, tuple)
parent, idx, parents = node
if parents:
parent, idx = parents.pop()
return parent, idx, parents
else:
# HACK: We could return ``parent`` but None will stop the algorithm the same way
return None

View File

@ -1,59 +0,0 @@
import re
import gettext
_ = gettext.gettext
from BeautifulSoup import BeautifulSoup, Declaration, Comment, Tag
from html5lib.constants import namespaces
import _base
class TreeWalker(_base.NonRecursiveTreeWalker):
doctype_regexp = re.compile(
r'(?P<name>[^\s]*)(\s*PUBLIC\s*"(?P<publicId>.*)"\s*"(?P<systemId1>.*)"|\s*SYSTEM\s*"(?P<systemId2>.*)")?')
def getNodeDetails(self, node):
if isinstance(node, BeautifulSoup): # Document or DocumentFragment
return (_base.DOCUMENT,)
elif isinstance(node, Declaration): # DocumentType
string = unicode(node.string)
#Slice needed to remove markup added during unicode conversion,
#but only in some versions of BeautifulSoup/Python
if string.startswith('<!') and string.endswith('>'):
string = string[2:-1]
m = self.doctype_regexp.match(string)
#This regexp approach seems wrong and fragile
#but beautiful soup stores the doctype as a single thing and we want the seperate bits
#It should work as long as the tree is created by html5lib itself but may be wrong if it's
#been modified at all
#We could just feed to it a html5lib tokenizer, I guess...
assert m is not None, "DOCTYPE did not match expected format"
name = m.group('name')
publicId = m.group('publicId')
if publicId is not None:
systemId = m.group('systemId1')
else:
systemId = m.group('systemId2')
return _base.DOCTYPE, name, publicId or "", systemId or ""
elif isinstance(node, Comment):
string = unicode(node.string)
if string.startswith('<!--') and string.endswith('-->'):
string = string[4:-3]
return _base.COMMENT, string
elif isinstance(node, unicode): # TextNode
return _base.TEXT, node
elif isinstance(node, Tag): # Element
return (_base.ELEMENT, namespaces["html"], node.name,
dict(node.attrs).items(), node.contents)
else:
return _base.UNKNOWN, node.__class__.__name__
def getFirstChild(self, node):
return node.contents[0]
def getNextSibling(self, node):
return node.nextSibling
def getParentNode(self, node):
return node.parent

View File

@ -0,0 +1,12 @@
from __future__ import absolute_import, division, unicode_literals
from .py import Trie as PyTrie
Trie = PyTrie
try:
from .datrie import Trie as DATrie
except ImportError:
pass
else:
Trie = DATrie

View File

@ -0,0 +1,37 @@
from __future__ import absolute_import, division, unicode_literals
from collections import Mapping
class Trie(Mapping):
"""Abstract base class for tries"""
def keys(self, prefix=None):
keys = super().keys()
if prefix is None:
return set(keys)
# Python 2.6: no set comprehensions
return set([x for x in keys if x.startswith(prefix)])
def has_keys_with_prefix(self, prefix):
for key in self.keys():
if key.startswith(prefix):
return True
return False
def longest_prefix(self, prefix):
if prefix in self:
return prefix
for i in range(1, len(prefix) + 1):
if prefix[:-i] in self:
return prefix[:-i]
raise KeyError(prefix)
def longest_prefix_item(self, prefix):
lprefix = self.longest_prefix(prefix)
return (lprefix, self[lprefix])

View File

@ -0,0 +1,44 @@
from __future__ import absolute_import, division, unicode_literals
from datrie import Trie as DATrie
from six import text_type
from ._base import Trie as ABCTrie
class Trie(ABCTrie):
def __init__(self, data):
chars = set()
for key in data.keys():
if not isinstance(key, text_type):
raise TypeError("All keys must be strings")
for char in key:
chars.add(char)
self._data = DATrie("".join(chars))
for key, value in data.items():
self._data[key] = value
def __contains__(self, key):
return key in self._data
def __len__(self):
return len(self._data)
def __iter__(self):
raise NotImplementedError()
def __getitem__(self, key):
return self._data[key]
def keys(self, prefix=None):
return self._data.keys(prefix)
def has_keys_with_prefix(self, prefix):
return self._data.has_keys_with_prefix(prefix)
def longest_prefix(self, prefix):
return self._data.longest_prefix(prefix)
def longest_prefix_item(self, prefix):
return self._data.longest_prefix_item(prefix)

67
src/html5lib/trie/py.py Normal file
View File

@ -0,0 +1,67 @@
from __future__ import absolute_import, division, unicode_literals
from six import text_type
from bisect import bisect_left
from ._base import Trie as ABCTrie
class Trie(ABCTrie):
def __init__(self, data):
if not all(isinstance(x, text_type) for x in data.keys()):
raise TypeError("All keys must be strings")
self._data = data
self._keys = sorted(data.keys())
self._cachestr = ""
self._cachepoints = (0, len(data))
def __contains__(self, key):
return key in self._data
def __len__(self):
return len(self._data)
def __iter__(self):
return iter(self._data)
def __getitem__(self, key):
return self._data[key]
def keys(self, prefix=None):
if prefix is None or prefix == "" or not self._keys:
return set(self._keys)
if prefix.startswith(self._cachestr):
lo, hi = self._cachepoints
start = i = bisect_left(self._keys, prefix, lo, hi)
else:
start = i = bisect_left(self._keys, prefix)
keys = set()
if start == len(self._keys):
return keys
while self._keys[i].startswith(prefix):
keys.add(self._keys[i])
i += 1
self._cachestr = prefix
self._cachepoints = (start, i)
return keys
def has_keys_with_prefix(self, prefix):
if prefix in self._data:
return True
if prefix.startswith(self._cachestr):
lo, hi = self._cachepoints
i = bisect_left(self._keys, prefix, lo, hi)
else:
i = bisect_left(self._keys, prefix)
if i == len(self._keys):
return False
return self._keys[i].startswith(prefix)

View File

@ -1,9 +1,16 @@
from __future__ import absolute_import, division, unicode_literals
from types import ModuleType
try:
frozenset
except NameError:
#Import from the sets module for python 2.3
from sets import Set as set
from sets import ImmutableSet as frozenset
import xml.etree.cElementTree as default_etree
except ImportError:
import xml.etree.ElementTree as default_etree
__all__ = ["default_etree", "MethodDispatcher", "isSurrogatePair",
"surrogatePairToCodepoint", "moduleFactoryFactory"]
class MethodDispatcher(dict):
"""Dict with 2 special properties:
@ -23,7 +30,7 @@ class MethodDispatcher(dict):
# twice as fast. Please do careful performance testing before changing
# anything here.
_dictEntries = []
for name,value in items:
for name, value in items:
if type(name) in (list, tuple, frozenset, set):
for item in name:
_dictEntries.append((item, value))
@ -35,122 +42,41 @@ class MethodDispatcher(dict):
def __getitem__(self, key):
return dict.get(self, key, self.default)
#Pure python implementation of deque taken from the ASPN Python Cookbook
#Original code by Raymond Hettinger
class deque(object):
# Some utility functions to dal with weirdness around UCS2 vs UCS4
# python builds
def __init__(self, iterable=(), maxsize=-1):
if not hasattr(self, 'data'):
self.left = self.right = 0
self.data = {}
self.maxsize = maxsize
self.extend(iterable)
def isSurrogatePair(data):
return (len(data) == 2 and
ord(data[0]) >= 0xD800 and ord(data[0]) <= 0xDBFF and
ord(data[1]) >= 0xDC00 and ord(data[1]) <= 0xDFFF)
def append(self, x):
self.data[self.right] = x
self.right += 1
if self.maxsize != -1 and len(self) > self.maxsize:
self.popleft()
def appendleft(self, x):
self.left -= 1
self.data[self.left] = x
if self.maxsize != -1 and len(self) > self.maxsize:
self.pop()
def pop(self):
if self.left == self.right:
raise IndexError('cannot pop from empty deque')
self.right -= 1
elem = self.data[self.right]
del self.data[self.right]
return elem
def popleft(self):
if self.left == self.right:
raise IndexError('cannot pop from empty deque')
elem = self.data[self.left]
del self.data[self.left]
self.left += 1
return elem
def clear(self):
self.data.clear()
self.left = self.right = 0
def surrogatePairToCodepoint(data):
char_val = (0x10000 + (ord(data[0]) - 0xD800) * 0x400 +
(ord(data[1]) - 0xDC00))
return char_val
def extend(self, iterable):
for elem in iterable:
self.append(elem)
# Module Factory Factory (no, this isn't Java, I know)
# Here to stop this being duplicated all over the place.
def extendleft(self, iterable):
for elem in iterable:
self.appendleft(elem)
def rotate(self, n=1):
if self:
n %= len(self)
for i in xrange(n):
self.appendleft(self.pop())
def moduleFactoryFactory(factory):
moduleCache = {}
def __getitem__(self, i):
if i < 0:
i += len(self)
try:
return self.data[i + self.left]
except KeyError:
raise IndexError
def moduleFactory(baseModule, *args, **kwargs):
if isinstance(ModuleType.__name__, type("")):
name = "_%s_factory" % baseModule.__name__
else:
name = b"_%s_factory" % baseModule.__name__
def __setitem__(self, i, value):
if i < 0:
i += len(self)
try:
self.data[i + self.left] = value
except KeyError:
raise IndexError
if name in moduleCache:
return moduleCache[name]
else:
mod = ModuleType(name)
objs = factory(baseModule, *args, **kwargs)
mod.__dict__.update(objs)
moduleCache[name] = mod
return mod
def __delitem__(self, i):
size = len(self)
if not (-size <= i < size):
raise IndexError
data = self.data
if i < 0:
i += size
for j in xrange(self.left+i, self.right-1):
data[j] = data[j+1]
self.pop()
def __len__(self):
return self.right - self.left
def __cmp__(self, other):
if type(self) != type(other):
return cmp(type(self), type(other))
return cmp(list(self), list(other))
def __repr__(self, _track=[]):
if id(self) in _track:
return '...'
_track.append(id(self))
r = 'deque(%r)' % (list(self),)
_track.remove(id(self))
return r
def __getstate__(self):
return (tuple(self),)
def __setstate__(self, s):
self.__init__(s[0])
def __hash__(self):
raise TypeError
def __copy__(self):
return self.__class__(self)
def __deepcopy__(self, memo={}):
from copy import deepcopy
result = self.__class__()
memo[id(self)] = result
result.__init__(deepcopy(tuple(self), memo))
return result
return moduleFactory