Update HTML 5 parser used in calibre (html5lib-python)

This commit is contained in:
Kovid Goyal 2013-10-23 11:04:05 +05:30
parent b4bf871077
commit b9421065f9
46 changed files with 7609 additions and 8932 deletions

View File

@ -562,9 +562,9 @@ def entity_to_unicode(match, exceptions=[], encoding='cp1252',
return check(chr(num).decode(encoding)) return check(chr(num).decode(encoding))
except UnicodeDecodeError: except UnicodeDecodeError:
return check(my_unichr(num)) return check(my_unichr(num))
from calibre.utils.html5_entities import entity_map from html5lib.constants import entities
try: try:
return check(entity_map[ent]) return check(entities[ent])
except KeyError: except KeyError:
pass pass
from htmlentitydefs import name2codepoint from htmlentitydefs import name2codepoint

View File

@ -81,11 +81,14 @@ def node_depth(node):
return ans return ans
def html5_parse(data, max_nesting_depth=100): def html5_parse(data, max_nesting_depth=100):
import html5lib import html5lib, warnings
# html5lib bug: http://code.google.com/p/html5lib/issues/detail?id=195 from html5lib.constants import cdataElements, rcdataElements
data = re.sub(r'<\s*(title|style|script|textarea)\s*[^>]*/\s*>', r'<\1></\1>', data, flags=re.I) # HTML5 parsing algorithm idiocy: http://code.google.com/p/html5lib/issues/detail?id=195
data = re.sub(r'<\s*(%s)\s*[^>]*/\s*>' % ('|'.join(cdataElements|rcdataElements)), r'<\1></\1>', data, flags=re.I)
data = html5lib.parse(data, treebuilder='lxml').getroot() with warnings.catch_warnings():
warnings.simplefilter('ignore')
data = html5lib.parse(data, treebuilder='lxml').getroot()
# Check that the asinine HTML 5 algorithm did not result in a tree with # Check that the asinine HTML 5 algorithm did not result in a tree with
# insane nesting depths # insane nesting depths

View File

@ -7,6 +7,7 @@ __license__ = 'GPL v3'
__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>' __copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
from lxml import etree from lxml import etree
from html5lib.constants import cdataElements, rcdataElements
from calibre.ebooks.oeb.polish.tests.base import BaseTest from calibre.ebooks.oeb.polish.tests.base import BaseTest
from calibre.ebooks.oeb.base import XPath, XHTML_NS, SVG_NS, XLINK_NS from calibre.ebooks.oeb.base import XPath, XHTML_NS, SVG_NS, XLINK_NS
@ -18,7 +19,7 @@ def nonvoid_cdata_elements(test, parse_function):
markup = ''' markup = '''
<html> <head><{0}/></head> <body id="test"> </html> <html> <head><{0}/></head> <body id="test"> </html>
''' '''
for tag in ('title', 'style', 'script', 'textarea'): for tag in cdataElements | rcdataElements:
for x in (tag, tag.upper(), '\n' + tag, tag + ' id="xxx" '): for x in (tag, tag.upper(), '\n' + tag, tag + ' id="xxx" '):
root = parse_function(markup.format(x)) root = parse_function(markup.format(x))
test.assertEqual( test.assertEqual(

File diff suppressed because it is too large Load Diff

View File

@ -1,4 +1,4 @@
""" """
HTML parsing library based on the WHATWG "HTML5" HTML parsing library based on the WHATWG "HTML5"
specification. The parser is designed to be compatible with existing specification. The parser is designed to be compatible with existing
HTML found in the wild and implements well-defined error recovery that HTML found in the wild and implements well-defined error recovery that
@ -8,10 +8,16 @@ Example usage:
import html5lib import html5lib
f = open("my_document.html") f = open("my_document.html")
tree = html5lib.parse(f) tree = html5lib.parse(f)
""" """
__version__ = "0.90"
from html5parser import HTMLParser, parse, parseFragment from __future__ import absolute_import, division, unicode_literals
from treebuilders import getTreeBuilder
from treewalkers import getTreeWalker from .html5parser import HTMLParser, parse, parseFragment
from serializer import serialize from .treebuilders import getTreeBuilder
from .treewalkers import getTreeWalker
from .serializer import serialize
__all__ = ["HTMLParser", "parse", "parseFragment", "getTreeBuilder",
"getTreeWalker", "serialize"]
__version__ = "0.999-dev"

File diff suppressed because it is too large Load Diff

View File

@ -1,3 +1,5 @@
from __future__ import absolute_import, division, unicode_literals
class Filter(object): class Filter(object):
def __init__(self, source): def __init__(self, source):

View File

@ -0,0 +1,20 @@
from __future__ import absolute_import, division, unicode_literals
from . import _base
try:
from collections import OrderedDict
except ImportError:
from ordereddict import OrderedDict
class Filter(_base.Filter):
def __iter__(self):
for token in _base.Filter.__iter__(self):
if token["type"] in ("StartTag", "EmptyTag"):
attrs = OrderedDict()
for name, value in sorted(token["data"].items(),
key=lambda x: x[0]):
attrs[name] = value
token["data"] = attrs
yield token

View File

@ -1,127 +0,0 @@
#
# The goal is to finally have a form filler where you pass data for
# each form, using the algorithm for "Seeding a form with initial values"
# See http://www.whatwg.org/specs/web-forms/current-work/#seeding
#
import _base
from html5lib.constants import spaceCharacters
spaceCharacters = u"".join(spaceCharacters)
class SimpleFilter(_base.Filter):
def __init__(self, source, fieldStorage):
_base.Filter.__init__(self, source)
self.fieldStorage = fieldStorage
def __iter__(self):
field_indices = {}
state = None
field_name = None
for token in _base.Filter.__iter__(self):
type = token["type"]
if type in ("StartTag", "EmptyTag"):
name = token["name"].lower()
if name == "input":
field_name = None
field_type = None
input_value_index = -1
input_checked_index = -1
for i,(n,v) in enumerate(token["data"]):
n = n.lower()
if n == u"name":
field_name = v.strip(spaceCharacters)
elif n == u"type":
field_type = v.strip(spaceCharacters)
elif n == u"checked":
input_checked_index = i
elif n == u"value":
input_value_index = i
value_list = self.fieldStorage.getlist(field_name)
field_index = field_indices.setdefault(field_name, 0)
if field_index < len(value_list):
value = value_list[field_index]
else:
value = ""
if field_type in (u"checkbox", u"radio"):
if value_list:
if token["data"][input_value_index][1] == value:
if input_checked_index < 0:
token["data"].append((u"checked", u""))
field_indices[field_name] = field_index + 1
elif input_checked_index >= 0:
del token["data"][input_checked_index]
elif field_type not in (u"button", u"submit", u"reset"):
if input_value_index >= 0:
token["data"][input_value_index] = (u"value", value)
else:
token["data"].append((u"value", value))
field_indices[field_name] = field_index + 1
field_type = None
field_name = None
elif name == "textarea":
field_type = "textarea"
field_name = dict((token["data"])[::-1])["name"]
elif name == "select":
field_type = "select"
attributes = dict(token["data"][::-1])
field_name = attributes.get("name")
is_select_multiple = "multiple" in attributes
is_selected_option_found = False
elif field_type == "select" and field_name and name == "option":
option_selected_index = -1
option_value = None
for i,(n,v) in enumerate(token["data"]):
n = n.lower()
if n == "selected":
option_selected_index = i
elif n == "value":
option_value = v.strip(spaceCharacters)
if option_value is None:
raise NotImplementedError("<option>s without a value= attribute")
else:
value_list = self.fieldStorage.getlist(field_name)
if value_list:
field_index = field_indices.setdefault(field_name, 0)
if field_index < len(value_list):
value = value_list[field_index]
else:
value = ""
if (is_select_multiple or not is_selected_option_found) and option_value == value:
if option_selected_index < 0:
token["data"].append((u"selected", u""))
field_indices[field_name] = field_index + 1
is_selected_option_found = True
elif option_selected_index >= 0:
del token["data"][option_selected_index]
elif field_type is not None and field_name and type == "EndTag":
name = token["name"].lower()
if name == field_type:
if name == "textarea":
value_list = self.fieldStorage.getlist(field_name)
if value_list:
field_index = field_indices.setdefault(field_name, 0)
if field_index < len(value_list):
value = value_list[field_index]
else:
value = ""
yield {"type": "Characters", "data": value}
field_indices[field_name] = field_index + 1
field_name = None
elif name == "option" and field_type == "select":
pass # TODO: part of "option without value= attribute" processing
elif field_type == "textarea":
continue # ignore token
yield token

View File

@ -1,4 +1,7 @@
import _base from __future__ import absolute_import, division, unicode_literals
from . import _base
class Filter(_base.Filter): class Filter(_base.Filter):
def __init__(self, source, encoding): def __init__(self, source, encoding):
@ -18,29 +21,28 @@ class Filter(_base.Filter):
elif type == "EmptyTag": elif type == "EmptyTag":
if token["name"].lower() == "meta": if token["name"].lower() == "meta":
# replace charset with actual encoding # replace charset with actual encoding
has_http_equiv_content_type = False has_http_equiv_content_type = False
content_index = -1 for (namespace, name), value in token["data"].items():
for i,(name,value) in enumerate(token["data"]): if namespace is not None:
if name.lower() == 'charset': continue
token["data"][i] = (u'charset', self.encoding) elif name.lower() == 'charset':
meta_found = True token["data"][(namespace, name)] = self.encoding
break meta_found = True
elif name == 'http-equiv' and value.lower() == 'content-type': break
has_http_equiv_content_type = True elif name == 'http-equiv' and value.lower() == 'content-type':
elif name == 'content': has_http_equiv_content_type = True
content_index = i else:
else: if has_http_equiv_content_type and (None, "content") in token["data"]:
if has_http_equiv_content_type and content_index >= 0: token["data"][(None, "content")] = 'text/html; charset=%s' % self.encoding
token["data"][content_index] = (u'content', u'text/html; charset=%s' % self.encoding) meta_found = True
meta_found = True
elif token["name"].lower() == "head" and not meta_found: elif token["name"].lower() == "head" and not meta_found:
# insert meta into empty head # insert meta into empty head
yield {"type": "StartTag", "name": "head", yield {"type": "StartTag", "name": "head",
"data": token["data"]} "data": token["data"]}
yield {"type": "EmptyTag", "name": "meta", yield {"type": "EmptyTag", "name": "meta",
"data": [["charset", self.encoding]]} "data": {(None, "charset"): self.encoding}}
yield {"type": "EndTag", "name": "head"} yield {"type": "EndTag", "name": "head"}
meta_found = True meta_found = True
continue continue
@ -51,7 +53,7 @@ class Filter(_base.Filter):
yield pending.pop(0) yield pending.pop(0)
if not meta_found: if not meta_found:
yield {"type": "EmptyTag", "name": "meta", yield {"type": "EmptyTag", "name": "meta",
"data": [["charset", self.encoding]]} "data": {(None, "charset"): self.encoding}}
while pending: while pending:
yield pending.pop(0) yield pending.pop(0)
meta_found = True meta_found = True

View File

@ -1,13 +1,18 @@
from __future__ import absolute_import, division, unicode_literals
from gettext import gettext from gettext import gettext
_ = gettext _ = gettext
import _base from . import _base
from html5lib.constants import cdataElements, rcdataElements, voidElements from ..constants import cdataElements, rcdataElements, voidElements
from html5lib.constants import spaceCharacters from ..constants import spaceCharacters
spaceCharacters = u"".join(spaceCharacters) spaceCharacters = "".join(spaceCharacters)
class LintError(Exception):
pass
class LintError(Exception): pass
class Filter(_base.Filter): class Filter(_base.Filter):
def __iter__(self): def __iter__(self):
@ -18,24 +23,24 @@ class Filter(_base.Filter):
if type in ("StartTag", "EmptyTag"): if type in ("StartTag", "EmptyTag"):
name = token["name"] name = token["name"]
if contentModelFlag != "PCDATA": if contentModelFlag != "PCDATA":
raise LintError(_("StartTag not in PCDATA content model flag: %s") % name) raise LintError(_("StartTag not in PCDATA content model flag: %(tag)s") % {"tag": name})
if not isinstance(name, unicode): if not isinstance(name, str):
raise LintError(_(u"Tag name is not a string: %r") % name) raise LintError(_("Tag name is not a string: %(tag)r") % {"tag": name})
if not name: if not name:
raise LintError(_(u"Empty tag name")) raise LintError(_("Empty tag name"))
if type == "StartTag" and name in voidElements: if type == "StartTag" and name in voidElements:
raise LintError(_(u"Void element reported as StartTag token: %s") % name) raise LintError(_("Void element reported as StartTag token: %(tag)s") % {"tag": name})
elif type == "EmptyTag" and name not in voidElements: elif type == "EmptyTag" and name not in voidElements:
raise LintError(_(u"Non-void element reported as EmptyTag token: %s") % token["name"]) raise LintError(_("Non-void element reported as EmptyTag token: %(tag)s") % {"tag": token["name"]})
if type == "StartTag": if type == "StartTag":
open_elements.append(name) open_elements.append(name)
for name, value in token["data"]: for name, value in token["data"]:
if not isinstance(name, unicode): if not isinstance(name, str):
raise LintError(_("Attribute name is not a string: %r") % name) raise LintError(_("Attribute name is not a string: %(name)r") % {"name": name})
if not name: if not name:
raise LintError(_(u"Empty attribute name")) raise LintError(_("Empty attribute name"))
if not isinstance(value, unicode): if not isinstance(value, str):
raise LintError(_("Attribute value is not a string: %r") % value) raise LintError(_("Attribute value is not a string: %(value)r") % {"value": value})
if name in cdataElements: if name in cdataElements:
contentModelFlag = "CDATA" contentModelFlag = "CDATA"
elif name in rcdataElements: elif name in rcdataElements:
@ -45,15 +50,15 @@ class Filter(_base.Filter):
elif type == "EndTag": elif type == "EndTag":
name = token["name"] name = token["name"]
if not isinstance(name, unicode): if not isinstance(name, str):
raise LintError(_(u"Tag name is not a string: %r") % name) raise LintError(_("Tag name is not a string: %(tag)r") % {"tag": name})
if not name: if not name:
raise LintError(_(u"Empty tag name")) raise LintError(_("Empty tag name"))
if name in voidElements: if name in voidElements:
raise LintError(_(u"Void element reported as EndTag token: %s") % name) raise LintError(_("Void element reported as EndTag token: %(tag)s") % {"tag": name})
start_name = open_elements.pop() start_name = open_elements.pop()
if start_name != name: if start_name != name:
raise LintError(_(u"EndTag (%s) does not match StartTag (%s)") % (name, start_name)) raise LintError(_("EndTag (%(end)s) does not match StartTag (%(start)s)") % {"end": name, "start": start_name})
contentModelFlag = "PCDATA" contentModelFlag = "PCDATA"
elif type == "Comment": elif type == "Comment":
@ -62,27 +67,27 @@ class Filter(_base.Filter):
elif type in ("Characters", "SpaceCharacters"): elif type in ("Characters", "SpaceCharacters"):
data = token["data"] data = token["data"]
if not isinstance(data, unicode): if not isinstance(data, str):
raise LintError(_("Attribute name is not a string: %r") % data) raise LintError(_("Attribute name is not a string: %(name)r") % {"name": data})
if not data: if not data:
raise LintError(_(u"%s token with empty data") % type) raise LintError(_("%(type)s token with empty data") % {"type": type})
if type == "SpaceCharacters": if type == "SpaceCharacters":
data = data.strip(spaceCharacters) data = data.strip(spaceCharacters)
if data: if data:
raise LintError(_(u"Non-space character(s) found in SpaceCharacters token: ") % data) raise LintError(_("Non-space character(s) found in SpaceCharacters token: %(token)r") % {"token": data})
elif type == "Doctype": elif type == "Doctype":
name = token["name"] name = token["name"]
if contentModelFlag != "PCDATA": if contentModelFlag != "PCDATA":
raise LintError(_("Doctype not in PCDATA content model flag: %s") % name) raise LintError(_("Doctype not in PCDATA content model flag: %(name)s") % {"name": name})
if not isinstance(name, unicode): if not isinstance(name, str):
raise LintError(_(u"Tag name is not a string: %r") % name) raise LintError(_("Tag name is not a string: %(tag)r") % {"tag": name})
# XXX: what to do with token["data"] ? # XXX: what to do with token["data"] ?
elif type in ("ParseError", "SerializeError"): elif type in ("ParseError", "SerializeError"):
pass pass
else: else:
raise LintError(_(u"Unknown token type: %s") % type) raise LintError(_("Unknown token type: %(type)s") % {"type": type})
yield token yield token

View File

@ -1,4 +1,7 @@
import _base from __future__ import absolute_import, division, unicode_literals
from . import _base
class Filter(_base.Filter): class Filter(_base.Filter):
def slider(self): def slider(self):
@ -14,8 +17,8 @@ class Filter(_base.Filter):
for previous, token, next in self.slider(): for previous, token, next in self.slider():
type = token["type"] type = token["type"]
if type == "StartTag": if type == "StartTag":
if (token["data"] or if (token["data"] or
not self.is_optional_start(token["name"], previous, next)): not self.is_optional_start(token["name"], previous, next)):
yield token yield token
elif type == "EndTag": elif type == "EndTag":
if not self.is_optional_end(token["name"], next): if not self.is_optional_end(token["name"], next):
@ -73,7 +76,7 @@ class Filter(_base.Filter):
# omit the thead and tfoot elements' end tag when they are # omit the thead and tfoot elements' end tag when they are
# immediately followed by a tbody element. See is_optional_end. # immediately followed by a tbody element. See is_optional_end.
if previous and previous['type'] == 'EndTag' and \ if previous and previous['type'] == 'EndTag' and \
previous['name'] in ('tbody','thead','tfoot'): previous['name'] in ('tbody', 'thead', 'tfoot'):
return False return False
return next["name"] == 'tr' return next["name"] == 'tr'
else: else:
@ -121,10 +124,10 @@ class Filter(_base.Filter):
# there is no more content in the parent element. # there is no more content in the parent element.
if type in ("StartTag", "EmptyTag"): if type in ("StartTag", "EmptyTag"):
return next["name"] in ('address', 'article', 'aside', return next["name"] in ('address', 'article', 'aside',
'blockquote', 'datagrid', 'dialog', 'blockquote', 'datagrid', 'dialog',
'dir', 'div', 'dl', 'fieldset', 'footer', 'dir', 'div', 'dl', 'fieldset', 'footer',
'form', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'form', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6',
'header', 'hr', 'menu', 'nav', 'ol', 'header', 'hr', 'menu', 'nav', 'ol',
'p', 'pre', 'section', 'table', 'ul') 'p', 'pre', 'section', 'table', 'ul')
else: else:
return type == "EndTag" or type is None return type == "EndTag" or type is None

View File

@ -1,8 +1,12 @@
import _base from __future__ import absolute_import, division, unicode_literals
from html5lib.sanitizer import HTMLSanitizerMixin
from . import _base
from ..sanitizer import HTMLSanitizerMixin
class Filter(_base.Filter, HTMLSanitizerMixin): class Filter(_base.Filter, HTMLSanitizerMixin):
def __iter__(self): def __iter__(self):
for token in _base.Filter.__iter__(self): for token in _base.Filter.__iter__(self):
token = self.sanitize_token(token) token = self.sanitize_token(token)
if token: yield token if token:
yield token

View File

@ -1,16 +1,13 @@
try: from __future__ import absolute_import, division, unicode_literals
frozenset
except NameError:
# Import from the sets module for python 2.3
from sets import ImmutableSet as frozenset
import re import re
import _base from . import _base
from html5lib.constants import rcdataElements, spaceCharacters from ..constants import rcdataElements, spaceCharacters
spaceCharacters = u"".join(spaceCharacters) spaceCharacters = "".join(spaceCharacters)
SPACES_REGEX = re.compile("[%s]+" % spaceCharacters)
SPACES_REGEX = re.compile(u"[%s]+" % spaceCharacters)
class Filter(_base.Filter): class Filter(_base.Filter):
@ -21,7 +18,7 @@ class Filter(_base.Filter):
for token in _base.Filter.__iter__(self): for token in _base.Filter.__iter__(self):
type = token["type"] type = token["type"]
if type == "StartTag" \ if type == "StartTag" \
and (preserve or token["name"] in self.spacePreserveElements): and (preserve or token["name"] in self.spacePreserveElements):
preserve += 1 preserve += 1
elif type == "EndTag" and preserve: elif type == "EndTag" and preserve:
@ -29,13 +26,13 @@ class Filter(_base.Filter):
elif not preserve and type == "SpaceCharacters" and token["data"]: elif not preserve and type == "SpaceCharacters" and token["data"]:
# Test on token["data"] above to not introduce spaces where there were not # Test on token["data"] above to not introduce spaces where there were not
token["data"] = u" " token["data"] = " "
elif not preserve and type == "Characters": elif not preserve and type == "Characters":
token["data"] = collapse_spaces(token["data"]) token["data"] = collapse_spaces(token["data"])
yield token yield token
def collapse_spaces(text): def collapse_spaces(text):
return SPACES_REGEX.sub(' ', text) return SPACES_REGEX.sub(' ', text)

File diff suppressed because it is too large Load Diff

View File

@ -1,25 +1,105 @@
import re from __future__ import absolute_import, division, unicode_literals
baseChar = """[#x0041-#x005A] | [#x0061-#x007A] | [#x00C0-#x00D6] | [#x00D8-#x00F6] | [#x00F8-#x00FF] | [#x0100-#x0131] | [#x0134-#x013E] | [#x0141-#x0148] | [#x014A-#x017E] | [#x0180-#x01C3] | [#x01CD-#x01F0] | [#x01F4-#x01F5] | [#x01FA-#x0217] | [#x0250-#x02A8] | [#x02BB-#x02C1] | #x0386 | [#x0388-#x038A] | #x038C | [#x038E-#x03A1] | [#x03A3-#x03CE] | [#x03D0-#x03D6] | #x03DA | #x03DC | #x03DE | #x03E0 | [#x03E2-#x03F3] | [#x0401-#x040C] | [#x040E-#x044F] | [#x0451-#x045C] | [#x045E-#x0481] | [#x0490-#x04C4] | [#x04C7-#x04C8] | [#x04CB-#x04CC] | [#x04D0-#x04EB] | [#x04EE-#x04F5] | [#x04F8-#x04F9] | [#x0531-#x0556] | #x0559 | [#x0561-#x0586] | [#x05D0-#x05EA] | [#x05F0-#x05F2] | [#x0621-#x063A] | [#x0641-#x064A] | [#x0671-#x06B7] | [#x06BA-#x06BE] | [#x06C0-#x06CE] | [#x06D0-#x06D3] | #x06D5 | [#x06E5-#x06E6] | [#x0905-#x0939] | #x093D | [#x0958-#x0961] | [#x0985-#x098C] | [#x098F-#x0990] | [#x0993-#x09A8] | [#x09AA-#x09B0] | #x09B2 | [#x09B6-#x09B9] | [#x09DC-#x09DD] | [#x09DF-#x09E1] | [#x09F0-#x09F1] | [#x0A05-#x0A0A] | [#x0A0F-#x0A10] | [#x0A13-#x0A28] | [#x0A2A-#x0A30] | [#x0A32-#x0A33] | [#x0A35-#x0A36] | [#x0A38-#x0A39] | [#x0A59-#x0A5C] | #x0A5E | [#x0A72-#x0A74] | [#x0A85-#x0A8B] | #x0A8D | [#x0A8F-#x0A91] | [#x0A93-#x0AA8] | [#x0AAA-#x0AB0] | [#x0AB2-#x0AB3] | [#x0AB5-#x0AB9] | #x0ABD | #x0AE0 | [#x0B05-#x0B0C] | [#x0B0F-#x0B10] | [#x0B13-#x0B28] | [#x0B2A-#x0B30] | [#x0B32-#x0B33] | [#x0B36-#x0B39] | #x0B3D | [#x0B5C-#x0B5D] | [#x0B5F-#x0B61] | [#x0B85-#x0B8A] | [#x0B8E-#x0B90] | [#x0B92-#x0B95] | [#x0B99-#x0B9A] | #x0B9C | [#x0B9E-#x0B9F] | [#x0BA3-#x0BA4] | [#x0BA8-#x0BAA] | [#x0BAE-#x0BB5] | [#x0BB7-#x0BB9] | [#x0C05-#x0C0C] | [#x0C0E-#x0C10] | [#x0C12-#x0C28] | [#x0C2A-#x0C33] | [#x0C35-#x0C39] | [#x0C60-#x0C61] | [#x0C85-#x0C8C] | [#x0C8E-#x0C90] | [#x0C92-#x0CA8] | [#x0CAA-#x0CB3] | [#x0CB5-#x0CB9] | #x0CDE | [#x0CE0-#x0CE1] | [#x0D05-#x0D0C] | [#x0D0E-#x0D10] | [#x0D12-#x0D28] | [#x0D2A-#x0D39] | [#x0D60-#x0D61] | [#x0E01-#x0E2E] | #x0E30 | [#x0E32-#x0E33] | [#x0E40-#x0E45] | [#x0E81-#x0E82] | #x0E84 | [#x0E87-#x0E88] | #x0E8A | #x0E8D | [#x0E94-#x0E97] | [#x0E99-#x0E9F] | [#x0EA1-#x0EA3] | #x0EA5 | #x0EA7 | [#x0EAA-#x0EAB] | [#x0EAD-#x0EAE] | #x0EB0 | [#x0EB2-#x0EB3] | #x0EBD | [#x0EC0-#x0EC4] | [#x0F40-#x0F47] | [#x0F49-#x0F69] | [#x10A0-#x10C5] | [#x10D0-#x10F6] | #x1100 | [#x1102-#x1103] | [#x1105-#x1107] | #x1109 | [#x110B-#x110C] | [#x110E-#x1112] | #x113C | #x113E | #x1140 | #x114C | #x114E | #x1150 | [#x1154-#x1155] | #x1159 | [#x115F-#x1161] | #x1163 | #x1165 | #x1167 | #x1169 | [#x116D-#x116E] | [#x1172-#x1173] | #x1175 | #x119E | #x11A8 | #x11AB | [#x11AE-#x11AF] | [#x11B7-#x11B8] | #x11BA | [#x11BC-#x11C2] | #x11EB | #x11F0 | #x11F9 | [#x1E00-#x1E9B] | [#x1EA0-#x1EF9] | [#x1F00-#x1F15] | [#x1F18-#x1F1D] | [#x1F20-#x1F45] | [#x1F48-#x1F4D] | [#x1F50-#x1F57] | #x1F59 | #x1F5B | #x1F5D | [#x1F5F-#x1F7D] | [#x1F80-#x1FB4] | [#x1FB6-#x1FBC] | #x1FBE | [#x1FC2-#x1FC4] | [#x1FC6-#x1FCC] | [#x1FD0-#x1FD3] | [#x1FD6-#x1FDB] | [#x1FE0-#x1FEC] | [#x1FF2-#x1FF4] | [#x1FF6-#x1FFC] | #x2126 | [#x212A-#x212B] | #x212E | [#x2180-#x2182] | [#x3041-#x3094] | [#x30A1-#x30FA] | [#x3105-#x312C] | [#xAC00-#xD7A3]""" import re
import warnings
from .constants import DataLossWarning
baseChar = """
[#x0041-#x005A] | [#x0061-#x007A] | [#x00C0-#x00D6] | [#x00D8-#x00F6] |
[#x00F8-#x00FF] | [#x0100-#x0131] | [#x0134-#x013E] | [#x0141-#x0148] |
[#x014A-#x017E] | [#x0180-#x01C3] | [#x01CD-#x01F0] | [#x01F4-#x01F5] |
[#x01FA-#x0217] | [#x0250-#x02A8] | [#x02BB-#x02C1] | #x0386 |
[#x0388-#x038A] | #x038C | [#x038E-#x03A1] | [#x03A3-#x03CE] |
[#x03D0-#x03D6] | #x03DA | #x03DC | #x03DE | #x03E0 | [#x03E2-#x03F3] |
[#x0401-#x040C] | [#x040E-#x044F] | [#x0451-#x045C] | [#x045E-#x0481] |
[#x0490-#x04C4] | [#x04C7-#x04C8] | [#x04CB-#x04CC] | [#x04D0-#x04EB] |
[#x04EE-#x04F5] | [#x04F8-#x04F9] | [#x0531-#x0556] | #x0559 |
[#x0561-#x0586] | [#x05D0-#x05EA] | [#x05F0-#x05F2] | [#x0621-#x063A] |
[#x0641-#x064A] | [#x0671-#x06B7] | [#x06BA-#x06BE] | [#x06C0-#x06CE] |
[#x06D0-#x06D3] | #x06D5 | [#x06E5-#x06E6] | [#x0905-#x0939] | #x093D |
[#x0958-#x0961] | [#x0985-#x098C] | [#x098F-#x0990] | [#x0993-#x09A8] |
[#x09AA-#x09B0] | #x09B2 | [#x09B6-#x09B9] | [#x09DC-#x09DD] |
[#x09DF-#x09E1] | [#x09F0-#x09F1] | [#x0A05-#x0A0A] | [#x0A0F-#x0A10] |
[#x0A13-#x0A28] | [#x0A2A-#x0A30] | [#x0A32-#x0A33] | [#x0A35-#x0A36] |
[#x0A38-#x0A39] | [#x0A59-#x0A5C] | #x0A5E | [#x0A72-#x0A74] |
[#x0A85-#x0A8B] | #x0A8D | [#x0A8F-#x0A91] | [#x0A93-#x0AA8] |
[#x0AAA-#x0AB0] | [#x0AB2-#x0AB3] | [#x0AB5-#x0AB9] | #x0ABD | #x0AE0 |
[#x0B05-#x0B0C] | [#x0B0F-#x0B10] | [#x0B13-#x0B28] | [#x0B2A-#x0B30] |
[#x0B32-#x0B33] | [#x0B36-#x0B39] | #x0B3D | [#x0B5C-#x0B5D] |
[#x0B5F-#x0B61] | [#x0B85-#x0B8A] | [#x0B8E-#x0B90] | [#x0B92-#x0B95] |
[#x0B99-#x0B9A] | #x0B9C | [#x0B9E-#x0B9F] | [#x0BA3-#x0BA4] |
[#x0BA8-#x0BAA] | [#x0BAE-#x0BB5] | [#x0BB7-#x0BB9] | [#x0C05-#x0C0C] |
[#x0C0E-#x0C10] | [#x0C12-#x0C28] | [#x0C2A-#x0C33] | [#x0C35-#x0C39] |
[#x0C60-#x0C61] | [#x0C85-#x0C8C] | [#x0C8E-#x0C90] | [#x0C92-#x0CA8] |
[#x0CAA-#x0CB3] | [#x0CB5-#x0CB9] | #x0CDE | [#x0CE0-#x0CE1] |
[#x0D05-#x0D0C] | [#x0D0E-#x0D10] | [#x0D12-#x0D28] | [#x0D2A-#x0D39] |
[#x0D60-#x0D61] | [#x0E01-#x0E2E] | #x0E30 | [#x0E32-#x0E33] |
[#x0E40-#x0E45] | [#x0E81-#x0E82] | #x0E84 | [#x0E87-#x0E88] | #x0E8A |
#x0E8D | [#x0E94-#x0E97] | [#x0E99-#x0E9F] | [#x0EA1-#x0EA3] | #x0EA5 |
#x0EA7 | [#x0EAA-#x0EAB] | [#x0EAD-#x0EAE] | #x0EB0 | [#x0EB2-#x0EB3] |
#x0EBD | [#x0EC0-#x0EC4] | [#x0F40-#x0F47] | [#x0F49-#x0F69] |
[#x10A0-#x10C5] | [#x10D0-#x10F6] | #x1100 | [#x1102-#x1103] |
[#x1105-#x1107] | #x1109 | [#x110B-#x110C] | [#x110E-#x1112] | #x113C |
#x113E | #x1140 | #x114C | #x114E | #x1150 | [#x1154-#x1155] | #x1159 |
[#x115F-#x1161] | #x1163 | #x1165 | #x1167 | #x1169 | [#x116D-#x116E] |
[#x1172-#x1173] | #x1175 | #x119E | #x11A8 | #x11AB | [#x11AE-#x11AF] |
[#x11B7-#x11B8] | #x11BA | [#x11BC-#x11C2] | #x11EB | #x11F0 | #x11F9 |
[#x1E00-#x1E9B] | [#x1EA0-#x1EF9] | [#x1F00-#x1F15] | [#x1F18-#x1F1D] |
[#x1F20-#x1F45] | [#x1F48-#x1F4D] | [#x1F50-#x1F57] | #x1F59 | #x1F5B |
#x1F5D | [#x1F5F-#x1F7D] | [#x1F80-#x1FB4] | [#x1FB6-#x1FBC] | #x1FBE |
[#x1FC2-#x1FC4] | [#x1FC6-#x1FCC] | [#x1FD0-#x1FD3] | [#x1FD6-#x1FDB] |
[#x1FE0-#x1FEC] | [#x1FF2-#x1FF4] | [#x1FF6-#x1FFC] | #x2126 |
[#x212A-#x212B] | #x212E | [#x2180-#x2182] | [#x3041-#x3094] |
[#x30A1-#x30FA] | [#x3105-#x312C] | [#xAC00-#xD7A3]"""
ideographic = """[#x4E00-#x9FA5] | #x3007 | [#x3021-#x3029]""" ideographic = """[#x4E00-#x9FA5] | #x3007 | [#x3021-#x3029]"""
combiningCharacter = """[#x0300-#x0345] | [#x0360-#x0361] | [#x0483-#x0486] | [#x0591-#x05A1] | [#x05A3-#x05B9] | [#x05BB-#x05BD] | #x05BF | [#x05C1-#x05C2] | #x05C4 | [#x064B-#x0652] | #x0670 | [#x06D6-#x06DC] | [#x06DD-#x06DF] | [#x06E0-#x06E4] | [#x06E7-#x06E8] | [#x06EA-#x06ED] | [#x0901-#x0903] | #x093C | [#x093E-#x094C] | #x094D | [#x0951-#x0954] | [#x0962-#x0963] | [#x0981-#x0983] | #x09BC | #x09BE | #x09BF | [#x09C0-#x09C4] | [#x09C7-#x09C8] | [#x09CB-#x09CD] | #x09D7 | [#x09E2-#x09E3] | #x0A02 | #x0A3C | #x0A3E | #x0A3F | [#x0A40-#x0A42] | [#x0A47-#x0A48] | [#x0A4B-#x0A4D] | [#x0A70-#x0A71] | [#x0A81-#x0A83] | #x0ABC | [#x0ABE-#x0AC5] | [#x0AC7-#x0AC9] | [#x0ACB-#x0ACD] | [#x0B01-#x0B03] | #x0B3C | [#x0B3E-#x0B43] | [#x0B47-#x0B48] | [#x0B4B-#x0B4D] | [#x0B56-#x0B57] | [#x0B82-#x0B83] | [#x0BBE-#x0BC2] | [#x0BC6-#x0BC8] | [#x0BCA-#x0BCD] | #x0BD7 | [#x0C01-#x0C03] | [#x0C3E-#x0C44] | [#x0C46-#x0C48] | [#x0C4A-#x0C4D] | [#x0C55-#x0C56] | [#x0C82-#x0C83] | [#x0CBE-#x0CC4] | [#x0CC6-#x0CC8] | [#x0CCA-#x0CCD] | [#x0CD5-#x0CD6] | [#x0D02-#x0D03] | [#x0D3E-#x0D43] | [#x0D46-#x0D48] | [#x0D4A-#x0D4D] | #x0D57 | #x0E31 | [#x0E34-#x0E3A] | [#x0E47-#x0E4E] | #x0EB1 | [#x0EB4-#x0EB9] | [#x0EBB-#x0EBC] | [#x0EC8-#x0ECD] | [#x0F18-#x0F19] | #x0F35 | #x0F37 | #x0F39 | #x0F3E | #x0F3F | [#x0F71-#x0F84] | [#x0F86-#x0F8B] | [#x0F90-#x0F95] | #x0F97 | [#x0F99-#x0FAD] | [#x0FB1-#x0FB7] | #x0FB9 | [#x20D0-#x20DC] | #x20E1 | [#x302A-#x302F] | #x3099 | #x309A""" combiningCharacter = """
[#x0300-#x0345] | [#x0360-#x0361] | [#x0483-#x0486] | [#x0591-#x05A1] |
[#x05A3-#x05B9] | [#x05BB-#x05BD] | #x05BF | [#x05C1-#x05C2] | #x05C4 |
[#x064B-#x0652] | #x0670 | [#x06D6-#x06DC] | [#x06DD-#x06DF] |
[#x06E0-#x06E4] | [#x06E7-#x06E8] | [#x06EA-#x06ED] | [#x0901-#x0903] |
#x093C | [#x093E-#x094C] | #x094D | [#x0951-#x0954] | [#x0962-#x0963] |
[#x0981-#x0983] | #x09BC | #x09BE | #x09BF | [#x09C0-#x09C4] |
[#x09C7-#x09C8] | [#x09CB-#x09CD] | #x09D7 | [#x09E2-#x09E3] | #x0A02 |
#x0A3C | #x0A3E | #x0A3F | [#x0A40-#x0A42] | [#x0A47-#x0A48] |
[#x0A4B-#x0A4D] | [#x0A70-#x0A71] | [#x0A81-#x0A83] | #x0ABC |
[#x0ABE-#x0AC5] | [#x0AC7-#x0AC9] | [#x0ACB-#x0ACD] | [#x0B01-#x0B03] |
#x0B3C | [#x0B3E-#x0B43] | [#x0B47-#x0B48] | [#x0B4B-#x0B4D] |
[#x0B56-#x0B57] | [#x0B82-#x0B83] | [#x0BBE-#x0BC2] | [#x0BC6-#x0BC8] |
[#x0BCA-#x0BCD] | #x0BD7 | [#x0C01-#x0C03] | [#x0C3E-#x0C44] |
[#x0C46-#x0C48] | [#x0C4A-#x0C4D] | [#x0C55-#x0C56] | [#x0C82-#x0C83] |
[#x0CBE-#x0CC4] | [#x0CC6-#x0CC8] | [#x0CCA-#x0CCD] | [#x0CD5-#x0CD6] |
[#x0D02-#x0D03] | [#x0D3E-#x0D43] | [#x0D46-#x0D48] | [#x0D4A-#x0D4D] |
#x0D57 | #x0E31 | [#x0E34-#x0E3A] | [#x0E47-#x0E4E] | #x0EB1 |
[#x0EB4-#x0EB9] | [#x0EBB-#x0EBC] | [#x0EC8-#x0ECD] | [#x0F18-#x0F19] |
#x0F35 | #x0F37 | #x0F39 | #x0F3E | #x0F3F | [#x0F71-#x0F84] |
[#x0F86-#x0F8B] | [#x0F90-#x0F95] | #x0F97 | [#x0F99-#x0FAD] |
[#x0FB1-#x0FB7] | #x0FB9 | [#x20D0-#x20DC] | #x20E1 | [#x302A-#x302F] |
#x3099 | #x309A"""
digit = """[#x0030-#x0039] | [#x0660-#x0669] | [#x06F0-#x06F9] | [#x0966-#x096F] | [#x09E6-#x09EF] | [#x0A66-#x0A6F] | [#x0AE6-#x0AEF] | [#x0B66-#x0B6F] | [#x0BE7-#x0BEF] | [#x0C66-#x0C6F] | [#x0CE6-#x0CEF] | [#x0D66-#x0D6F] | [#x0E50-#x0E59] | [#x0ED0-#x0ED9] | [#x0F20-#x0F29]""" digit = """
[#x0030-#x0039] | [#x0660-#x0669] | [#x06F0-#x06F9] | [#x0966-#x096F] |
[#x09E6-#x09EF] | [#x0A66-#x0A6F] | [#x0AE6-#x0AEF] | [#x0B66-#x0B6F] |
[#x0BE7-#x0BEF] | [#x0C66-#x0C6F] | [#x0CE6-#x0CEF] | [#x0D66-#x0D6F] |
[#x0E50-#x0E59] | [#x0ED0-#x0ED9] | [#x0F20-#x0F29]"""
extender = """#x00B7 | #x02D0 | #x02D1 | #x0387 | #x0640 | #x0E46 | #x0EC6 | #x3005 | [#x3031-#x3035] | [#x309D-#x309E] | [#x30FC-#x30FE]""" extender = """
#x00B7 | #x02D0 | #x02D1 | #x0387 | #x0640 | #x0E46 | #x0EC6 | #x3005 |
#[#x3031-#x3035] | [#x309D-#x309E] | [#x30FC-#x30FE]"""
letter = " | ".join([baseChar, ideographic]) letter = " | ".join([baseChar, ideographic])
#Without the # Without the
name = " | ".join([letter, digit, ".", "-", "_", combiningCharacter, name = " | ".join([letter, digit, ".", "-", "_", combiningCharacter,
extender]) extender])
nameFirst = " | ".join([letter, "_"]) nameFirst = " | ".join([letter, "_"])
reChar = re.compile(r"#x([\d|A-F]{4,4})") reChar = re.compile(r"#x([\d|A-F]{4,4})")
reCharRange = re.compile(r"\[#x([\d|A-F]{4,4})-#x([\d|A-F]{4,4})\]") reCharRange = re.compile(r"\[#x([\d|A-F]{4,4})-#x([\d|A-F]{4,4})\]")
def charStringToList(chars): def charStringToList(chars):
charRanges = [item.strip() for item in chars.split(" | ")] charRanges = [item.strip() for item in chars.split(" | ")]
rv = [] rv = []
@ -30,16 +110,17 @@ def charStringToList(chars):
if match is not None: if match is not None:
rv.append([hexToInt(item) for item in match.groups()]) rv.append([hexToInt(item) for item in match.groups()])
if len(rv[-1]) == 1: if len(rv[-1]) == 1:
rv[-1] = rv[-1]*2 rv[-1] = rv[-1] * 2
foundMatch = True foundMatch = True
break break
if not foundMatch: if not foundMatch:
assert len(item) == 1 assert len(item) == 1
rv.append([ord(item)] * 2) rv.append([ord(item)] * 2)
rv = normaliseCharList(rv) rv = normaliseCharList(rv)
return rv return rv
def normaliseCharList(charList): def normaliseCharList(charList):
charList = sorted(charList) charList = sorted(charList)
for item in charList: for item in charList:
@ -49,61 +130,69 @@ def normaliseCharList(charList):
while i < len(charList): while i < len(charList):
j = 1 j = 1
rv.append(charList[i]) rv.append(charList[i])
while i + j < len(charList) and charList[i+j][0] <= rv[-1][1] + 1: while i + j < len(charList) and charList[i + j][0] <= rv[-1][1] + 1:
rv[-1][1] = charList[i+j][1] rv[-1][1] = charList[i + j][1]
j += 1 j += 1
i += j i += j
return rv return rv
#We don't really support characters above the BMP :( # We don't really support characters above the BMP :(
max_unicode = int("FFFF", 16) max_unicode = int("FFFF", 16)
def missingRanges(charList): def missingRanges(charList):
rv = [] rv = []
if charList[0] != 0: if charList[0] != 0:
rv.append([0, charList[0][0] - 1]) rv.append([0, charList[0][0] - 1])
for i, item in enumerate(charList[:-1]): for i, item in enumerate(charList[:-1]):
rv.append([item[1]+1, charList[i+1][0] - 1]) rv.append([item[1] + 1, charList[i + 1][0] - 1])
if charList[-1][1] != max_unicode: if charList[-1][1] != max_unicode:
rv.append([charList[-1][1] + 1, max_unicode]) rv.append([charList[-1][1] + 1, max_unicode])
return rv return rv
def listToRegexpStr(charList): def listToRegexpStr(charList):
rv = [] rv = []
for item in charList: for item in charList:
if item[0] == item[1]: if item[0] == item[1]:
rv.append(escapeRegexp(unichr(item[0]))) rv.append(escapeRegexp(chr(item[0])))
else: else:
rv.append(escapeRegexp(unichr(item[0])) + "-" + rv.append(escapeRegexp(chr(item[0])) + "-" +
escapeRegexp(unichr(item[1]))) escapeRegexp(chr(item[1])))
return "[%s]"%"".join(rv) return "[%s]" % "".join(rv)
def hexToInt(hex_str): def hexToInt(hex_str):
return int(hex_str, 16) return int(hex_str, 16)
def escapeRegexp(string): def escapeRegexp(string):
specialCharacters = (".", "^", "$", "*", "+", "?", "{", "}", specialCharacters = (".", "^", "$", "*", "+", "?", "{", "}",
"[", "]", "|", "(", ")", "-") "[", "]", "|", "(", ")", "-")
for char in specialCharacters: for char in specialCharacters:
string = string.replace(char, "\\" + char) string = string.replace(char, "\\" + char)
if char in string:
print string
return string return string
#output from the above # output from the above
nonXmlNameBMPRegexp = re.compile(u'[\x00-,/:-@\\[-\\^`\\{-\xb6\xb8-\xbf\xd7\xf7\u0132-\u0133\u013f-\u0140\u0149\u017f\u01c4-\u01cc\u01f1-\u01f3\u01f6-\u01f9\u0218-\u024f\u02a9-\u02ba\u02c2-\u02cf\u02d2-\u02ff\u0346-\u035f\u0362-\u0385\u038b\u038d\u03a2\u03cf\u03d7-\u03d9\u03db\u03dd\u03df\u03e1\u03f4-\u0400\u040d\u0450\u045d\u0482\u0487-\u048f\u04c5-\u04c6\u04c9-\u04ca\u04cd-\u04cf\u04ec-\u04ed\u04f6-\u04f7\u04fa-\u0530\u0557-\u0558\u055a-\u0560\u0587-\u0590\u05a2\u05ba\u05be\u05c0\u05c3\u05c5-\u05cf\u05eb-\u05ef\u05f3-\u0620\u063b-\u063f\u0653-\u065f\u066a-\u066f\u06b8-\u06b9\u06bf\u06cf\u06d4\u06e9\u06ee-\u06ef\u06fa-\u0900\u0904\u093a-\u093b\u094e-\u0950\u0955-\u0957\u0964-\u0965\u0970-\u0980\u0984\u098d-\u098e\u0991-\u0992\u09a9\u09b1\u09b3-\u09b5\u09ba-\u09bb\u09bd\u09c5-\u09c6\u09c9-\u09ca\u09ce-\u09d6\u09d8-\u09db\u09de\u09e4-\u09e5\u09f2-\u0a01\u0a03-\u0a04\u0a0b-\u0a0e\u0a11-\u0a12\u0a29\u0a31\u0a34\u0a37\u0a3a-\u0a3b\u0a3d\u0a43-\u0a46\u0a49-\u0a4a\u0a4e-\u0a58\u0a5d\u0a5f-\u0a65\u0a75-\u0a80\u0a84\u0a8c\u0a8e\u0a92\u0aa9\u0ab1\u0ab4\u0aba-\u0abb\u0ac6\u0aca\u0ace-\u0adf\u0ae1-\u0ae5\u0af0-\u0b00\u0b04\u0b0d-\u0b0e\u0b11-\u0b12\u0b29\u0b31\u0b34-\u0b35\u0b3a-\u0b3b\u0b44-\u0b46\u0b49-\u0b4a\u0b4e-\u0b55\u0b58-\u0b5b\u0b5e\u0b62-\u0b65\u0b70-\u0b81\u0b84\u0b8b-\u0b8d\u0b91\u0b96-\u0b98\u0b9b\u0b9d\u0ba0-\u0ba2\u0ba5-\u0ba7\u0bab-\u0bad\u0bb6\u0bba-\u0bbd\u0bc3-\u0bc5\u0bc9\u0bce-\u0bd6\u0bd8-\u0be6\u0bf0-\u0c00\u0c04\u0c0d\u0c11\u0c29\u0c34\u0c3a-\u0c3d\u0c45\u0c49\u0c4e-\u0c54\u0c57-\u0c5f\u0c62-\u0c65\u0c70-\u0c81\u0c84\u0c8d\u0c91\u0ca9\u0cb4\u0cba-\u0cbd\u0cc5\u0cc9\u0cce-\u0cd4\u0cd7-\u0cdd\u0cdf\u0ce2-\u0ce5\u0cf0-\u0d01\u0d04\u0d0d\u0d11\u0d29\u0d3a-\u0d3d\u0d44-\u0d45\u0d49\u0d4e-\u0d56\u0d58-\u0d5f\u0d62-\u0d65\u0d70-\u0e00\u0e2f\u0e3b-\u0e3f\u0e4f\u0e5a-\u0e80\u0e83\u0e85-\u0e86\u0e89\u0e8b-\u0e8c\u0e8e-\u0e93\u0e98\u0ea0\u0ea4\u0ea6\u0ea8-\u0ea9\u0eac\u0eaf\u0eba\u0ebe-\u0ebf\u0ec5\u0ec7\u0ece-\u0ecf\u0eda-\u0f17\u0f1a-\u0f1f\u0f2a-\u0f34\u0f36\u0f38\u0f3a-\u0f3d\u0f48\u0f6a-\u0f70\u0f85\u0f8c-\u0f8f\u0f96\u0f98\u0fae-\u0fb0\u0fb8\u0fba-\u109f\u10c6-\u10cf\u10f7-\u10ff\u1101\u1104\u1108\u110a\u110d\u1113-\u113b\u113d\u113f\u1141-\u114b\u114d\u114f\u1151-\u1153\u1156-\u1158\u115a-\u115e\u1162\u1164\u1166\u1168\u116a-\u116c\u116f-\u1171\u1174\u1176-\u119d\u119f-\u11a7\u11a9-\u11aa\u11ac-\u11ad\u11b0-\u11b6\u11b9\u11bb\u11c3-\u11ea\u11ec-\u11ef\u11f1-\u11f8\u11fa-\u1dff\u1e9c-\u1e9f\u1efa-\u1eff\u1f16-\u1f17\u1f1e-\u1f1f\u1f46-\u1f47\u1f4e-\u1f4f\u1f58\u1f5a\u1f5c\u1f5e\u1f7e-\u1f7f\u1fb5\u1fbd\u1fbf-\u1fc1\u1fc5\u1fcd-\u1fcf\u1fd4-\u1fd5\u1fdc-\u1fdf\u1fed-\u1ff1\u1ff5\u1ffd-\u20cf\u20dd-\u20e0\u20e2-\u2125\u2127-\u2129\u212c-\u212d\u212f-\u217f\u2183-\u3004\u3006\u3008-\u3020\u3030\u3036-\u3040\u3095-\u3098\u309b-\u309c\u309f-\u30a0\u30fb\u30ff-\u3104\u312d-\u4dff\u9fa6-\uabff\ud7a4-\uffff]') nonXmlNameBMPRegexp = re.compile('[\x00-,/:-@\\[-\\^`\\{-\xb6\xb8-\xbf\xd7\xf7\u0132-\u0133\u013f-\u0140\u0149\u017f\u01c4-\u01cc\u01f1-\u01f3\u01f6-\u01f9\u0218-\u024f\u02a9-\u02ba\u02c2-\u02cf\u02d2-\u02ff\u0346-\u035f\u0362-\u0385\u038b\u038d\u03a2\u03cf\u03d7-\u03d9\u03db\u03dd\u03df\u03e1\u03f4-\u0400\u040d\u0450\u045d\u0482\u0487-\u048f\u04c5-\u04c6\u04c9-\u04ca\u04cd-\u04cf\u04ec-\u04ed\u04f6-\u04f7\u04fa-\u0530\u0557-\u0558\u055a-\u0560\u0587-\u0590\u05a2\u05ba\u05be\u05c0\u05c3\u05c5-\u05cf\u05eb-\u05ef\u05f3-\u0620\u063b-\u063f\u0653-\u065f\u066a-\u066f\u06b8-\u06b9\u06bf\u06cf\u06d4\u06e9\u06ee-\u06ef\u06fa-\u0900\u0904\u093a-\u093b\u094e-\u0950\u0955-\u0957\u0964-\u0965\u0970-\u0980\u0984\u098d-\u098e\u0991-\u0992\u09a9\u09b1\u09b3-\u09b5\u09ba-\u09bb\u09bd\u09c5-\u09c6\u09c9-\u09ca\u09ce-\u09d6\u09d8-\u09db\u09de\u09e4-\u09e5\u09f2-\u0a01\u0a03-\u0a04\u0a0b-\u0a0e\u0a11-\u0a12\u0a29\u0a31\u0a34\u0a37\u0a3a-\u0a3b\u0a3d\u0a43-\u0a46\u0a49-\u0a4a\u0a4e-\u0a58\u0a5d\u0a5f-\u0a65\u0a75-\u0a80\u0a84\u0a8c\u0a8e\u0a92\u0aa9\u0ab1\u0ab4\u0aba-\u0abb\u0ac6\u0aca\u0ace-\u0adf\u0ae1-\u0ae5\u0af0-\u0b00\u0b04\u0b0d-\u0b0e\u0b11-\u0b12\u0b29\u0b31\u0b34-\u0b35\u0b3a-\u0b3b\u0b44-\u0b46\u0b49-\u0b4a\u0b4e-\u0b55\u0b58-\u0b5b\u0b5e\u0b62-\u0b65\u0b70-\u0b81\u0b84\u0b8b-\u0b8d\u0b91\u0b96-\u0b98\u0b9b\u0b9d\u0ba0-\u0ba2\u0ba5-\u0ba7\u0bab-\u0bad\u0bb6\u0bba-\u0bbd\u0bc3-\u0bc5\u0bc9\u0bce-\u0bd6\u0bd8-\u0be6\u0bf0-\u0c00\u0c04\u0c0d\u0c11\u0c29\u0c34\u0c3a-\u0c3d\u0c45\u0c49\u0c4e-\u0c54\u0c57-\u0c5f\u0c62-\u0c65\u0c70-\u0c81\u0c84\u0c8d\u0c91\u0ca9\u0cb4\u0cba-\u0cbd\u0cc5\u0cc9\u0cce-\u0cd4\u0cd7-\u0cdd\u0cdf\u0ce2-\u0ce5\u0cf0-\u0d01\u0d04\u0d0d\u0d11\u0d29\u0d3a-\u0d3d\u0d44-\u0d45\u0d49\u0d4e-\u0d56\u0d58-\u0d5f\u0d62-\u0d65\u0d70-\u0e00\u0e2f\u0e3b-\u0e3f\u0e4f\u0e5a-\u0e80\u0e83\u0e85-\u0e86\u0e89\u0e8b-\u0e8c\u0e8e-\u0e93\u0e98\u0ea0\u0ea4\u0ea6\u0ea8-\u0ea9\u0eac\u0eaf\u0eba\u0ebe-\u0ebf\u0ec5\u0ec7\u0ece-\u0ecf\u0eda-\u0f17\u0f1a-\u0f1f\u0f2a-\u0f34\u0f36\u0f38\u0f3a-\u0f3d\u0f48\u0f6a-\u0f70\u0f85\u0f8c-\u0f8f\u0f96\u0f98\u0fae-\u0fb0\u0fb8\u0fba-\u109f\u10c6-\u10cf\u10f7-\u10ff\u1101\u1104\u1108\u110a\u110d\u1113-\u113b\u113d\u113f\u1141-\u114b\u114d\u114f\u1151-\u1153\u1156-\u1158\u115a-\u115e\u1162\u1164\u1166\u1168\u116a-\u116c\u116f-\u1171\u1174\u1176-\u119d\u119f-\u11a7\u11a9-\u11aa\u11ac-\u11ad\u11b0-\u11b6\u11b9\u11bb\u11c3-\u11ea\u11ec-\u11ef\u11f1-\u11f8\u11fa-\u1dff\u1e9c-\u1e9f\u1efa-\u1eff\u1f16-\u1f17\u1f1e-\u1f1f\u1f46-\u1f47\u1f4e-\u1f4f\u1f58\u1f5a\u1f5c\u1f5e\u1f7e-\u1f7f\u1fb5\u1fbd\u1fbf-\u1fc1\u1fc5\u1fcd-\u1fcf\u1fd4-\u1fd5\u1fdc-\u1fdf\u1fed-\u1ff1\u1ff5\u1ffd-\u20cf\u20dd-\u20e0\u20e2-\u2125\u2127-\u2129\u212c-\u212d\u212f-\u217f\u2183-\u3004\u3006\u3008-\u3020\u3030\u3036-\u3040\u3095-\u3098\u309b-\u309c\u309f-\u30a0\u30fb\u30ff-\u3104\u312d-\u4dff\u9fa6-\uabff\ud7a4-\uffff]')
nonXmlNameFirstBMPRegexp = re.compile('[\x00-@\\[-\\^`\\{-\xbf\xd7\xf7\u0132-\u0133\u013f-\u0140\u0149\u017f\u01c4-\u01cc\u01f1-\u01f3\u01f6-\u01f9\u0218-\u024f\u02a9-\u02ba\u02c2-\u0385\u0387\u038b\u038d\u03a2\u03cf\u03d7-\u03d9\u03db\u03dd\u03df\u03e1\u03f4-\u0400\u040d\u0450\u045d\u0482-\u048f\u04c5-\u04c6\u04c9-\u04ca\u04cd-\u04cf\u04ec-\u04ed\u04f6-\u04f7\u04fa-\u0530\u0557-\u0558\u055a-\u0560\u0587-\u05cf\u05eb-\u05ef\u05f3-\u0620\u063b-\u0640\u064b-\u0670\u06b8-\u06b9\u06bf\u06cf\u06d4\u06d6-\u06e4\u06e7-\u0904\u093a-\u093c\u093e-\u0957\u0962-\u0984\u098d-\u098e\u0991-\u0992\u09a9\u09b1\u09b3-\u09b5\u09ba-\u09db\u09de\u09e2-\u09ef\u09f2-\u0a04\u0a0b-\u0a0e\u0a11-\u0a12\u0a29\u0a31\u0a34\u0a37\u0a3a-\u0a58\u0a5d\u0a5f-\u0a71\u0a75-\u0a84\u0a8c\u0a8e\u0a92\u0aa9\u0ab1\u0ab4\u0aba-\u0abc\u0abe-\u0adf\u0ae1-\u0b04\u0b0d-\u0b0e\u0b11-\u0b12\u0b29\u0b31\u0b34-\u0b35\u0b3a-\u0b3c\u0b3e-\u0b5b\u0b5e\u0b62-\u0b84\u0b8b-\u0b8d\u0b91\u0b96-\u0b98\u0b9b\u0b9d\u0ba0-\u0ba2\u0ba5-\u0ba7\u0bab-\u0bad\u0bb6\u0bba-\u0c04\u0c0d\u0c11\u0c29\u0c34\u0c3a-\u0c5f\u0c62-\u0c84\u0c8d\u0c91\u0ca9\u0cb4\u0cba-\u0cdd\u0cdf\u0ce2-\u0d04\u0d0d\u0d11\u0d29\u0d3a-\u0d5f\u0d62-\u0e00\u0e2f\u0e31\u0e34-\u0e3f\u0e46-\u0e80\u0e83\u0e85-\u0e86\u0e89\u0e8b-\u0e8c\u0e8e-\u0e93\u0e98\u0ea0\u0ea4\u0ea6\u0ea8-\u0ea9\u0eac\u0eaf\u0eb1\u0eb4-\u0ebc\u0ebe-\u0ebf\u0ec5-\u0f3f\u0f48\u0f6a-\u109f\u10c6-\u10cf\u10f7-\u10ff\u1101\u1104\u1108\u110a\u110d\u1113-\u113b\u113d\u113f\u1141-\u114b\u114d\u114f\u1151-\u1153\u1156-\u1158\u115a-\u115e\u1162\u1164\u1166\u1168\u116a-\u116c\u116f-\u1171\u1174\u1176-\u119d\u119f-\u11a7\u11a9-\u11aa\u11ac-\u11ad\u11b0-\u11b6\u11b9\u11bb\u11c3-\u11ea\u11ec-\u11ef\u11f1-\u11f8\u11fa-\u1dff\u1e9c-\u1e9f\u1efa-\u1eff\u1f16-\u1f17\u1f1e-\u1f1f\u1f46-\u1f47\u1f4e-\u1f4f\u1f58\u1f5a\u1f5c\u1f5e\u1f7e-\u1f7f\u1fb5\u1fbd\u1fbf-\u1fc1\u1fc5\u1fcd-\u1fcf\u1fd4-\u1fd5\u1fdc-\u1fdf\u1fed-\u1ff1\u1ff5\u1ffd-\u2125\u2127-\u2129\u212c-\u212d\u212f-\u217f\u2183-\u3006\u3008-\u3020\u302a-\u3040\u3095-\u30a0\u30fb-\u3104\u312d-\u4dff\u9fa6-\uabff\ud7a4-\uffff]')
# Simpler things
nonPubidCharRegexp = re.compile("[^\x20\x0D\x0Aa-zA-Z0-9\-\'()+,./:=?;!*#@$_%]")
nonXmlNameFirstBMPRegexp = re.compile(u'[\x00-@\\[-\\^`\\{-\xbf\xd7\xf7\u0132-\u0133\u013f-\u0140\u0149\u017f\u01c4-\u01cc\u01f1-\u01f3\u01f6-\u01f9\u0218-\u024f\u02a9-\u02ba\u02c2-\u0385\u0387\u038b\u038d\u03a2\u03cf\u03d7-\u03d9\u03db\u03dd\u03df\u03e1\u03f4-\u0400\u040d\u0450\u045d\u0482-\u048f\u04c5-\u04c6\u04c9-\u04ca\u04cd-\u04cf\u04ec-\u04ed\u04f6-\u04f7\u04fa-\u0530\u0557-\u0558\u055a-\u0560\u0587-\u05cf\u05eb-\u05ef\u05f3-\u0620\u063b-\u0640\u064b-\u0670\u06b8-\u06b9\u06bf\u06cf\u06d4\u06d6-\u06e4\u06e7-\u0904\u093a-\u093c\u093e-\u0957\u0962-\u0984\u098d-\u098e\u0991-\u0992\u09a9\u09b1\u09b3-\u09b5\u09ba-\u09db\u09de\u09e2-\u09ef\u09f2-\u0a04\u0a0b-\u0a0e\u0a11-\u0a12\u0a29\u0a31\u0a34\u0a37\u0a3a-\u0a58\u0a5d\u0a5f-\u0a71\u0a75-\u0a84\u0a8c\u0a8e\u0a92\u0aa9\u0ab1\u0ab4\u0aba-\u0abc\u0abe-\u0adf\u0ae1-\u0b04\u0b0d-\u0b0e\u0b11-\u0b12\u0b29\u0b31\u0b34-\u0b35\u0b3a-\u0b3c\u0b3e-\u0b5b\u0b5e\u0b62-\u0b84\u0b8b-\u0b8d\u0b91\u0b96-\u0b98\u0b9b\u0b9d\u0ba0-\u0ba2\u0ba5-\u0ba7\u0bab-\u0bad\u0bb6\u0bba-\u0c04\u0c0d\u0c11\u0c29\u0c34\u0c3a-\u0c5f\u0c62-\u0c84\u0c8d\u0c91\u0ca9\u0cb4\u0cba-\u0cdd\u0cdf\u0ce2-\u0d04\u0d0d\u0d11\u0d29\u0d3a-\u0d5f\u0d62-\u0e00\u0e2f\u0e31\u0e34-\u0e3f\u0e46-\u0e80\u0e83\u0e85-\u0e86\u0e89\u0e8b-\u0e8c\u0e8e-\u0e93\u0e98\u0ea0\u0ea4\u0ea6\u0ea8-\u0ea9\u0eac\u0eaf\u0eb1\u0eb4-\u0ebc\u0ebe-\u0ebf\u0ec5-\u0f3f\u0f48\u0f6a-\u109f\u10c6-\u10cf\u10f7-\u10ff\u1101\u1104\u1108\u110a\u110d\u1113-\u113b\u113d\u113f\u1141-\u114b\u114d\u114f\u1151-\u1153\u1156-\u1158\u115a-\u115e\u1162\u1164\u1166\u1168\u116a-\u116c\u116f-\u1171\u1174\u1176-\u119d\u119f-\u11a7\u11a9-\u11aa\u11ac-\u11ad\u11b0-\u11b6\u11b9\u11bb\u11c3-\u11ea\u11ec-\u11ef\u11f1-\u11f8\u11fa-\u1dff\u1e9c-\u1e9f\u1efa-\u1eff\u1f16-\u1f17\u1f1e-\u1f1f\u1f46-\u1f47\u1f4e-\u1f4f\u1f58\u1f5a\u1f5c\u1f5e\u1f7e-\u1f7f\u1fb5\u1fbd\u1fbf-\u1fc1\u1fc5\u1fcd-\u1fcf\u1fd4-\u1fd5\u1fdc-\u1fdf\u1fed-\u1ff1\u1ff5\u1ffd-\u2125\u2127-\u2129\u212c-\u212d\u212f-\u217f\u2183-\u3006\u3008-\u3020\u302a-\u3040\u3095-\u30a0\u30fb-\u3104\u312d-\u4dff\u9fa6-\uabff\ud7a4-\uffff]')
class InfosetFilter(object): class InfosetFilter(object):
replacementRegexp = re.compile(r"U[\dA-F]{5,5}") replacementRegexp = re.compile(r"U[\dA-F]{5,5}")
def __init__(self, replaceChars = None,
dropXmlnsLocalName = False, def __init__(self, replaceChars=None,
dropXmlnsAttrNs = False, dropXmlnsLocalName=False,
preventDoubleDashComments = False, dropXmlnsAttrNs=False,
preventDashAtCommentEnd = False, preventDoubleDashComments=False,
replaceFormFeedCharacters = True): preventDashAtCommentEnd=False,
replaceFormFeedCharacters=True,
preventSingleQuotePubid=False):
self.dropXmlnsLocalName = dropXmlnsLocalName self.dropXmlnsLocalName = dropXmlnsLocalName
self.dropXmlnsAttrNs = dropXmlnsAttrNs self.dropXmlnsAttrNs = dropXmlnsAttrNs
@ -113,14 +202,17 @@ class InfosetFilter(object):
self.replaceFormFeedCharacters = replaceFormFeedCharacters self.replaceFormFeedCharacters = replaceFormFeedCharacters
self.preventSingleQuotePubid = preventSingleQuotePubid
self.replaceCache = {} self.replaceCache = {}
def coerceAttribute(self, name, namespace=None): def coerceAttribute(self, name, namespace=None):
if self.dropXmlnsLocalName and name.startswith("xmlns:"): if self.dropXmlnsLocalName and name.startswith("xmlns:"):
#Need a datalosswarning here warnings.warn("Attributes cannot begin with xmlns", DataLossWarning)
return None return None
elif (self.dropXmlnsAttrNs and elif (self.dropXmlnsAttrNs and
namespace == "http://www.w3.org/2000/xmlns/"): namespace == "http://www.w3.org/2000/xmlns/"):
warnings.warn("Attributes cannot be in the xml namespace", DataLossWarning)
return None return None
else: else:
return self.toXmlName(name) return self.toXmlName(name)
@ -131,20 +223,35 @@ class InfosetFilter(object):
def coerceComment(self, data): def coerceComment(self, data):
if self.preventDoubleDashComments: if self.preventDoubleDashComments:
while "--" in data: while "--" in data:
warnings.warn("Comments cannot contain adjacent dashes", DataLossWarning)
data = data.replace("--", "- -") data = data.replace("--", "- -")
return data return data
def coerceCharacters(self, data): def coerceCharacters(self, data):
if self.replaceFormFeedCharacters: if self.replaceFormFeedCharacters:
for i in range(data.count("\x0C")):
warnings.warn("Text cannot contain U+000C", DataLossWarning)
data = data.replace("\x0C", " ") data = data.replace("\x0C", " ")
#Other non-xml characters # Other non-xml characters
return data return data
def coercePubid(self, data):
dataOutput = data
for char in nonPubidCharRegexp.findall(data):
warnings.warn("Coercing non-XML pubid", DataLossWarning)
replacement = self.getReplacementCharacter(char)
dataOutput = dataOutput.replace(char, replacement)
if self.preventSingleQuotePubid and dataOutput.find("'") >= 0:
warnings.warn("Pubid cannot contain single quote", DataLossWarning)
dataOutput = dataOutput.replace("'", self.getReplacementCharacter("'"))
return dataOutput
def toXmlName(self, name): def toXmlName(self, name):
nameFirst = name[0] nameFirst = name[0]
nameRest = name[1:] nameRest = name[1:]
m = nonXmlNameFirstBMPRegexp.match(nameFirst) m = nonXmlNameFirstBMPRegexp.match(nameFirst)
if m: if m:
warnings.warn("Coercing non-XML name", DataLossWarning)
nameFirstOutput = self.getReplacementCharacter(nameFirst) nameFirstOutput = self.getReplacementCharacter(nameFirst)
else: else:
nameFirstOutput = nameFirst nameFirstOutput = nameFirst
@ -152,10 +259,11 @@ class InfosetFilter(object):
nameRestOutput = nameRest nameRestOutput = nameRest
replaceChars = set(nonXmlNameBMPRegexp.findall(nameRest)) replaceChars = set(nonXmlNameBMPRegexp.findall(nameRest))
for char in replaceChars: for char in replaceChars:
warnings.warn("Coercing non-XML name", DataLossWarning)
replacement = self.getReplacementCharacter(char) replacement = self.getReplacementCharacter(char)
nameRestOutput = nameRestOutput.replace(char, replacement) nameRestOutput = nameRestOutput.replace(char, replacement)
return nameFirstOutput + nameRestOutput return nameFirstOutput + nameRestOutput
def getReplacementCharacter(self, char): def getReplacementCharacter(self, char):
if char in self.replaceCache: if char in self.replaceCache:
replacement = self.replaceCache[char] replacement = self.replaceCache[char]
@ -169,9 +277,9 @@ class InfosetFilter(object):
return name return name
def escapeChar(self, char): def escapeChar(self, char):
replacement = "U" + hex(ord(char))[2:].upper().rjust(5, "0") replacement = "U%05X" % ord(char)
self.replaceCache[char] = replacement self.replaceCache[char] = replacement
return replacement return replacement
def unescapeChar(self, charcode): def unescapeChar(self, charcode):
return unichr(int(charcode[1:], 16)) return chr(int(charcode[1:], 16))

File diff suppressed because it is too large Load Diff

View File

@ -1,125 +1,145 @@
from __future__ import absolute_import, division, unicode_literals
import re import re
from xml.sax.saxutils import escape, unescape from xml.sax.saxutils import escape, unescape
from tokenizer import HTMLTokenizer from .tokenizer import HTMLTokenizer
from constants import tokenTypes from .constants import tokenTypes
class HTMLSanitizerMixin(object): class HTMLSanitizerMixin(object):
""" sanitization of XHTML+MathML+SVG and of inline style attributes.""" """ sanitization of XHTML+MathML+SVG and of inline style attributes."""
acceptable_elements = ['a', 'abbr', 'acronym', 'address', 'area', 'b', acceptable_elements = ['a', 'abbr', 'acronym', 'address', 'area',
'big', 'blockquote', 'br', 'button', 'caption', 'center', 'cite', 'article', 'aside', 'audio', 'b', 'big', 'blockquote', 'br', 'button',
'code', 'col', 'colgroup', 'dd', 'del', 'dfn', 'dir', 'div', 'dl', 'dt', 'canvas', 'caption', 'center', 'cite', 'code', 'col', 'colgroup',
'em', 'fieldset', 'font', 'form', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'command', 'datagrid', 'datalist', 'dd', 'del', 'details', 'dfn',
'hr', 'i', 'img', 'input', 'ins', 'kbd', 'label', 'legend', 'li', 'map', 'dialog', 'dir', 'div', 'dl', 'dt', 'em', 'event-source', 'fieldset',
'menu', 'ol', 'optgroup', 'option', 'p', 'pre', 'q', 's', 'samp', 'figcaption', 'figure', 'footer', 'font', 'form', 'header', 'h1',
'select', 'small', 'span', 'strike', 'strong', 'sub', 'sup', 'table', 'h2', 'h3', 'h4', 'h5', 'h6', 'hr', 'i', 'img', 'input', 'ins',
'tbody', 'td', 'textarea', 'tfoot', 'th', 'thead', 'tr', 'tt', 'u', 'keygen', 'kbd', 'label', 'legend', 'li', 'm', 'map', 'menu', 'meter',
'ul', 'var'] 'multicol', 'nav', 'nextid', 'ol', 'output', 'optgroup', 'option',
'p', 'pre', 'progress', 'q', 's', 'samp', 'section', 'select',
'small', 'sound', 'source', 'spacer', 'span', 'strike', 'strong',
'sub', 'sup', 'table', 'tbody', 'td', 'textarea', 'time', 'tfoot',
'th', 'thead', 'tr', 'tt', 'u', 'ul', 'var', 'video']
mathml_elements = ['maction', 'math', 'merror', 'mfrac', 'mi', mathml_elements = ['maction', 'math', 'merror', 'mfrac', 'mi',
'mmultiscripts', 'mn', 'mo', 'mover', 'mpadded', 'mphantom', 'mmultiscripts', 'mn', 'mo', 'mover', 'mpadded', 'mphantom',
'mprescripts', 'mroot', 'mrow', 'mspace', 'msqrt', 'mstyle', 'msub', 'mprescripts', 'mroot', 'mrow', 'mspace', 'msqrt', 'mstyle', 'msub',
'msubsup', 'msup', 'mtable', 'mtd', 'mtext', 'mtr', 'munder', 'msubsup', 'msup', 'mtable', 'mtd', 'mtext', 'mtr', 'munder',
'munderover', 'none'] 'munderover', 'none']
svg_elements = ['a', 'animate', 'animateColor', 'animateMotion', svg_elements = ['a', 'animate', 'animateColor', 'animateMotion',
'animateTransform', 'circle', 'defs', 'desc', 'ellipse', 'font-face', 'animateTransform', 'clipPath', 'circle', 'defs', 'desc', 'ellipse',
'font-face-name', 'font-face-src', 'g', 'glyph', 'hkern', 'font-face', 'font-face-name', 'font-face-src', 'g', 'glyph', 'hkern',
'linearGradient', 'line', 'marker', 'metadata', 'missing-glyph', 'linearGradient', 'line', 'marker', 'metadata', 'missing-glyph',
'mpath', 'path', 'polygon', 'polyline', 'radialGradient', 'rect', 'mpath', 'path', 'polygon', 'polyline', 'radialGradient', 'rect',
'set', 'stop', 'svg', 'switch', 'text', 'title', 'tspan', 'use'] 'set', 'stop', 'svg', 'switch', 'text', 'title', 'tspan', 'use']
acceptable_attributes = ['abbr', 'accept', 'accept-charset', 'accesskey', acceptable_attributes = ['abbr', 'accept', 'accept-charset', 'accesskey',
'action', 'align', 'alt', 'axis', 'border', 'cellpadding', 'action', 'align', 'alt', 'autocomplete', 'autofocus', 'axis',
'cellspacing', 'char', 'charoff', 'charset', 'checked', 'cite', 'class', 'background', 'balance', 'bgcolor', 'bgproperties', 'border',
'clear', 'cols', 'colspan', 'color', 'compact', 'coords', 'datetime', 'bordercolor', 'bordercolordark', 'bordercolorlight', 'bottompadding',
'dir', 'disabled', 'enctype', 'for', 'frame', 'headers', 'height', 'cellpadding', 'cellspacing', 'ch', 'challenge', 'char', 'charoff',
'href', 'hreflang', 'hspace', 'id', 'ismap', 'label', 'lang', 'choff', 'charset', 'checked', 'cite', 'class', 'clear', 'color',
'longdesc', 'maxlength', 'media', 'method', 'multiple', 'name', 'cols', 'colspan', 'compact', 'contenteditable', 'controls', 'coords',
'nohref', 'noshade', 'nowrap', 'prompt', 'readonly', 'rel', 'rev', 'data', 'datafld', 'datapagesize', 'datasrc', 'datetime', 'default',
'rows', 'rowspan', 'rules', 'scope', 'selected', 'shape', 'size', 'delay', 'dir', 'disabled', 'draggable', 'dynsrc', 'enctype', 'end',
'span', 'src', 'start', 'style', 'summary', 'tabindex', 'target', 'face', 'for', 'form', 'frame', 'galleryimg', 'gutter', 'headers',
'title', 'type', 'usemap', 'valign', 'value', 'vspace', 'width', 'height', 'hidefocus', 'hidden', 'high', 'href', 'hreflang', 'hspace',
'xml:lang'] 'icon', 'id', 'inputmode', 'ismap', 'keytype', 'label', 'leftspacing',
'lang', 'list', 'longdesc', 'loop', 'loopcount', 'loopend',
'loopstart', 'low', 'lowsrc', 'max', 'maxlength', 'media', 'method',
'min', 'multiple', 'name', 'nohref', 'noshade', 'nowrap', 'open',
'optimum', 'pattern', 'ping', 'point-size', 'poster', 'pqg', 'preload',
'prompt', 'radiogroup', 'readonly', 'rel', 'repeat-max', 'repeat-min',
'replace', 'required', 'rev', 'rightspacing', 'rows', 'rowspan',
'rules', 'scope', 'selected', 'shape', 'size', 'span', 'src', 'start',
'step', 'style', 'summary', 'suppress', 'tabindex', 'target',
'template', 'title', 'toppadding', 'type', 'unselectable', 'usemap',
'urn', 'valign', 'value', 'variable', 'volume', 'vspace', 'vrml',
'width', 'wrap', 'xml:lang']
mathml_attributes = ['actiontype', 'align', 'columnalign', 'columnalign', mathml_attributes = ['actiontype', 'align', 'columnalign', 'columnalign',
'columnalign', 'columnlines', 'columnspacing', 'columnspan', 'depth', 'columnalign', 'columnlines', 'columnspacing', 'columnspan', 'depth',
'display', 'displaystyle', 'equalcolumns', 'equalrows', 'fence', 'display', 'displaystyle', 'equalcolumns', 'equalrows', 'fence',
'fontstyle', 'fontweight', 'frame', 'height', 'linethickness', 'lspace', 'fontstyle', 'fontweight', 'frame', 'height', 'linethickness', 'lspace',
'mathbackground', 'mathcolor', 'mathvariant', 'mathvariant', 'maxsize', 'mathbackground', 'mathcolor', 'mathvariant', 'mathvariant', 'maxsize',
'minsize', 'other', 'rowalign', 'rowalign', 'rowalign', 'rowlines', 'minsize', 'other', 'rowalign', 'rowalign', 'rowalign', 'rowlines',
'rowspacing', 'rowspan', 'rspace', 'scriptlevel', 'selection', 'rowspacing', 'rowspan', 'rspace', 'scriptlevel', 'selection',
'separator', 'stretchy', 'width', 'width', 'xlink:href', 'xlink:show', 'separator', 'stretchy', 'width', 'width', 'xlink:href', 'xlink:show',
'xlink:type', 'xmlns', 'xmlns:xlink'] 'xlink:type', 'xmlns', 'xmlns:xlink']
svg_attributes = ['accent-height', 'accumulate', 'additive', 'alphabetic',
'arabic-form', 'ascent', 'attributeName', 'attributeType',
'baseProfile', 'bbox', 'begin', 'by', 'calcMode', 'cap-height',
'class', 'color', 'color-rendering', 'content', 'cx', 'cy', 'd', 'dx',
'dy', 'descent', 'display', 'dur', 'end', 'fill', 'fill-opacity',
'fill-rule', 'font-family', 'font-size', 'font-stretch', 'font-style',
'font-variant', 'font-weight', 'from', 'fx', 'fy', 'g1', 'g2',
'glyph-name', 'gradientUnits', 'hanging', 'height', 'horiz-adv-x',
'horiz-origin-x', 'id', 'ideographic', 'k', 'keyPoints',
'keySplines', 'keyTimes', 'lang', 'marker-end', 'marker-mid',
'marker-start', 'markerHeight', 'markerUnits', 'markerWidth',
'mathematical', 'max', 'min', 'name', 'offset', 'opacity', 'orient',
'origin', 'overline-position', 'overline-thickness', 'panose-1',
'path', 'pathLength', 'points', 'preserveAspectRatio', 'r', 'refX',
'refY', 'repeatCount', 'repeatDur', 'requiredExtensions',
'requiredFeatures', 'restart', 'rotate', 'rx', 'ry', 'slope',
'stemh', 'stemv', 'stop-color', 'stop-opacity',
'strikethrough-position', 'strikethrough-thickness', 'stroke',
'stroke-dasharray', 'stroke-dashoffset', 'stroke-linecap',
'stroke-linejoin', 'stroke-miterlimit', 'stroke-opacity',
'stroke-width', 'systemLanguage', 'target', 'text-anchor', 'to',
'transform', 'type', 'u1', 'u2', 'underline-position',
'underline-thickness', 'unicode', 'unicode-range', 'units-per-em',
'values', 'version', 'viewBox', 'visibility', 'width', 'widths', 'x',
'x-height', 'x1', 'x2', 'xlink:actuate', 'xlink:arcrole',
'xlink:href', 'xlink:role', 'xlink:show', 'xlink:title',
'xlink:type', 'xml:base', 'xml:lang', 'xml:space', 'xmlns',
'xmlns:xlink', 'y', 'y1', 'y2', 'zoomAndPan']
attr_val_is_uri = ['href', 'src', 'cite', 'action', 'longdesc', svg_attributes = ['accent-height', 'accumulate', 'additive', 'alphabetic',
'xlink:href', 'xml:base'] 'arabic-form', 'ascent', 'attributeName', 'attributeType',
'baseProfile', 'bbox', 'begin', 'by', 'calcMode', 'cap-height',
'class', 'clip-path', 'color', 'color-rendering', 'content', 'cx',
'cy', 'd', 'dx', 'dy', 'descent', 'display', 'dur', 'end', 'fill',
'fill-opacity', 'fill-rule', 'font-family', 'font-size',
'font-stretch', 'font-style', 'font-variant', 'font-weight', 'from',
'fx', 'fy', 'g1', 'g2', 'glyph-name', 'gradientUnits', 'hanging',
'height', 'horiz-adv-x', 'horiz-origin-x', 'id', 'ideographic', 'k',
'keyPoints', 'keySplines', 'keyTimes', 'lang', 'marker-end',
'marker-mid', 'marker-start', 'markerHeight', 'markerUnits',
'markerWidth', 'mathematical', 'max', 'min', 'name', 'offset',
'opacity', 'orient', 'origin', 'overline-position',
'overline-thickness', 'panose-1', 'path', 'pathLength', 'points',
'preserveAspectRatio', 'r', 'refX', 'refY', 'repeatCount',
'repeatDur', 'requiredExtensions', 'requiredFeatures', 'restart',
'rotate', 'rx', 'ry', 'slope', 'stemh', 'stemv', 'stop-color',
'stop-opacity', 'strikethrough-position', 'strikethrough-thickness',
'stroke', 'stroke-dasharray', 'stroke-dashoffset', 'stroke-linecap',
'stroke-linejoin', 'stroke-miterlimit', 'stroke-opacity',
'stroke-width', 'systemLanguage', 'target', 'text-anchor', 'to',
'transform', 'type', 'u1', 'u2', 'underline-position',
'underline-thickness', 'unicode', 'unicode-range', 'units-per-em',
'values', 'version', 'viewBox', 'visibility', 'width', 'widths', 'x',
'x-height', 'x1', 'x2', 'xlink:actuate', 'xlink:arcrole',
'xlink:href', 'xlink:role', 'xlink:show', 'xlink:title', 'xlink:type',
'xml:base', 'xml:lang', 'xml:space', 'xmlns', 'xmlns:xlink', 'y',
'y1', 'y2', 'zoomAndPan']
attr_val_is_uri = ['href', 'src', 'cite', 'action', 'longdesc', 'poster',
'xlink:href', 'xml:base']
svg_attr_val_allows_ref = ['clip-path', 'color-profile', 'cursor', 'fill', svg_attr_val_allows_ref = ['clip-path', 'color-profile', 'cursor', 'fill',
'filter', 'marker', 'marker-start', 'marker-mid', 'marker-end', 'mask', 'stroke'] 'filter', 'marker', 'marker-start', 'marker-mid', 'marker-end',
'mask', 'stroke']
svg_allow_local_href = ['altGlyph', 'animate', 'animateColor',
'animateMotion', 'animateTransform', 'cursor', 'feImage', 'filter',
'linearGradient', 'pattern', 'radialGradient', 'textpath', 'tref',
'set', 'use']
svg_allow_local_href = ['altGlyph', 'animate', 'animateColor', 'animateMotion',
'animateTransform', 'cursor', 'feImage', 'filter', 'linearGradient', 'pattern',
'radialGradient', 'textpath', 'tref', 'set', 'use']
acceptable_css_properties = ['azimuth', 'background-color', acceptable_css_properties = ['azimuth', 'background-color',
'border-bottom-color', 'border-collapse', 'border-color', 'border-bottom-color', 'border-collapse', 'border-color',
'border-left-color', 'border-right-color', 'border-top-color', 'clear', 'border-left-color', 'border-right-color', 'border-top-color', 'clear',
'color', 'cursor', 'direction', 'display', 'elevation', 'float', 'font', 'color', 'cursor', 'direction', 'display', 'elevation', 'float', 'font',
'font-family', 'font-size', 'font-style', 'font-variant', 'font-weight', 'font-family', 'font-size', 'font-style', 'font-variant', 'font-weight',
'height', 'letter-spacing', 'line-height', 'overflow', 'pause', 'height', 'letter-spacing', 'line-height', 'overflow', 'pause',
'pause-after', 'pause-before', 'pitch', 'pitch-range', 'richness', 'pause-after', 'pause-before', 'pitch', 'pitch-range', 'richness',
'speak', 'speak-header', 'speak-numeral', 'speak-punctuation', 'speak', 'speak-header', 'speak-numeral', 'speak-punctuation',
'speech-rate', 'stress', 'text-align', 'text-decoration', 'text-indent', 'speech-rate', 'stress', 'text-align', 'text-decoration', 'text-indent',
'unicode-bidi', 'vertical-align', 'voice-family', 'volume', 'unicode-bidi', 'vertical-align', 'voice-family', 'volume',
'white-space', 'width'] 'white-space', 'width']
acceptable_css_keywords = ['auto', 'aqua', 'black', 'block', 'blue', acceptable_css_keywords = ['auto', 'aqua', 'black', 'block', 'blue',
'bold', 'both', 'bottom', 'brown', 'center', 'collapse', 'dashed', 'bold', 'both', 'bottom', 'brown', 'center', 'collapse', 'dashed',
'dotted', 'fuchsia', 'gray', 'green', '!important', 'italic', 'left', 'dotted', 'fuchsia', 'gray', 'green', '!important', 'italic', 'left',
'lime', 'maroon', 'medium', 'none', 'navy', 'normal', 'nowrap', 'olive', 'lime', 'maroon', 'medium', 'none', 'navy', 'normal', 'nowrap', 'olive',
'pointer', 'purple', 'red', 'right', 'solid', 'silver', 'teal', 'top', 'pointer', 'purple', 'red', 'right', 'solid', 'silver', 'teal', 'top',
'transparent', 'underline', 'white', 'yellow'] 'transparent', 'underline', 'white', 'yellow']
acceptable_svg_properties = [ 'fill', 'fill-opacity', 'fill-rule', acceptable_svg_properties = ['fill', 'fill-opacity', 'fill-rule',
'stroke', 'stroke-width', 'stroke-linecap', 'stroke-linejoin', 'stroke', 'stroke-width', 'stroke-linecap', 'stroke-linejoin',
'stroke-opacity'] 'stroke-opacity']
acceptable_protocols = [ 'ed2k', 'ftp', 'http', 'https', 'irc', acceptable_protocols = ['ed2k', 'ftp', 'http', 'https', 'irc',
'mailto', 'news', 'gopher', 'nntp', 'telnet', 'webcal', 'mailto', 'news', 'gopher', 'nntp', 'telnet', 'webcal',
'xmpp', 'callto', 'feed', 'urn', 'aim', 'rsync', 'tag', 'xmpp', 'callto', 'feed', 'urn', 'aim', 'rsync', 'tag',
'ssh', 'sftp', 'rtsp', 'afs' ] 'ssh', 'sftp', 'rtsp', 'afs']
# subclasses may define their own versions of these constants # subclasses may define their own versions of these constants
allowed_elements = acceptable_elements + mathml_elements + svg_elements allowed_elements = acceptable_elements + mathml_elements + svg_elements
allowed_attributes = acceptable_attributes + mathml_attributes + svg_attributes allowed_attributes = acceptable_attributes + mathml_attributes + svg_attributes
@ -140,88 +160,109 @@ class HTMLSanitizerMixin(object):
# sanitize_html('<a href="javascript: sucker();">Click here for $100</a>') # sanitize_html('<a href="javascript: sucker();">Click here for $100</a>')
# => <a>Click here for $100</a> # => <a>Click here for $100</a>
def sanitize_token(self, token): def sanitize_token(self, token):
if token["type"] in (tokenTypes["StartTag"], tokenTypes["EndTag"],
tokenTypes["EmptyTag"]): # accommodate filters which use token_type differently
token_type = token["type"]
if token_type in list(tokenTypes.keys()):
token_type = tokenTypes[token_type]
if token_type in (tokenTypes["StartTag"], tokenTypes["EndTag"],
tokenTypes["EmptyTag"]):
if token["name"] in self.allowed_elements: if token["name"] in self.allowed_elements:
if token.has_key("data"): return self.allowed_token(token, token_type)
attrs = dict([(name,val) for name,val in
token["data"][::-1]
if name in self.allowed_attributes])
for attr in self.attr_val_is_uri:
if not attrs.has_key(attr):
continue
val_unescaped = re.sub("[`\000-\040\177-\240\s]+", '',
unescape(attrs[attr])).lower()
#remove replacement characters from unescaped characters
val_unescaped = val_unescaped.replace(u"\ufffd", "")
if (re.match("^[a-z0-9][-+.a-z0-9]*:",val_unescaped) and
(val_unescaped.split(':')[0] not in
self.allowed_protocols)):
del attrs[attr]
for attr in self.svg_attr_val_allows_ref:
if attr in attrs:
attrs[attr] = re.sub(r'url\s*\(\s*[^#\s][^)]+?\)',
' ',
unescape(attrs[attr]))
if (token["name"] in self.svg_allow_local_href and
'xlink:href' in attrs and re.search('^\s*[^#\s].*',
attrs['xlink:href'])):
del attrs['xlink:href']
if attrs.has_key('style'):
attrs['style'] = self.sanitize_css(attrs['style'])
token["data"] = [[name,val] for name,val in attrs.items()]
return token
else: else:
if token["type"] == tokenTypes["EndTag"]: return self.disallowed_token(token, token_type)
token["data"] = "</%s>" % token["name"] elif token_type == tokenTypes["Comment"]:
elif token["data"]:
attrs = ''.join([' %s="%s"' % (k,escape(v)) for k,v in token["data"]])
token["data"] = "<%s%s>" % (token["name"],attrs)
else:
token["data"] = "<%s>" % token["name"]
if token["selfClosing"]:
token["data"]=token["data"][:-1] + "/>"
token["type"] = tokenTypes["Characters"]
del token["name"]
return token
elif token["type"] == tokenTypes["Comment"]:
pass pass
else: else:
return token return token
def allowed_token(self, token, token_type):
if "data" in token:
attrs = dict([(name, val) for name, val in
token["data"][::-1]
if name in self.allowed_attributes])
for attr in self.attr_val_is_uri:
if attr not in attrs:
continue
val_unescaped = re.sub("[`\000-\040\177-\240\s]+", '',
unescape(attrs[attr])).lower()
# remove replacement characters from unescaped characters
val_unescaped = val_unescaped.replace("\ufffd", "")
if (re.match("^[a-z0-9][-+.a-z0-9]*:", val_unescaped) and
(val_unescaped.split(':')[0] not in
self.allowed_protocols)):
del attrs[attr]
for attr in self.svg_attr_val_allows_ref:
if attr in attrs:
attrs[attr] = re.sub(r'url\s*\(\s*[^#\s][^)]+?\)',
' ',
unescape(attrs[attr]))
if (token["name"] in self.svg_allow_local_href and
'xlink:href' in attrs and re.search('^\s*[^#\s].*',
attrs['xlink:href'])):
del attrs['xlink:href']
if 'style' in attrs:
attrs['style'] = self.sanitize_css(attrs['style'])
token["data"] = [[name, val] for name, val in list(attrs.items())]
return token
def disallowed_token(self, token, token_type):
if token_type == tokenTypes["EndTag"]:
token["data"] = "</%s>" % token["name"]
elif token["data"]:
attrs = ''.join([' %s="%s"' % (k, escape(v)) for k, v in token["data"]])
token["data"] = "<%s%s>" % (token["name"], attrs)
else:
token["data"] = "<%s>" % token["name"]
if token.get("selfClosing"):
token["data"] = token["data"][:-1] + "/>"
if token["type"] in list(tokenTypes.keys()):
token["type"] = "Characters"
else:
token["type"] = tokenTypes["Characters"]
del token["name"]
return token
def sanitize_css(self, style): def sanitize_css(self, style):
# disallow urls # disallow urls
style=re.compile('url\s*\(\s*[^\s)]+?\s*\)\s*').sub(' ',style) style = re.compile('url\s*\(\s*[^\s)]+?\s*\)\s*').sub(' ', style)
# gauntlet # gauntlet
if not re.match("""^([:,;#%.\sa-zA-Z0-9!]|\w-\w|'[\s\w]+'|"[\s\w]+"|\([\d,\s]+\))*$""", style): return '' if not re.match("""^([:,;#%.\sa-zA-Z0-9!]|\w-\w|'[\s\w]+'|"[\s\w]+"|\([\d,\s]+\))*$""", style):
if not re.match("^\s*([-\w]+\s*:[^:;]*(;\s*|$))*$", style): return '' return ''
if not re.match("^\s*([-\w]+\s*:[^:;]*(;\s*|$))*$", style):
return ''
clean = [] clean = []
for prop,value in re.findall("([-\w]+)\s*:\s*([^:;]*)",style): for prop, value in re.findall("([-\w]+)\s*:\s*([^:;]*)", style):
if not value: continue if not value:
if prop.lower() in self.allowed_css_properties: continue
clean.append(prop + ': ' + value + ';') if prop.lower() in self.allowed_css_properties:
elif prop.split('-')[0].lower() in ['background','border','margin', clean.append(prop + ': ' + value + ';')
'padding']: elif prop.split('-')[0].lower() in ['background', 'border', 'margin',
for keyword in value.split(): 'padding']:
if not keyword in self.acceptable_css_keywords and \ for keyword in value.split():
not re.match("^(#[0-9a-f]+|rgb\(\d+%?,\d*%?,?\d*%?\)?|\d{0,2}\.?\d{0,2}(cm|em|ex|in|mm|pc|pt|px|%|,|\))?)$",keyword): if not keyword in self.acceptable_css_keywords and \
break not re.match("^(#[0-9a-f]+|rgb\(\d+%?,\d*%?,?\d*%?\)?|\d{0,2}\.?\d{0,2}(cm|em|ex|in|mm|pc|pt|px|%|,|\))?)$", keyword):
else: break
clean.append(prop + ': ' + value + ';') else:
elif prop.lower() in self.allowed_svg_properties: clean.append(prop + ': ' + value + ';')
clean.append(prop + ': ' + value + ';') elif prop.lower() in self.allowed_svg_properties:
clean.append(prop + ': ' + value + ';')
return ' '.join(clean) return ' '.join(clean)
class HTMLSanitizer(HTMLTokenizer, HTMLSanitizerMixin): class HTMLSanitizer(HTMLTokenizer, HTMLSanitizerMixin):
def __init__(self, stream, encoding=None, parseMeta=True, useChardet=True, def __init__(self, stream, encoding=None, parseMeta=True, useChardet=True,
lowercaseElementName=False, lowercaseAttrName=False): lowercaseElementName=False, lowercaseAttrName=False, parser=None):
#Change case matching defaults as we only output lowercase html anyway # Change case matching defaults as we only output lowercase html anyway
#This solution doesn't seem ideal... # This solution doesn't seem ideal...
HTMLTokenizer.__init__(self, stream, encoding, parseMeta, useChardet, HTMLTokenizer.__init__(self, stream, encoding, parseMeta, useChardet,
lowercaseElementName, lowercaseAttrName) lowercaseElementName, lowercaseAttrName, parser=parser)
def __iter__(self): def __iter__(self):
for token in HTMLTokenizer.__iter__(self): for token in HTMLTokenizer.__iter__(self):

View File

@ -1,17 +1,16 @@
from __future__ import absolute_import, division, unicode_literals
from html5lib import treewalkers from .. import treewalkers
from htmlserializer import HTMLSerializer from .htmlserializer import HTMLSerializer
from xhtmlserializer import XHTMLSerializer
def serialize(input, tree="simpletree", format="html", encoding=None,
def serialize(input, tree="etree", format="html", encoding=None,
**serializer_opts): **serializer_opts):
# XXX: Should we cache this? # XXX: Should we cache this?
walker = treewalkers.getTreeWalker(tree) walker = treewalkers.getTreeWalker(tree)
if format == "html": if format == "html":
s = HTMLSerializer(**serializer_opts) s = HTMLSerializer(**serializer_opts)
elif format == "xhtml":
s = XHTMLSerializer(**serializer_opts)
else: else:
raise ValueError, "type must be either html or xhtml" raise ValueError("type must be html")
return s.render(walker(input), encoding) return s.render(walker(input), encoding)

View File

@ -1,18 +1,20 @@
try: from __future__ import absolute_import, division, unicode_literals
frozenset from six import text_type
except NameError:
# Import from the sets module for python 2.3
from sets import ImmutableSet as frozenset
import gettext import gettext
_ = gettext.gettext _ = gettext.gettext
from html5lib.constants import voidElements, booleanAttributes, spaceCharacters try:
from html5lib.constants import rcdataElements from functools import reduce
except ImportError:
pass
from ..constants import voidElements, booleanAttributes, spaceCharacters
from ..constants import rcdataElements, entities, xmlEntities
from .. import utils
from xml.sax.saxutils import escape from xml.sax.saxutils import escape
spaceCharacters = u"".join(spaceCharacters) spaceCharacters = "".join(spaceCharacters)
try: try:
from codecs import register_error, xmlcharrefreplace_errors from codecs import register_error, xmlcharrefreplace_errors
@ -21,27 +23,48 @@ except ImportError:
else: else:
unicode_encode_errors = "htmlentityreplace" unicode_encode_errors = "htmlentityreplace"
from html5lib.constants import entities
encode_entity_map = {} encode_entity_map = {}
for k, v in entities.items(): is_ucs4 = len("\U0010FFFF") == 1
if v != "&" and encode_entity_map.get(v) != k.lower(): for k, v in list(entities.items()):
# prefer &lt; over &LT; and similarly for &amp;, &gt;, etc. # skip multi-character entities
encode_entity_map[v] = k if ((is_ucs4 and len(v) > 1) or
(not is_ucs4 and len(v) > 2)):
continue
if v != "&":
if len(v) == 2:
v = utils.surrogatePairToCodepoint(v)
else:
v = ord(v)
if not v in encode_entity_map or k.islower():
# prefer &lt; over &LT; and similarly for &amp;, &gt;, etc.
encode_entity_map[v] = k
def htmlentityreplace_errors(exc): def htmlentityreplace_errors(exc):
if isinstance(exc, (UnicodeEncodeError, UnicodeTranslateError)): if isinstance(exc, (UnicodeEncodeError, UnicodeTranslateError)):
res = [] res = []
for c in exc.object[exc.start:exc.end]: codepoints = []
e = encode_entity_map.get(c) skip = False
for i, c in enumerate(exc.object[exc.start:exc.end]):
if skip:
skip = False
continue
index = i + exc.start
if utils.isSurrogatePair(exc.object[index:min([exc.end, index + 2])]):
codepoint = utils.surrogatePairToCodepoint(exc.object[index:index + 2])
skip = True
else:
codepoint = ord(c)
codepoints.append(codepoint)
for cp in codepoints:
e = encode_entity_map.get(cp)
if e: if e:
res.append("&") res.append("&")
res.append(e) res.append(e)
if not e.endswith(";"): if not e.endswith(";"):
res.append(";") res.append(";")
else: else:
res.append(c.encode(exc.encoding, "xmlcharrefreplace")) res.append("&#x%s;" % (hex(cp)[2:]))
return (u"".join(res), exc.end) return ("".join(res), exc.end)
else: else:
return xmlcharrefreplace_errors(exc) return xmlcharrefreplace_errors(exc)
@ -49,125 +72,185 @@ else:
del register_error del register_error
def encode(text, encoding):
return text.encode(encoding, unicode_encode_errors)
class HTMLSerializer(object): class HTMLSerializer(object):
# attribute quoting options
quote_attr_values = False quote_attr_values = False
quote_char = '"' quote_char = '"'
use_best_quote_char = True use_best_quote_char = True
minimize_boolean_attributes = True
# tag syntax options
omit_optional_tags = True
minimize_boolean_attributes = True
use_trailing_solidus = False use_trailing_solidus = False
space_before_trailing_solidus = True space_before_trailing_solidus = True
# escaping options
escape_lt_in_attrs = False escape_lt_in_attrs = False
escape_rcdata = False escape_rcdata = False
resolve_entities = True
# miscellaneous options
alphabetical_attributes = False
inject_meta_charset = True inject_meta_charset = True
strip_whitespace = False strip_whitespace = False
sanitize = False sanitize = False
omit_optional_tags = True
options = ("quote_attr_values", "quote_char", "use_best_quote_char", options = ("quote_attr_values", "quote_char", "use_best_quote_char",
"minimize_boolean_attributes", "use_trailing_solidus", "omit_optional_tags", "minimize_boolean_attributes",
"space_before_trailing_solidus", "omit_optional_tags", "use_trailing_solidus", "space_before_trailing_solidus",
"strip_whitespace", "inject_meta_charset", "escape_lt_in_attrs", "escape_lt_in_attrs", "escape_rcdata", "resolve_entities",
"escape_rcdata", 'use_trailing_solidus', "sanitize") "alphabetical_attributes", "inject_meta_charset",
"strip_whitespace", "sanitize")
def __init__(self, **kwargs): def __init__(self, **kwargs):
if kwargs.has_key('quote_char'): """Initialize HTMLSerializer.
Keyword options (default given first unless specified) include:
inject_meta_charset=True|False
Whether it insert a meta element to define the character set of the
document.
quote_attr_values=True|False
Whether to quote attribute values that don't require quoting
per HTML5 parsing rules.
quote_char=u'"'|u"'"
Use given quote character for attribute quoting. Default is to
use double quote unless attribute value contains a double quote,
in which case single quotes are used instead.
escape_lt_in_attrs=False|True
Whether to escape < in attribute values.
escape_rcdata=False|True
Whether to escape characters that need to be escaped within normal
elements within rcdata elements such as style.
resolve_entities=True|False
Whether to resolve named character entities that appear in the
source tree. The XML predefined entities &lt; &gt; &amp; &quot; &apos;
are unaffected by this setting.
strip_whitespace=False|True
Whether to remove semantically meaningless whitespace. (This
compresses all whitespace to a single space except within pre.)
minimize_boolean_attributes=True|False
Shortens boolean attributes to give just the attribute value,
for example <input disabled="disabled"> becomes <input disabled>.
use_trailing_solidus=False|True
Includes a close-tag slash at the end of the start tag of void
elements (empty elements whose end tag is forbidden). E.g. <hr/>.
space_before_trailing_solidus=True|False
Places a space immediately before the closing slash in a tag
using a trailing solidus. E.g. <hr />. Requires use_trailing_solidus.
sanitize=False|True
Strip all unsafe or unknown constructs from output.
See `html5lib user documentation`_
omit_optional_tags=True|False
Omit start/end tags that are optional.
alphabetical_attributes=False|True
Reorder attributes to be in alphabetical order.
.. _html5lib user documentation: http://code.google.com/p/html5lib/wiki/UserDocumentation
"""
if 'quote_char' in kwargs:
self.use_best_quote_char = False self.use_best_quote_char = False
for attr in self.options: for attr in self.options:
setattr(self, attr, kwargs.get(attr, getattr(self, attr))) setattr(self, attr, kwargs.get(attr, getattr(self, attr)))
self.errors = [] self.errors = []
self.strict = False self.strict = False
def encode(self, string):
assert(isinstance(string, text_type))
if self.encoding:
return string.encode(self.encoding, unicode_encode_errors)
else:
return string
def encodeStrict(self, string):
assert(isinstance(string, text_type))
if self.encoding:
return string.encode(self.encoding, "strict")
else:
return string
def serialize(self, treewalker, encoding=None): def serialize(self, treewalker, encoding=None):
self.encoding = encoding
in_cdata = False in_cdata = False
self.errors = [] self.errors = []
if encoding and self.inject_meta_charset: if encoding and self.inject_meta_charset:
from html5lib.filters.inject_meta_charset import Filter from ..filters.inject_meta_charset import Filter
treewalker = Filter(treewalker, encoding) treewalker = Filter(treewalker, encoding)
# XXX: WhitespaceFilter should be used before OptionalTagFilter # WhitespaceFilter should be used before OptionalTagFilter
# for maximum efficiently of this latter filter # for maximum efficiently of this latter filter
if self.strip_whitespace: if self.strip_whitespace:
from html5lib.filters.whitespace import Filter from ..filters.whitespace import Filter
treewalker = Filter(treewalker) treewalker = Filter(treewalker)
if self.sanitize: if self.sanitize:
from html5lib.filters.sanitizer import Filter from ..filters.sanitizer import Filter
treewalker = Filter(treewalker) treewalker = Filter(treewalker)
if self.omit_optional_tags: if self.omit_optional_tags:
from html5lib.filters.optionaltags import Filter from ..filters.optionaltags import Filter
treewalker = Filter(treewalker) treewalker = Filter(treewalker)
# Alphabetical attributes must be last, as other filters
# could add attributes and alter the order
if self.alphabetical_attributes:
from ..filters.alphabeticalattributes import Filter
treewalker = Filter(treewalker)
for token in treewalker: for token in treewalker:
type = token["type"] type = token["type"]
if type == "Doctype": if type == "Doctype":
doctype = u"<!DOCTYPE %s" % token["name"] doctype = "<!DOCTYPE %s" % token["name"]
if token["publicId"]: if token["publicId"]:
doctype += u' PUBLIC "%s"' % token["publicId"] doctype += ' PUBLIC "%s"' % token["publicId"]
elif token["systemId"]: elif token["systemId"]:
doctype += u" SYSTEM" doctype += " SYSTEM"
if token["systemId"]: if token["systemId"]:
if token["systemId"].find(u'"') >= 0: if token["systemId"].find('"') >= 0:
if token["systemId"].find(u"'") >= 0: if token["systemId"].find("'") >= 0:
self.serializeError(_("System identifer contains both single and double quote characters")) self.serializeError(_("System identifer contains both single and double quote characters"))
quote_char = u"'" quote_char = "'"
else: else:
quote_char = u'"' quote_char = '"'
doctype += u" %s%s%s" % (quote_char, token["systemId"], quote_char) doctype += " %s%s%s" % (quote_char, token["systemId"], quote_char)
doctype += u">" doctype += ">"
yield self.encodeStrict(doctype)
if encoding:
yield doctype.encode(encoding)
else:
yield doctype
elif type in ("Characters", "SpaceCharacters"): elif type in ("Characters", "SpaceCharacters"):
if type == "SpaceCharacters" or in_cdata: if type == "SpaceCharacters" or in_cdata:
if in_cdata and token["data"].find("</") >= 0: if in_cdata and token["data"].find("</") >= 0:
self.serializeError(_("Unexpected </ in CDATA")) self.serializeError(_("Unexpected </ in CDATA"))
if encoding: yield self.encode(token["data"])
yield token["data"].encode(encoding, "strict")
else:
yield token["data"]
elif encoding:
yield encode(escape(token["data"]), encoding)
else: else:
yield escape(token["data"]) yield self.encode(escape(token["data"]))
elif type in ("StartTag", "EmptyTag"): elif type in ("StartTag", "EmptyTag"):
name = token["name"] name = token["name"]
yield self.encodeStrict("<%s" % name)
if name in rcdataElements and not self.escape_rcdata: if name in rcdataElements and not self.escape_rcdata:
in_cdata = True in_cdata = True
elif in_cdata: elif in_cdata:
self.serializeError(_("Unexpected child element of a CDATA element")) self.serializeError(_("Unexpected child element of a CDATA element"))
attrs = token["data"] for (attr_namespace, attr_name), attr_value in token["data"].items():
if hasattr(attrs, "items"): # TODO: Add namespace support here
attrs = attrs.items() k = attr_name
attrs.sort() v = attr_value
attributes = [] yield self.encodeStrict(' ')
for k,v in attrs:
if encoding:
k = k.encode(encoding, "strict")
attributes.append(' ')
attributes.append(k) yield self.encodeStrict(k)
if not self.minimize_boolean_attributes or \ if not self.minimize_boolean_attributes or \
(k not in booleanAttributes.get(name, tuple()) \ (k not in booleanAttributes.get(name, tuple())
and k not in booleanAttributes.get("", tuple())): and k not in booleanAttributes.get("", tuple())):
attributes.append("=") yield self.encodeStrict("=")
if self.quote_attr_values or not v: if self.quote_attr_values or not v:
quote_attr = True quote_attr = True
else: else:
quote_attr = reduce(lambda x,y: x or (y in v), quote_attr = reduce(lambda x, y: x or (y in v),
spaceCharacters + ">\"'=", False) spaceCharacters + ">\"'=", False)
v = v.replace("&", "&amp;") v = v.replace("&", "&amp;")
if self.escape_lt_in_attrs: v = v.replace("<", "&lt;") if self.escape_lt_in_attrs:
if encoding: v = v.replace("<", "&lt;")
v = encode(v, encoding)
if quote_attr: if quote_attr:
quote_char = self.quote_char quote_char = self.quote_char
if self.use_best_quote_char: if self.use_best_quote_char:
@ -179,20 +262,17 @@ class HTMLSerializer(object):
v = v.replace("'", "&#39;") v = v.replace("'", "&#39;")
else: else:
v = v.replace('"', "&quot;") v = v.replace('"', "&quot;")
attributes.append(quote_char) yield self.encodeStrict(quote_char)
attributes.append(v) yield self.encode(v)
attributes.append(quote_char) yield self.encodeStrict(quote_char)
else: else:
attributes.append(v) yield self.encode(v)
if name in voidElements and self.use_trailing_solidus: if name in voidElements and self.use_trailing_solidus:
if self.space_before_trailing_solidus: if self.space_before_trailing_solidus:
attributes.append(" /") yield self.encodeStrict(" /")
else: else:
attributes.append("/") yield self.encodeStrict("/")
if encoding: yield self.encode(">")
yield "<%s%s>" % (name.encode(encoding, "strict"), "".join(attributes))
else:
yield u"<%s%s>" % (name, u"".join(attributes))
elif type == "EndTag": elif type == "EndTag":
name = token["name"] name = token["name"]
@ -200,28 +280,33 @@ class HTMLSerializer(object):
in_cdata = False in_cdata = False
elif in_cdata: elif in_cdata:
self.serializeError(_("Unexpected child element of a CDATA element")) self.serializeError(_("Unexpected child element of a CDATA element"))
end_tag = u"</%s>" % name yield self.encodeStrict("</%s>" % name)
if encoding:
end_tag = end_tag.encode(encoding, "strict")
yield end_tag
elif type == "Comment": elif type == "Comment":
data = token["data"] data = token["data"]
if data.find("--") >= 0: if data.find("--") >= 0:
self.serializeError(_("Comment contains --")) self.serializeError(_("Comment contains --"))
comment = u"<!--%s-->" % token["data"] yield self.encodeStrict("<!--%s-->" % token["data"])
if encoding:
comment = comment.encode(encoding, unicode_encode_errors) elif type == "Entity":
yield comment name = token["name"]
key = name + ";"
if not key in entities:
self.serializeError(_("Entity %s not recognized" % name))
if self.resolve_entities and key not in xmlEntities:
data = entities[key]
else:
data = "&%s;" % name
yield self.encodeStrict(data)
else: else:
self.serializeError(token["data"]) self.serializeError(token["data"])
def render(self, treewalker, encoding=None): def render(self, treewalker, encoding=None):
if encoding: if encoding:
return "".join(list(self.serialize(treewalker, encoding))) return b"".join(list(self.serialize(treewalker, encoding)))
else: else:
return u"".join(list(self.serialize(treewalker))) return "".join(list(self.serialize(treewalker)))
def serializeError(self, data="XXX ERROR MESSAGE NEEDED"): def serializeError(self, data="XXX ERROR MESSAGE NEEDED"):
# XXX The idea is to make data mandatory. # XXX The idea is to make data mandatory.
@ -229,6 +314,7 @@ class HTMLSerializer(object):
if self.strict: if self.strict:
raise SerializeError raise SerializeError
def SerializeError(Exception): def SerializeError(Exception):
"""Error in serialized tree""" """Error in serialized tree"""
pass pass

View File

@ -1,9 +0,0 @@
from htmlserializer import HTMLSerializer
class XHTMLSerializer(HTMLSerializer):
quote_attr_values = True
minimize_boolean_attributes = False
use_trailing_solidus = True
escape_lt_in_attrs = True
omit_optional_tags = False
escape_rcdata = True

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

View File

@ -0,0 +1,44 @@
from __future__ import absolute_import, division, unicode_literals
from xml.sax.xmlreader import AttributesNSImpl
from ..constants import adjustForeignAttributes, unadjustForeignAttributes
prefix_mapping = {}
for prefix, localName, namespace in adjustForeignAttributes.values():
if prefix is not None:
prefix_mapping[prefix] = namespace
def to_sax(walker, handler):
"""Call SAX-like content handler based on treewalker walker"""
handler.startDocument()
for prefix, namespace in prefix_mapping.items():
handler.startPrefixMapping(prefix, namespace)
for token in walker:
type = token["type"]
if type == "Doctype":
continue
elif type in ("StartTag", "EmptyTag"):
attrs = AttributesNSImpl(token["data"],
unadjustForeignAttributes)
handler.startElementNS((token["namespace"], token["name"]),
token["name"],
attrs)
if type == "EmptyTag":
handler.endElementNS((token["namespace"], token["name"]),
token["name"])
elif type == "EndTag":
handler.endElementNS((token["namespace"], token["name"]),
token["name"])
elif type in ("Characters", "SpaceCharacters"):
handler.characters(token["data"])
elif type == "Comment":
pass
else:
assert False, "Unknown token type"
for prefix, namespace in prefix_mapping.items():
handler.endPrefixMapping(prefix)
handler.endDocument()

View File

@ -7,7 +7,7 @@ implement several things:
1) A set of classes for various types of elements: Document, Doctype, 1) A set of classes for various types of elements: Document, Doctype,
Comment, Element. These must implement the interface of Comment, Element. These must implement the interface of
_base.treebuilders.Node (although comment nodes have a different _base.treebuilders.Node (although comment nodes have a different
signature for their constructor, see treebuilders.simpletree.Comment) signature for their constructor, see treebuilders.etree.Comment)
Textual content may also be implemented as another node type, or not, as Textual content may also be implemented as another node type, or not, as
your tree implementation requires. your tree implementation requires.
@ -24,69 +24,53 @@ getDocument - Returns the root node of the complete document tree
testSerializer method on your treebuilder which accepts a node and testSerializer method on your treebuilder which accepts a node and
returns a string containing Node and its children serialized according returns a string containing Node and its children serialized according
to the format used in the unittests to the format used in the unittests
The supplied simpletree module provides a python-only implementation
of a full treebuilder and is a useful reference for the semantics of
the various methods.
""" """
from __future__ import absolute_import, division, unicode_literals
from ..utils import default_etree
treeBuilderCache = {} treeBuilderCache = {}
def getTreeBuilder(treeType, implementation=None, **kwargs): def getTreeBuilder(treeType, implementation=None, **kwargs):
"""Get a TreeBuilder class for various types of tree with built-in support """Get a TreeBuilder class for various types of tree with built-in support
treeType - the name of the tree type required (case-insensitive). Supported treeType - the name of the tree type required (case-insensitive). Supported
values are "simpletree", "dom", "etree" and "beautifulsoup" values are:
"simpletree" - a built-in DOM-ish tree type with support for some "dom" - A generic builder for DOM implementations, defaulting to
more pythonic idioms. a xml.dom.minidom based implementation.
"dom" - A generic builder for DOM implementations, defaulting to "etree" - A generic builder for tree implementations exposing an
a xml.dom.minidom based implementation for the sake of ElementTree-like interface, defaulting to
backwards compatibility (as releases up until 0.10 had a xml.etree.cElementTree if available and
builder called "dom" that was a minidom implemenation). xml.etree.ElementTree if not.
"etree" - A generic builder for tree implementations exposing an "lxml" - A etree-based builder for lxml.etree, handling
elementtree-like interface (known to work with limitations of lxml's implementation.
ElementTree, cElementTree and lxml.etree).
"beautifulsoup" - Beautiful soup (if installed)
implementation - (Currently applies to the "etree" and "dom" tree types). A implementation - (Currently applies to the "etree" and "dom" tree types). A
module implementing the tree type e.g. module implementing the tree type e.g.
xml.etree.ElementTree or lxml.etree.""" xml.etree.ElementTree or xml.etree.cElementTree."""
treeType = treeType.lower() treeType = treeType.lower()
if treeType not in treeBuilderCache: if treeType not in treeBuilderCache:
if treeType == "dom": if treeType == "dom":
import dom from . import dom
# XXX: Keep backwards compatibility by using minidom if no implementation is given # Come up with a sane default (pref. from the stdlib)
if implementation == None: if implementation is None:
from xml.dom import minidom from xml.dom import minidom
implementation = minidom implementation = minidom
# XXX: NEVER cache here, caching is done in the dom submodule # NEVER cache here, caching is done in the dom submodule
return dom.getDomModule(implementation, **kwargs).TreeBuilder return dom.getDomModule(implementation, **kwargs).TreeBuilder
elif treeType == "simpletree":
import simpletree
treeBuilderCache[treeType] = simpletree.TreeBuilder
elif treeType == "beautifulsoup":
import soup
treeBuilderCache[treeType] = soup.TreeBuilder
elif treeType == "lxml": elif treeType == "lxml":
import etree_lxml from . import etree_lxml
treeBuilderCache[treeType] = etree_lxml.TreeBuilder treeBuilderCache[treeType] = etree_lxml.TreeBuilder
elif treeType == "etree": elif treeType == "etree":
# Come up with a sane default from . import etree
if implementation == None: if implementation is None:
try: implementation = default_etree
import xml.etree.cElementTree as ET # NEVER cache here, caching is done in the etree submodule
except ImportError:
try:
import xml.etree.ElementTree as ET
except ImportError:
try:
import cElementTree as ET
except ImportError:
import elementtree.ElementTree as ET
implementation = ET
import etree
# XXX: NEVER cache here, caching is done in the etree submodule
return etree.getETreeModule(implementation, **kwargs).TreeBuilder return etree.getETreeModule(implementation, **kwargs).TreeBuilder
else:
raise ValueError("""Unrecognised treebuilder "%s" """ % treeType)
return treeBuilderCache.get(treeType) return treeBuilderCache.get(treeType)

View File

@ -1,25 +1,34 @@
from html5lib.constants import scopingElements, tableInsertModeElements, namespaces from __future__ import absolute_import, division, unicode_literals
try: from six import text_type
frozenset
except NameError:
# Import from the sets module for python 2.3
from sets import Set as set
from sets import ImmutableSet as frozenset
# The scope markers are inserted when entering buttons, object elements, from ..constants import scopingElements, tableInsertModeElements, namespaces
# The scope markers are inserted when entering object elements,
# marquees, table cells, and table captions, and are used to prevent formatting # marquees, table cells, and table captions, and are used to prevent formatting
# from "leaking" into tables, buttons, object elements, and marquees. # from "leaking" into tables, object elements, and marquees.
Marker = None Marker = None
listElementsMap = {
None: (frozenset(scopingElements), False),
"button": (frozenset(scopingElements | set([(namespaces["html"], "button")])), False),
"list": (frozenset(scopingElements | set([(namespaces["html"], "ol"),
(namespaces["html"], "ul")])), False),
"table": (frozenset([(namespaces["html"], "html"),
(namespaces["html"], "table")]), False),
"select": (frozenset([(namespaces["html"], "optgroup"),
(namespaces["html"], "option")]), True)
}
class Node(object): class Node(object):
def __init__(self, name): def __init__(self, name):
"""Node representing an item in the tree. """Node representing an item in the tree.
name - The tag name associated with the node name - The tag name associated with the node
parent - The parent of the current node (or None for the document node) parent - The parent of the current node (or None for the document node)
value - The value of the current node (applies to text nodes and value - The value of the current node (applies to text nodes and
comments comments
attributes - a dict holding name, value pairs for attributes of the node attributes - a dict holding name, value pairs for attributes of the node
childNodes - a list of child nodes of the current node. This must childNodes - a list of child nodes of the current node. This must
include all elements but not necessarily other node types include all elements but not necessarily other node types
_flags - A list of miscellaneous flags that can be set on the node _flags - A list of miscellaneous flags that can be set on the node
""" """
@ -30,14 +39,14 @@ class Node(object):
self.childNodes = [] self.childNodes = []
self._flags = [] self._flags = []
def __unicode__(self): def __str__(self):
attributesStr = " ".join(["%s=\"%s\""%(name, value) attributesStr = " ".join(["%s=\"%s\"" % (name, value)
for name, value in for name, value in
self.attributes.iteritems()]) self.attributes.items()])
if attributesStr: if attributesStr:
return "<%s %s>"%(self.name,attributesStr) return "<%s %s>" % (self.name, attributesStr)
else: else:
return "<%s>"%(self.name) return "<%s>" % (self.name)
def __repr__(self): def __repr__(self):
return "<%s>" % (self.name) return "<%s>" % (self.name)
@ -48,14 +57,14 @@ class Node(object):
raise NotImplementedError raise NotImplementedError
def insertText(self, data, insertBefore=None): def insertText(self, data, insertBefore=None):
"""Insert data as text in the current node, positioned before the """Insert data as text in the current node, positioned before the
start of node insertBefore or to the end of the node's text. start of node insertBefore or to the end of the node's text.
""" """
raise NotImplementedError raise NotImplementedError
def insertBefore(self, node, refNode): def insertBefore(self, node, refNode):
"""Insert node as a child of the current node, before refNode in the """Insert node as a child of the current node, before refNode in the
list of child nodes. Raises ValueError if refNode is not a child of list of child nodes. Raises ValueError if refNode is not a child of
the current node""" the current node"""
raise NotImplementedError raise NotImplementedError
@ -65,11 +74,11 @@ class Node(object):
raise NotImplementedError raise NotImplementedError
def reparentChildren(self, newParent): def reparentChildren(self, newParent):
"""Move all the children of the current node to newParent. """Move all the children of the current node to newParent.
This is needed so that trees that don't store text as nodes move the This is needed so that trees that don't store text as nodes move the
text in the correct way text in the correct way
""" """
#XXX - should this method be made more general? # XXX - should this method be made more general?
for child in self.childNodes: for child in self.childNodes:
newParent.appendChild(child) newParent.appendChild(child)
self.childNodes = [] self.childNodes = []
@ -80,12 +89,36 @@ class Node(object):
""" """
raise NotImplementedError raise NotImplementedError
def hasContent(self): def hasContent(self):
"""Return true if the node has children or text, false otherwise """Return true if the node has children or text, false otherwise
""" """
raise NotImplementedError raise NotImplementedError
class ActiveFormattingElements(list):
def append(self, node):
equalCount = 0
if node != Marker:
for element in self[::-1]:
if element == Marker:
break
if self.nodesEqual(element, node):
equalCount += 1
if equalCount == 3:
self.remove(element)
break
list.append(self, node)
def nodesEqual(self, node1, node2):
if not node1.nameTuple == node2.nameTuple:
return False
if not node1.attributes == node2.attributes:
return False
return True
class TreeBuilder(object): class TreeBuilder(object):
"""Base treebuilder implementation """Base treebuilder implementation
documentClass - the class to use for the bottommost node of a document documentClass - the class to use for the bottommost node of a document
@ -94,19 +127,19 @@ class TreeBuilder(object):
doctypeClass - the class to use for doctypes doctypeClass - the class to use for doctypes
""" """
#Document class # Document class
documentClass = None documentClass = None
#The class to use for creating a node # The class to use for creating a node
elementClass = None elementClass = None
#The class to use for creating comments # The class to use for creating comments
commentClass = None commentClass = None
#The class to use for creating doctypes # The class to use for creating doctypes
doctypeClass = None doctypeClass = None
#Fragment class # Fragment class
fragmentClass = None fragmentClass = None
def __init__(self, namespaceHTMLElements): def __init__(self, namespaceHTMLElements):
@ -115,12 +148,12 @@ class TreeBuilder(object):
else: else:
self.defaultNamespace = None self.defaultNamespace = None
self.reset() self.reset()
def reset(self): def reset(self):
self.openElements = [] self.openElements = []
self.activeFormattingElements = [] self.activeFormattingElements = ActiveFormattingElements()
#XXX - rename these to headElement, formElement # XXX - rename these to headElement, formElement
self.headPointer = None self.headPointer = None
self.formPointer = None self.formPointer = None
@ -129,23 +162,21 @@ class TreeBuilder(object):
self.document = self.documentClass() self.document = self.documentClass()
def elementInScope(self, target, variant=None): def elementInScope(self, target, variant=None):
# Exit early when possible.
listElementsMap = { # If we pass a node in we match that. if we pass a string
None:scopingElements, # match any node with that name
"list":scopingElements | set([(namespaces["html"], "ol"), exactNode = hasattr(target, "nameTuple")
(namespaces["html"], "ul")]),
"table":set([(namespaces["html"], "html"), listElements, invert = listElementsMap[variant]
(namespaces["html"], "table")])
}
listElements = listElementsMap[variant]
for node in reversed(self.openElements): for node in reversed(self.openElements):
if node.name == target: if (node.name == target and not exactNode or
node == target and exactNode):
return True return True
elif node.nameTuple in listElements: elif (invert ^ (node.nameTuple in listElements)):
return False return False
assert False # We should never reach this point assert False # We should never reach this point
def reconstructActiveFormattingElements(self): def reconstructActiveFormattingElements(self):
# Within this algorithm the order of steps described in the # Within this algorithm the order of steps described in the
@ -165,7 +196,7 @@ class TreeBuilder(object):
# Step 6 # Step 6
while entry != Marker and entry not in self.openElements: while entry != Marker and entry not in self.openElements:
if i == 0: if i == 0:
#This will be reset to 0 below # This will be reset to 0 below
i = -1 i = -1
break break
i -= 1 i -= 1
@ -178,13 +209,13 @@ class TreeBuilder(object):
# Step 8 # Step 8
entry = self.activeFormattingElements[i] entry = self.activeFormattingElements[i]
clone = entry.cloneNode() #Mainly to get a new copy of the attributes clone = entry.cloneNode() # Mainly to get a new copy of the attributes
# Step 9 # Step 9
element = self.insertElement({"type":"StartTag", element = self.insertElement({"type": "StartTag",
"name":clone.name, "name": clone.name,
"namespace":clone.namespace, "namespace": clone.namespace,
"data":clone.attributes}) "data": clone.attributes})
# Step 10 # Step 10
self.activeFormattingElements[i] = element self.activeFormattingElements[i] = element
@ -229,7 +260,7 @@ class TreeBuilder(object):
if parent is None: if parent is None:
parent = self.openElements[-1] parent = self.openElements[-1]
parent.appendChild(self.commentClass(token["data"])) parent.appendChild(self.commentClass(token["data"]))
def createElement(self, token): def createElement(self, token):
"""Create an element but don't insert it anywhere""" """Create an element but don't insert it anywhere"""
name = token["name"] name = token["name"]
@ -251,9 +282,10 @@ class TreeBuilder(object):
self.insertElement = self.insertElementNormal self.insertElement = self.insertElementNormal
insertFromTable = property(_getInsertFromTable, _setInsertFromTable) insertFromTable = property(_getInsertFromTable, _setInsertFromTable)
def insertElementNormal(self, token): def insertElementNormal(self, token):
name = token["name"] name = token["name"]
assert isinstance(name, text_type), "Element %s not unicode" % name
namespace = token.get("namespace", self.defaultNamespace) namespace = token.get("namespace", self.defaultNamespace)
element = self.elementClass(name, namespace) element = self.elementClass(name, namespace)
element.attributes = token["data"] element.attributes = token["data"]
@ -262,13 +294,13 @@ class TreeBuilder(object):
return element return element
def insertElementTable(self, token): def insertElementTable(self, token):
"""Create an element and insert it into the tree""" """Create an element and insert it into the tree"""
element = self.createElement(token) element = self.createElement(token)
if self.openElements[-1].name not in tableInsertModeElements: if self.openElements[-1].name not in tableInsertModeElements:
return self.insertElementNormal(token) return self.insertElementNormal(token)
else: else:
#We should be in the InTable mode. This means we want to do # We should be in the InTable mode. This means we want to do
#special magic element rearranging # special magic element rearranging
parent, insertBefore = self.getTableMisnestedNodePosition() parent, insertBefore = self.getTableMisnestedNodePosition()
if insertBefore is None: if insertBefore is None:
parent.appendChild(element) parent.appendChild(element)
@ -283,7 +315,7 @@ class TreeBuilder(object):
parent = self.openElements[-1] parent = self.openElements[-1]
if (not self.insertFromTable or (self.insertFromTable and if (not self.insertFromTable or (self.insertFromTable and
self.openElements[-1].name self.openElements[-1].name
not in tableInsertModeElements)): not in tableInsertModeElements)):
parent.insertText(data) parent.insertText(data)
else: else:
@ -291,14 +323,14 @@ class TreeBuilder(object):
# special magic element rearranging # special magic element rearranging
parent, insertBefore = self.getTableMisnestedNodePosition() parent, insertBefore = self.getTableMisnestedNodePosition()
parent.insertText(data, insertBefore) parent.insertText(data, insertBefore)
def getTableMisnestedNodePosition(self): def getTableMisnestedNodePosition(self):
"""Get the foster parent element, and sibling to insert before """Get the foster parent element, and sibling to insert before
(or None) when inserting a misnested table node""" (or None) when inserting a misnested table node"""
# The foster parent element is the one which comes before the most # The foster parent element is the one which comes before the most
# recently opened table element # recently opened table element
# XXX - this is really inelegant # XXX - this is really inelegant
lastTable=None lastTable = None
fosterParent = None fosterParent = None
insertBefore = None insertBefore = None
for elm in self.openElements[::-1]: for elm in self.openElements[::-1]:
@ -321,8 +353,8 @@ class TreeBuilder(object):
def generateImpliedEndTags(self, exclude=None): def generateImpliedEndTags(self, exclude=None):
name = self.openElements[-1].name name = self.openElements[-1].name
# XXX td, th and tr are not actually needed # XXX td, th and tr are not actually needed
if (name in frozenset(("dd", "dt", "li", "p", "td", "th", "tr")) if (name in frozenset(("dd", "dt", "li", "option", "optgroup", "p", "rp", "rt"))
and name != exclude): and name != exclude):
self.openElements.pop() self.openElements.pop()
# XXX This is not entirely what the specification says. We should # XXX This is not entirely what the specification says. We should
# investigate it more closely. # investigate it more closely.
@ -331,10 +363,10 @@ class TreeBuilder(object):
def getDocument(self): def getDocument(self):
"Return the final tree" "Return the final tree"
return self.document return self.document
def getFragment(self): def getFragment(self):
"Return the final fragment" "Return the final fragment"
#assert self.innerHTML # assert self.innerHTML
fragment = self.fragmentClass() fragment = self.fragmentClass()
self.openElements[0].reparentChildren(fragment) self.openElements[0].reparentChildren(fragment)
return fragment return fragment

View File

@ -1,40 +1,38 @@
from __future__ import absolute_import, division, unicode_literals
from xml.dom import minidom, Node, XML_NAMESPACE, XMLNS_NAMESPACE
import new from xml.dom import minidom, Node
import re
import weakref import weakref
import _base from . import _base
from html5lib import constants, ihatexml from .. import constants
from html5lib.constants import namespaces from ..constants import namespaces
from ..utils import moduleFactoryFactory
moduleCache = {}
def getDomModule(DomImplementation):
name = "_" + DomImplementation.__name__+"builder"
if name in moduleCache:
return moduleCache[name]
else:
mod = new.module(name)
objs = getDomBuilder(DomImplementation)
mod.__dict__.update(objs)
moduleCache[name] = mod
return mod
def getDomBuilder(DomImplementation): def getDomBuilder(DomImplementation):
Dom = DomImplementation Dom = DomImplementation
class AttrList:
class AttrList(object):
def __init__(self, element): def __init__(self, element):
self.element = element self.element = element
def __iter__(self): def __iter__(self):
return self.element.attributes.items().__iter__() return list(self.element.attributes.items()).__iter__()
def __setitem__(self, name, value): def __setitem__(self, name, value):
self.element.setAttribute(name, value) self.element.setAttribute(name, value)
def __len__(self):
return len(list(self.element.attributes.items()))
def items(self): def items(self):
return [(item[0], item[1]) for item in return [(item[0], item[1]) for item in
self.element.attributes.items()] list(self.element.attributes.items())]
def keys(self): def keys(self):
return self.element.attributes.keys() return list(self.element.attributes.keys())
def __getitem__(self, name): def __getitem__(self, name):
return self.element.getAttribute(name) return self.element.getAttribute(name)
@ -43,68 +41,68 @@ def getDomBuilder(DomImplementation):
raise NotImplementedError raise NotImplementedError
else: else:
return self.element.hasAttribute(name) return self.element.hasAttribute(name)
class NodeBuilder(_base.Node): class NodeBuilder(_base.Node):
def __init__(self, element): def __init__(self, element):
_base.Node.__init__(self, element.nodeName) _base.Node.__init__(self, element.nodeName)
self.element = element self.element = element
namespace = property(lambda self:hasattr(self.element, "namespaceURI") namespace = property(lambda self: hasattr(self.element, "namespaceURI")
and self.element.namespaceURI or None) and self.element.namespaceURI or None)
def appendChild(self, node): def appendChild(self, node):
node.parent = self node.parent = self
self.element.appendChild(node.element) self.element.appendChild(node.element)
def insertText(self, data, insertBefore=None): def insertText(self, data, insertBefore=None):
text = self.element.ownerDocument.createTextNode(data) text = self.element.ownerDocument.createTextNode(data)
if insertBefore: if insertBefore:
self.element.insertBefore(text, insertBefore.element) self.element.insertBefore(text, insertBefore.element)
else: else:
self.element.appendChild(text) self.element.appendChild(text)
def insertBefore(self, node, refNode): def insertBefore(self, node, refNode):
self.element.insertBefore(node.element, refNode.element) self.element.insertBefore(node.element, refNode.element)
node.parent = self node.parent = self
def removeChild(self, node): def removeChild(self, node):
if node.element.parentNode == self.element: if node.element.parentNode == self.element:
self.element.removeChild(node.element) self.element.removeChild(node.element)
node.parent = None node.parent = None
def reparentChildren(self, newParent): def reparentChildren(self, newParent):
while self.element.hasChildNodes(): while self.element.hasChildNodes():
child = self.element.firstChild child = self.element.firstChild
self.element.removeChild(child) self.element.removeChild(child)
newParent.element.appendChild(child) newParent.element.appendChild(child)
self.childNodes = [] self.childNodes = []
def getAttributes(self): def getAttributes(self):
return AttrList(self.element) return AttrList(self.element)
def setAttributes(self, attributes): def setAttributes(self, attributes):
if attributes: if attributes:
for name, value in attributes.items(): for name, value in list(attributes.items()):
if isinstance(name, tuple): if isinstance(name, tuple):
if name[0] is not None: if name[0] is not None:
qualifiedName = (name[0] + ":" + name[1]) qualifiedName = (name[0] + ":" + name[1])
else: else:
qualifiedName = name[1] qualifiedName = name[1]
self.element.setAttributeNS(name[2], qualifiedName, self.element.setAttributeNS(name[2], qualifiedName,
value) value)
else: else:
self.element.setAttribute( self.element.setAttribute(
name, value) name, value)
attributes = property(getAttributes, setAttributes) attributes = property(getAttributes, setAttributes)
def cloneNode(self): def cloneNode(self):
return NodeBuilder(self.element.cloneNode(False)) return NodeBuilder(self.element.cloneNode(False))
def hasContent(self): def hasContent(self):
return self.element.hasChildNodes() return self.element.hasChildNodes()
def getNameTuple(self): def getNameTuple(self):
if self.namespace == None: if self.namespace is None:
return namespaces["html"], self.name return namespaces["html"], self.name
else: else:
return self.namespace, self.name return self.namespace, self.name
@ -113,9 +111,9 @@ def getDomBuilder(DomImplementation):
class TreeBuilder(_base.TreeBuilder): class TreeBuilder(_base.TreeBuilder):
def documentClass(self): def documentClass(self):
self.dom = Dom.getDOMImplementation().createDocument(None,None,None) self.dom = Dom.getDOMImplementation().createDocument(None, None, None)
return weakref.proxy(self) return weakref.proxy(self)
def insertDoctype(self, token): def insertDoctype(self, token):
name = token["name"] name = token["name"]
publicId = token["publicId"] publicId = token["publicId"]
@ -126,7 +124,7 @@ def getDomBuilder(DomImplementation):
self.document.appendChild(NodeBuilder(doctype)) self.document.appendChild(NodeBuilder(doctype))
if Dom == minidom: if Dom == minidom:
doctype.ownerDocument = self.dom doctype.ownerDocument = self.dom
def elementClass(self, name, namespace=None): def elementClass(self, name, namespace=None):
if namespace is None and self.defaultNamespace is None: if namespace is None and self.defaultNamespace is None:
node = self.dom.createElement(name) node = self.dom.createElement(name)
@ -134,153 +132,96 @@ def getDomBuilder(DomImplementation):
node = self.dom.createElementNS(namespace, name) node = self.dom.createElementNS(namespace, name)
return NodeBuilder(node) return NodeBuilder(node)
def commentClass(self, data): def commentClass(self, data):
return NodeBuilder(self.dom.createComment(data)) return NodeBuilder(self.dom.createComment(data))
def fragmentClass(self): def fragmentClass(self):
return NodeBuilder(self.dom.createDocumentFragment()) return NodeBuilder(self.dom.createDocumentFragment())
def appendChild(self, node): def appendChild(self, node):
self.dom.appendChild(node.element) self.dom.appendChild(node.element)
def testSerializer(self, element): def testSerializer(self, element):
return testSerializer(element) return testSerializer(element)
def getDocument(self): def getDocument(self):
return self.dom return self.dom
def getFragment(self): def getFragment(self):
return _base.TreeBuilder.getFragment(self).element return _base.TreeBuilder.getFragment(self).element
def insertText(self, data, parent=None): def insertText(self, data, parent=None):
data=data data = data
if parent <> self: if parent != self:
_base.TreeBuilder.insertText(self, data, parent) _base.TreeBuilder.insertText(self, data, parent)
else: else:
# HACK: allow text nodes as children of the document node # HACK: allow text nodes as children of the document node
if hasattr(self.dom, '_child_node_types'): if hasattr(self.dom, '_child_node_types'):
if not Node.TEXT_NODE in self.dom._child_node_types: if not Node.TEXT_NODE in self.dom._child_node_types:
self.dom._child_node_types=list(self.dom._child_node_types) self.dom._child_node_types = list(self.dom._child_node_types)
self.dom._child_node_types.append(Node.TEXT_NODE) self.dom._child_node_types.append(Node.TEXT_NODE)
self.dom.appendChild(self.dom.createTextNode(data)) self.dom.appendChild(self.dom.createTextNode(data))
implementation = DomImplementation
name = None name = None
def testSerializer(element): def testSerializer(element):
element.normalize() element.normalize()
rv = [] rv = []
def serializeElement(element, indent=0): def serializeElement(element, indent=0):
if element.nodeType == Node.DOCUMENT_TYPE_NODE: if element.nodeType == Node.DOCUMENT_TYPE_NODE:
if element.name: if element.name:
if element.publicId or element.systemId: if element.publicId or element.systemId:
publicId = element.publicId or "" publicId = element.publicId or ""
systemId = element.systemId or "" systemId = element.systemId or ""
rv.append( """|%s<!DOCTYPE %s "%s" "%s">"""%( rv.append("""|%s<!DOCTYPE %s "%s" "%s">""" %
' '*indent, element.name, publicId, systemId)) (' ' * indent, element.name, publicId, systemId))
else: else:
rv.append("|%s<!DOCTYPE %s>"%(' '*indent, element.name)) rv.append("|%s<!DOCTYPE %s>" % (' ' * indent, element.name))
else: else:
rv.append("|%s<!DOCTYPE >"%(' '*indent,)) rv.append("|%s<!DOCTYPE >" % (' ' * indent,))
elif element.nodeType == Node.DOCUMENT_NODE: elif element.nodeType == Node.DOCUMENT_NODE:
rv.append("#document") rv.append("#document")
elif element.nodeType == Node.DOCUMENT_FRAGMENT_NODE: elif element.nodeType == Node.DOCUMENT_FRAGMENT_NODE:
rv.append("#document-fragment") rv.append("#document-fragment")
elif element.nodeType == Node.COMMENT_NODE: elif element.nodeType == Node.COMMENT_NODE:
rv.append("|%s<!-- %s -->"%(' '*indent, element.nodeValue)) rv.append("|%s<!-- %s -->" % (' ' * indent, element.nodeValue))
elif element.nodeType == Node.TEXT_NODE: elif element.nodeType == Node.TEXT_NODE:
rv.append("|%s\"%s\"" %(' '*indent, element.nodeValue)) rv.append("|%s\"%s\"" % (' ' * indent, element.nodeValue))
else: else:
if (hasattr(element, "namespaceURI") and if (hasattr(element, "namespaceURI") and
element.namespaceURI != None): element.namespaceURI is not None):
name = "%s %s"%(constants.prefixes[element.namespaceURI], name = "%s %s" % (constants.prefixes[element.namespaceURI],
element.nodeName) element.nodeName)
else: else:
name = element.nodeName name = element.nodeName
rv.append("|%s<%s>"%(' '*indent, name)) rv.append("|%s<%s>" % (' ' * indent, name))
if element.hasAttributes(): if element.hasAttributes():
i = 0 attributes = []
attr = element.attributes.item(i) for i in range(len(element.attributes)):
while attr: attr = element.attributes.item(i)
name = attr.nodeName name = attr.nodeName
value = attr.value value = attr.value
ns = attr.namespaceURI ns = attr.namespaceURI
if ns: if ns:
name = "%s %s"%(constants.prefixes[ns], attr.localName) name = "%s %s" % (constants.prefixes[ns], attr.localName)
else: else:
name = attr.nodeName name = attr.nodeName
i += 1 attributes.append((name, value))
attr = element.attributes.item(i)
rv.append('|%s%s="%s"' % (' '*(indent+2), name, value)) for name, value in sorted(attributes):
rv.append('|%s%s="%s"' % (' ' * (indent + 2), name, value))
indent += 2 indent += 2
for child in element.childNodes: for child in element.childNodes:
serializeElement(child, indent) serializeElement(child, indent)
serializeElement(element, 0) serializeElement(element, 0)
return "\n".join(rv) return "\n".join(rv)
def dom2sax(node, handler, nsmap={'xml':XML_NAMESPACE}):
if node.nodeType == Node.ELEMENT_NODE:
if not nsmap:
handler.startElement(node.nodeName, node.attributes)
for child in node.childNodes: dom2sax(child, handler, nsmap)
handler.endElement(node.nodeName)
else:
attributes = dict(node.attributes.itemsNS())
# gather namespace declarations
prefixes = []
for attrname in node.attributes.keys():
attr = node.getAttributeNode(attrname)
if (attr.namespaceURI == XMLNS_NAMESPACE or
(attr.namespaceURI == None and attr.nodeName.startswith('xmlns'))):
prefix = (attr.nodeName != 'xmlns' and attr.nodeName or None)
handler.startPrefixMapping(prefix, attr.nodeValue)
prefixes.append(prefix)
nsmap = nsmap.copy()
nsmap[prefix] = attr.nodeValue
del attributes[(attr.namespaceURI, attr.nodeName)]
# apply namespace declarations
for attrname in node.attributes.keys():
attr = node.getAttributeNode(attrname)
if attr.namespaceURI == None and ':' in attr.nodeName:
prefix = attr.nodeName.split(':')[0]
if nsmap.has_key(prefix):
del attributes[(attr.namespaceURI, attr.nodeName)]
attributes[(nsmap[prefix],attr.nodeName)]=attr.nodeValue
# SAX events
ns = node.namespaceURI or nsmap.get(None,None)
handler.startElementNS((ns,node.nodeName), node.nodeName, attributes)
for child in node.childNodes: dom2sax(child, handler, nsmap)
handler.endElementNS((ns, node.nodeName), node.nodeName)
for prefix in prefixes: handler.endPrefixMapping(prefix)
elif node.nodeType in [Node.TEXT_NODE, Node.CDATA_SECTION_NODE]:
handler.characters(node.nodeValue)
elif node.nodeType == Node.DOCUMENT_NODE:
handler.startDocument()
for child in node.childNodes: dom2sax(child, handler, nsmap)
handler.endDocument()
elif node.nodeType == Node.DOCUMENT_FRAGMENT_NODE:
for child in node.childNodes: dom2sax(child, handler, nsmap)
else:
# ATTRIBUTE_NODE
# ENTITY_NODE
# PROCESSING_INSTRUCTION_NODE
# COMMENT_NODE
# DOCUMENT_TYPE_NODE
# NOTATION_NODE
pass
return locals() return locals()
# Keep backwards compatibility with things that directly load
# classes/functions from this module # The actual means to get a module!
for key, value in getDomModule(minidom).__dict__.items(): getDomModule = moduleFactoryFactory(getDomBuilder)
globals()[key] = value

View File

@ -1,28 +1,21 @@
import new from __future__ import absolute_import, division, unicode_literals
from six import text_type
import re import re
import _base from . import _base
from html5lib import ihatexml from .. import ihatexml
from html5lib import constants from .. import constants
from html5lib.constants import namespaces from ..constants import namespaces
from ..utils import moduleFactoryFactory
tag_regexp = re.compile("{([^}]*)}(.*)") tag_regexp = re.compile("{([^}]*)}(.*)")
moduleCache = {}
def getETreeModule(ElementTreeImplementation, fullTree=False):
name = "_" + ElementTreeImplementation.__name__+"builder"
if name in moduleCache:
return moduleCache[name]
else:
mod = new.module("_" + ElementTreeImplementation.__name__+"builder")
objs = getETreeBuilder(ElementTreeImplementation, fullTree)
mod.__dict__.update(objs)
moduleCache[name] = mod
return mod
def getETreeBuilder(ElementTreeImplementation, fullTree=False): def getETreeBuilder(ElementTreeImplementation, fullTree=False):
ElementTree = ElementTreeImplementation ElementTree = ElementTreeImplementation
ElementTreeCommentType = ElementTree.Comment("asd").tag
class Element(_base.Node): class Element(_base.Node):
def __init__(self, name, namespace=None): def __init__(self, name, namespace=None):
self._name = name self._name = name
@ -41,16 +34,16 @@ def getETreeBuilder(ElementTreeImplementation, fullTree=False):
if namespace is None: if namespace is None:
etree_tag = name etree_tag = name
else: else:
etree_tag = "{%s}%s"%(namespace, name) etree_tag = "{%s}%s" % (namespace, name)
return etree_tag return etree_tag
def _setName(self, name): def _setName(self, name):
self._name = name self._name = name
self._element.tag = self._getETreeTag(self._name, self._namespace) self._element.tag = self._getETreeTag(self._name, self._namespace)
def _getName(self): def _getName(self):
return self._name return self._name
name = property(_getName, _setName) name = property(_getName, _setName)
def _setNamespace(self, namespace): def _setNamespace(self, namespace):
@ -61,81 +54,82 @@ def getETreeBuilder(ElementTreeImplementation, fullTree=False):
return self._namespace return self._namespace
namespace = property(_getNamespace, _setNamespace) namespace = property(_getNamespace, _setNamespace)
def _getAttributes(self): def _getAttributes(self):
return self._element.attrib return self._element.attrib
def _setAttributes(self, attributes): def _setAttributes(self, attributes):
#Delete existing attributes first # Delete existing attributes first
#XXX - there may be a better way to do this... # XXX - there may be a better way to do this...
for key in self._element.attrib.keys(): for key in list(self._element.attrib.keys()):
del self._element.attrib[key] del self._element.attrib[key]
for key, value in attributes.iteritems(): for key, value in attributes.items():
if isinstance(key, tuple): if isinstance(key, tuple):
name = "{%s}%s"%(key[2], key[1]) name = "{%s}%s" % (key[2], key[1])
else: else:
name = key name = key
self._element.set(name, value) self._element.set(name, value)
attributes = property(_getAttributes, _setAttributes) attributes = property(_getAttributes, _setAttributes)
def _getChildNodes(self): def _getChildNodes(self):
return self._childNodes return self._childNodes
def _setChildNodes(self, value): def _setChildNodes(self, value):
del self._element[:] del self._element[:]
self._childNodes = [] self._childNodes = []
for element in value: for element in value:
self.insertChild(element) self.insertChild(element)
childNodes = property(_getChildNodes, _setChildNodes) childNodes = property(_getChildNodes, _setChildNodes)
def hasContent(self): def hasContent(self):
"""Return true if the node has children or text""" """Return true if the node has children or text"""
return bool(self._element.text or self._element.getchildren()) return bool(self._element.text or len(self._element))
def appendChild(self, node): def appendChild(self, node):
self._childNodes.append(node) self._childNodes.append(node)
self._element.append(node._element) self._element.append(node._element)
node.parent = self node.parent = self
def insertBefore(self, node, refNode): def insertBefore(self, node, refNode):
index = self._element.getchildren().index(refNode._element) index = list(self._element).index(refNode._element)
self._element.insert(index, node._element) self._element.insert(index, node._element)
node.parent = self node.parent = self
def removeChild(self, node): def removeChild(self, node):
self._element.remove(node._element) self._element.remove(node._element)
node.parent=None node.parent = None
def insertText(self, data, insertBefore=None): def insertText(self, data, insertBefore=None):
if not(len(self._element)): if not(len(self._element)):
if not self._element.text: if not self._element.text:
self._element.text = "" self._element.text = ""
self._element.text += data self._element.text += data
elif insertBefore is None: elif insertBefore is None:
#Insert the text as the tail of the last child element # Insert the text as the tail of the last child element
if not self._element[-1].tail: if not self._element[-1].tail:
self._element[-1].tail = "" self._element[-1].tail = ""
self._element[-1].tail += data self._element[-1].tail += data
else: else:
#Insert the text before the specified node # Insert the text before the specified node
children = self._element.getchildren() children = list(self._element)
index = children.index(insertBefore._element) index = children.index(insertBefore._element)
if index > 0: if index > 0:
if not self._element[index-1].tail: if not self._element[index - 1].tail:
self._element[index-1].tail = "" self._element[index - 1].tail = ""
self._element[index-1].tail += data self._element[index - 1].tail += data
else: else:
if not self._element.text: if not self._element.text:
self._element.text = "" self._element.text = ""
self._element.text += data self._element.text += data
def cloneNode(self): def cloneNode(self):
element = Element(self.name, self.namespace) element = type(self)(self.name, self.namespace)
for name, value in self.attributes.iteritems(): for name, value in self.attributes.items():
element.attributes[name] = value element.attributes[name] = value
return element return element
def reparentChildren(self, newParent): def reparentChildren(self, newParent):
if newParent.childNodes: if newParent.childNodes:
newParent.childNodes[-1]._element.tail += self._element.text newParent.childNodes[-1]._element.tail += self._element.text
@ -146,60 +140,60 @@ def getETreeBuilder(ElementTreeImplementation, fullTree=False):
newParent._element.text += self._element.text newParent._element.text += self._element.text
self._element.text = "" self._element.text = ""
_base.Node.reparentChildren(self, newParent) _base.Node.reparentChildren(self, newParent)
class Comment(Element): class Comment(Element):
def __init__(self, data): def __init__(self, data):
#Use the superclass constructor to set all properties on the # Use the superclass constructor to set all properties on the
#wrapper element # wrapper element
self._element = ElementTree.Comment(data) self._element = ElementTree.Comment(data)
self.parent = None self.parent = None
self._childNodes = [] self._childNodes = []
self._flags = [] self._flags = []
def _getData(self): def _getData(self):
return self._element.text return self._element.text
def _setData(self, value): def _setData(self, value):
self._element.text = value self._element.text = value
data = property(_getData, _setData) data = property(_getData, _setData)
class DocumentType(Element): class DocumentType(Element):
def __init__(self, name, publicId, systemId): def __init__(self, name, publicId, systemId):
Element.__init__(self, "<!DOCTYPE>") Element.__init__(self, "<!DOCTYPE>")
self._element.text = name self._element.text = name
self.publicId = publicId self.publicId = publicId
self.systemId = systemId self.systemId = systemId
def _getPublicId(self): def _getPublicId(self):
return self._element.get(u"publicId", "") return self._element.get("publicId", "")
def _setPublicId(self, value): def _setPublicId(self, value):
if value is not None: if value is not None:
self._element.set(u"publicId", value) self._element.set("publicId", value)
publicId = property(_getPublicId, _setPublicId) publicId = property(_getPublicId, _setPublicId)
def _getSystemId(self): def _getSystemId(self):
return self._element.get(u"systemId", "") return self._element.get("systemId", "")
def _setSystemId(self, value): def _setSystemId(self, value):
if value is not None: if value is not None:
self._element.set(u"systemId", value) self._element.set("systemId", value)
systemId = property(_getSystemId, _setSystemId) systemId = property(_getSystemId, _setSystemId)
class Document(Element): class Document(Element):
def __init__(self): def __init__(self):
Element.__init__(self, "<DOCUMENT_ROOT>") Element.__init__(self, "DOCUMENT_ROOT")
class DocumentFragment(Element): class DocumentFragment(Element):
def __init__(self): def __init__(self):
Element.__init__(self, "<DOCUMENT_FRAGMENT>") Element.__init__(self, "DOCUMENT_FRAGMENT")
def testSerializer(element): def testSerializer(element):
rv = [] rv = []
finalText = None
def serializeElement(element, indent=0): def serializeElement(element, indent=0):
if not(hasattr(element, "tag")): if not(hasattr(element, "tag")):
element = element.getroot() element = element.getroot()
@ -207,19 +201,23 @@ def getETreeBuilder(ElementTreeImplementation, fullTree=False):
if element.get("publicId") or element.get("systemId"): if element.get("publicId") or element.get("systemId"):
publicId = element.get("publicId") or "" publicId = element.get("publicId") or ""
systemId = element.get("systemId") or "" systemId = element.get("systemId") or ""
rv.append( """<!DOCTYPE %s "%s" "%s">"""%( rv.append("""<!DOCTYPE %s "%s" "%s">""" %
element.text, publicId, systemId)) (element.text, publicId, systemId))
else: else:
rv.append("<!DOCTYPE %s>"%(element.text,)) rv.append("<!DOCTYPE %s>" % (element.text,))
elif element.tag == "<DOCUMENT_ROOT>": elif element.tag == "DOCUMENT_ROOT":
rv.append("#document") rv.append("#document")
if element.text: if element.text is not None:
rv.append("|%s\"%s\""%(' '*(indent+2), element.text)) rv.append("|%s\"%s\"" % (' ' * (indent + 2), element.text))
if element.tail: if element.tail is not None:
finalText = element.tail raise TypeError("Document node cannot have tail")
elif type(element.tag) == type(ElementTree.Comment): if hasattr(element, "attrib") and len(element.attrib):
rv.append("|%s<!-- %s -->"%(' '*indent, element.text)) raise TypeError("Document node cannot have attributes")
elif element.tag == ElementTreeCommentType:
rv.append("|%s<!-- %s -->" % (' ' * indent, element.text))
else: else:
assert isinstance(element.tag, text_type), \
"Expected unicode, got %s, %s" % (type(element.tag), element.tag)
nsmatch = tag_regexp.match(element.tag) nsmatch = tag_regexp.match(element.tag)
if nsmatch is None: if nsmatch is None:
@ -227,103 +225,113 @@ def getETreeBuilder(ElementTreeImplementation, fullTree=False):
else: else:
ns, name = nsmatch.groups() ns, name = nsmatch.groups()
prefix = constants.prefixes[ns] prefix = constants.prefixes[ns]
name = "%s %s"%(prefix, name) name = "%s %s" % (prefix, name)
rv.append("|%s<%s>"%(' '*indent, name)) rv.append("|%s<%s>" % (' ' * indent, name))
if hasattr(element, "attrib"): if hasattr(element, "attrib"):
for name, value in element.attrib.iteritems(): attributes = []
for name, value in element.attrib.items():
nsmatch = tag_regexp.match(name) nsmatch = tag_regexp.match(name)
if nsmatch is not None: if nsmatch is not None:
ns, name = nsmatch.groups() ns, name = nsmatch.groups()
prefix = constants.prefixes[ns] prefix = constants.prefixes[ns]
name = "%s %s"%(prefix, name) attr_string = "%s %s" % (prefix, name)
rv.append('|%s%s="%s"' % (' '*(indent+2), name, value)) else:
attr_string = name
attributes.append((attr_string, value))
for name, value in sorted(attributes):
rv.append('|%s%s="%s"' % (' ' * (indent + 2), name, value))
if element.text: if element.text:
rv.append("|%s\"%s\"" %(' '*(indent+2), element.text)) rv.append("|%s\"%s\"" % (' ' * (indent + 2), element.text))
indent += 2 indent += 2
for child in element.getchildren(): for child in element:
serializeElement(child, indent) serializeElement(child, indent)
if element.tail: if element.tail:
rv.append("|%s\"%s\"" %(' '*(indent-2), element.tail)) rv.append("|%s\"%s\"" % (' ' * (indent - 2), element.tail))
serializeElement(element, 0) serializeElement(element, 0)
if finalText is not None:
rv.append("|%s\"%s\""%(' '*2, finalText))
return "\n".join(rv) return "\n".join(rv)
def tostring(element): def tostring(element):
"""Serialize an element and its child nodes to a string""" """Serialize an element and its child nodes to a string"""
rv = [] rv = []
finalText = None
filter = ihatexml.InfosetFilter() filter = ihatexml.InfosetFilter()
def serializeElement(element): def serializeElement(element):
if type(element) == type(ElementTree.ElementTree): if isinstance(element, ElementTree.ElementTree):
element = element.getroot() element = element.getroot()
if element.tag == "<!DOCTYPE>": if element.tag == "<!DOCTYPE>":
if element.get("publicId") or element.get("systemId"): if element.get("publicId") or element.get("systemId"):
publicId = element.get("publicId") or "" publicId = element.get("publicId") or ""
systemId = element.get("systemId") or "" systemId = element.get("systemId") or ""
rv.append( """<!DOCTYPE %s PUBLIC "%s" "%s">"""%( rv.append("""<!DOCTYPE %s PUBLIC "%s" "%s">""" %
element.text, publicId, systemId)) (element.text, publicId, systemId))
else:
rv.append("<!DOCTYPE %s>"%(element.text,))
elif element.tag == "<DOCUMENT_ROOT>":
if element.text:
rv.append(element.text)
if element.tail:
finalText = element.tail
for child in element.getchildren():
serializeElement(child)
elif type(element.tag) == type(ElementTree.Comment):
rv.append("<!--%s-->"%(element.text,))
else:
#This is assumed to be an ordinary element
if not element.attrib:
rv.append("<%s>"%(filter.fromXmlName(element.tag),))
else: else:
attr = " ".join(["%s=\"%s\""%( rv.append("<!DOCTYPE %s>" % (element.text,))
filter.fromXmlName(name), value) elif element.tag == "DOCUMENT_ROOT":
for name, value in element.attrib.iteritems()]) if element.text is not None:
rv.append("<%s %s>"%(element.tag, attr)) rv.append(element.text)
if element.tail is not None:
raise TypeError("Document node cannot have tail")
if hasattr(element, "attrib") and len(element.attrib):
raise TypeError("Document node cannot have attributes")
for child in element:
serializeElement(child)
elif element.tag == ElementTreeCommentType:
rv.append("<!--%s-->" % (element.text,))
else:
# This is assumed to be an ordinary element
if not element.attrib:
rv.append("<%s>" % (filter.fromXmlName(element.tag),))
else:
attr = " ".join(["%s=\"%s\"" % (
filter.fromXmlName(name), value)
for name, value in element.attrib.items()])
rv.append("<%s %s>" % (element.tag, attr))
if element.text: if element.text:
rv.append(element.text) rv.append(element.text)
for child in element.getchildren(): for child in element:
serializeElement(child) serializeElement(child)
rv.append("</%s>"%(element.tag,)) rv.append("</%s>" % (element.tag,))
if element.tail: if element.tail:
rv.append(element.tail) rv.append(element.tail)
serializeElement(element) serializeElement(element)
if finalText is not None:
rv.append("%s\""%(' '*2, finalText))
return "".join(rv) return "".join(rv)
class TreeBuilder(_base.TreeBuilder): class TreeBuilder(_base.TreeBuilder):
documentClass = Document documentClass = Document
doctypeClass = DocumentType doctypeClass = DocumentType
elementClass = Element elementClass = Element
commentClass = Comment commentClass = Comment
fragmentClass = DocumentFragment fragmentClass = DocumentFragment
implementation = ElementTreeImplementation
def testSerializer(self, element): def testSerializer(self, element):
return testSerializer(element) return testSerializer(element)
def getDocument(self): def getDocument(self):
if fullTree: if fullTree:
return self.document._element return self.document._element
else: else:
return self.document._element.find("html") if self.defaultNamespace is not None:
return self.document._element.find(
"{%s}html" % self.defaultNamespace)
else:
return self.document._element.find("html")
def getFragment(self): def getFragment(self):
return _base.TreeBuilder.getFragment(self)._element return _base.TreeBuilder.getFragment(self)._element
return locals() return locals()
getETreeModule = moduleFactoryFactory(getETreeBuilder)

View File

@ -1,20 +1,3 @@
import new
import warnings
import re
import _base
from html5lib.constants import DataLossWarning
import html5lib.constants as constants
import etree as etree_builders
from html5lib import ihatexml
try:
import lxml.etree as etree
except ImportError:
pass
fullTree = True
"""Module for supporting the lxml.etree library. The idea here is to use as much """Module for supporting the lxml.etree library. The idea here is to use as much
of the native library as possible, without using fragile hacks like custom element of the native library as possible, without using fragile hacks like custom element
names that break between releases. The downside of this is that we cannot represent names that break between releases. The downside of this is that we cannot represent
@ -26,12 +9,34 @@ Docypes with no name
When any of these things occur, we emit a DataLossWarning When any of these things occur, we emit a DataLossWarning
""" """
from __future__ import absolute_import, division, unicode_literals
import warnings
import re
import sys
from . import _base
from ..constants import DataLossWarning
from .. import constants
from . import etree as etree_builders
from .. import ihatexml
import lxml.etree as etree
fullTree = True
tag_regexp = re.compile("{([^}]*)}(.*)")
comment_type = etree.Comment("asd").tag
class DocumentType(object): class DocumentType(object):
def __init__(self, name, publicId, systemId): def __init__(self, name, publicId, systemId):
self.name = name self.name = name
self.publicId = publicId self.publicId = publicId
self.systemId = systemId self.systemId = systemId
class Document(object): class Document(object):
def __init__(self): def __init__(self):
self._elementTree = None self._elementTree = None
@ -42,117 +47,126 @@ class Document(object):
def _getChildNodes(self): def _getChildNodes(self):
return self._childNodes return self._childNodes
childNodes = property(_getChildNodes) childNodes = property(_getChildNodes)
def testSerializer(element): def testSerializer(element):
rv = [] rv = []
finalText = None finalText = None
filter = ihatexml.InfosetFilter() infosetFilter = ihatexml.InfosetFilter()
def serializeElement(element, indent=0): def serializeElement(element, indent=0):
if not hasattr(element, "tag"): if not hasattr(element, "tag"):
if hasattr(element, "getroot"): if hasattr(element, "getroot"):
#Full tree case # Full tree case
rv.append("#document") rv.append("#document")
if element.docinfo.internalDTD: if element.docinfo.internalDTD:
if not (element.docinfo.public_id or if not (element.docinfo.public_id or
element.docinfo.system_url): element.docinfo.system_url):
dtd_str = "<!DOCTYPE %s>"%element.docinfo.root_name dtd_str = "<!DOCTYPE %s>" % element.docinfo.root_name
else: else:
dtd_str = """<!DOCTYPE %s "%s" "%s">"""%( dtd_str = """<!DOCTYPE %s "%s" "%s">""" % (
element.docinfo.root_name, element.docinfo.root_name,
element.docinfo.public_id, element.docinfo.public_id,
element.docinfo.system_url) element.docinfo.system_url)
rv.append("|%s%s"%(' '*(indent+2), dtd_str)) rv.append("|%s%s" % (' ' * (indent + 2), dtd_str))
next_element = element.getroot() next_element = element.getroot()
while next_element.getprevious() is not None: while next_element.getprevious() is not None:
next_element = next_element.getprevious() next_element = next_element.getprevious()
while next_element is not None: while next_element is not None:
serializeElement(next_element, indent+2) serializeElement(next_element, indent + 2)
next_element = next_element.getnext() next_element = next_element.getnext()
elif isinstance(element, basestring): elif isinstance(element, str) or isinstance(element, bytes):
#Text in a fragment # Text in a fragment
rv.append("|%s\"%s\""%(' '*indent, element)) assert isinstance(element, str) or sys.version_info.major == 2
rv.append("|%s\"%s\"" % (' ' * indent, element))
else: else:
#Fragment case # Fragment case
rv.append("#document-fragment") rv.append("#document-fragment")
for next_element in element: for next_element in element:
serializeElement(next_element, indent+2) serializeElement(next_element, indent + 2)
elif type(element.tag) == type(etree.Comment): elif element.tag == comment_type:
rv.append("|%s<!-- %s -->"%(' '*indent, element.text)) rv.append("|%s<!-- %s -->" % (' ' * indent, element.text))
if hasattr(element, "tail") and element.tail:
rv.append("|%s\"%s\"" % (' ' * indent, element.tail))
else: else:
assert isinstance(element, etree._Element)
nsmatch = etree_builders.tag_regexp.match(element.tag) nsmatch = etree_builders.tag_regexp.match(element.tag)
if nsmatch is not None: if nsmatch is not None:
ns = nsmatch.group(1) ns = nsmatch.group(1)
tag = nsmatch.group(2) tag = nsmatch.group(2)
prefix = constants.prefixes[ns] prefix = constants.prefixes[ns]
rv.append("|%s<%s %s>"%(' '*indent, prefix, rv.append("|%s<%s %s>" % (' ' * indent, prefix,
filter.fromXmlName(tag))) infosetFilter.fromXmlName(tag)))
else: else:
rv.append("|%s<%s>"%(' '*indent, rv.append("|%s<%s>" % (' ' * indent,
filter.fromXmlName(element.tag))) infosetFilter.fromXmlName(element.tag)))
if hasattr(element, "attrib"): if hasattr(element, "attrib"):
for name, value in element.attrib.iteritems(): attributes = []
nsmatch = etree_builders.tag_regexp.match(name) for name, value in element.attrib.items():
if nsmatch: nsmatch = tag_regexp.match(name)
ns = nsmatch.group(1) if nsmatch is not None:
name = nsmatch.group(2) ns, name = nsmatch.groups()
name = infosetFilter.fromXmlName(name)
prefix = constants.prefixes[ns] prefix = constants.prefixes[ns]
rv.append('|%s%s %s="%s"' % (' '*(indent+2), attr_string = "%s %s" % (prefix, name)
prefix, else:
filter.fromXmlName(name), attr_string = infosetFilter.fromXmlName(name)
value)) attributes.append((attr_string, value))
else:
rv.append('|%s%s="%s"' % (' '*(indent+2), for name, value in sorted(attributes):
filter.fromXmlName(name), rv.append('|%s%s="%s"' % (' ' * (indent + 2), name, value))
value))
if element.text: if element.text:
rv.append("|%s\"%s\"" %(' '*(indent+2), element.text)) rv.append("|%s\"%s\"" % (' ' * (indent + 2), element.text))
indent += 2 indent += 2
for child in element.getchildren(): for child in element:
serializeElement(child, indent) serializeElement(child, indent)
if hasattr(element, "tail") and element.tail: if hasattr(element, "tail") and element.tail:
rv.append("|%s\"%s\"" %(' '*(indent-2), element.tail)) rv.append("|%s\"%s\"" % (' ' * (indent - 2), element.tail))
serializeElement(element, 0) serializeElement(element, 0)
if finalText is not None: if finalText is not None:
rv.append("|%s\"%s\""%(' '*2, finalText)) rv.append("|%s\"%s\"" % (' ' * 2, finalText))
return "\n".join(rv) return "\n".join(rv)
def tostring(element): def tostring(element):
"""Serialize an element and its child nodes to a string""" """Serialize an element and its child nodes to a string"""
rv = [] rv = []
finalText = None finalText = None
def serializeElement(element): def serializeElement(element):
if not hasattr(element, "tag"): if not hasattr(element, "tag"):
if element.docinfo.internalDTD: if element.docinfo.internalDTD:
if element.docinfo.doctype: if element.docinfo.doctype:
dtd_str = element.docinfo.doctype dtd_str = element.docinfo.doctype
else: else:
dtd_str = "<!DOCTYPE %s>"%element.docinfo.root_name dtd_str = "<!DOCTYPE %s>" % element.docinfo.root_name
rv.append(dtd_str) rv.append(dtd_str)
serializeElement(element.getroot()) serializeElement(element.getroot())
elif type(element.tag) == type(etree.Comment): elif element.tag == comment_type:
rv.append("<!--%s-->"%(element.text,)) rv.append("<!--%s-->" % (element.text,))
else: else:
#This is assumed to be an ordinary element # This is assumed to be an ordinary element
if not element.attrib: if not element.attrib:
rv.append("<%s>"%(element.tag,)) rv.append("<%s>" % (element.tag,))
else: else:
attr = " ".join(["%s=\"%s\""%(name, value) attr = " ".join(["%s=\"%s\"" % (name, value)
for name, value in element.attrib.iteritems()]) for name, value in element.attrib.items()])
rv.append("<%s %s>"%(element.tag, attr)) rv.append("<%s %s>" % (element.tag, attr))
if element.text: if element.text:
rv.append(element.text) rv.append(element.text)
for child in element.getchildren(): for child in element:
serializeElement(child) serializeElement(child)
rv.append("</%s>"%(element.tag,)) rv.append("</%s>" % (element.tag,))
if hasattr(element, "tail") and element.tail: if hasattr(element, "tail") and element.tail:
rv.append(element.tail) rv.append(element.tail)
@ -160,56 +174,57 @@ def tostring(element):
serializeElement(element) serializeElement(element)
if finalText is not None: if finalText is not None:
rv.append("%s\""%(' '*2, finalText)) rv.append("%s\"" % (' ' * 2, finalText))
return "".join(rv) return "".join(rv)
class TreeBuilder(_base.TreeBuilder): class TreeBuilder(_base.TreeBuilder):
documentClass = Document documentClass = Document
doctypeClass = DocumentType doctypeClass = DocumentType
elementClass = None elementClass = None
commentClass = None commentClass = None
fragmentClass = Document fragmentClass = Document
implementation = etree
def __init__(self, namespaceHTMLElements, fullTree = False): def __init__(self, namespaceHTMLElements, fullTree=False):
builder = etree_builders.getETreeModule(etree, fullTree=fullTree) builder = etree_builders.getETreeModule(etree, fullTree=fullTree)
filter = self.filter = ihatexml.InfosetFilter() infosetFilter = self.infosetFilter = ihatexml.InfosetFilter()
self.namespaceHTMLElements = namespaceHTMLElements self.namespaceHTMLElements = namespaceHTMLElements
class Attributes(dict): class Attributes(dict):
def __init__(self, element, value={}): def __init__(self, element, value={}):
self._element = element self._element = element
dict.__init__(self, value) dict.__init__(self, value)
for key, value in self.iteritems(): for key, value in self.items():
if isinstance(key, tuple): if isinstance(key, tuple):
name = "{%s}%s"%(key[2], filter.coerceAttribute(key[1])) name = "{%s}%s" % (key[2], infosetFilter.coerceAttribute(key[1]))
else: else:
name = filter.coerceAttribute(key) name = infosetFilter.coerceAttribute(key)
self._element._element.attrib[name] = value self._element._element.attrib[name] = value
def __setitem__(self, key, value): def __setitem__(self, key, value):
dict.__setitem__(self, key, value) dict.__setitem__(self, key, value)
if isinstance(key, tuple): if isinstance(key, tuple):
name = "{%s}%s"%(key[2], filter.coerceAttribute(key[1])) name = "{%s}%s" % (key[2], infosetFilter.coerceAttribute(key[1]))
else: else:
name = filter.coerceAttribute(key) name = infosetFilter.coerceAttribute(key)
self._element._element.attrib[name] = value self._element._element.attrib[name] = value
class Element(builder.Element): class Element(builder.Element):
def __init__(self, name, namespace): def __init__(self, name, namespace):
name = filter.coerceElement(name) name = infosetFilter.coerceElement(name)
builder.Element.__init__(self, name, namespace=namespace) builder.Element.__init__(self, name, namespace=namespace)
self._attributes = Attributes(self) self._attributes = Attributes(self)
def _setName(self, name): def _setName(self, name):
self._name = filter.coerceElement(name) self._name = infosetFilter.coerceElement(name)
self._element.tag = self._getETreeTag( self._element.tag = self._getETreeTag(
self._name, self._namespace) self._name, self._namespace)
def _getName(self): def _getName(self):
return filter.fromXmlName(self._name) return infosetFilter.fromXmlName(self._name)
name = property(_getName, _setName) name = property(_getName, _setName)
def _getAttributes(self): def _getAttributes(self):
@ -217,24 +232,23 @@ class TreeBuilder(_base.TreeBuilder):
def _setAttributes(self, attributes): def _setAttributes(self, attributes):
self._attributes = Attributes(self, attributes) self._attributes = Attributes(self, attributes)
attributes = property(_getAttributes, _setAttributes) attributes = property(_getAttributes, _setAttributes)
def insertText(self, data, insertBefore=None): def insertText(self, data, insertBefore=None):
data = filter.coerceCharacters(data) data = infosetFilter.coerceCharacters(data)
builder.Element.insertText(self, data, insertBefore) builder.Element.insertText(self, data, insertBefore)
def appendChild(self, child): def appendChild(self, child):
builder.Element.appendChild(self, child) builder.Element.appendChild(self, child)
class Comment(builder.Comment): class Comment(builder.Comment):
def __init__(self, data): def __init__(self, data):
data = filter.coerceComment(data) data = infosetFilter.coerceComment(data)
builder.Comment.__init__(self, data) builder.Comment.__init__(self, data)
def _setData(self, data): def _setData(self, data):
data = filter.coerceComment(data) data = infosetFilter.coerceComment(data)
self._element.text = data self._element.text = data
def _getData(self): def _getData(self):
@ -244,9 +258,9 @@ class TreeBuilder(_base.TreeBuilder):
self.elementClass = Element self.elementClass = Element
self.commentClass = builder.Comment self.commentClass = builder.Comment
#self.fragmentClass = builder.DocumentFragment # self.fragmentClass = builder.DocumentFragment
_base.TreeBuilder.__init__(self, namespaceHTMLElements) _base.TreeBuilder.__init__(self, namespaceHTMLElements)
def reset(self): def reset(self):
_base.TreeBuilder.reset(self) _base.TreeBuilder.reset(self)
self.insertComment = self.insertCommentInitial self.insertComment = self.insertCommentInitial
@ -261,13 +275,13 @@ class TreeBuilder(_base.TreeBuilder):
return self.document._elementTree return self.document._elementTree
else: else:
return self.document._elementTree.getroot() return self.document._elementTree.getroot()
def getFragment(self): def getFragment(self):
fragment = [] fragment = []
element = self.openElements[0]._element element = self.openElements[0]._element
if element.text: if element.text:
fragment.append(element.text) fragment.append(element.text)
fragment.extend(element.getchildren()) fragment.extend(list(element))
if element.tail: if element.tail:
fragment.append(element.tail) fragment.append(element.tail)
return fragment return fragment
@ -277,59 +291,79 @@ class TreeBuilder(_base.TreeBuilder):
publicId = token["publicId"] publicId = token["publicId"]
systemId = token["systemId"] systemId = token["systemId"]
if not name or ihatexml.nonXmlNameBMPRegexp.search(name) or name[0] == '"': if not name:
warnings.warn("lxml cannot represent null or non-xml doctype", DataLossWarning) warnings.warn("lxml cannot represent empty doctype", DataLossWarning)
self.doctype = None
else:
coercedName = self.infosetFilter.coerceElement(name)
if coercedName != name:
warnings.warn("lxml cannot represent non-xml doctype", DataLossWarning)
doctype = self.doctypeClass(coercedName, publicId, systemId)
self.doctype = doctype
doctype = self.doctypeClass(name, publicId, systemId)
self.doctype = doctype
def insertCommentInitial(self, data, parent=None): def insertCommentInitial(self, data, parent=None):
self.initial_comments.append(data) self.initial_comments.append(data)
def insertCommentMain(self, data, parent=None):
if (parent == self.document and
self.document._elementTree.getroot()[-1].tag == comment_type):
warnings.warn("lxml cannot represent adjacent comments beyond the root elements", DataLossWarning)
super(TreeBuilder, self).insertComment(data, parent)
def insertRoot(self, token): def insertRoot(self, token):
"""Create the document root""" """Create the document root"""
#Because of the way libxml2 works, it doesn't seem to be possible to # Because of the way libxml2 works, it doesn't seem to be possible to
#alter information like the doctype after the tree has been parsed. # alter information like the doctype after the tree has been parsed.
#Therefore we need to use the built-in parser to create our iniial # Therefore we need to use the built-in parser to create our iniial
#tree, after which we can add elements like normal # tree, after which we can add elements like normal
docStr = "" docStr = ""
if self.doctype and self.doctype.name and not self.doctype.name.startswith('"'): if self.doctype:
docStr += "<!DOCTYPE %s"%self.doctype.name assert self.doctype.name
if (self.doctype.publicId is not None or docStr += "<!DOCTYPE %s" % self.doctype.name
self.doctype.systemId is not None): if (self.doctype.publicId is not None or
docStr += ' PUBLIC "%s" "%s"'%(self.doctype.publicId or "", self.doctype.systemId is not None):
self.doctype.systemId or "") docStr += (' PUBLIC "%s" ' %
(self.infosetFilter.coercePubid(self.doctype.publicId or "")))
if self.doctype.systemId:
sysid = self.doctype.systemId
if sysid.find("'") >= 0 and sysid.find('"') >= 0:
warnings.warn("DOCTYPE system cannot contain single and double quotes", DataLossWarning)
sysid = sysid.replace("'", 'U00027')
if sysid.find("'") >= 0:
docStr += '"%s"' % sysid
else:
docStr += "'%s'" % sysid
else:
docStr += "''"
docStr += ">" docStr += ">"
if self.doctype.name != token["name"]:
warnings.warn("lxml cannot represent doctype with a different name to the root element", DataLossWarning)
docStr += "<THIS_SHOULD_NEVER_APPEAR_PUBLICLY/>" docStr += "<THIS_SHOULD_NEVER_APPEAR_PUBLICLY/>"
root = etree.fromstring(docStr)
try:
root = etree.fromstring(docStr) # Append the initial comments:
except etree.XMLSyntaxError:
print docStr
raise
#Append the initial comments:
for comment_token in self.initial_comments: for comment_token in self.initial_comments:
root.addprevious(etree.Comment(comment_token["data"])) root.addprevious(etree.Comment(comment_token["data"]))
#Create the root document and add the ElementTree to it # Create the root document and add the ElementTree to it
self.document = self.documentClass() self.document = self.documentClass()
self.document._elementTree = root.getroottree() self.document._elementTree = root.getroottree()
# Give the root element the right name # Give the root element the right name
name = token["name"] name = token["name"]
namespace = token.get("namespace", self.defaultNamespace) namespace = token.get("namespace", self.defaultNamespace)
if namespace is None: if namespace is None:
etree_tag = name etree_tag = name
else: else:
etree_tag = "{%s}%s"%(namespace, name) etree_tag = "{%s}%s" % (namespace, name)
root.tag = etree_tag root.tag = etree_tag
#Add the root element to the internal child/open data structures # Add the root element to the internal child/open data structures
root_element = self.elementClass(name, namespace) root_element = self.elementClass(name, namespace)
root_element._element = root root_element._element = root
self.document._childNodes.append(root_element) self.document._childNodes.append(root_element)
self.openElements.append(root_element) self.openElements.append(root_element)
#Reset to the default insert comment function # Reset to the default insert comment function
self.insertComment = super(TreeBuilder, self).insertComment self.insertComment = self.insertCommentMain

View File

@ -1,248 +0,0 @@
import _base
from html5lib.constants import voidElements, namespaces, prefixes
from xml.sax.saxutils import escape
# Really crappy basic implementation of a DOM-core like thing
class Node(_base.Node):
type = -1
def __init__(self, name):
self.name = name
self.parent = None
self.value = None
self.childNodes = []
self._flags = []
def __iter__(self):
for node in self.childNodes:
yield node
for item in node:
yield item
def __unicode__(self):
return self.name
def toxml(self):
raise NotImplementedError
def printTree(self, indent=0):
tree = '\n|%s%s' % (' '* indent, unicode(self))
for child in self.childNodes:
tree += child.printTree(indent + 2)
return tree
def appendChild(self, node):
if (isinstance(node, TextNode) and self.childNodes and
isinstance(self.childNodes[-1], TextNode)):
self.childNodes[-1].value += node.value
else:
self.childNodes.append(node)
node.parent = self
def insertText(self, data, insertBefore=None):
if insertBefore is None:
self.appendChild(TextNode(data))
else:
self.insertBefore(TextNode(data), insertBefore)
def insertBefore(self, node, refNode):
index = self.childNodes.index(refNode)
if (isinstance(node, TextNode) and index > 0 and
isinstance(self.childNodes[index - 1], TextNode)):
self.childNodes[index - 1].value += node.value
else:
self.childNodes.insert(index, node)
node.parent = self
def removeChild(self, node):
try:
self.childNodes.remove(node)
except:
# XXX
raise
node.parent = None
def cloneNode(self):
raise NotImplementedError
def hasContent(self):
"""Return true if the node has children or text"""
return bool(self.childNodes)
def getNameTuple(self):
if self.namespace == None:
return namespaces["html"], self.name
else:
return self.namespace, self.name
nameTuple = property(getNameTuple)
class Document(Node):
type = 1
def __init__(self):
Node.__init__(self, None)
def __unicode__(self):
return "#document"
def appendChild(self, child):
Node.appendChild(self, child)
def toxml(self, encoding="utf=8"):
result = ""
for child in self.childNodes:
result += child.toxml()
return result.encode(encoding)
def hilite(self, encoding="utf-8"):
result = "<pre>"
for child in self.childNodes:
result += child.hilite()
return result.encode(encoding) + "</pre>"
def printTree(self):
tree = unicode(self)
for child in self.childNodes:
tree += child.printTree(2)
return tree
def cloneNode(self):
return Document()
class DocumentFragment(Document):
type = 2
def __unicode__(self):
return "#document-fragment"
def cloneNode(self):
return DocumentFragment()
class DocumentType(Node):
type = 3
def __init__(self, name, publicId, systemId):
Node.__init__(self, name)
self.publicId = publicId
self.systemId = systemId
def __unicode__(self):
if self.publicId or self.systemId:
publicId = self.publicId or ""
systemId = self.systemId or ""
return """<!DOCTYPE %s "%s" "%s">"""%(
self.name, publicId, systemId)
else:
return u"<!DOCTYPE %s>" % self.name
toxml = __unicode__
def hilite(self):
return '<code class="markup doctype">&lt;!DOCTYPE %s></code>' % self.name
def cloneNode(self):
return DocumentType(self.name, self.publicId, self.systemId)
class TextNode(Node):
type = 4
def __init__(self, value):
Node.__init__(self, None)
self.value = value
def __unicode__(self):
return u"\"%s\"" % self.value
def toxml(self):
return escape(self.value)
hilite = toxml
def cloneNode(self):
return TextNode(self.value)
class Element(Node):
type = 5
def __init__(self, name, namespace=None):
Node.__init__(self, name)
self.namespace = namespace
self.attributes = {}
def __unicode__(self):
if self.namespace == None:
return u"<%s>" % self.name
else:
return u"<%s %s>"%(prefixes[self.namespace], self.name)
def toxml(self):
result = '<' + self.name
if self.attributes:
for name,value in self.attributes.iteritems():
result += u' %s="%s"' % (name, escape(value,{'"':'&quot;'}))
if self.childNodes:
result += '>'
for child in self.childNodes:
result += child.toxml()
result += u'</%s>' % self.name
else:
result += u'/>'
return result
def hilite(self):
result = '&lt;<code class="markup element-name">%s</code>' % self.name
if self.attributes:
for name, value in self.attributes.iteritems():
result += ' <code class="markup attribute-name">%s</code>=<code class="markup attribute-value">"%s"</code>' % (name, escape(value, {'"':'&quot;'}))
if self.childNodes:
result += ">"
for child in self.childNodes:
result += child.hilite()
elif self.name in voidElements:
return result + ">"
return result + '&lt;/<code class="markup element-name">%s</code>>' % self.name
def printTree(self, indent):
tree = '\n|%s%s' % (' '*indent, unicode(self))
indent += 2
if self.attributes:
for name, value in self.attributes.iteritems():
if isinstance(name, tuple):
name = "%s %s"%(name[0], name[1])
tree += '\n|%s%s="%s"' % (' ' * indent, name, value)
for child in self.childNodes:
tree += child.printTree(indent)
return tree
def cloneNode(self):
newNode = Element(self.name)
if hasattr(self, 'namespace'):
newNode.namespace = self.namespace
for attr, value in self.attributes.iteritems():
newNode.attributes[attr] = value
return newNode
class CommentNode(Node):
type = 6
def __init__(self, data):
Node.__init__(self, None)
self.data = data
def __unicode__(self):
return "<!-- %s -->" % self.data
def toxml(self):
return "<!--%s-->" % self.data
def hilite(self):
return '<code class="markup comment">&lt;!--%s--></code>' % escape(self.data)
def cloneNode(self):
return CommentNode(self.data)
class TreeBuilder(_base.TreeBuilder):
documentClass = Document
doctypeClass = DocumentType
elementClass = Element
commentClass = CommentNode
fragmentClass = DocumentFragment
def testSerializer(self, node):
return node.printTree()

View File

@ -1,228 +0,0 @@
import warnings
warnings.warn("BeautifulSoup 3.x (as of 3.1) is not fully compatible with html5lib and support will be removed in the future", DeprecationWarning)
from BeautifulSoup import BeautifulSoup, Tag, NavigableString, Comment, Declaration
import _base
from html5lib.constants import namespaces, DataLossWarning
class AttrList(object):
def __init__(self, element):
self.element = element
self.attrs = dict(self.element.attrs)
def __iter__(self):
return self.attrs.items().__iter__()
def __setitem__(self, name, value):
"set attr", name, value
self.element[name] = value
def items(self):
return self.attrs.items()
def keys(self):
return self.attrs.keys()
def __getitem__(self, name):
return self.attrs[name]
def __contains__(self, name):
return name in self.attrs.keys()
class Element(_base.Node):
def __init__(self, element, soup, namespace):
_base.Node.__init__(self, element.name)
self.element = element
self.soup = soup
self.namespace = namespace
def _nodeIndex(self, node, refNode):
# Finds a node by identity rather than equality
for index in range(len(self.element.contents)):
if id(self.element.contents[index]) == id(refNode.element):
return index
return None
def appendChild(self, node):
if (node.element.__class__ == NavigableString and self.element.contents
and self.element.contents[-1].__class__ == NavigableString):
# Concatenate new text onto old text node
# (TODO: This has O(n^2) performance, for input like "a</a>a</a>a</a>...")
newStr = NavigableString(self.element.contents[-1]+node.element)
# Remove the old text node
# (Can't simply use .extract() by itself, because it fails if
# an equal text node exists within the parent node)
oldElement = self.element.contents[-1]
del self.element.contents[-1]
oldElement.parent = None
oldElement.extract()
self.element.insert(len(self.element.contents), newStr)
else:
self.element.insert(len(self.element.contents), node.element)
node.parent = self
def getAttributes(self):
return AttrList(self.element)
def setAttributes(self, attributes):
if attributes:
for name, value in attributes.items():
self.element[name] = value
attributes = property(getAttributes, setAttributes)
def insertText(self, data, insertBefore=None):
text = TextNode(NavigableString(data), self.soup)
if insertBefore:
self.insertBefore(text, insertBefore)
else:
self.appendChild(text)
def insertBefore(self, node, refNode):
index = self._nodeIndex(node, refNode)
if (node.element.__class__ == NavigableString and self.element.contents
and self.element.contents[index-1].__class__ == NavigableString):
# (See comments in appendChild)
newStr = NavigableString(self.element.contents[index-1]+node.element)
oldNode = self.element.contents[index-1]
del self.element.contents[index-1]
oldNode.parent = None
oldNode.extract()
self.element.insert(index-1, newStr)
else:
self.element.insert(index, node.element)
node.parent = self
def removeChild(self, node):
index = self._nodeIndex(node.parent, node)
del node.parent.element.contents[index]
node.element.parent = None
node.element.extract()
node.parent = None
def reparentChildren(self, newParent):
while self.element.contents:
child = self.element.contents[0]
child.extract()
if isinstance(child, Tag):
newParent.appendChild(Element(child, self.soup, namespaces["html"]))
else:
newParent.appendChild(TextNode(child, self.soup))
def cloneNode(self):
node = Element(Tag(self.soup, self.element.name), self.soup, self.namespace)
for key,value in self.attributes:
node.attributes[key] = value
return node
def hasContent(self):
return self.element.contents
def getNameTuple(self):
if self.namespace == None:
return namespaces["html"], self.name
else:
return self.namespace, self.name
nameTuple = property(getNameTuple)
class TextNode(Element):
def __init__(self, element, soup):
_base.Node.__init__(self, None)
self.element = element
self.soup = soup
def cloneNode(self):
raise NotImplementedError
class TreeBuilder(_base.TreeBuilder):
def __init__(self, namespaceHTMLElements):
if namespaceHTMLElements:
warnings.warn("BeautifulSoup cannot represent elements in any namespace", DataLossWarning)
_base.TreeBuilder.__init__(self, namespaceHTMLElements)
def documentClass(self):
self.soup = BeautifulSoup("")
return Element(self.soup, self.soup, None)
def insertDoctype(self, token):
name = token["name"]
publicId = token["publicId"]
systemId = token["systemId"]
if publicId:
self.soup.insert(0, Declaration("%s PUBLIC \"%s\" \"%s\""%(name, publicId, systemId or "")))
elif systemId:
self.soup.insert(0, Declaration("%s SYSTEM \"%s\""%
(name, systemId)))
else:
self.soup.insert(0, Declaration(name))
def elementClass(self, name, namespace):
if namespace is not None:
warnings.warn("BeautifulSoup cannot represent elements in any namespace", DataLossWarning)
return Element(Tag(self.soup, name), self.soup, namespace)
def commentClass(self, data):
return TextNode(Comment(data), self.soup)
def fragmentClass(self):
self.soup = BeautifulSoup("")
self.soup.name = "[document_fragment]"
return Element(self.soup, self.soup, None)
def appendChild(self, node):
self.soup.insert(len(self.soup.contents), node.element)
def testSerializer(self, element):
return testSerializer(element)
def getDocument(self):
return self.soup
def getFragment(self):
return _base.TreeBuilder.getFragment(self).element
def testSerializer(element):
import re
rv = []
def serializeElement(element, indent=0):
if isinstance(element, Declaration):
doctype_regexp = r'(?P<name>[^\s]*)( PUBLIC "(?P<publicId>.*)" "(?P<systemId1>.*)"| SYSTEM "(?P<systemId2>.*)")?'
m = re.compile(doctype_regexp).match(element.string)
assert m is not None, "DOCTYPE did not match expected format"
name = m.group('name')
publicId = m.group('publicId')
if publicId is not None:
systemId = m.group('systemId1') or ""
else:
systemId = m.group('systemId2')
if publicId is not None or systemId is not None:
rv.append("""|%s<!DOCTYPE %s "%s" "%s">"""%
(' '*indent, name, publicId or "", systemId or ""))
else:
rv.append("|%s<!DOCTYPE %s>"%(' '*indent, name))
elif isinstance(element, BeautifulSoup):
if element.name == "[document_fragment]":
rv.append("#document-fragment")
else:
rv.append("#document")
elif isinstance(element, Comment):
rv.append("|%s<!-- %s -->"%(' '*indent, element.string))
elif isinstance(element, unicode):
rv.append("|%s\"%s\"" %(' '*indent, element))
else:
rv.append("|%s<%s>"%(' '*indent, element.name))
if element.attrs:
for name, value in element.attrs:
rv.append('|%s%s="%s"' % (' '*(indent+2), name, value))
indent += 2
if hasattr(element, "contents"):
for child in element.contents:
serializeElement(child, indent)
serializeElement(element, 0)
return "\n".join(rv)

View File

@ -8,23 +8,27 @@ implements a 'serialize' method taking a tree as sole argument and
returning an iterator generating tokens. returning an iterator generating tokens.
""" """
from __future__ import absolute_import, division, unicode_literals
import sys
from ..utils import default_etree
treeWalkerCache = {} treeWalkerCache = {}
def getTreeWalker(treeType, implementation=None, **kwargs): def getTreeWalker(treeType, implementation=None, **kwargs):
"""Get a TreeWalker class for various types of tree with built-in support """Get a TreeWalker class for various types of tree with built-in support
treeType - the name of the tree type required (case-insensitive). Supported treeType - the name of the tree type required (case-insensitive). Supported
values are "simpletree", "dom", "etree" and "beautifulsoup" values are:
"simpletree" - a built-in DOM-ish tree type with support for some
more pythonic idioms.
"dom" - The xml.dom.minidom DOM implementation "dom" - The xml.dom.minidom DOM implementation
"pulldom" - The xml.dom.pulldom event stream "pulldom" - The xml.dom.pulldom event stream
"etree" - A generic walker for tree implementations exposing an "etree" - A generic walker for tree implementations exposing an
elementtree-like interface (known to work with elementtree-like interface (known to work with
ElementTree, cElementTree and lxml.etree). ElementTree, cElementTree and lxml.etree).
"lxml" - Optimized walker for lxml.etree "lxml" - Optimized walker for lxml.etree
"beautifulsoup" - Beautiful soup (if installed)
"genshi" - a Genshi stream "genshi" - a Genshi stream
implementation - (Currently applies to the "etree" tree type only). A module implementation - (Currently applies to the "etree" tree type only). A module
@ -33,20 +37,21 @@ def getTreeWalker(treeType, implementation=None, **kwargs):
treeType = treeType.lower() treeType = treeType.lower()
if treeType not in treeWalkerCache: if treeType not in treeWalkerCache:
if treeType in ("dom", "pulldom", "simpletree"): if treeType in ("dom", "pulldom"):
mod = __import__(treeType, globals()) name = "%s.%s" % (__name__, treeType)
__import__(name)
mod = sys.modules[name]
treeWalkerCache[treeType] = mod.TreeWalker treeWalkerCache[treeType] = mod.TreeWalker
elif treeType == "genshi": elif treeType == "genshi":
import genshistream from . import genshistream
treeWalkerCache[treeType] = genshistream.TreeWalker treeWalkerCache[treeType] = genshistream.TreeWalker
elif treeType == "beautifulsoup":
import soup
treeWalkerCache[treeType] = soup.TreeWalker
elif treeType == "lxml": elif treeType == "lxml":
import lxmletree from . import lxmletree
treeWalkerCache[treeType] = lxmletree.TreeWalker treeWalkerCache[treeType] = lxmletree.TreeWalker
elif treeType == "etree": elif treeType == "etree":
import etree from . import etree
if implementation is None:
implementation = default_etree
# XXX: NEVER cache here, caching is done in the etree submodule # XXX: NEVER cache here, caching is done in the etree submodule
return etree.getETreeModule(implementation, **kwargs).TreeWalker return etree.getETreeModule(implementation, **kwargs).TreeWalker
return treeWalkerCache.get(treeType) return treeWalkerCache.get(treeType)

View File

@ -1,8 +1,40 @@
from __future__ import absolute_import, division, unicode_literals
from six import text_type, string_types
import gettext import gettext
_ = gettext.gettext _ = gettext.gettext
from html5lib.constants import voidElements, spaceCharacters from xml.dom import Node
spaceCharacters = u"".join(spaceCharacters)
DOCUMENT = Node.DOCUMENT_NODE
DOCTYPE = Node.DOCUMENT_TYPE_NODE
TEXT = Node.TEXT_NODE
ELEMENT = Node.ELEMENT_NODE
COMMENT = Node.COMMENT_NODE
ENTITY = Node.ENTITY_NODE
UNKNOWN = "<#UNKNOWN#>"
from ..constants import voidElements, spaceCharacters
spaceCharacters = "".join(spaceCharacters)
def to_text(s, blank_if_none=True):
"""Wrapper around six.text_type to convert None to empty string"""
if s is None:
if blank_if_none:
return ""
else:
return None
elif isinstance(s, text_type):
return s
else:
return text_type(s)
def is_text_or_none(string):
"""Wrapper around isinstance(string_types) or is None"""
return string is None or isinstance(string, string_types)
class TreeWalker(object): class TreeWalker(object):
def __init__(self, tree): def __init__(self, tree):
@ -14,36 +46,50 @@ class TreeWalker(object):
def error(self, msg): def error(self, msg):
return {"type": "SerializeError", "data": msg} return {"type": "SerializeError", "data": msg}
def normalizeAttrs(self, attrs):
if not attrs:
attrs = []
elif hasattr(attrs, 'items'):
attrs = attrs.items()
return [(unicode(name),unicode(value)) for name,value in attrs]
def emptyTag(self, namespace, name, attrs, hasChildren=False): def emptyTag(self, namespace, name, attrs, hasChildren=False):
yield {"type": "EmptyTag", "name": unicode(name), assert namespace is None or isinstance(namespace, string_types), type(namespace)
"namespace":unicode(namespace), assert isinstance(name, string_types), type(name)
"data": self.normalizeAttrs(attrs)} assert all((namespace is None or isinstance(namespace, string_types)) and
isinstance(name, string_types) and
isinstance(value, string_types)
for (namespace, name), value in attrs.items())
yield {"type": "EmptyTag", "name": to_text(name, False),
"namespace": to_text(namespace),
"data": attrs}
if hasChildren: if hasChildren:
yield self.error(_("Void element has children")) yield self.error(_("Void element has children"))
def startTag(self, namespace, name, attrs): def startTag(self, namespace, name, attrs):
return {"type": "StartTag", assert namespace is None or isinstance(namespace, string_types), type(namespace)
"name": unicode(name), assert isinstance(name, string_types), type(name)
"namespace":unicode(namespace), assert all((namespace is None or isinstance(namespace, string_types)) and
"data": self.normalizeAttrs(attrs)} isinstance(name, string_types) and
isinstance(value, string_types)
for (namespace, name), value in attrs.items())
return {"type": "StartTag",
"name": text_type(name),
"namespace": to_text(namespace),
"data": dict(((to_text(namespace, False), to_text(name)),
to_text(value, False))
for (namespace, name), value in attrs.items())}
def endTag(self, namespace, name): def endTag(self, namespace, name):
return {"type": "EndTag", assert namespace is None or isinstance(namespace, string_types), type(namespace)
"name": unicode(name), assert isinstance(name, string_types), type(namespace)
"namespace":unicode(namespace),
"data": []} return {"type": "EndTag",
"name": to_text(name, False),
"namespace": to_text(namespace),
"data": {}}
def text(self, data): def text(self, data):
data = unicode(data) assert isinstance(data, string_types), type(data)
data = to_text(data)
middle = data.lstrip(spaceCharacters) middle = data.lstrip(spaceCharacters)
left = data[:len(data)-len(middle)] left = data[:len(data) - len(middle)]
if left: if left:
yield {"type": "SpaceCharacters", "data": left} yield {"type": "SpaceCharacters", "data": left}
data = middle data = middle
@ -55,52 +101,40 @@ class TreeWalker(object):
yield {"type": "SpaceCharacters", "data": right} yield {"type": "SpaceCharacters", "data": right}
def comment(self, data): def comment(self, data):
return {"type": "Comment", "data": unicode(data)} assert isinstance(data, string_types), type(data)
return {"type": "Comment", "data": text_type(data)}
def doctype(self, name, publicId=None, systemId=None, correct=True): def doctype(self, name, publicId=None, systemId=None, correct=True):
assert is_text_or_none(name), type(name)
assert is_text_or_none(publicId), type(publicId)
assert is_text_or_none(systemId), type(systemId)
return {"type": "Doctype", return {"type": "Doctype",
"name": name is not None and unicode(name) or u"", "name": to_text(name),
"publicId": publicId, "publicId": to_text(publicId),
"systemId": systemId, "systemId": to_text(systemId),
"correct": correct} "correct": to_text(correct)}
def entity(self, name):
assert isinstance(name, string_types), type(name)
return {"type": "Entity", "name": text_type(name)}
def unknown(self, nodeType): def unknown(self, nodeType):
return self.error(_("Unknown node type: ") + nodeType) return self.error(_("Unknown node type: ") + nodeType)
class RecursiveTreeWalker(TreeWalker):
def walkChildren(self, node):
raise NodeImplementedError
def element(self, node, namespace, name, attrs, hasChildren):
if name in voidElements:
for token in self.emptyTag(namespace, name, attrs, hasChildren):
yield token
else:
yield self.startTag(name, attrs)
if hasChildren:
for token in self.walkChildren(node):
yield token
yield self.endTag(name)
from xml.dom import Node
DOCUMENT = Node.DOCUMENT_NODE
DOCTYPE = Node.DOCUMENT_TYPE_NODE
TEXT = Node.TEXT_NODE
ELEMENT = Node.ELEMENT_NODE
COMMENT = Node.COMMENT_NODE
UNKNOWN = "<#UNKNOWN#>"
class NonRecursiveTreeWalker(TreeWalker): class NonRecursiveTreeWalker(TreeWalker):
def getNodeDetails(self, node): def getNodeDetails(self, node):
raise NotImplementedError raise NotImplementedError
def getFirstChild(self, node): def getFirstChild(self, node):
raise NotImplementedError raise NotImplementedError
def getNextSibling(self, node): def getNextSibling(self, node):
raise NotImplementedError raise NotImplementedError
def getParentNode(self, node): def getParentNode(self, node):
raise NotImplementedError raise NotImplementedError
@ -110,7 +144,6 @@ class NonRecursiveTreeWalker(TreeWalker):
details = self.getNodeDetails(currentNode) details = self.getNodeDetails(currentNode)
type, details = details[0], details[1:] type, details = details[0], details[1:]
hasChildren = False hasChildren = False
endTag = None
if type == DOCTYPE: if type == DOCTYPE:
yield self.doctype(*details) yield self.doctype(*details)
@ -122,28 +155,30 @@ class NonRecursiveTreeWalker(TreeWalker):
elif type == ELEMENT: elif type == ELEMENT:
namespace, name, attributes, hasChildren = details namespace, name, attributes, hasChildren = details
if name in voidElements: if name in voidElements:
for token in self.emptyTag(namespace, name, attributes, for token in self.emptyTag(namespace, name, attributes,
hasChildren): hasChildren):
yield token yield token
hasChildren = False hasChildren = False
else: else:
endTag = name
yield self.startTag(namespace, name, attributes) yield self.startTag(namespace, name, attributes)
elif type == COMMENT: elif type == COMMENT:
yield self.comment(details[0]) yield self.comment(details[0])
elif type == ENTITY:
yield self.entity(details[0])
elif type == DOCUMENT: elif type == DOCUMENT:
hasChildren = True hasChildren = True
else: else:
yield self.unknown(details[0]) yield self.unknown(details[0])
if hasChildren: if hasChildren:
firstChild = self.getFirstChild(currentNode) firstChild = self.getFirstChild(currentNode)
else: else:
firstChild = None firstChild = None
if firstChild is not None: if firstChild is not None:
currentNode = firstChild currentNode = firstChild
else: else:

View File

@ -1,10 +1,12 @@
from __future__ import absolute_import, division, unicode_literals
from xml.dom import Node from xml.dom import Node
import gettext import gettext
_ = gettext.gettext _ = gettext.gettext
import _base from . import _base
from html5lib.constants import voidElements
class TreeWalker(_base.NonRecursiveTreeWalker): class TreeWalker(_base.NonRecursiveTreeWalker):
def getNodeDetails(self, node): def getNodeDetails(self, node):
@ -15,8 +17,15 @@ class TreeWalker(_base.NonRecursiveTreeWalker):
return _base.TEXT, node.nodeValue return _base.TEXT, node.nodeValue
elif node.nodeType == Node.ELEMENT_NODE: elif node.nodeType == Node.ELEMENT_NODE:
return (_base.ELEMENT, node.namespaceURI, node.nodeName, attrs = {}
node.attributes.items(), node.hasChildNodes) for attr in list(node.attributes.keys()):
attr = node.getAttributeNode(attr)
if attr.namespaceURI:
attrs[(attr.namespaceURI, attr.localName)] = attr.value
else:
attrs[(None, attr.name)] = attr.value
return (_base.ELEMENT, node.namespaceURI, node.nodeName,
attrs, node.hasChildNodes())
elif node.nodeType == Node.COMMENT_NODE: elif node.nodeType == Node.COMMENT_NODE:
return _base.COMMENT, node.nodeValue return _base.COMMENT, node.nodeValue

View File

@ -1,30 +1,28 @@
from __future__ import absolute_import, division, unicode_literals
try:
from collections import OrderedDict
except ImportError:
try:
from ordereddict import OrderedDict
except ImportError:
OrderedDict = dict
import gettext import gettext
_ = gettext.gettext _ = gettext.gettext
import new
import copy
import re import re
import _base from six import text_type
from html5lib.constants import voidElements
from . import _base
from ..utils import moduleFactoryFactory
tag_regexp = re.compile("{([^}]*)}(.*)") tag_regexp = re.compile("{([^}]*)}(.*)")
moduleCache = {}
def getETreeModule(ElementTreeImplementation):
name = "_" + ElementTreeImplementation.__name__+"builder"
if name in moduleCache:
return moduleCache[name]
else:
mod = new.module("_" + ElementTreeImplementation.__name__+"builder")
objs = getETreeBuilder(ElementTreeImplementation)
mod.__dict__.update(objs)
moduleCache[name] = mod
return mod
def getETreeBuilder(ElementTreeImplementation): def getETreeBuilder(ElementTreeImplementation):
ElementTree = ElementTreeImplementation ElementTree = ElementTreeImplementation
ElementTreeCommentType = ElementTree.Comment("asd").tag
class TreeWalker(_base.NonRecursiveTreeWalker): class TreeWalker(_base.NonRecursiveTreeWalker):
"""Given the particular ElementTree representation, this implementation, """Given the particular ElementTree representation, this implementation,
@ -32,16 +30,16 @@ def getETreeBuilder(ElementTreeImplementation):
content: content:
1. The current element 1. The current element
2. The index of the element relative to its parent 2. The index of the element relative to its parent
3. A stack of ancestor elements 3. A stack of ancestor elements
4. A flag "text", "tail" or None to indicate if the current node is a 4. A flag "text", "tail" or None to indicate if the current node is a
text node; either the text or tail of the current element (1) text node; either the text or tail of the current element (1)
""" """
def getNodeDetails(self, node): def getNodeDetails(self, node):
if isinstance(node, tuple): # It might be the root Element if isinstance(node, tuple): # It might be the root Element
elt, key, parents, flag = node elt, key, parents, flag = node
if flag in ("text", "tail"): if flag in ("text", "tail"):
return _base.TEXT, getattr(elt, flag) return _base.TEXT, getattr(elt, flag)
@ -51,33 +49,41 @@ def getETreeBuilder(ElementTreeImplementation):
if not(hasattr(node, "tag")): if not(hasattr(node, "tag")):
node = node.getroot() node = node.getroot()
if node.tag in ("<DOCUMENT_ROOT>", "<DOCUMENT_FRAGMENT>"): if node.tag in ("DOCUMENT_ROOT", "DOCUMENT_FRAGMENT"):
return (_base.DOCUMENT,) return (_base.DOCUMENT,)
elif node.tag == "<!DOCTYPE>": elif node.tag == "<!DOCTYPE>":
return (_base.DOCTYPE, node.text, return (_base.DOCTYPE, node.text,
node.get("publicId"), node.get("systemId")) node.get("publicId"), node.get("systemId"))
elif type(node.tag) == type(ElementTree.Comment): elif node.tag == ElementTreeCommentType:
return _base.COMMENT, node.text return _base.COMMENT, node.text
else: else:
#This is assumed to be an ordinary element assert type(node.tag) == text_type, type(node.tag)
# This is assumed to be an ordinary element
match = tag_regexp.match(node.tag) match = tag_regexp.match(node.tag)
if match: if match:
namespace, tag = match.groups() namespace, tag = match.groups()
else: else:
namespace = None namespace = None
tag = node.tag tag = node.tag
return (_base.ELEMENT, namespace, tag, attrs = OrderedDict()
node.attrib.items(), len(node) or node.text) for name, value in list(node.attrib.items()):
match = tag_regexp.match(name)
if match:
attrs[(match.group(1), match.group(2))] = value
else:
attrs[(None, name)] = value
return (_base.ELEMENT, namespace, tag,
attrs, len(node) or node.text)
def getFirstChild(self, node): def getFirstChild(self, node):
if isinstance(node, tuple): if isinstance(node, tuple):
element, key, parents, flag = node element, key, parents, flag = node
else: else:
element, key, parents, flag = node, None, [], None element, key, parents, flag = node, None, [], None
if flag in ("text", "tail"): if flag in ("text", "tail"):
return None return None
else: else:
@ -88,13 +94,13 @@ def getETreeBuilder(ElementTreeImplementation):
return element[0], 0, parents, None return element[0], 0, parents, None
else: else:
return None return None
def getNextSibling(self, node): def getNextSibling(self, node):
if isinstance(node, tuple): if isinstance(node, tuple):
element, key, parents, flag = node element, key, parents, flag = node
else: else:
return None return None
if flag == "text": if flag == "text":
if len(element): if len(element):
parents.append(element) parents.append(element)
@ -105,16 +111,16 @@ def getETreeBuilder(ElementTreeImplementation):
if element.tail and flag != "tail": if element.tail and flag != "tail":
return element, key, parents, "tail" return element, key, parents, "tail"
elif key < len(parents[-1]) - 1: elif key < len(parents[-1]) - 1:
return parents[-1][key+1], key+1, parents, None return parents[-1][key + 1], key + 1, parents, None
else: else:
return None return None
def getParentNode(self, node): def getParentNode(self, node):
if isinstance(node, tuple): if isinstance(node, tuple):
element, key, parents, flag = node element, key, parents, flag = node
else: else:
return None return None
if flag == "text": if flag == "text":
if not parents: if not parents:
return element return element
@ -128,3 +134,5 @@ def getETreeBuilder(ElementTreeImplementation):
return parent, list(parents[-1]).index(parent), parents, None return parent, list(parents[-1]).index(parent), parents, None
return locals() return locals()
getETreeModule = moduleFactoryFactory(getETreeBuilder)

View File

@ -1,50 +1,49 @@
from __future__ import absolute_import, division, unicode_literals
from genshi.core import QName
from genshi.core import START, END, XML_NAMESPACE, DOCTYPE, TEXT from genshi.core import START, END, XML_NAMESPACE, DOCTYPE, TEXT
from genshi.core import START_NS, END_NS, START_CDATA, END_CDATA, PI, COMMENT from genshi.core import START_NS, END_NS, START_CDATA, END_CDATA, PI, COMMENT
from genshi.output import NamespaceFlattener
import _base from . import _base
from ..constants import voidElements, namespaces
from html5lib.constants import voidElements
class TreeWalker(_base.TreeWalker): class TreeWalker(_base.TreeWalker):
def __iter__(self): def __iter__(self):
depth = 0 # Buffer the events so we can pass in the following one
ignore_until = None
previous = None previous = None
for event in self.tree: for event in self.tree:
if previous is not None: if previous is not None:
if previous[0] == START: for token in self.tokens(previous, event):
depth += 1
if ignore_until <= depth:
ignore_until = None
if ignore_until is None:
for token in self.tokens(previous, event):
yield token
if token["type"] == "EmptyTag":
ignore_until = depth
if previous[0] == END:
depth -= 1
previous = event
if previous is not None:
if ignore_until is None or ignore_until <= depth:
for token in self.tokens(previous, None):
yield token yield token
elif ignore_until is not None: previous = event
raise ValueError("Illformed DOM event stream: void element without END_ELEMENT")
# Don't forget the final event!
if previous is not None:
for token in self.tokens(previous, None):
yield token
def tokens(self, event, next): def tokens(self, event, next):
kind, data, pos = event kind, data, pos = event
if kind == START: if kind == START:
tag, attrib = data tag, attribs = data
name = tag.localname name = tag.localname
namespace = tag.namespace namespace = tag.namespace
if tag in voidElements: converted_attribs = {}
for token in self.emptyTag(namespace, name, list(attrib), for k, v in attribs:
not next or next[0] != END if isinstance(k, QName):
converted_attribs[(k.namespace, k.localname)] = v
else:
converted_attribs[(None, k)] = v
if namespace == namespaces["html"] and name in voidElements:
for token in self.emptyTag(namespace, name, converted_attribs,
not next or next[0] != END
or next[1] != tag): or next[1] != tag):
yield token yield token
else: else:
yield self.startTag(namespace, name, list(attrib)) yield self.startTag(namespace, name, converted_attribs)
elif kind == END: elif kind == END:
name = data.localname name = data.localname
@ -62,8 +61,8 @@ class TreeWalker(_base.TreeWalker):
elif kind == DOCTYPE: elif kind == DOCTYPE:
yield self.doctype(*data) yield self.doctype(*data)
elif kind in (XML_NAMESPACE, DOCTYPE, START_NS, END_NS, \ elif kind in (XML_NAMESPACE, DOCTYPE, START_NS, END_NS,
START_CDATA, END_CDATA, PI): START_CDATA, END_CDATA, PI):
pass pass
else: else:

View File

@ -1,22 +1,35 @@
from __future__ import absolute_import, division, unicode_literals
from six import text_type
from lxml import etree from lxml import etree
from html5lib.treebuilders.etree import tag_regexp from ..treebuilders.etree import tag_regexp
from gettext import gettext from gettext import gettext
_ = gettext _ = gettext
import _base from . import _base
from .. import ihatexml
def ensure_str(s):
if s is None:
return None
elif isinstance(s, text_type):
return s
else:
return s.decode("utf-8", "strict")
from html5lib.constants import voidElements
from html5lib import ihatexml
class Root(object): class Root(object):
def __init__(self, et): def __init__(self, et):
self.elementtree = et self.elementtree = et
self.children = [] self.children = []
if et.docinfo.internalDTD: if et.docinfo.internalDTD:
self.children.append(Doctype(self, et.docinfo.root_name, self.children.append(Doctype(self,
et.docinfo.public_id, ensure_str(et.docinfo.root_name),
et.docinfo.system_url)) ensure_str(et.docinfo.public_id),
ensure_str(et.docinfo.system_url)))
root = et.getroot() root = et.getroot()
node = root node = root
@ -28,7 +41,7 @@ class Root(object):
self.text = None self.text = None
self.tail = None self.tail = None
def __getitem__(self, key): def __getitem__(self, key):
return self.children[key] return self.children[key]
@ -38,19 +51,21 @@ class Root(object):
def __len__(self): def __len__(self):
return 1 return 1
class Doctype(object): class Doctype(object):
def __init__(self, root_node, name, public_id, system_id): def __init__(self, root_node, name, public_id, system_id):
self.root_node = root_node self.root_node = root_node
self.name = name self.name = name
self.public_id = public_id self.public_id = public_id
self.system_id = system_id self.system_id = system_id
self.text = None self.text = None
self.tail = None self.tail = None
def getnext(self): def getnext(self):
return self.root_node.children[1] return self.root_node.children[1]
class FragmentRoot(Root): class FragmentRoot(Root):
def __init__(self, children): def __init__(self, children):
self.children = [FragmentWrapper(self, child) for child in children] self.children = [FragmentWrapper(self, child) for child in children]
@ -59,23 +74,27 @@ class FragmentRoot(Root):
def getnext(self): def getnext(self):
return None return None
class FragmentWrapper(object): class FragmentWrapper(object):
def __init__(self, fragment_root, obj): def __init__(self, fragment_root, obj):
self.root_node = fragment_root self.root_node = fragment_root
self.obj = obj self.obj = obj
if hasattr(self.obj, 'text'): if hasattr(self.obj, 'text'):
self.text = self.obj.text self.text = ensure_str(self.obj.text)
else: else:
self.text = None self.text = None
if hasattr(self.obj, 'tail'): if hasattr(self.obj, 'tail'):
self.tail = self.obj.tail self.tail = ensure_str(self.obj.tail)
else: else:
self.tail = None self.tail = None
self.isstring = isinstance(obj, basestring) self.isstring = isinstance(obj, str) or isinstance(obj, bytes)
# Support for bytes here is Py2
if self.isstring:
self.obj = ensure_str(self.obj)
def __getattr__(self, name): def __getattr__(self, name):
return getattr(self.obj, name) return getattr(self.obj, name)
def getnext(self): def getnext(self):
siblings = self.root_node.children siblings = self.root_node.children
idx = siblings.index(self) idx = siblings.index(self)
@ -87,7 +106,7 @@ class FragmentWrapper(object):
def __getitem__(self, key): def __getitem__(self, key):
return self.obj[key] return self.obj[key]
def __nonzero__(self): def __bool__(self):
return bool(self.obj) return bool(self.obj)
def getparent(self): def getparent(self):
@ -96,10 +115,13 @@ class FragmentWrapper(object):
def __str__(self): def __str__(self):
return str(self.obj) return str(self.obj)
def __unicode__(self):
return str(self.obj)
def __len__(self): def __len__(self):
return len(self.obj) return len(self.obj)
class TreeWalker(_base.NonRecursiveTreeWalker): class TreeWalker(_base.NonRecursiveTreeWalker):
def __init__(self, tree): def __init__(self, tree):
if hasattr(tree, "getroot"): if hasattr(tree, "getroot"):
@ -108,11 +130,12 @@ class TreeWalker(_base.NonRecursiveTreeWalker):
tree = FragmentRoot(tree) tree = FragmentRoot(tree)
_base.NonRecursiveTreeWalker.__init__(self, tree) _base.NonRecursiveTreeWalker.__init__(self, tree)
self.filter = ihatexml.InfosetFilter() self.filter = ihatexml.InfosetFilter()
def getNodeDetails(self, node): def getNodeDetails(self, node):
if isinstance(node, tuple): # Text node if isinstance(node, tuple): # Text node
node, key = node node, key = node
assert key in ("text", "tail"), _("Text nodes are text or tail, found %s") % key assert key in ("text", "tail"), _("Text nodes are text or tail, found %s") % key
return _base.TEXT, getattr(node, key) return _base.TEXT, ensure_str(getattr(node, key))
elif isinstance(node, Root): elif isinstance(node, Root):
return (_base.DOCUMENT,) return (_base.DOCUMENT,)
@ -121,23 +144,33 @@ class TreeWalker(_base.NonRecursiveTreeWalker):
return _base.DOCTYPE, node.name, node.public_id, node.system_id return _base.DOCTYPE, node.name, node.public_id, node.system_id
elif isinstance(node, FragmentWrapper) and node.isstring: elif isinstance(node, FragmentWrapper) and node.isstring:
return _base.TEXT, node return _base.TEXT, node.obj
elif node.tag == etree.Comment: elif node.tag == etree.Comment:
return _base.COMMENT, node.text return _base.COMMENT, ensure_str(node.text)
elif node.tag == etree.Entity:
return _base.ENTITY, ensure_str(node.text)[1:-1] # strip &;
else: else:
#This is assumed to be an ordinary element # This is assumed to be an ordinary element
match = tag_regexp.match(node.tag) match = tag_regexp.match(ensure_str(node.tag))
if match: if match:
namespace, tag = match.groups() namespace, tag = match.groups()
else: else:
namespace = None namespace = None
tag = node.tag tag = ensure_str(node.tag)
return (_base.ELEMENT, namespace, self.filter.fromXmlName(tag), attrs = {}
[(self.filter.fromXmlName(name), value) for for name, value in list(node.attrib.items()):
name,value in node.attrib.iteritems()], name = ensure_str(name)
len(node) > 0 or node.text) value = ensure_str(value)
match = tag_regexp.match(name)
if match:
attrs[(match.group(1), match.group(2))] = value
else:
attrs[(None, name)] = value
return (_base.ELEMENT, namespace, self.filter.fromXmlName(tag),
attrs, len(node) > 0 or node.text)
def getFirstChild(self, node): def getFirstChild(self, node):
assert not isinstance(node, tuple), _("Text nodes have no children") assert not isinstance(node, tuple), _("Text nodes have no children")
@ -149,7 +182,7 @@ class TreeWalker(_base.NonRecursiveTreeWalker):
return node[0] return node[0]
def getNextSibling(self, node): def getNextSibling(self, node):
if isinstance(node, tuple): # Text node if isinstance(node, tuple): # Text node
node, key = node node, key = node
assert key in ("text", "tail"), _("Text nodes are text or tail, found %s") % key assert key in ("text", "tail"), _("Text nodes are text or tail, found %s") % key
if key == "text": if key == "text":
@ -159,13 +192,13 @@ class TreeWalker(_base.NonRecursiveTreeWalker):
return node[0] return node[0]
else: else:
return None return None
else: # tail else: # tail
return node.getnext() return node.getnext()
return node.tail and (node, "tail") or node.getnext() return (node, "tail") if node.tail else node.getnext()
def getParentNode(self, node): def getParentNode(self, node):
if isinstance(node, tuple): # Text node if isinstance(node, tuple): # Text node
node, key = node node, key = node
assert key in ("text", "tail"), _("Text nodes are text or tail, found %s") % key assert key in ("text", "tail"), _("Text nodes are text or tail, found %s") % key
if key == "text": if key == "text":

View File

@ -1,9 +1,12 @@
from __future__ import absolute_import, division, unicode_literals
from xml.dom.pulldom import START_ELEMENT, END_ELEMENT, \ from xml.dom.pulldom import START_ELEMENT, END_ELEMENT, \
COMMENT, IGNORABLE_WHITESPACE, CHARACTERS COMMENT, IGNORABLE_WHITESPACE, CHARACTERS
import _base from . import _base
from ..constants import voidElements
from html5lib.constants import voidElements
class TreeWalker(_base.TreeWalker): class TreeWalker(_base.TreeWalker):
def __iter__(self): def __iter__(self):
@ -11,7 +14,7 @@ class TreeWalker(_base.TreeWalker):
previous = None previous = None
for event in self.tree: for event in self.tree:
if previous is not None and \ if previous is not None and \
(ignore_until is None or previous[1] is ignore_until): (ignore_until is None or previous[1] is ignore_until):
if previous[1] is ignore_until: if previous[1] is ignore_until:
ignore_until = None ignore_until = None
for token in self.tokens(previous, event): for token in self.tokens(previous, event):
@ -30,14 +33,18 @@ class TreeWalker(_base.TreeWalker):
if type == START_ELEMENT: if type == START_ELEMENT:
name = node.nodeName name = node.nodeName
namespace = node.namespaceURI namespace = node.namespaceURI
attrs = {}
for attr in list(node.attributes.keys()):
attr = node.getAttributeNode(attr)
attrs[(attr.namespaceURI, attr.localName)] = attr.value
if name in voidElements: if name in voidElements:
for token in self.emptyTag(namespace, for token in self.emptyTag(namespace,
name, name,
node.attributes.items(), attrs,
not next or next[1] is not node): not next or next[1] is not node):
yield token yield token
else: else:
yield self.startTag(namespace, name, node.attributes.items()) yield self.startTag(namespace, name, attrs)
elif type == END_ELEMENT: elif type == END_ELEMENT:
name = node.nodeName name = node.nodeName

View File

@ -1,72 +0,0 @@
import gettext
_ = gettext.gettext
import _base
class TreeWalker(_base.NonRecursiveTreeWalker):
"""Given that simpletree has no performant way of getting a node's
next sibling, this implementation returns "nodes" as tuples with the
following content:
1. The parent Node (Element, Document or DocumentFragment)
2. The child index of the current node in its parent's children list
3. A list used as a stack of all ancestors. It is a pair tuple whose
first item is a parent Node and second item is a child index.
"""
def getNodeDetails(self, node):
if isinstance(node, tuple): # It might be the root Node
parent, idx, parents = node
node = parent.childNodes[idx]
# testing node.type allows us not to import treebuilders.simpletree
if node.type in (1, 2): # Document or DocumentFragment
return (_base.DOCUMENT,)
elif node.type == 3: # DocumentType
return _base.DOCTYPE, node.name, node.publicId, node.systemId
elif node.type == 4: # TextNode
return _base.TEXT, node.value
elif node.type == 5: # Element
return (_base.ELEMENT, node.namespace, node.name,
node.attributes.items(), node.hasContent())
elif node.type == 6: # CommentNode
return _base.COMMENT, node.data
else:
return _node.UNKNOWN, node.type
def getFirstChild(self, node):
if isinstance(node, tuple): # It might be the root Node
parent, idx, parents = node
parents.append((parent, idx))
node = parent.childNodes[idx]
else:
parents = []
assert node.hasContent(), "Node has no children"
return (node, 0, parents)
def getNextSibling(self, node):
assert isinstance(node, tuple), "Node is not a tuple: " + str(node)
parent, idx, parents = node
idx += 1
if len(parent.childNodes) > idx:
return (parent, idx, parents)
else:
return None
def getParentNode(self, node):
assert isinstance(node, tuple)
parent, idx, parents = node
if parents:
parent, idx = parents.pop()
return parent, idx, parents
else:
# HACK: We could return ``parent`` but None will stop the algorithm the same way
return None

View File

@ -1,59 +0,0 @@
import re
import gettext
_ = gettext.gettext
from BeautifulSoup import BeautifulSoup, Declaration, Comment, Tag
from html5lib.constants import namespaces
import _base
class TreeWalker(_base.NonRecursiveTreeWalker):
doctype_regexp = re.compile(
r'(?P<name>[^\s]*)(\s*PUBLIC\s*"(?P<publicId>.*)"\s*"(?P<systemId1>.*)"|\s*SYSTEM\s*"(?P<systemId2>.*)")?')
def getNodeDetails(self, node):
if isinstance(node, BeautifulSoup): # Document or DocumentFragment
return (_base.DOCUMENT,)
elif isinstance(node, Declaration): # DocumentType
string = unicode(node.string)
#Slice needed to remove markup added during unicode conversion,
#but only in some versions of BeautifulSoup/Python
if string.startswith('<!') and string.endswith('>'):
string = string[2:-1]
m = self.doctype_regexp.match(string)
#This regexp approach seems wrong and fragile
#but beautiful soup stores the doctype as a single thing and we want the seperate bits
#It should work as long as the tree is created by html5lib itself but may be wrong if it's
#been modified at all
#We could just feed to it a html5lib tokenizer, I guess...
assert m is not None, "DOCTYPE did not match expected format"
name = m.group('name')
publicId = m.group('publicId')
if publicId is not None:
systemId = m.group('systemId1')
else:
systemId = m.group('systemId2')
return _base.DOCTYPE, name, publicId or "", systemId or ""
elif isinstance(node, Comment):
string = unicode(node.string)
if string.startswith('<!--') and string.endswith('-->'):
string = string[4:-3]
return _base.COMMENT, string
elif isinstance(node, unicode): # TextNode
return _base.TEXT, node
elif isinstance(node, Tag): # Element
return (_base.ELEMENT, namespaces["html"], node.name,
dict(node.attrs).items(), node.contents)
else:
return _base.UNKNOWN, node.__class__.__name__
def getFirstChild(self, node):
return node.contents[0]
def getNextSibling(self, node):
return node.nextSibling
def getParentNode(self, node):
return node.parent

View File

@ -0,0 +1,12 @@
from __future__ import absolute_import, division, unicode_literals
from .py import Trie as PyTrie
Trie = PyTrie
try:
from .datrie import Trie as DATrie
except ImportError:
pass
else:
Trie = DATrie

View File

@ -0,0 +1,37 @@
from __future__ import absolute_import, division, unicode_literals
from collections import Mapping
class Trie(Mapping):
"""Abstract base class for tries"""
def keys(self, prefix=None):
keys = super().keys()
if prefix is None:
return set(keys)
# Python 2.6: no set comprehensions
return set([x for x in keys if x.startswith(prefix)])
def has_keys_with_prefix(self, prefix):
for key in self.keys():
if key.startswith(prefix):
return True
return False
def longest_prefix(self, prefix):
if prefix in self:
return prefix
for i in range(1, len(prefix) + 1):
if prefix[:-i] in self:
return prefix[:-i]
raise KeyError(prefix)
def longest_prefix_item(self, prefix):
lprefix = self.longest_prefix(prefix)
return (lprefix, self[lprefix])

View File

@ -0,0 +1,44 @@
from __future__ import absolute_import, division, unicode_literals
from datrie import Trie as DATrie
from six import text_type
from ._base import Trie as ABCTrie
class Trie(ABCTrie):
def __init__(self, data):
chars = set()
for key in data.keys():
if not isinstance(key, text_type):
raise TypeError("All keys must be strings")
for char in key:
chars.add(char)
self._data = DATrie("".join(chars))
for key, value in data.items():
self._data[key] = value
def __contains__(self, key):
return key in self._data
def __len__(self):
return len(self._data)
def __iter__(self):
raise NotImplementedError()
def __getitem__(self, key):
return self._data[key]
def keys(self, prefix=None):
return self._data.keys(prefix)
def has_keys_with_prefix(self, prefix):
return self._data.has_keys_with_prefix(prefix)
def longest_prefix(self, prefix):
return self._data.longest_prefix(prefix)
def longest_prefix_item(self, prefix):
return self._data.longest_prefix_item(prefix)

67
src/html5lib/trie/py.py Normal file
View File

@ -0,0 +1,67 @@
from __future__ import absolute_import, division, unicode_literals
from six import text_type
from bisect import bisect_left
from ._base import Trie as ABCTrie
class Trie(ABCTrie):
def __init__(self, data):
if not all(isinstance(x, text_type) for x in data.keys()):
raise TypeError("All keys must be strings")
self._data = data
self._keys = sorted(data.keys())
self._cachestr = ""
self._cachepoints = (0, len(data))
def __contains__(self, key):
return key in self._data
def __len__(self):
return len(self._data)
def __iter__(self):
return iter(self._data)
def __getitem__(self, key):
return self._data[key]
def keys(self, prefix=None):
if prefix is None or prefix == "" or not self._keys:
return set(self._keys)
if prefix.startswith(self._cachestr):
lo, hi = self._cachepoints
start = i = bisect_left(self._keys, prefix, lo, hi)
else:
start = i = bisect_left(self._keys, prefix)
keys = set()
if start == len(self._keys):
return keys
while self._keys[i].startswith(prefix):
keys.add(self._keys[i])
i += 1
self._cachestr = prefix
self._cachepoints = (start, i)
return keys
def has_keys_with_prefix(self, prefix):
if prefix in self._data:
return True
if prefix.startswith(self._cachestr):
lo, hi = self._cachepoints
i = bisect_left(self._keys, prefix, lo, hi)
else:
i = bisect_left(self._keys, prefix)
if i == len(self._keys):
return False
return self._keys[i].startswith(prefix)

View File

@ -1,9 +1,16 @@
from __future__ import absolute_import, division, unicode_literals
from types import ModuleType
try: try:
frozenset import xml.etree.cElementTree as default_etree
except NameError: except ImportError:
#Import from the sets module for python 2.3 import xml.etree.ElementTree as default_etree
from sets import Set as set
from sets import ImmutableSet as frozenset
__all__ = ["default_etree", "MethodDispatcher", "isSurrogatePair",
"surrogatePairToCodepoint", "moduleFactoryFactory"]
class MethodDispatcher(dict): class MethodDispatcher(dict):
"""Dict with 2 special properties: """Dict with 2 special properties:
@ -23,7 +30,7 @@ class MethodDispatcher(dict):
# twice as fast. Please do careful performance testing before changing # twice as fast. Please do careful performance testing before changing
# anything here. # anything here.
_dictEntries = [] _dictEntries = []
for name,value in items: for name, value in items:
if type(name) in (list, tuple, frozenset, set): if type(name) in (list, tuple, frozenset, set):
for item in name: for item in name:
_dictEntries.append((item, value)) _dictEntries.append((item, value))
@ -35,122 +42,41 @@ class MethodDispatcher(dict):
def __getitem__(self, key): def __getitem__(self, key):
return dict.get(self, key, self.default) return dict.get(self, key, self.default)
#Pure python implementation of deque taken from the ASPN Python Cookbook
#Original code by Raymond Hettinger
class deque(object): # Some utility functions to dal with weirdness around UCS2 vs UCS4
# python builds
def __init__(self, iterable=(), maxsize=-1): def isSurrogatePair(data):
if not hasattr(self, 'data'): return (len(data) == 2 and
self.left = self.right = 0 ord(data[0]) >= 0xD800 and ord(data[0]) <= 0xDBFF and
self.data = {} ord(data[1]) >= 0xDC00 and ord(data[1]) <= 0xDFFF)
self.maxsize = maxsize
self.extend(iterable)
def append(self, x):
self.data[self.right] = x
self.right += 1
if self.maxsize != -1 and len(self) > self.maxsize:
self.popleft()
def appendleft(self, x):
self.left -= 1
self.data[self.left] = x
if self.maxsize != -1 and len(self) > self.maxsize:
self.pop()
def pop(self):
if self.left == self.right:
raise IndexError('cannot pop from empty deque')
self.right -= 1
elem = self.data[self.right]
del self.data[self.right]
return elem
def popleft(self):
if self.left == self.right:
raise IndexError('cannot pop from empty deque')
elem = self.data[self.left]
del self.data[self.left]
self.left += 1
return elem
def clear(self): def surrogatePairToCodepoint(data):
self.data.clear() char_val = (0x10000 + (ord(data[0]) - 0xD800) * 0x400 +
self.left = self.right = 0 (ord(data[1]) - 0xDC00))
return char_val
def extend(self, iterable): # Module Factory Factory (no, this isn't Java, I know)
for elem in iterable: # Here to stop this being duplicated all over the place.
self.append(elem)
def extendleft(self, iterable):
for elem in iterable:
self.appendleft(elem)
def rotate(self, n=1): def moduleFactoryFactory(factory):
if self: moduleCache = {}
n %= len(self)
for i in xrange(n):
self.appendleft(self.pop())
def __getitem__(self, i): def moduleFactory(baseModule, *args, **kwargs):
if i < 0: if isinstance(ModuleType.__name__, type("")):
i += len(self) name = "_%s_factory" % baseModule.__name__
try: else:
return self.data[i + self.left] name = b"_%s_factory" % baseModule.__name__
except KeyError:
raise IndexError
def __setitem__(self, i, value): if name in moduleCache:
if i < 0: return moduleCache[name]
i += len(self) else:
try: mod = ModuleType(name)
self.data[i + self.left] = value objs = factory(baseModule, *args, **kwargs)
except KeyError: mod.__dict__.update(objs)
raise IndexError moduleCache[name] = mod
return mod
def __delitem__(self, i): return moduleFactory
size = len(self)
if not (-size <= i < size):
raise IndexError
data = self.data
if i < 0:
i += size
for j in xrange(self.left+i, self.right-1):
data[j] = data[j+1]
self.pop()
def __len__(self):
return self.right - self.left
def __cmp__(self, other):
if type(self) != type(other):
return cmp(type(self), type(other))
return cmp(list(self), list(other))
def __repr__(self, _track=[]):
if id(self) in _track:
return '...'
_track.append(id(self))
r = 'deque(%r)' % (list(self),)
_track.remove(id(self))
return r
def __getstate__(self):
return (tuple(self),)
def __setstate__(self, s):
self.__init__(s[0])
def __hash__(self):
raise TypeError
def __copy__(self):
return self.__class__(self)
def __deepcopy__(self, memo={}):
from copy import deepcopy
result = self.__class__()
memo[id(self)] = result
result.__init__(deepcopy(tuple(self), memo))
return result