Update HTML 5 parser used in calibre (html5lib-python)

2025-07-09 03:04:10 -04:00 · 2013-10-23 11:04:05 +05:30 · 2013-10-23 11:04:05 +05:30 · b9421065f9
commit b9421065f9
parent b4bf871077
46 changed files with 7609 additions and 8932 deletions
--- a/src/calibre/init.py
+++ b/src/calibre/init.py
@ -562,9 +562,9 @@ def entity_to_unicode(match, exceptions=[], encoding='cp1252',
            return check(chr(num).decode(encoding))
        except UnicodeDecodeError:
            return check(my_unichr(num))
-    from calibre.utils.html5_entities import entity_map
+    from html5lib.constants import entities
    try:
-        return check(entity_map[ent])
+        return check(entities[ent])
    except KeyError:
        pass
    from htmlentitydefs import name2codepoint
--- a/src/calibre/ebooks/oeb/parse_utils.py
+++ b/src/calibre/ebooks/oeb/parse_utils.py
@ -81,10 +81,13 @@ def node_depth(node):
    return ans
 def html5_parse(data, max_nesting_depth=100):
-    import html5lib
+    import html5lib, warnings
-    # html5lib bug: http://code.google.com/p/html5lib/issues/detail?id=195
+    from html5lib.constants import cdataElements, rcdataElements
-    data = re.sub(r'<\s*(title|style|script|textarea)\s*[^>]*/\s*>', r'<\1></\1>', data, flags=re.I)
+    # HTML5 parsing algorithm idiocy: http://code.google.com/p/html5lib/issues/detail?id=195
    data = re.sub(r'<\s*(%s)\s*[^>]*/\s*>' % ('|'.join(cdataElements|rcdataElements)), r'<\1></\1>', data, flags=re.I)
    with warnings.catch_warnings():
        warnings.simplefilter('ignore')
        data = html5lib.parse(data, treebuilder='lxml').getroot()
    # Check that the asinine HTML 5 algorithm did not result in a tree with
--- a/src/calibre/ebooks/oeb/polish/tests/parsing.py
+++ b/src/calibre/ebooks/oeb/polish/tests/parsing.py
@ -7,6 +7,7 @@ __license__ = 'GPL v3'
 __copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
 from lxml import etree
 from html5lib.constants import cdataElements, rcdataElements
 from calibre.ebooks.oeb.polish.tests.base import BaseTest
 from calibre.ebooks.oeb.base import XPath, XHTML_NS, SVG_NS, XLINK_NS
@ -18,7 +19,7 @@ def nonvoid_cdata_elements(test, parse_function):
    markup = '''
    <html> <head><{0}/></head> <body id="test"> </html>
    '''
-    for tag in ('title', 'style', 'script', 'textarea'):
+    for tag in cdataElements | rcdataElements:
        for x in (tag, tag.upper(), '\n' + tag, tag + ' id="xxx" '):
            root = parse_function(markup.format(x))
            test.assertEqual(
--- a/src/calibre/utils/html5_entities.py
+++ b/src/calibre/utils/html5_entities.py
--- a/src/html5lib/init.py
+++ b/src/html5lib/init.py
@ -10,8 +10,14 @@ import html5lib
 f = open("my_document.html")
 tree = html5lib.parse(f)
 """
-__version__ = "0.90"
+
-from html5parser import HTMLParser, parse, parseFragment
+from __future__ import absolute_import, division, unicode_literals
-from treebuilders import getTreeBuilder
+
-from treewalkers import getTreeWalker
+from .html5parser import HTMLParser, parse, parseFragment
-from serializer import serialize
+from .treebuilders import getTreeBuilder
 from .treewalkers import getTreeWalker
 from .serializer import serialize
 __all__ = ["HTMLParser", "parse", "parseFragment", "getTreeBuilder",
           "getTreeWalker", "serialize"]
 __version__ = "0.999-dev"
--- a/src/html5lib/constants.py
+++ b/src/html5lib/constants.py
--- a/src/html5lib/filters/_base.py
+++ b/src/html5lib/filters/_base.py
@ -1,3 +1,5 @@
 from __future__ import absolute_import, division, unicode_literals
 class Filter(object):
    def __init__(self, source):
--- a/src/html5lib/filters/alphabeticalattributes.py
+++ b/src/html5lib/filters/alphabeticalattributes.py
@ -0,0 +1,20 @@
 from __future__ import absolute_import, division, unicode_literals
 from . import _base
 try:
    from collections import OrderedDict
 except ImportError:
    from ordereddict import OrderedDict
 class Filter(_base.Filter):
    def __iter__(self):
        for token in _base.Filter.__iter__(self):
            if token["type"] in ("StartTag", "EmptyTag"):
                attrs = OrderedDict()
                for name, value in sorted(token["data"].items(),
                                          key=lambda x: x[0]):
                    attrs[name] = value
                token["data"] = attrs
            yield token
--- a/src/html5lib/filters/formfiller.py
+++ b/src/html5lib/filters/formfiller.py
@ -1,127 +0,0 @@
 #
 # The goal is to finally have a form filler where you pass data for
 # each form, using the algorithm for "Seeding a form with initial values"
 # See http://www.whatwg.org/specs/web-forms/current-work/#seeding
 #
 import _base
 from html5lib.constants import spaceCharacters
 spaceCharacters = u"".join(spaceCharacters)
 class SimpleFilter(_base.Filter):
    def __init__(self, source, fieldStorage):
        _base.Filter.__init__(self, source)
        self.fieldStorage = fieldStorage
    def __iter__(self):
        field_indices = {}
        state = None
        field_name = None
        for token in _base.Filter.__iter__(self):
            type = token["type"]
            if type in ("StartTag", "EmptyTag"):
                name = token["name"].lower()
                if name == "input":
                    field_name = None
                    field_type = None
                    input_value_index = -1
                    input_checked_index = -1
                    for i,(n,v) in enumerate(token["data"]):
                        n = n.lower()
                        if n == u"name":
                            field_name = v.strip(spaceCharacters)
                        elif n == u"type":
                            field_type = v.strip(spaceCharacters)
                        elif n == u"checked":
                            input_checked_index = i
                        elif n == u"value":
                            input_value_index = i
                    value_list = self.fieldStorage.getlist(field_name)
                    field_index = field_indices.setdefault(field_name, 0)
                    if field_index < len(value_list):
                        value = value_list[field_index]
                    else:
                        value = ""
                    if field_type in (u"checkbox", u"radio"):
                        if value_list:
                            if token["data"][input_value_index][1] == value:
                                if input_checked_index < 0:
                                    token["data"].append((u"checked", u""))
                                field_indices[field_name] = field_index + 1
                            elif input_checked_index >= 0:
                                del token["data"][input_checked_index]
                    elif field_type not in (u"button", u"submit", u"reset"):
                        if input_value_index >= 0:
                            token["data"][input_value_index] = (u"value", value)
                        else:
                            token["data"].append((u"value", value))
                        field_indices[field_name] = field_index + 1
                    field_type = None
                    field_name = None
                elif name == "textarea":
                    field_type = "textarea"
                    field_name = dict((token["data"])[::-1])["name"]
                elif name == "select":
                    field_type = "select"
                    attributes = dict(token["data"][::-1])
                    field_name = attributes.get("name")
                    is_select_multiple = "multiple" in attributes
                    is_selected_option_found = False
                elif field_type == "select" and field_name and name == "option":
                    option_selected_index = -1
                    option_value = None
                    for i,(n,v) in enumerate(token["data"]):
                        n = n.lower()
                        if n == "selected":
                            option_selected_index = i
                        elif n == "value":
                            option_value = v.strip(spaceCharacters)
                    if option_value is None:
                        raise NotImplementedError("<option>s without a value= attribute")
                    else:
                        value_list = self.fieldStorage.getlist(field_name)
                        if value_list:
                            field_index = field_indices.setdefault(field_name, 0)
                            if field_index < len(value_list):
                                value = value_list[field_index]
                            else:
                                value = ""
                            if (is_select_multiple or not is_selected_option_found) and option_value == value:
                                if option_selected_index < 0:
                                    token["data"].append((u"selected", u""))
                                field_indices[field_name] = field_index + 1
                                is_selected_option_found = True
                            elif option_selected_index >= 0:
                                del token["data"][option_selected_index]
            elif field_type is not None and field_name and type == "EndTag":
                name = token["name"].lower()
                if name == field_type:
                    if name == "textarea":
                        value_list = self.fieldStorage.getlist(field_name)
                        if value_list:
                            field_index = field_indices.setdefault(field_name, 0)
                            if field_index < len(value_list):
                                value = value_list[field_index]
                            else:
                                value = ""
                            yield {"type": "Characters", "data": value}
                            field_indices[field_name] = field_index + 1
                    field_name = None
                elif name == "option" and field_type == "select":
                    pass # TODO: part of "option without value= attribute" processing
            elif field_type == "textarea":
                continue # ignore token
            yield token
--- a/src/html5lib/filters/inject_meta_charset.py
+++ b/src/html5lib/filters/inject_meta_charset.py
@ -1,4 +1,7 @@
-import _base
+from __future__ import absolute_import, division, unicode_literals
 from . import _base
 class Filter(_base.Filter):
    def __init__(self, source, encoding):
@ -20,19 +23,18 @@ class Filter(_base.Filter):
                if token["name"].lower() == "meta":
                    # replace charset with actual encoding
                    has_http_equiv_content_type = False
-                   content_index = -1
+                    for (namespace, name), value in token["data"].items():
-                   for i,(name,value) in enumerate(token["data"]):
+                        if namespace is not None:
-                       if name.lower() == 'charset':
+                            continue
-                          token["data"][i] = (u'charset', self.encoding)
+                        elif name.lower() == 'charset':
                            token["data"][(namespace, name)] = self.encoding
                            meta_found = True
                            break
                        elif name == 'http-equiv' and value.lower() == 'content-type':
                            has_http_equiv_content_type = True
                       elif name == 'content':
                           content_index = i
                    else:
-                       if has_http_equiv_content_type and content_index >= 0:
+                        if has_http_equiv_content_type and (None, "content") in token["data"]:
-                           token["data"][content_index] = (u'content', u'text/html; charset=%s' % self.encoding)
+                            token["data"][(None, "content")] = 'text/html; charset=%s' % self.encoding
                            meta_found = True
                elif token["name"].lower() == "head" and not meta_found:
@ -40,7 +42,7 @@ class Filter(_base.Filter):
                    yield {"type": "StartTag", "name": "head",
                           "data": token["data"]}
                    yield {"type": "EmptyTag", "name": "meta",
-                           "data": [["charset", self.encoding]]}
+                           "data": {(None, "charset"): self.encoding}}
                    yield {"type": "EndTag", "name": "head"}
                    meta_found = True
                    continue
@ -51,7 +53,7 @@ class Filter(_base.Filter):
                    yield pending.pop(0)
                    if not meta_found:
                        yield {"type": "EmptyTag", "name": "meta",
-                               "data": [["charset", self.encoding]]}
+                               "data": {(None, "charset"): self.encoding}}
                    while pending:
                        yield pending.pop(0)
                    meta_found = True
--- a/src/html5lib/filters/lint.py
+++ b/src/html5lib/filters/lint.py
@ -1,13 +1,18 @@
 from __future__ import absolute_import, division, unicode_literals
 from gettext import gettext
 _ = gettext
-import _base
+from . import _base
-from html5lib.constants import cdataElements, rcdataElements, voidElements
+from ..constants import cdataElements, rcdataElements, voidElements
-from html5lib.constants import spaceCharacters
+from ..constants import spaceCharacters
-spaceCharacters = u"".join(spaceCharacters)
+spaceCharacters = "".join(spaceCharacters)
 class LintError(Exception):
    pass
 class LintError(Exception): pass
 class Filter(_base.Filter):
    def __iter__(self):
@ -18,24 +23,24 @@ class Filter(_base.Filter):
            if type in ("StartTag", "EmptyTag"):
                name = token["name"]
                if contentModelFlag != "PCDATA":
-                    raise LintError(_("StartTag not in PCDATA content model flag: %s") % name)
+                    raise LintError(_("StartTag not in PCDATA content model flag: %(tag)s") % {"tag": name})
-                if not isinstance(name, unicode):
+                if not isinstance(name, str):
-                    raise LintError(_(u"Tag name is not a string: %r") % name)
+                    raise LintError(_("Tag name is not a string: %(tag)r") % {"tag": name})
                if not name:
-                    raise LintError(_(u"Empty tag name"))
+                    raise LintError(_("Empty tag name"))
                if type == "StartTag" and name in voidElements:
-                    raise LintError(_(u"Void element reported as StartTag token: %s") % name)
+                    raise LintError(_("Void element reported as StartTag token: %(tag)s") % {"tag": name})
                elif type == "EmptyTag" and name not in voidElements:
-                    raise LintError(_(u"Non-void element reported as EmptyTag token: %s") % token["name"])
+                    raise LintError(_("Non-void element reported as EmptyTag token: %(tag)s") % {"tag": token["name"]})
                if type == "StartTag":
                    open_elements.append(name)
                for name, value in token["data"]:
-                    if not isinstance(name, unicode):
+                    if not isinstance(name, str):
-                        raise LintError(_("Attribute name is not a string: %r") % name)
+                        raise LintError(_("Attribute name is not a string: %(name)r") % {"name": name})
                    if not name:
-                        raise LintError(_(u"Empty attribute name"))
+                        raise LintError(_("Empty attribute name"))
-                    if not isinstance(value, unicode):
+                    if not isinstance(value, str):
-                        raise LintError(_("Attribute value is not a string: %r") % value)
+                        raise LintError(_("Attribute value is not a string: %(value)r") % {"value": value})
                if name in cdataElements:
                    contentModelFlag = "CDATA"
                elif name in rcdataElements:
@ -45,15 +50,15 @@ class Filter(_base.Filter):
            elif type == "EndTag":
                name = token["name"]
-                if not isinstance(name, unicode):
+                if not isinstance(name, str):
-                    raise LintError(_(u"Tag name is not a string: %r") % name)
+                    raise LintError(_("Tag name is not a string: %(tag)r") % {"tag": name})
                if not name:
-                    raise LintError(_(u"Empty tag name"))
+                    raise LintError(_("Empty tag name"))
                if name in voidElements:
-                    raise LintError(_(u"Void element reported as EndTag token: %s") % name)
+                    raise LintError(_("Void element reported as EndTag token: %(tag)s") % {"tag": name})
                start_name = open_elements.pop()
                if start_name != name:
-                    raise LintError(_(u"EndTag (%s) does not match StartTag (%s)") % (name, start_name))
+                    raise LintError(_("EndTag (%(end)s) does not match StartTag (%(start)s)") % {"end": name, "start": start_name})
                contentModelFlag = "PCDATA"
            elif type == "Comment":
@ -62,27 +67,27 @@ class Filter(_base.Filter):
            elif type in ("Characters", "SpaceCharacters"):
                data = token["data"]
-                if not isinstance(data, unicode):
+                if not isinstance(data, str):
-                    raise LintError(_("Attribute name is not a string: %r") % data)
+                    raise LintError(_("Attribute name is not a string: %(name)r") % {"name": data})
                if not data:
-                    raise LintError(_(u"%s token with empty data") % type)
+                    raise LintError(_("%(type)s token with empty data") % {"type": type})
                if type == "SpaceCharacters":
                    data = data.strip(spaceCharacters)
                    if data:
-                        raise LintError(_(u"Non-space character(s) found in SpaceCharacters token: ") % data)
+                        raise LintError(_("Non-space character(s) found in SpaceCharacters token: %(token)r") % {"token": data})
            elif type == "Doctype":
                name = token["name"]
                if contentModelFlag != "PCDATA":
-                    raise LintError(_("Doctype not in PCDATA content model flag: %s") % name)
+                    raise LintError(_("Doctype not in PCDATA content model flag: %(name)s") % {"name": name})
-                if not isinstance(name, unicode):
+                if not isinstance(name, str):
-                    raise LintError(_(u"Tag name is not a string: %r") % name)
+                    raise LintError(_("Tag name is not a string: %(tag)r") % {"tag": name})
                # XXX: what to do with token["data"] ?
            elif type in ("ParseError", "SerializeError"):
                pass
            else:
-                raise LintError(_(u"Unknown token type: %s") % type)
+                raise LintError(_("Unknown token type: %(type)s") % {"type": type})
            yield token
--- a/src/html5lib/filters/optionaltags.py
+++ b/src/html5lib/filters/optionaltags.py
@ -1,4 +1,7 @@
-import _base
+from __future__ import absolute_import, division, unicode_literals
 from . import _base
 class Filter(_base.Filter):
    def slider(self):
--- a/src/html5lib/filters/sanitizer.py
+++ b/src/html5lib/filters/sanitizer.py
@ -1,8 +1,12 @@
-import _base
+from __future__ import absolute_import, division, unicode_literals
-from html5lib.sanitizer import HTMLSanitizerMixin
+
 from . import _base
 from ..sanitizer import HTMLSanitizerMixin
 class Filter(_base.Filter, HTMLSanitizerMixin):
    def __iter__(self):
        for token in _base.Filter.__iter__(self):
            token = self.sanitize_token(token)
-            if token: yield token
+            if token:
                yield token
--- a/src/html5lib/filters/whitespace.py
+++ b/src/html5lib/filters/whitespace.py
@ -1,16 +1,13 @@
-try:
+from __future__ import absolute_import, division, unicode_literals
    frozenset
 except NameError:
    # Import from the sets module for python 2.3
    from sets import ImmutableSet as frozenset
 import re
-import _base
+from . import _base
-from html5lib.constants import rcdataElements, spaceCharacters
+from ..constants import rcdataElements, spaceCharacters
-spaceCharacters = u"".join(spaceCharacters)
+spaceCharacters = "".join(spaceCharacters)
 SPACES_REGEX = re.compile("[%s]+" % spaceCharacters)
 SPACES_REGEX = re.compile(u"[%s]+" % spaceCharacters)
 class Filter(_base.Filter):
@ -29,13 +26,13 @@ class Filter(_base.Filter):
            elif not preserve and type == "SpaceCharacters" and token["data"]:
                # Test on token["data"] above to not introduce spaces where there were not
-                token["data"] = u" "
+                token["data"] = " "
            elif not preserve and type == "Characters":
                token["data"] = collapse_spaces(token["data"])
            yield token
 def collapse_spaces(text):
    return SPACES_REGEX.sub(' ', text)
--- a/src/html5lib/html5parser.py
+++ b/src/html5lib/html5parser.py
--- a/src/html5lib/ihatexml.py
+++ b/src/html5lib/ihatexml.py
@ -1,14 +1,93 @@
-import re
+from __future__ import absolute_import, division, unicode_literals
-baseChar = """[#x0041-#x005A] | [#x0061-#x007A] | [#x00C0-#x00D6] | [#x00D8-#x00F6] | [#x00F8-#x00FF] | [#x0100-#x0131] | [#x0134-#x013E] | [#x0141-#x0148] | [#x014A-#x017E] | [#x0180-#x01C3] | [#x01CD-#x01F0] | [#x01F4-#x01F5] | [#x01FA-#x0217] | [#x0250-#x02A8] | [#x02BB-#x02C1] | #x0386 | [#x0388-#x038A] | #x038C | [#x038E-#x03A1] | [#x03A3-#x03CE] | [#x03D0-#x03D6] | #x03DA | #x03DC | #x03DE | #x03E0 | [#x03E2-#x03F3] | [#x0401-#x040C] | [#x040E-#x044F] | [#x0451-#x045C] | [#x045E-#x0481] | [#x0490-#x04C4] | [#x04C7-#x04C8] | [#x04CB-#x04CC] | [#x04D0-#x04EB] | [#x04EE-#x04F5] | [#x04F8-#x04F9] | [#x0531-#x0556] | #x0559 | [#x0561-#x0586] | [#x05D0-#x05EA] | [#x05F0-#x05F2] | [#x0621-#x063A] | [#x0641-#x064A] | [#x0671-#x06B7] | [#x06BA-#x06BE] | [#x06C0-#x06CE] | [#x06D0-#x06D3] | #x06D5 | [#x06E5-#x06E6] | [#x0905-#x0939] | #x093D | [#x0958-#x0961] | [#x0985-#x098C] | [#x098F-#x0990] | [#x0993-#x09A8] | [#x09AA-#x09B0] | #x09B2 | [#x09B6-#x09B9] | [#x09DC-#x09DD] | [#x09DF-#x09E1] | [#x09F0-#x09F1] | [#x0A05-#x0A0A] | [#x0A0F-#x0A10] | [#x0A13-#x0A28] | [#x0A2A-#x0A30] | [#x0A32-#x0A33] | [#x0A35-#x0A36] | [#x0A38-#x0A39] | [#x0A59-#x0A5C] | #x0A5E | [#x0A72-#x0A74] | [#x0A85-#x0A8B] | #x0A8D | [#x0A8F-#x0A91] | [#x0A93-#x0AA8] | [#x0AAA-#x0AB0] | [#x0AB2-#x0AB3] | [#x0AB5-#x0AB9] | #x0ABD | #x0AE0 | [#x0B05-#x0B0C] | [#x0B0F-#x0B10] | [#x0B13-#x0B28] | [#x0B2A-#x0B30] | [#x0B32-#x0B33] | [#x0B36-#x0B39] | #x0B3D | [#x0B5C-#x0B5D] | [#x0B5F-#x0B61] | [#x0B85-#x0B8A] | [#x0B8E-#x0B90] | [#x0B92-#x0B95] | [#x0B99-#x0B9A] | #x0B9C | [#x0B9E-#x0B9F] | [#x0BA3-#x0BA4] | [#x0BA8-#x0BAA] | [#x0BAE-#x0BB5] | [#x0BB7-#x0BB9] | [#x0C05-#x0C0C] | [#x0C0E-#x0C10] | [#x0C12-#x0C28] | [#x0C2A-#x0C33] | [#x0C35-#x0C39] | [#x0C60-#x0C61] | [#x0C85-#x0C8C] | [#x0C8E-#x0C90] | [#x0C92-#x0CA8] | [#x0CAA-#x0CB3] | [#x0CB5-#x0CB9] | #x0CDE | [#x0CE0-#x0CE1] | [#x0D05-#x0D0C] | [#x0D0E-#x0D10] | [#x0D12-#x0D28] | [#x0D2A-#x0D39] | [#x0D60-#x0D61] | [#x0E01-#x0E2E] | #x0E30 | [#x0E32-#x0E33] | [#x0E40-#x0E45] | [#x0E81-#x0E82] | #x0E84 | [#x0E87-#x0E88] | #x0E8A | #x0E8D | [#x0E94-#x0E97] | [#x0E99-#x0E9F] | [#x0EA1-#x0EA3] | #x0EA5 | #x0EA7 | [#x0EAA-#x0EAB] | [#x0EAD-#x0EAE] | #x0EB0 | [#x0EB2-#x0EB3] | #x0EBD | [#x0EC0-#x0EC4] | [#x0F40-#x0F47] | [#x0F49-#x0F69] | [#x10A0-#x10C5] | [#x10D0-#x10F6] | #x1100 | [#x1102-#x1103] | [#x1105-#x1107] | #x1109 | [#x110B-#x110C] | [#x110E-#x1112] | #x113C | #x113E | #x1140 | #x114C | #x114E | #x1150 | [#x1154-#x1155] | #x1159 | [#x115F-#x1161] | #x1163 | #x1165 | #x1167 | #x1169 | [#x116D-#x116E] | [#x1172-#x1173] | #x1175 | #x119E | #x11A8 | #x11AB | [#x11AE-#x11AF] | [#x11B7-#x11B8] | #x11BA | [#x11BC-#x11C2] | #x11EB | #x11F0 | #x11F9 | [#x1E00-#x1E9B] | [#x1EA0-#x1EF9] | [#x1F00-#x1F15] | [#x1F18-#x1F1D] | [#x1F20-#x1F45] | [#x1F48-#x1F4D] | [#x1F50-#x1F57] | #x1F59 | #x1F5B | #x1F5D | [#x1F5F-#x1F7D] | [#x1F80-#x1FB4] | [#x1FB6-#x1FBC] | #x1FBE | [#x1FC2-#x1FC4] | [#x1FC6-#x1FCC] | [#x1FD0-#x1FD3] | [#x1FD6-#x1FDB] | [#x1FE0-#x1FEC] | [#x1FF2-#x1FF4] | [#x1FF6-#x1FFC] | #x2126 | [#x212A-#x212B] | #x212E | [#x2180-#x2182] | [#x3041-#x3094] | [#x30A1-#x30FA] | [#x3105-#x312C] | [#xAC00-#xD7A3]"""
+import re
 import warnings
 from .constants import DataLossWarning
 baseChar = """
 [#x0041-#x005A] | [#x0061-#x007A] | [#x00C0-#x00D6] | [#x00D8-#x00F6] |
 [#x00F8-#x00FF] | [#x0100-#x0131] | [#x0134-#x013E] | [#x0141-#x0148] |
 [#x014A-#x017E] | [#x0180-#x01C3] | [#x01CD-#x01F0] | [#x01F4-#x01F5] |
 [#x01FA-#x0217] | [#x0250-#x02A8] | [#x02BB-#x02C1] | #x0386 |
 [#x0388-#x038A] | #x038C | [#x038E-#x03A1] | [#x03A3-#x03CE] |
 [#x03D0-#x03D6] | #x03DA | #x03DC | #x03DE | #x03E0 | [#x03E2-#x03F3] |
 [#x0401-#x040C] | [#x040E-#x044F] | [#x0451-#x045C] | [#x045E-#x0481] |
 [#x0490-#x04C4] | [#x04C7-#x04C8] | [#x04CB-#x04CC] | [#x04D0-#x04EB] |
 [#x04EE-#x04F5] | [#x04F8-#x04F9] | [#x0531-#x0556] | #x0559 |
 [#x0561-#x0586] | [#x05D0-#x05EA] | [#x05F0-#x05F2] | [#x0621-#x063A] |
 [#x0641-#x064A] | [#x0671-#x06B7] | [#x06BA-#x06BE] | [#x06C0-#x06CE] |
 [#x06D0-#x06D3] | #x06D5 | [#x06E5-#x06E6] | [#x0905-#x0939] | #x093D |
 [#x0958-#x0961] | [#x0985-#x098C] | [#x098F-#x0990] | [#x0993-#x09A8] |
 [#x09AA-#x09B0] | #x09B2 | [#x09B6-#x09B9] | [#x09DC-#x09DD] |
 [#x09DF-#x09E1] | [#x09F0-#x09F1] | [#x0A05-#x0A0A] | [#x0A0F-#x0A10] |
 [#x0A13-#x0A28] | [#x0A2A-#x0A30] | [#x0A32-#x0A33] | [#x0A35-#x0A36] |
 [#x0A38-#x0A39] | [#x0A59-#x0A5C] | #x0A5E | [#x0A72-#x0A74] |
 [#x0A85-#x0A8B] | #x0A8D | [#x0A8F-#x0A91] | [#x0A93-#x0AA8] |
 [#x0AAA-#x0AB0] | [#x0AB2-#x0AB3] | [#x0AB5-#x0AB9] | #x0ABD | #x0AE0 |
 [#x0B05-#x0B0C] | [#x0B0F-#x0B10] | [#x0B13-#x0B28] | [#x0B2A-#x0B30] |
 [#x0B32-#x0B33] | [#x0B36-#x0B39] | #x0B3D | [#x0B5C-#x0B5D] |
 [#x0B5F-#x0B61] | [#x0B85-#x0B8A] | [#x0B8E-#x0B90] | [#x0B92-#x0B95] |
 [#x0B99-#x0B9A] | #x0B9C | [#x0B9E-#x0B9F] | [#x0BA3-#x0BA4] |
 [#x0BA8-#x0BAA] | [#x0BAE-#x0BB5] | [#x0BB7-#x0BB9] | [#x0C05-#x0C0C] |
 [#x0C0E-#x0C10] | [#x0C12-#x0C28] | [#x0C2A-#x0C33] | [#x0C35-#x0C39] |
 [#x0C60-#x0C61] | [#x0C85-#x0C8C] | [#x0C8E-#x0C90] | [#x0C92-#x0CA8] |
 [#x0CAA-#x0CB3] | [#x0CB5-#x0CB9] | #x0CDE | [#x0CE0-#x0CE1] |
 [#x0D05-#x0D0C] | [#x0D0E-#x0D10] | [#x0D12-#x0D28] | [#x0D2A-#x0D39] |
 [#x0D60-#x0D61] | [#x0E01-#x0E2E] | #x0E30 | [#x0E32-#x0E33] |
 [#x0E40-#x0E45] | [#x0E81-#x0E82] | #x0E84 | [#x0E87-#x0E88] | #x0E8A |
 #x0E8D | [#x0E94-#x0E97] | [#x0E99-#x0E9F] | [#x0EA1-#x0EA3] | #x0EA5 |
 #x0EA7 | [#x0EAA-#x0EAB] | [#x0EAD-#x0EAE] | #x0EB0 | [#x0EB2-#x0EB3] |
 #x0EBD | [#x0EC0-#x0EC4] | [#x0F40-#x0F47] | [#x0F49-#x0F69] |
 [#x10A0-#x10C5] | [#x10D0-#x10F6] | #x1100 | [#x1102-#x1103] |
 [#x1105-#x1107] | #x1109 | [#x110B-#x110C] | [#x110E-#x1112] | #x113C |
 #x113E | #x1140 | #x114C | #x114E | #x1150 | [#x1154-#x1155] | #x1159 |
 [#x115F-#x1161] | #x1163 | #x1165 | #x1167 | #x1169 | [#x116D-#x116E] |
 [#x1172-#x1173] | #x1175 | #x119E | #x11A8 | #x11AB | [#x11AE-#x11AF] |
 [#x11B7-#x11B8] | #x11BA | [#x11BC-#x11C2] | #x11EB | #x11F0 | #x11F9 |
 [#x1E00-#x1E9B] | [#x1EA0-#x1EF9] | [#x1F00-#x1F15] | [#x1F18-#x1F1D] |
 [#x1F20-#x1F45] | [#x1F48-#x1F4D] | [#x1F50-#x1F57] | #x1F59 | #x1F5B |
 #x1F5D | [#x1F5F-#x1F7D] | [#x1F80-#x1FB4] | [#x1FB6-#x1FBC] | #x1FBE |
 [#x1FC2-#x1FC4] | [#x1FC6-#x1FCC] | [#x1FD0-#x1FD3] | [#x1FD6-#x1FDB] |
 [#x1FE0-#x1FEC] | [#x1FF2-#x1FF4] | [#x1FF6-#x1FFC] | #x2126 |
 [#x212A-#x212B] | #x212E | [#x2180-#x2182] | [#x3041-#x3094] |
 [#x30A1-#x30FA] | [#x3105-#x312C] | [#xAC00-#xD7A3]"""
 ideographic = """[#x4E00-#x9FA5] | #x3007 | [#x3021-#x3029]"""
-combiningCharacter = """[#x0300-#x0345] | [#x0360-#x0361] | [#x0483-#x0486] | [#x0591-#x05A1] | [#x05A3-#x05B9] | [#x05BB-#x05BD] | #x05BF | [#x05C1-#x05C2] | #x05C4 | [#x064B-#x0652] | #x0670 | [#x06D6-#x06DC] | [#x06DD-#x06DF] | [#x06E0-#x06E4] | [#x06E7-#x06E8] | [#x06EA-#x06ED] | [#x0901-#x0903] | #x093C | [#x093E-#x094C] | #x094D | [#x0951-#x0954] | [#x0962-#x0963] | [#x0981-#x0983] | #x09BC | #x09BE | #x09BF | [#x09C0-#x09C4] | [#x09C7-#x09C8] | [#x09CB-#x09CD] | #x09D7 | [#x09E2-#x09E3] | #x0A02 | #x0A3C | #x0A3E | #x0A3F | [#x0A40-#x0A42] | [#x0A47-#x0A48] | [#x0A4B-#x0A4D] | [#x0A70-#x0A71] | [#x0A81-#x0A83] | #x0ABC | [#x0ABE-#x0AC5] | [#x0AC7-#x0AC9] | [#x0ACB-#x0ACD] | [#x0B01-#x0B03] | #x0B3C | [#x0B3E-#x0B43] | [#x0B47-#x0B48] | [#x0B4B-#x0B4D] | [#x0B56-#x0B57] | [#x0B82-#x0B83] | [#x0BBE-#x0BC2] | [#x0BC6-#x0BC8] | [#x0BCA-#x0BCD] | #x0BD7 | [#x0C01-#x0C03] | [#x0C3E-#x0C44] | [#x0C46-#x0C48] | [#x0C4A-#x0C4D] | [#x0C55-#x0C56] | [#x0C82-#x0C83] | [#x0CBE-#x0CC4] | [#x0CC6-#x0CC8] | [#x0CCA-#x0CCD] | [#x0CD5-#x0CD6] | [#x0D02-#x0D03] | [#x0D3E-#x0D43] | [#x0D46-#x0D48] | [#x0D4A-#x0D4D] | #x0D57 | #x0E31 | [#x0E34-#x0E3A] | [#x0E47-#x0E4E] | #x0EB1 | [#x0EB4-#x0EB9] | [#x0EBB-#x0EBC] | [#x0EC8-#x0ECD] | [#x0F18-#x0F19] | #x0F35 | #x0F37 | #x0F39 | #x0F3E | #x0F3F | [#x0F71-#x0F84] | [#x0F86-#x0F8B] | [#x0F90-#x0F95] | #x0F97 | [#x0F99-#x0FAD] | [#x0FB1-#x0FB7] | #x0FB9 | [#x20D0-#x20DC] | #x20E1 | [#x302A-#x302F] | #x3099 | #x309A"""
+combiningCharacter = """
 [#x0300-#x0345] | [#x0360-#x0361] | [#x0483-#x0486] | [#x0591-#x05A1] |
 [#x05A3-#x05B9] | [#x05BB-#x05BD] | #x05BF | [#x05C1-#x05C2] | #x05C4 |
 [#x064B-#x0652] | #x0670 | [#x06D6-#x06DC] | [#x06DD-#x06DF] |
 [#x06E0-#x06E4] | [#x06E7-#x06E8] | [#x06EA-#x06ED] | [#x0901-#x0903] |
 #x093C | [#x093E-#x094C] | #x094D | [#x0951-#x0954] | [#x0962-#x0963] |
 [#x0981-#x0983] | #x09BC | #x09BE | #x09BF | [#x09C0-#x09C4] |
 [#x09C7-#x09C8] | [#x09CB-#x09CD] | #x09D7 | [#x09E2-#x09E3] | #x0A02 |
 #x0A3C | #x0A3E | #x0A3F | [#x0A40-#x0A42] | [#x0A47-#x0A48] |
 [#x0A4B-#x0A4D] | [#x0A70-#x0A71] | [#x0A81-#x0A83] | #x0ABC |
 [#x0ABE-#x0AC5] | [#x0AC7-#x0AC9] | [#x0ACB-#x0ACD] | [#x0B01-#x0B03] |
 #x0B3C | [#x0B3E-#x0B43] | [#x0B47-#x0B48] | [#x0B4B-#x0B4D] |
 [#x0B56-#x0B57] | [#x0B82-#x0B83] | [#x0BBE-#x0BC2] | [#x0BC6-#x0BC8] |
 [#x0BCA-#x0BCD] | #x0BD7 | [#x0C01-#x0C03] | [#x0C3E-#x0C44] |
 [#x0C46-#x0C48] | [#x0C4A-#x0C4D] | [#x0C55-#x0C56] | [#x0C82-#x0C83] |
 [#x0CBE-#x0CC4] | [#x0CC6-#x0CC8] | [#x0CCA-#x0CCD] | [#x0CD5-#x0CD6] |
 [#x0D02-#x0D03] | [#x0D3E-#x0D43] | [#x0D46-#x0D48] | [#x0D4A-#x0D4D] |
 #x0D57 | #x0E31 | [#x0E34-#x0E3A] | [#x0E47-#x0E4E] | #x0EB1 |
 [#x0EB4-#x0EB9] | [#x0EBB-#x0EBC] | [#x0EC8-#x0ECD] | [#x0F18-#x0F19] |
 #x0F35 | #x0F37 | #x0F39 | #x0F3E | #x0F3F | [#x0F71-#x0F84] |
 [#x0F86-#x0F8B] | [#x0F90-#x0F95] | #x0F97 | [#x0F99-#x0FAD] |
 [#x0FB1-#x0FB7] | #x0FB9 | [#x20D0-#x20DC] | #x20E1 | [#x302A-#x302F] |
 #x3099 | #x309A"""
-digit = """[#x0030-#x0039] | [#x0660-#x0669] | [#x06F0-#x06F9] | [#x0966-#x096F] | [#x09E6-#x09EF] | [#x0A66-#x0A6F] | [#x0AE6-#x0AEF] | [#x0B66-#x0B6F] | [#x0BE7-#x0BEF] | [#x0C66-#x0C6F] | [#x0CE6-#x0CEF] | [#x0D66-#x0D6F] | [#x0E50-#x0E59] | [#x0ED0-#x0ED9] | [#x0F20-#x0F29]"""
+digit = """
 [#x0030-#x0039] | [#x0660-#x0669] | [#x06F0-#x06F9] | [#x0966-#x096F] |
 [#x09E6-#x09EF] | [#x0A66-#x0A6F] | [#x0AE6-#x0AEF] | [#x0B66-#x0B6F] |
 [#x0BE7-#x0BEF] | [#x0C66-#x0C6F] | [#x0CE6-#x0CEF] | [#x0D66-#x0D6F] |
 [#x0E50-#x0E59] | [#x0ED0-#x0ED9] | [#x0F20-#x0F29]"""
-extender = """#x00B7 | #x02D0 | #x02D1 | #x0387 | #x0640 | #x0E46 | #x0EC6 | #x3005 | [#x3031-#x3035] | [#x309D-#x309E] | [#x30FC-#x30FE]"""
+extender = """
 #x00B7 | #x02D0 | #x02D1 | #x0387 | #x0640 | #x0E46 | #x0EC6 | #x3005 |
 #[#x3031-#x3035] | [#x309D-#x309E] | [#x30FC-#x30FE]"""
 letter = " | ".join([baseChar, ideographic])
@ -20,6 +99,7 @@ nameFirst = " | ".join([letter, "_"])
 reChar = re.compile(r"#x([\d|A-F]{4,4})")
 reCharRange = re.compile(r"\[#x([\d|A-F]{4,4})-#x([\d|A-F]{4,4})\]")
 def charStringToList(chars):
    charRanges = [item.strip() for item in chars.split(" | ")]
    rv = []
@ -40,6 +120,7 @@ def charStringToList(chars):
    rv = normaliseCharList(rv)
    return rv
 def normaliseCharList(charList):
    charList = sorted(charList)
    for item in charList:
@ -58,6 +139,7 @@ def normaliseCharList(charList):
 # We don't really support characters above the BMP :(
 max_unicode = int("FFFF", 16)
 def missingRanges(charList):
    rv = []
    if charList[0] != 0:
@ -68,42 +150,49 @@ def missingRanges(charList):
        rv.append([charList[-1][1] + 1, max_unicode])
    return rv
 def listToRegexpStr(charList):
    rv = []
    for item in charList:
        if item[0] == item[1]:
-           rv.append(escapeRegexp(unichr(item[0])))
+            rv.append(escapeRegexp(chr(item[0])))
        else:
-            rv.append(escapeRegexp(unichr(item[0])) + "-" +
+            rv.append(escapeRegexp(chr(item[0])) + "-" +
-                      escapeRegexp(unichr(item[1])))
+                      escapeRegexp(chr(item[1])))
    return "[%s]" % "".join(rv)
 def hexToInt(hex_str):
    return int(hex_str, 16)
 def escapeRegexp(string):
    specialCharacters = (".", "^", "$", "*", "+", "?", "{", "}",
                         "[", "]", "|", "(", ")", "-")
    for char in specialCharacters:
        string = string.replace(char, "\\" + char)
        if char in string:
            print string
    return string
 # output from the above
-nonXmlNameBMPRegexp = re.compile(u'[\x00-,/:-@\\[-\\^`\\{-\xb6\xb8-\xbf\xd7\xf7\u0132-\u0133\u013f-\u0140\u0149\u017f\u01c4-\u01cc\u01f1-\u01f3\u01f6-\u01f9\u0218-\u024f\u02a9-\u02ba\u02c2-\u02cf\u02d2-\u02ff\u0346-\u035f\u0362-\u0385\u038b\u038d\u03a2\u03cf\u03d7-\u03d9\u03db\u03dd\u03df\u03e1\u03f4-\u0400\u040d\u0450\u045d\u0482\u0487-\u048f\u04c5-\u04c6\u04c9-\u04ca\u04cd-\u04cf\u04ec-\u04ed\u04f6-\u04f7\u04fa-\u0530\u0557-\u0558\u055a-\u0560\u0587-\u0590\u05a2\u05ba\u05be\u05c0\u05c3\u05c5-\u05cf\u05eb-\u05ef\u05f3-\u0620\u063b-\u063f\u0653-\u065f\u066a-\u066f\u06b8-\u06b9\u06bf\u06cf\u06d4\u06e9\u06ee-\u06ef\u06fa-\u0900\u0904\u093a-\u093b\u094e-\u0950\u0955-\u0957\u0964-\u0965\u0970-\u0980\u0984\u098d-\u098e\u0991-\u0992\u09a9\u09b1\u09b3-\u09b5\u09ba-\u09bb\u09bd\u09c5-\u09c6\u09c9-\u09ca\u09ce-\u09d6\u09d8-\u09db\u09de\u09e4-\u09e5\u09f2-\u0a01\u0a03-\u0a04\u0a0b-\u0a0e\u0a11-\u0a12\u0a29\u0a31\u0a34\u0a37\u0a3a-\u0a3b\u0a3d\u0a43-\u0a46\u0a49-\u0a4a\u0a4e-\u0a58\u0a5d\u0a5f-\u0a65\u0a75-\u0a80\u0a84\u0a8c\u0a8e\u0a92\u0aa9\u0ab1\u0ab4\u0aba-\u0abb\u0ac6\u0aca\u0ace-\u0adf\u0ae1-\u0ae5\u0af0-\u0b00\u0b04\u0b0d-\u0b0e\u0b11-\u0b12\u0b29\u0b31\u0b34-\u0b35\u0b3a-\u0b3b\u0b44-\u0b46\u0b49-\u0b4a\u0b4e-\u0b55\u0b58-\u0b5b\u0b5e\u0b62-\u0b65\u0b70-\u0b81\u0b84\u0b8b-\u0b8d\u0b91\u0b96-\u0b98\u0b9b\u0b9d\u0ba0-\u0ba2\u0ba5-\u0ba7\u0bab-\u0bad\u0bb6\u0bba-\u0bbd\u0bc3-\u0bc5\u0bc9\u0bce-\u0bd6\u0bd8-\u0be6\u0bf0-\u0c00\u0c04\u0c0d\u0c11\u0c29\u0c34\u0c3a-\u0c3d\u0c45\u0c49\u0c4e-\u0c54\u0c57-\u0c5f\u0c62-\u0c65\u0c70-\u0c81\u0c84\u0c8d\u0c91\u0ca9\u0cb4\u0cba-\u0cbd\u0cc5\u0cc9\u0cce-\u0cd4\u0cd7-\u0cdd\u0cdf\u0ce2-\u0ce5\u0cf0-\u0d01\u0d04\u0d0d\u0d11\u0d29\u0d3a-\u0d3d\u0d44-\u0d45\u0d49\u0d4e-\u0d56\u0d58-\u0d5f\u0d62-\u0d65\u0d70-\u0e00\u0e2f\u0e3b-\u0e3f\u0e4f\u0e5a-\u0e80\u0e83\u0e85-\u0e86\u0e89\u0e8b-\u0e8c\u0e8e-\u0e93\u0e98\u0ea0\u0ea4\u0ea6\u0ea8-\u0ea9\u0eac\u0eaf\u0eba\u0ebe-\u0ebf\u0ec5\u0ec7\u0ece-\u0ecf\u0eda-\u0f17\u0f1a-\u0f1f\u0f2a-\u0f34\u0f36\u0f38\u0f3a-\u0f3d\u0f48\u0f6a-\u0f70\u0f85\u0f8c-\u0f8f\u0f96\u0f98\u0fae-\u0fb0\u0fb8\u0fba-\u109f\u10c6-\u10cf\u10f7-\u10ff\u1101\u1104\u1108\u110a\u110d\u1113-\u113b\u113d\u113f\u1141-\u114b\u114d\u114f\u1151-\u1153\u1156-\u1158\u115a-\u115e\u1162\u1164\u1166\u1168\u116a-\u116c\u116f-\u1171\u1174\u1176-\u119d\u119f-\u11a7\u11a9-\u11aa\u11ac-\u11ad\u11b0-\u11b6\u11b9\u11bb\u11c3-\u11ea\u11ec-\u11ef\u11f1-\u11f8\u11fa-\u1dff\u1e9c-\u1e9f\u1efa-\u1eff\u1f16-\u1f17\u1f1e-\u1f1f\u1f46-\u1f47\u1f4e-\u1f4f\u1f58\u1f5a\u1f5c\u1f5e\u1f7e-\u1f7f\u1fb5\u1fbd\u1fbf-\u1fc1\u1fc5\u1fcd-\u1fcf\u1fd4-\u1fd5\u1fdc-\u1fdf\u1fed-\u1ff1\u1ff5\u1ffd-\u20cf\u20dd-\u20e0\u20e2-\u2125\u2127-\u2129\u212c-\u212d\u212f-\u217f\u2183-\u3004\u3006\u3008-\u3020\u3030\u3036-\u3040\u3095-\u3098\u309b-\u309c\u309f-\u30a0\u30fb\u30ff-\u3104\u312d-\u4dff\u9fa6-\uabff\ud7a4-\uffff]')
+nonXmlNameBMPRegexp = re.compile('[\x00-,/:-@\\[-\\^`\\{-\xb6\xb8-\xbf\xd7\xf7\u0132-\u0133\u013f-\u0140\u0149\u017f\u01c4-\u01cc\u01f1-\u01f3\u01f6-\u01f9\u0218-\u024f\u02a9-\u02ba\u02c2-\u02cf\u02d2-\u02ff\u0346-\u035f\u0362-\u0385\u038b\u038d\u03a2\u03cf\u03d7-\u03d9\u03db\u03dd\u03df\u03e1\u03f4-\u0400\u040d\u0450\u045d\u0482\u0487-\u048f\u04c5-\u04c6\u04c9-\u04ca\u04cd-\u04cf\u04ec-\u04ed\u04f6-\u04f7\u04fa-\u0530\u0557-\u0558\u055a-\u0560\u0587-\u0590\u05a2\u05ba\u05be\u05c0\u05c3\u05c5-\u05cf\u05eb-\u05ef\u05f3-\u0620\u063b-\u063f\u0653-\u065f\u066a-\u066f\u06b8-\u06b9\u06bf\u06cf\u06d4\u06e9\u06ee-\u06ef\u06fa-\u0900\u0904\u093a-\u093b\u094e-\u0950\u0955-\u0957\u0964-\u0965\u0970-\u0980\u0984\u098d-\u098e\u0991-\u0992\u09a9\u09b1\u09b3-\u09b5\u09ba-\u09bb\u09bd\u09c5-\u09c6\u09c9-\u09ca\u09ce-\u09d6\u09d8-\u09db\u09de\u09e4-\u09e5\u09f2-\u0a01\u0a03-\u0a04\u0a0b-\u0a0e\u0a11-\u0a12\u0a29\u0a31\u0a34\u0a37\u0a3a-\u0a3b\u0a3d\u0a43-\u0a46\u0a49-\u0a4a\u0a4e-\u0a58\u0a5d\u0a5f-\u0a65\u0a75-\u0a80\u0a84\u0a8c\u0a8e\u0a92\u0aa9\u0ab1\u0ab4\u0aba-\u0abb\u0ac6\u0aca\u0ace-\u0adf\u0ae1-\u0ae5\u0af0-\u0b00\u0b04\u0b0d-\u0b0e\u0b11-\u0b12\u0b29\u0b31\u0b34-\u0b35\u0b3a-\u0b3b\u0b44-\u0b46\u0b49-\u0b4a\u0b4e-\u0b55\u0b58-\u0b5b\u0b5e\u0b62-\u0b65\u0b70-\u0b81\u0b84\u0b8b-\u0b8d\u0b91\u0b96-\u0b98\u0b9b\u0b9d\u0ba0-\u0ba2\u0ba5-\u0ba7\u0bab-\u0bad\u0bb6\u0bba-\u0bbd\u0bc3-\u0bc5\u0bc9\u0bce-\u0bd6\u0bd8-\u0be6\u0bf0-\u0c00\u0c04\u0c0d\u0c11\u0c29\u0c34\u0c3a-\u0c3d\u0c45\u0c49\u0c4e-\u0c54\u0c57-\u0c5f\u0c62-\u0c65\u0c70-\u0c81\u0c84\u0c8d\u0c91\u0ca9\u0cb4\u0cba-\u0cbd\u0cc5\u0cc9\u0cce-\u0cd4\u0cd7-\u0cdd\u0cdf\u0ce2-\u0ce5\u0cf0-\u0d01\u0d04\u0d0d\u0d11\u0d29\u0d3a-\u0d3d\u0d44-\u0d45\u0d49\u0d4e-\u0d56\u0d58-\u0d5f\u0d62-\u0d65\u0d70-\u0e00\u0e2f\u0e3b-\u0e3f\u0e4f\u0e5a-\u0e80\u0e83\u0e85-\u0e86\u0e89\u0e8b-\u0e8c\u0e8e-\u0e93\u0e98\u0ea0\u0ea4\u0ea6\u0ea8-\u0ea9\u0eac\u0eaf\u0eba\u0ebe-\u0ebf\u0ec5\u0ec7\u0ece-\u0ecf\u0eda-\u0f17\u0f1a-\u0f1f\u0f2a-\u0f34\u0f36\u0f38\u0f3a-\u0f3d\u0f48\u0f6a-\u0f70\u0f85\u0f8c-\u0f8f\u0f96\u0f98\u0fae-\u0fb0\u0fb8\u0fba-\u109f\u10c6-\u10cf\u10f7-\u10ff\u1101\u1104\u1108\u110a\u110d\u1113-\u113b\u113d\u113f\u1141-\u114b\u114d\u114f\u1151-\u1153\u1156-\u1158\u115a-\u115e\u1162\u1164\u1166\u1168\u116a-\u116c\u116f-\u1171\u1174\u1176-\u119d\u119f-\u11a7\u11a9-\u11aa\u11ac-\u11ad\u11b0-\u11b6\u11b9\u11bb\u11c3-\u11ea\u11ec-\u11ef\u11f1-\u11f8\u11fa-\u1dff\u1e9c-\u1e9f\u1efa-\u1eff\u1f16-\u1f17\u1f1e-\u1f1f\u1f46-\u1f47\u1f4e-\u1f4f\u1f58\u1f5a\u1f5c\u1f5e\u1f7e-\u1f7f\u1fb5\u1fbd\u1fbf-\u1fc1\u1fc5\u1fcd-\u1fcf\u1fd4-\u1fd5\u1fdc-\u1fdf\u1fed-\u1ff1\u1ff5\u1ffd-\u20cf\u20dd-\u20e0\u20e2-\u2125\u2127-\u2129\u212c-\u212d\u212f-\u217f\u2183-\u3004\u3006\u3008-\u3020\u3030\u3036-\u3040\u3095-\u3098\u309b-\u309c\u309f-\u30a0\u30fb\u30ff-\u3104\u312d-\u4dff\u9fa6-\uabff\ud7a4-\uffff]')
 nonXmlNameFirstBMPRegexp = re.compile('[\x00-@\\[-\\^`\\{-\xbf\xd7\xf7\u0132-\u0133\u013f-\u0140\u0149\u017f\u01c4-\u01cc\u01f1-\u01f3\u01f6-\u01f9\u0218-\u024f\u02a9-\u02ba\u02c2-\u0385\u0387\u038b\u038d\u03a2\u03cf\u03d7-\u03d9\u03db\u03dd\u03df\u03e1\u03f4-\u0400\u040d\u0450\u045d\u0482-\u048f\u04c5-\u04c6\u04c9-\u04ca\u04cd-\u04cf\u04ec-\u04ed\u04f6-\u04f7\u04fa-\u0530\u0557-\u0558\u055a-\u0560\u0587-\u05cf\u05eb-\u05ef\u05f3-\u0620\u063b-\u0640\u064b-\u0670\u06b8-\u06b9\u06bf\u06cf\u06d4\u06d6-\u06e4\u06e7-\u0904\u093a-\u093c\u093e-\u0957\u0962-\u0984\u098d-\u098e\u0991-\u0992\u09a9\u09b1\u09b3-\u09b5\u09ba-\u09db\u09de\u09e2-\u09ef\u09f2-\u0a04\u0a0b-\u0a0e\u0a11-\u0a12\u0a29\u0a31\u0a34\u0a37\u0a3a-\u0a58\u0a5d\u0a5f-\u0a71\u0a75-\u0a84\u0a8c\u0a8e\u0a92\u0aa9\u0ab1\u0ab4\u0aba-\u0abc\u0abe-\u0adf\u0ae1-\u0b04\u0b0d-\u0b0e\u0b11-\u0b12\u0b29\u0b31\u0b34-\u0b35\u0b3a-\u0b3c\u0b3e-\u0b5b\u0b5e\u0b62-\u0b84\u0b8b-\u0b8d\u0b91\u0b96-\u0b98\u0b9b\u0b9d\u0ba0-\u0ba2\u0ba5-\u0ba7\u0bab-\u0bad\u0bb6\u0bba-\u0c04\u0c0d\u0c11\u0c29\u0c34\u0c3a-\u0c5f\u0c62-\u0c84\u0c8d\u0c91\u0ca9\u0cb4\u0cba-\u0cdd\u0cdf\u0ce2-\u0d04\u0d0d\u0d11\u0d29\u0d3a-\u0d5f\u0d62-\u0e00\u0e2f\u0e31\u0e34-\u0e3f\u0e46-\u0e80\u0e83\u0e85-\u0e86\u0e89\u0e8b-\u0e8c\u0e8e-\u0e93\u0e98\u0ea0\u0ea4\u0ea6\u0ea8-\u0ea9\u0eac\u0eaf\u0eb1\u0eb4-\u0ebc\u0ebe-\u0ebf\u0ec5-\u0f3f\u0f48\u0f6a-\u109f\u10c6-\u10cf\u10f7-\u10ff\u1101\u1104\u1108\u110a\u110d\u1113-\u113b\u113d\u113f\u1141-\u114b\u114d\u114f\u1151-\u1153\u1156-\u1158\u115a-\u115e\u1162\u1164\u1166\u1168\u116a-\u116c\u116f-\u1171\u1174\u1176-\u119d\u119f-\u11a7\u11a9-\u11aa\u11ac-\u11ad\u11b0-\u11b6\u11b9\u11bb\u11c3-\u11ea\u11ec-\u11ef\u11f1-\u11f8\u11fa-\u1dff\u1e9c-\u1e9f\u1efa-\u1eff\u1f16-\u1f17\u1f1e-\u1f1f\u1f46-\u1f47\u1f4e-\u1f4f\u1f58\u1f5a\u1f5c\u1f5e\u1f7e-\u1f7f\u1fb5\u1fbd\u1fbf-\u1fc1\u1fc5\u1fcd-\u1fcf\u1fd4-\u1fd5\u1fdc-\u1fdf\u1fed-\u1ff1\u1ff5\u1ffd-\u2125\u2127-\u2129\u212c-\u212d\u212f-\u217f\u2183-\u3006\u3008-\u3020\u302a-\u3040\u3095-\u30a0\u30fb-\u3104\u312d-\u4dff\u9fa6-\uabff\ud7a4-\uffff]')
 # Simpler things
 nonPubidCharRegexp = re.compile("[^\x20\x0D\x0Aa-zA-Z0-9\-\'()+,./:=?;!*#@$_%]")
 nonXmlNameFirstBMPRegexp = re.compile(u'[\x00-@\\[-\\^`\\{-\xbf\xd7\xf7\u0132-\u0133\u013f-\u0140\u0149\u017f\u01c4-\u01cc\u01f1-\u01f3\u01f6-\u01f9\u0218-\u024f\u02a9-\u02ba\u02c2-\u0385\u0387\u038b\u038d\u03a2\u03cf\u03d7-\u03d9\u03db\u03dd\u03df\u03e1\u03f4-\u0400\u040d\u0450\u045d\u0482-\u048f\u04c5-\u04c6\u04c9-\u04ca\u04cd-\u04cf\u04ec-\u04ed\u04f6-\u04f7\u04fa-\u0530\u0557-\u0558\u055a-\u0560\u0587-\u05cf\u05eb-\u05ef\u05f3-\u0620\u063b-\u0640\u064b-\u0670\u06b8-\u06b9\u06bf\u06cf\u06d4\u06d6-\u06e4\u06e7-\u0904\u093a-\u093c\u093e-\u0957\u0962-\u0984\u098d-\u098e\u0991-\u0992\u09a9\u09b1\u09b3-\u09b5\u09ba-\u09db\u09de\u09e2-\u09ef\u09f2-\u0a04\u0a0b-\u0a0e\u0a11-\u0a12\u0a29\u0a31\u0a34\u0a37\u0a3a-\u0a58\u0a5d\u0a5f-\u0a71\u0a75-\u0a84\u0a8c\u0a8e\u0a92\u0aa9\u0ab1\u0ab4\u0aba-\u0abc\u0abe-\u0adf\u0ae1-\u0b04\u0b0d-\u0b0e\u0b11-\u0b12\u0b29\u0b31\u0b34-\u0b35\u0b3a-\u0b3c\u0b3e-\u0b5b\u0b5e\u0b62-\u0b84\u0b8b-\u0b8d\u0b91\u0b96-\u0b98\u0b9b\u0b9d\u0ba0-\u0ba2\u0ba5-\u0ba7\u0bab-\u0bad\u0bb6\u0bba-\u0c04\u0c0d\u0c11\u0c29\u0c34\u0c3a-\u0c5f\u0c62-\u0c84\u0c8d\u0c91\u0ca9\u0cb4\u0cba-\u0cdd\u0cdf\u0ce2-\u0d04\u0d0d\u0d11\u0d29\u0d3a-\u0d5f\u0d62-\u0e00\u0e2f\u0e31\u0e34-\u0e3f\u0e46-\u0e80\u0e83\u0e85-\u0e86\u0e89\u0e8b-\u0e8c\u0e8e-\u0e93\u0e98\u0ea0\u0ea4\u0ea6\u0ea8-\u0ea9\u0eac\u0eaf\u0eb1\u0eb4-\u0ebc\u0ebe-\u0ebf\u0ec5-\u0f3f\u0f48\u0f6a-\u109f\u10c6-\u10cf\u10f7-\u10ff\u1101\u1104\u1108\u110a\u110d\u1113-\u113b\u113d\u113f\u1141-\u114b\u114d\u114f\u1151-\u1153\u1156-\u1158\u115a-\u115e\u1162\u1164\u1166\u1168\u116a-\u116c\u116f-\u1171\u1174\u1176-\u119d\u119f-\u11a7\u11a9-\u11aa\u11ac-\u11ad\u11b0-\u11b6\u11b9\u11bb\u11c3-\u11ea\u11ec-\u11ef\u11f1-\u11f8\u11fa-\u1dff\u1e9c-\u1e9f\u1efa-\u1eff\u1f16-\u1f17\u1f1e-\u1f1f\u1f46-\u1f47\u1f4e-\u1f4f\u1f58\u1f5a\u1f5c\u1f5e\u1f7e-\u1f7f\u1fb5\u1fbd\u1fbf-\u1fc1\u1fc5\u1fcd-\u1fcf\u1fd4-\u1fd5\u1fdc-\u1fdf\u1fed-\u1ff1\u1ff5\u1ffd-\u2125\u2127-\u2129\u212c-\u212d\u212f-\u217f\u2183-\u3006\u3008-\u3020\u302a-\u3040\u3095-\u30a0\u30fb-\u3104\u312d-\u4dff\u9fa6-\uabff\ud7a4-\uffff]')
 class InfosetFilter(object):
    replacementRegexp = re.compile(r"U[\dA-F]{5,5}")
    def __init__(self, replaceChars=None,
                 dropXmlnsLocalName=False,
                 dropXmlnsAttrNs=False,
                 preventDoubleDashComments=False,
                 preventDashAtCommentEnd=False,
-                 replaceFormFeedCharacters = True):
+                 replaceFormFeedCharacters=True,
                 preventSingleQuotePubid=False):
        self.dropXmlnsLocalName = dropXmlnsLocalName
        self.dropXmlnsAttrNs = dropXmlnsAttrNs
@ -113,14 +202,17 @@ class InfosetFilter(object):
        self.replaceFormFeedCharacters = replaceFormFeedCharacters
        self.preventSingleQuotePubid = preventSingleQuotePubid
        self.replaceCache = {}
    def coerceAttribute(self, name, namespace=None):
        if self.dropXmlnsLocalName and name.startswith("xmlns:"):
-            #Need a datalosswarning here
+            warnings.warn("Attributes cannot begin with xmlns", DataLossWarning)
            return None
        elif (self.dropXmlnsAttrNs and
              namespace == "http://www.w3.org/2000/xmlns/"):
            warnings.warn("Attributes cannot be in the xml namespace", DataLossWarning)
            return None
        else:
            return self.toXmlName(name)
@ -131,20 +223,35 @@ class InfosetFilter(object):
    def coerceComment(self, data):
        if self.preventDoubleDashComments:
            while "--" in data:
                warnings.warn("Comments cannot contain adjacent dashes", DataLossWarning)
                data = data.replace("--", "- -")
        return data
    def coerceCharacters(self, data):
        if self.replaceFormFeedCharacters:
            for i in range(data.count("\x0C")):
                warnings.warn("Text cannot contain U+000C", DataLossWarning)
            data = data.replace("\x0C", " ")
        # Other non-xml characters
        return data
    def coercePubid(self, data):
        dataOutput = data
        for char in nonPubidCharRegexp.findall(data):
            warnings.warn("Coercing non-XML pubid", DataLossWarning)
            replacement = self.getReplacementCharacter(char)
            dataOutput = dataOutput.replace(char, replacement)
        if self.preventSingleQuotePubid and dataOutput.find("'") >= 0:
            warnings.warn("Pubid cannot contain single quote", DataLossWarning)
            dataOutput = dataOutput.replace("'", self.getReplacementCharacter("'"))
        return dataOutput
    def toXmlName(self, name):
        nameFirst = name[0]
        nameRest = name[1:]
        m = nonXmlNameFirstBMPRegexp.match(nameFirst)
        if m:
            warnings.warn("Coercing non-XML name", DataLossWarning)
            nameFirstOutput = self.getReplacementCharacter(nameFirst)
        else:
            nameFirstOutput = nameFirst
@ -152,6 +259,7 @@ class InfosetFilter(object):
        nameRestOutput = nameRest
        replaceChars = set(nonXmlNameBMPRegexp.findall(nameRest))
        for char in replaceChars:
            warnings.warn("Coercing non-XML name", DataLossWarning)
            replacement = self.getReplacementCharacter(char)
            nameRestOutput = nameRestOutput.replace(char, replacement)
        return nameFirstOutput + nameRestOutput
@ -169,9 +277,9 @@ class InfosetFilter(object):
        return name
    def escapeChar(self, char):
-        replacement = "U" + hex(ord(char))[2:].upper().rjust(5, "0")
+        replacement = "U%05X" % ord(char)
        self.replaceCache[char] = replacement
        return replacement
    def unescapeChar(self, charcode):
-        return unichr(int(charcode[1:], 16))
+        return chr(int(charcode[1:], 16))
--- a/src/html5lib/inputstream.py
+++ b/src/html5lib/inputstream.py
@ -1,18 +1,33 @@
 from __future__ import absolute_import, division, unicode_literals
 from six import text_type
 import codecs
 import re
 import types
 import sys
-from constants import EOF, spaceCharacters, asciiLetters, asciiUppercase
+from .constants import EOF, spaceCharacters, asciiLetters, asciiUppercase
-from constants import encodings, ReparseException
+from .constants import encodings, ReparseException
 from . import utils
 from io import StringIO
 try:
    from io import BytesIO
 except ImportError:
    BytesIO = StringIO
 try:
    from io import BufferedIOBase
 except ImportError:
    class BufferedIOBase(object):
        pass
 # Non-unicode versions of constants for use in the pre-parser
-spaceCharactersBytes = frozenset([str(item) for item in spaceCharacters])
+spaceCharactersBytes = frozenset([item.encode("ascii") for item in spaceCharacters])
-asciiLettersBytes = frozenset([str(item) for item in asciiLetters])
+asciiLettersBytes = frozenset([item.encode("ascii") for item in asciiLetters])
-asciiUppercaseBytes = frozenset([str(item) for item in asciiUppercase])
+asciiUppercaseBytes = frozenset([item.encode("ascii") for item in asciiUppercase])
-spacesAngleBrackets = spaceCharactersBytes | frozenset([">", "<"])
+spacesAngleBrackets = spaceCharactersBytes | frozenset([b">", b"<"])
-invalid_unicode_re = re.compile(u"[\u0001-\u0008\u000B\u000E-\u001F\u007F-\u009F\uD800-\uDFFF\uFDD0-\uFDEF\uFFFE\uFFFF\U0001FFFE\U0001FFFF\U0002FFFE\U0002FFFF\U0003FFFE\U0003FFFF\U0004FFFE\U0004FFFF\U0005FFFE\U0005FFFF\U0006FFFE\U0006FFFF\U0007FFFE\U0007FFFF\U0008FFFE\U0008FFFF\U0009FFFE\U0009FFFF\U000AFFFE\U000AFFFF\U000BFFFE\U000BFFFF\U000CFFFE\U000CFFFF\U000DFFFE\U000DFFFF\U000EFFFE\U000EFFFF\U000FFFFE\U000FFFFF\U0010FFFE\U0010FFFF]")
+invalid_unicode_re = re.compile("[\u0001-\u0008\u000B\u000E-\u001F\u007F-\u009F\uD800-\uDFFF\uFDD0-\uFDEF\uFFFE\uFFFF\U0001FFFE\U0001FFFF\U0002FFFE\U0002FFFF\U0003FFFE\U0003FFFF\U0004FFFE\U0004FFFF\U0005FFFE\U0005FFFF\U0006FFFE\U0006FFFF\U0007FFFE\U0007FFFF\U0008FFFE\U0008FFFF\U0009FFFE\U0009FFFF\U000AFFFE\U000AFFFF\U000BFFFE\U000BFFFF\U000CFFFE\U000CFFFF\U000DFFFE\U000DFFFF\U000EFFFE\U000EFFFF\U000FFFFE\U000FFFFF\U0010FFFE\U0010FFFF]")
 non_bmp_invalid_codepoints = set([0x1FFFE, 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE,
                                  0x3FFFF, 0x4FFFE, 0x4FFFF, 0x5FFFE, 0x5FFFF,
@ -22,12 +37,13 @@ non_bmp_invalid_codepoints = set([0x1FFFE, 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE,
                                  0xDFFFF, 0xEFFFE, 0xEFFFF, 0xFFFFE, 0xFFFFF,
                                  0x10FFFE, 0x10FFFF])
-ascii_punctuation_re = re.compile(ur"[\u0009-\u000D\u0020-\u002F\u003A-\u0040\u005B-\u0060\u007B-\u007E]")
+ascii_punctuation_re = re.compile("[\u0009-\u000D\u0020-\u002F\u003A-\u0040\u005B-\u0060\u007B-\u007E]")
 # Cache for charsUntil()
 charsUntilRegEx = {}
-class BufferedStream:
+
 class BufferedStream(object):
    """Buffering for streams that do not have buffering of their own
    The buffer is implemented as a list of chunks on the assumption that
@ -47,11 +63,11 @@ class BufferedStream:
        return pos
    def seek(self, pos):
-        assert pos < self._bufferedBytes()
+        assert pos <= self._bufferedBytes()
        offset = pos
        i = 0
        while len(self.buffer[i]) < offset:
-            offset -= pos
+            offset -= len(self.buffer[i])
            i += 1
        self.position = [i, offset]
@ -90,8 +106,7 @@ class BufferedStream:
                bytesToRead = len(bufferedData) - bufferOffset
                self.position = [bufferIndex, len(bufferedData)]
                bufferIndex += 1
-            data = rv.append(bufferedData[bufferOffset: 
+            rv.append(bufferedData[bufferOffset:bufferOffset + bytesToRead])
                                          bufferOffset + bytesToRead])
            remainingBytes -= bytesToRead
            bufferOffset = 0
@ -99,11 +114,25 @@ class BufferedStream:
        if remainingBytes:
            rv.append(self._readStream(remainingBytes))
-        return "".join(rv)
+        return b"".join(rv)
 def HTMLInputStream(source, encoding=None, parseMeta=True, chardet=True):
    if hasattr(source, "read"):
        isUnicode = isinstance(source.read(0), text_type)
    else:
        isUnicode = isinstance(source, text_type)
-class HTMLInputStream:
+    if isUnicode:
        if encoding is not None:
            raise TypeError("Cannot explicitly set an encoding with a unicode string")
        return HTMLUnicodeInputStream(source)
    else:
        return HTMLBinaryInputStream(source, encoding, parseMeta, chardet)
 class HTMLUnicodeInputStream(object):
    """Provides a unicode stream of characters to the HTMLTokenizer.
    This class takes care of character encoding and removing or replacing
@ -113,7 +142,7 @@ class HTMLInputStream:
    _defaultChunkSize = 10240
-    def __init__(self, source, encoding=None, parseMeta=True, chardet=True):
+    def __init__(self, source):
        """Initialises the HTMLInputStream.
        HTMLInputStream(source, [encoding]) -> Normalized stream from source
@ -131,41 +160,23 @@ class HTMLInputStream:
        """
        # Craziness
-        if len(u"\U0010FFFF") == 1:
+        if len("\U0010FFFF") == 1:
            self.reportCharacterErrors = self.characterErrorsUCS4
            self.replaceCharactersRegexp = re.compile("[\uD800-\uDFFF]")
        else:
            self.reportCharacterErrors = self.characterErrorsUCS2
            self.replaceCharactersRegexp = re.compile("([\uD800-\uDBFF](?![\uDC00-\uDFFF])|(?<![\uD800-\uDBFF])[\uDC00-\uDFFF])")
        # List of where new lines occur
        self.newLines = [0]
-        self.charEncoding = (codecName(encoding), "certain")
+        self.charEncoding = ("utf-8", "certain")
-
+        self.dataStream = self.openStream(source)
        # Raw Stream - for unicode objects this will encode to utf-8 and set
        #              self.charEncoding as appropriate
        self.rawStream = self.openStream(source)
        # Encoding Information
        #Number of bytes to use when looking for a meta element with
        #encoding information
        self.numBytesMeta = 512
        #Number of bytes to use when using detecting encoding using chardet
        self.numBytesChardet = 100
        #Encoding to use if no other information can be found
        self.defaultEncoding = "windows-1252"
        #Detect encoding iff no explicit "transport level" encoding is supplied
        if (self.charEncoding[0] is None):
            self.charEncoding = self.detectEncoding(parseMeta, chardet)
        self.reset()
    def reset(self):
-        self.dataStream = codecs.getreader(self.charEncoding[0])(self.rawStream,
+        self.chunk = ""
                                                                 'replace')
        self.chunk = u""
        self.chunkSize = 0
        self.chunkOffset = 0
        self.errors = []
@ -175,8 +186,8 @@ class HTMLInputStream:
        # number of columns in the last line of the previous chunk
        self.prevNumCols = 0
-        #Flag to indicate we may have a CR LF broken across a data chunk
+        # Deal with CR LF and surrogates split over chunk boundaries
-        self._lastChunkEndsWithCR = False
+        self._bufferedCharacter = None
    def openStream(self, source):
        """Produces a file object from source.
@ -188,122 +199,15 @@ class HTMLInputStream:
        if hasattr(source, 'read'):
            stream = source
        else:
-            # Otherwise treat source as a string and convert to a file object
+            stream = StringIO(source)
            if isinstance(source, unicode):
                source = source.encode('utf-8')
                self.charEncoding = ("utf-8", "certain")
            import cStringIO
            stream = cStringIO.StringIO(str(source))
        if (not(hasattr(stream, "tell") and hasattr(stream, "seek")) or
            stream is sys.stdin):
            stream = BufferedStream(stream)
        return stream
    def detectEncoding(self, parseMeta=True, chardet=True):
        #First look for a BOM
        #This will also read past the BOM if present
        encoding = self.detectBOM()
        confidence = "certain"
        #If there is no BOM need to look for meta elements with encoding 
        #information
        if encoding is None and parseMeta:
            encoding = self.detectEncodingMeta()
            confidence = "tentative"
        #Guess with chardet, if avaliable
        if encoding is None and chardet:
            confidence = "tentative"
            try:
                from chardet.universaldetector import UniversalDetector
                buffers = []
                detector = UniversalDetector()
                while not detector.done:
                    buffer = self.rawStream.read(self.numBytesChardet)
                    if not buffer:
                        break
                    buffers.append(buffer)
                    detector.feed(buffer)
                detector.close()
                encoding = detector.result['encoding']
                self.rawStream.seek(0)
            except ImportError:
                pass
        # If all else fails use the default encoding
        if encoding is None:
            confidence="tentative"
            encoding = self.defaultEncoding
        #Substitute for equivalent encodings:
        encodingSub = {"iso-8859-1":"windows-1252"}
        if encoding.lower() in encodingSub:
            encoding = encodingSub[encoding.lower()]
        return encoding, confidence
    def changeEncoding(self, newEncoding):
        newEncoding = codecName(newEncoding)
        if newEncoding in ("utf-16", "utf-16-be", "utf-16-le"):
            newEncoding = "utf-8"
        if newEncoding is None:
            return
        elif newEncoding == self.charEncoding[0]:
            self.charEncoding = (self.charEncoding[0], "certain")
        else:
            self.rawStream.seek(0)
            self.reset()
            self.charEncoding = (newEncoding, "certain")
            raise ReparseException, "Encoding changed from %s to %s"%(self.charEncoding[0], newEncoding)
    def detectBOM(self):
        """Attempts to detect at BOM at the start of the stream. If
        an encoding can be determined from the BOM return the name of the
        encoding otherwise return None"""
        bomDict = {
            codecs.BOM_UTF8: 'utf-8',
            codecs.BOM_UTF16_LE: 'utf-16-le', codecs.BOM_UTF16_BE: 'utf-16-be',
            codecs.BOM_UTF32_LE: 'utf-32-le', codecs.BOM_UTF32_BE: 'utf-32-be'
        }
        # Go to beginning of file and read in 4 bytes
        string = self.rawStream.read(4)
        # Try detecting the BOM using bytes from the string
        encoding = bomDict.get(string[:3])         # UTF-8
        seek = 3
        if not encoding:
            # Need to detect UTF-32 before UTF-16
            encoding = bomDict.get(string)         # UTF-32
            seek = 4
            if not encoding:
                encoding = bomDict.get(string[:2]) # UTF-16
                seek = 2
        # Set the read position past the BOM if one was found, otherwise
        # set it to the start of the stream
        self.rawStream.seek(encoding and seek or 0)
        return encoding
    def detectEncodingMeta(self):
        """Report the encoding declared by the meta element
        """
        buffer = self.rawStream.read(self.numBytesMeta)
        parser = EncodingParser(buffer)
        self.rawStream.seek(0)
        encoding = parser.getEncoding()
        if encoding in ("utf-16", "utf-16-be", "utf-16-le"):
            encoding = "utf-8"
        return encoding
    def _position(self, offset):
        chunk = self.chunk
-        nLines = chunk.count(u'\n', 0, offset)
+        nLines = chunk.count('\n', 0, offset)
        positionLine = self.prevNumLines + nLines
-        lastLinePos = chunk.rfind(u'\n', 0, offset)
+        lastLinePos = chunk.rfind('\n', 0, offset)
        if lastLinePos == -1:
            positionColumn = self.prevNumCols + offset
        else:
@ -336,27 +240,34 @@ class HTMLInputStream:
        self.prevNumLines, self.prevNumCols = self._position(self.chunkSize)
-        self.chunk = u""
+        self.chunk = ""
        self.chunkSize = 0
        self.chunkOffset = 0
        data = self.dataStream.read(chunkSize)
-        if not data:
+        # Deal with CR LF and surrogates broken across chunks
        if self._bufferedCharacter:
            data = self._bufferedCharacter + data
            self._bufferedCharacter = None
        elif not data:
            # We have no more data, bye-bye stream
            return False
        if len(data) > 1:
            lastv = ord(data[-1])
            if lastv == 0x0D or 0xD800 <= lastv <= 0xDBFF:
                self._bufferedCharacter = data[-1]
                data = data[:-1]
        self.reportCharacterErrors(data)
-        data = data.replace(u"\u0000", u"\ufffd")
+        # Replace invalid characters
-        #Check for CR LF broken across chunks
+        # Note U+0000 is dealt with in the tokenizer
-        if (self._lastChunkEndsWithCR and data[0] == u"\n"):
+        data = self.replaceCharactersRegexp.sub("\ufffd", data)
-            data = data[1:]
+
-            # Stop if the chunk is now empty
+        data = data.replace("\r\n", "\n")
-            if not data:
+        data = data.replace("\r", "\n")
                return False
        self._lastChunkEndsWithCR = data[-1] == u"\r"
        data = data.replace(u"\r\n", u"\n")
        data = data.replace(u"\r", u"\n")
        self.chunk = data
        self.chunkSize = len(data)
@ -364,32 +275,22 @@ class HTMLInputStream:
        return True
    def characterErrorsUCS4(self, data):
-        for i in xrange(data.count(u"\u0000")):
+        for i in range(len(invalid_unicode_re.findall(data))):
            self.errors.append("null-character")
        for i in xrange(len(invalid_unicode_re.findall(data))):
            self.errors.append("invalid-codepoint")
    def characterErrorsUCS2(self, data):
        # Someone picked the wrong compile option
        # You lose
        for i in xrange(data.count(u"\u0000")):
            self.errors.append("null-character")
        skip = False
        import sys
        for match in invalid_unicode_re.finditer(data):
            if skip:
                continue
            codepoint = ord(match.group())
            pos = match.start()
            # Pretty sure there should be endianness issues here
-            if (codepoint >= 0xD800 and codepoint <= 0xDBFF and
+            if utils.isSurrogatePair(data[pos:pos + 2]):
                pos < len(data) - 1 and
                ord(data[pos + 1]) >= 0xDC00 and
                ord(data[pos + 1]) <= 0xDFFF):
                # We have a surrogate pair!
-                #From a perl manpage
+                char_val = utils.surrogatePairToCodepoint(data[pos:pos + 2])
                char_val = (0x10000 + (codepoint - 0xD800) * 0x400 + 
                            (ord(data[pos + 1]) - 0xDC00))
                if char_val in non_bmp_invalid_codepoints:
                    self.errors.append("invalid-codepoint")
                skip = True
@ -399,8 +300,6 @@ class HTMLInputStream:
            else:
                skip = False
                self.errors.append("invalid-codepoint")
        #This is still wrong if it is possible for a surrogate pair to break a
        #chunk boundary
    def charsUntil(self, characters, opposite=False):
        """ Returns a string of characters from the stream up to but not
@ -416,10 +315,10 @@ class HTMLInputStream:
            if __debug__:
                for c in characters:
                    assert(ord(c) < 128)
-            regex = u"".join([u"\\x%02x" % ord(c) for c in characters])
+            regex = "".join(["\\x%02x" % ord(c) for c in characters])
            if not opposite:
-                regex = u"^%s" % regex
+                regex = "^%s" % regex
-            chars = charsUntilRegEx[(characters, opposite)] = re.compile(u"[%s]+" % regex)
+            chars = charsUntilRegEx[(characters, opposite)] = re.compile("[%s]+" % regex)
        rv = []
@ -446,27 +345,12 @@ class HTMLInputStream:
                # Reached EOF
                break
-        r = u"".join(rv)
+        r = "".join(rv)
        return r
    def charsUntilEOF(self):
        """ Returns a string of characters from the stream up to EOF."""
        rv = []
        while True:
            rv.append(self.chunk[self.chunkOffset:])
            if not self.readChunk():
                # Reached EOF
                break
        r = u"".join(rv)
        return r
    def unget(self, char):
        # Only one character is allowed to be ungotten at once - it must
        # be consumed again before any further call to unget
        if char is not None:
            if self.chunkOffset == 0:
                # unget is called quite rarely, so it's a good idea to do
@ -480,12 +364,192 @@ class HTMLInputStream:
                self.chunkOffset -= 1
                assert self.chunk[self.chunkOffset] == char
-class EncodingBytes(str):
+
 class HTMLBinaryInputStream(HTMLUnicodeInputStream):
    """Provides a unicode stream of characters to the HTMLTokenizer.
    This class takes care of character encoding and removing or replacing
    incorrect byte-sequences and also provides column and line tracking.
    """
    def __init__(self, source, encoding=None, parseMeta=True, chardet=True):
        """Initialises the HTMLInputStream.
        HTMLInputStream(source, [encoding]) -> Normalized stream from source
        for use by html5lib.
        source can be either a file-object, local filename or a string.
        The optional encoding parameter must be a string that indicates
        the encoding.  If specified, that encoding will be used,
        regardless of any BOM or later declaration (such as in a meta
        element)
        parseMeta - Look for a <meta> element containing encoding information
        """
        # Raw Stream - for unicode objects this will encode to utf-8 and set
        #              self.charEncoding as appropriate
        self.rawStream = self.openStream(source)
        HTMLUnicodeInputStream.__init__(self, self.rawStream)
        self.charEncoding = (codecName(encoding), "certain")
        # Encoding Information
        # Number of bytes to use when looking for a meta element with
        # encoding information
        self.numBytesMeta = 512
        # Number of bytes to use when using detecting encoding using chardet
        self.numBytesChardet = 100
        # Encoding to use if no other information can be found
        self.defaultEncoding = "windows-1252"
        # Detect encoding iff no explicit "transport level" encoding is supplied
        if (self.charEncoding[0] is None):
            self.charEncoding = self.detectEncoding(parseMeta, chardet)
        # Call superclass
        self.reset()
    def reset(self):
        self.dataStream = codecs.getreader(self.charEncoding[0])(self.rawStream,
                                                                 'replace')
        HTMLUnicodeInputStream.reset(self)
    def openStream(self, source):
        """Produces a file object from source.
        source can be either a file object, local filename or a string.
        """
        # Already a file object
        if hasattr(source, 'read'):
            stream = source
        else:
            stream = BytesIO(source)
        try:
            stream.seek(stream.tell())
        except:
            stream = BufferedStream(stream)
        return stream
    def detectEncoding(self, parseMeta=True, chardet=True):
        # First look for a BOM
        # This will also read past the BOM if present
        encoding = self.detectBOM()
        confidence = "certain"
        # If there is no BOM need to look for meta elements with encoding
        # information
        if encoding is None and parseMeta:
            encoding = self.detectEncodingMeta()
            confidence = "tentative"
        # Guess with chardet, if avaliable
        if encoding is None and chardet:
            confidence = "tentative"
            try:
                try:
                    from charade.universaldetector import UniversalDetector
                except ImportError:
                    from chardet.universaldetector import UniversalDetector
                buffers = []
                detector = UniversalDetector()
                while not detector.done:
                    buffer = self.rawStream.read(self.numBytesChardet)
                    assert isinstance(buffer, bytes)
                    if not buffer:
                        break
                    buffers.append(buffer)
                    detector.feed(buffer)
                detector.close()
                encoding = detector.result['encoding']
                self.rawStream.seek(0)
            except ImportError:
                pass
        # If all else fails use the default encoding
        if encoding is None:
            confidence = "tentative"
            encoding = self.defaultEncoding
        # Substitute for equivalent encodings:
        encodingSub = {"iso-8859-1": "windows-1252"}
        if encoding.lower() in encodingSub:
            encoding = encodingSub[encoding.lower()]
        return encoding, confidence
    def changeEncoding(self, newEncoding):
        assert self.charEncoding[1] != "certain"
        newEncoding = codecName(newEncoding)
        if newEncoding in ("utf-16", "utf-16-be", "utf-16-le"):
            newEncoding = "utf-8"
        if newEncoding is None:
            return
        elif newEncoding == self.charEncoding[0]:
            self.charEncoding = (self.charEncoding[0], "certain")
        else:
            self.rawStream.seek(0)
            self.reset()
            self.charEncoding = (newEncoding, "certain")
            raise ReparseException("Encoding changed from %s to %s" % (self.charEncoding[0], newEncoding))
    def detectBOM(self):
        """Attempts to detect at BOM at the start of the stream. If
        an encoding can be determined from the BOM return the name of the
        encoding otherwise return None"""
        bomDict = {
            codecs.BOM_UTF8: 'utf-8',
            codecs.BOM_UTF16_LE: 'utf-16-le', codecs.BOM_UTF16_BE: 'utf-16-be',
            codecs.BOM_UTF32_LE: 'utf-32-le', codecs.BOM_UTF32_BE: 'utf-32-be'
        }
        # Go to beginning of file and read in 4 bytes
        string = self.rawStream.read(4)
        assert isinstance(string, bytes)
        # Try detecting the BOM using bytes from the string
        encoding = bomDict.get(string[:3])         # UTF-8
        seek = 3
        if not encoding:
            # Need to detect UTF-32 before UTF-16
            encoding = bomDict.get(string)         # UTF-32
            seek = 4
            if not encoding:
                encoding = bomDict.get(string[:2])  # UTF-16
                seek = 2
        # Set the read position past the BOM if one was found, otherwise
        # set it to the start of the stream
        self.rawStream.seek(encoding and seek or 0)
        return encoding
    def detectEncodingMeta(self):
        """Report the encoding declared by the meta element
        """
        buffer = self.rawStream.read(self.numBytesMeta)
        assert isinstance(buffer, bytes)
        parser = EncodingParser(buffer)
        self.rawStream.seek(0)
        encoding = parser.getEncoding()
        if encoding in ("utf-16", "utf-16-be", "utf-16-le"):
            encoding = "utf-8"
        return encoding
 class EncodingBytes(bytes):
    """String-like object with an associated position and various extra methods
    If the position is ever greater than the string length then an exception is
    raised"""
    def __new__(self, value):
-        return str.__new__(self, value.lower())
+        assert isinstance(value, bytes)
        return bytes.__new__(self, value.lower())
    def __init__(self, value):
        self._position = -1
@ -493,13 +557,17 @@ class EncodingBytes(str):
    def __iter__(self):
        return self
-    def next(self):
+    def __next__(self):
        p = self._position = self._position + 1
        if p >= len(self):
            raise StopIteration
        elif p < 0:
            raise TypeError
-        return self[p]
+        return self[p:p + 1]
    def next(self):
        # Py2 compat
        return self.__next__()
    def previous(self):
        p = self._position
@ -508,7 +576,7 @@ class EncodingBytes(str):
        elif p < 0:
            raise TypeError
        self._position = p = p - 1
-        return self[p]
+        return self[p:p + 1]
    def setPosition(self, position):
        if self._position >= len(self):
@ -526,7 +594,7 @@ class EncodingBytes(str):
    position = property(getPosition, setPosition)
    def getCurrentByte(self):
-        return self[self.position]
+        return self[self.position:self.position + 1]
    currentByte = property(getCurrentByte)
@ -534,7 +602,7 @@ class EncodingBytes(str):
        """Skip past a list of characters"""
        p = self.position               # use property for the error-checking
        while p < len(self):
-            c = self[p]
+            c = self[p:p + 1]
            if c not in chars:
                self._position = p
                return c
@ -545,7 +613,7 @@ class EncodingBytes(str):
    def skipUntil(self, chars):
        p = self.position
        while p < len(self):
-            c = self[p]
+            c = self[p:p + 1]
            if c in chars:
                self._position = p
                return c
@ -577,6 +645,7 @@ class EncodingBytes(str):
        else:
            raise StopIteration
 class EncodingParser(object):
    """Mini parser for detecting character encoding from meta elements"""
@ -587,12 +656,12 @@ class EncodingParser(object):
    def getEncoding(self):
        methodDispatch = (
-            ("<!--",self.handleComment),
+            (b"<!--", self.handleComment),
-            ("<meta",self.handleMeta),
+            (b"<meta", self.handleMeta),
-            ("</",self.handlePossibleEndTag),
+            (b"</", self.handlePossibleEndTag),
-            ("<!",self.handleOther),
+            (b"<!", self.handleOther),
-            ("<?",self.handleOther),
+            (b"<?", self.handleOther),
-            ("<",self.handlePossibleStartTag))
+            (b"<", self.handlePossibleStartTag))
        for byte in self.data:
            keepParsing = True
            for key, method in methodDispatch:
@ -610,38 +679,49 @@ class EncodingParser(object):
    def handleComment(self):
        """Skip over comments"""
-        return self.data.jumpTo("-->")
+        return self.data.jumpTo(b"-->")
    def handleMeta(self):
        if self.data.currentByte not in spaceCharactersBytes:
            # if we have <meta not followed by a space so just keep going
            return True
        # We have a valid meta element we want to search for attributes
        hasPragma = False
        pendingEncoding = None
        while True:
            # Try to find the next attribute after the current position
            attr = self.getAttribute()
            if attr is None:
                return True
            else:
-                if attr[0] == "charset":
+                if attr[0] == b"http-equiv":
                    hasPragma = attr[1] == b"content-type"
                    if hasPragma and pendingEncoding is not None:
                        self.encoding = pendingEncoding
                        return False
                elif attr[0] == b"charset":
                    tentativeEncoding = attr[1]
                    codec = codecName(tentativeEncoding)
                    if codec is not None:
                        self.encoding = codec
                        return False
-                elif attr[0] == "content":
+                elif attr[0] == b"content":
                    contentParser = ContentAttrParser(EncodingBytes(attr[1]))
                    tentativeEncoding = contentParser.parse()
                    if tentativeEncoding is not None:
                        codec = codecName(tentativeEncoding)
                        if codec is not None:
                            if hasPragma:
                                self.encoding = codec
                                return False
                            else:
                                pendingEncoding = codec
    def handlePossibleStartTag(self):
        return self.handlePossibleTag(False)
    def handlePossibleEndTag(self):
-        self.data.next()
+        next(self.data)
        return self.handlePossibleTag(True)
    def handlePossibleTag(self, endTag):
@ -656,7 +736,7 @@ class EncodingParser(object):
            return True
        c = data.skipUntil(spacesAngleBrackets)
-        if c == "<":
+        if c == b"<":
            # return to the first step in the overall "two step" algorithm
            # reprocessing the < byte
            data.previous()
@ -668,66 +748,66 @@ class EncodingParser(object):
        return True
    def handleOther(self):
-        return self.data.jumpTo(">")
+        return self.data.jumpTo(b">")
    def getAttribute(self):
        """Return a name,value pair for the next attribute in the stream,
        if one is found, or None"""
        data = self.data
        # Step 1 (skip chars)
-        c = data.skip(spaceCharactersBytes | frozenset("/"))
+        c = data.skip(spaceCharactersBytes | frozenset([b"/"]))
        assert c is None or len(c) == 1
        # Step 2
-        if c in (">", None):
+        if c in (b">", None):
            return None
        # Step 3
        attrName = []
        attrValue = []
        # Step 4 attribute name
        while True:
-            if c == "=" and attrName:   
+            if c == b"=" and attrName:
                break
            elif c in spaceCharactersBytes:
                # Step 6!
                c = data.skip()
                c = data.next()
                break
-            elif c in ("/", ">"):
+            elif c in (b"/", b">"):
-                return "".join(attrName), ""
+                return b"".join(attrName), b""
            elif c in asciiUppercaseBytes:
                attrName.append(c.lower())
-            elif c == None:
+            elif c is None:
                return None
            else:
                attrName.append(c)
            # Step 5
-            c = data.next()
+            c = next(data)
        # Step 7
-        if c != "=":
+        if c != b"=":
            data.previous()
-            return "".join(attrName), ""
+            return b"".join(attrName), b""
        # Step 8
-        data.next()
+        next(data)
        # Step 9
        c = data.skip()
        # Step 10
-        if c in ("'", '"'):
+        if c in (b"'", b'"'):
            # 10.1
            quoteChar = c
            while True:
                # 10.2
-                c = data.next()
+                c = next(data)
                # 10.3
                if c == quoteChar:
-                    data.next()
+                    next(data)
-                    return "".join(attrName), "".join(attrValue)
+                    return b"".join(attrName), b"".join(attrValue)
                # 10.4
                elif c in asciiUppercaseBytes:
                    attrValue.append(c.lower())
                # 10.5
                else:
                    attrValue.append(c)
-        elif c == ">":
+        elif c == b">":
-            return "".join(attrName), ""
+            return b"".join(attrName), b""
        elif c in asciiUppercaseBytes:
            attrValue.append(c.lower())
        elif c is None:
@ -736,9 +816,9 @@ class EncodingParser(object):
            attrValue.append(c)
        # Step 11
        while True:
-            c = data.next()
+            c = next(data)
            if c in spacesAngleBrackets:
-                return "".join(attrName), "".join(attrValue)
+                return b"".join(attrName), b"".join(attrValue)
            elif c in asciiUppercaseBytes:
                attrValue.append(c.lower())
            elif c is None:
@ -749,21 +829,23 @@ class EncodingParser(object):
 class ContentAttrParser(object):
    def __init__(self, data):
        assert isinstance(data, bytes)
        self.data = data
    def parse(self):
        try:
            # Check if the attr name is charset
            # otherwise return
-            self.data.jumpTo("charset")
+            self.data.jumpTo(b"charset")
            self.data.position += 1
            self.data.skip()
-            if not self.data.currentByte == "=":
+            if not self.data.currentByte == b"=":
                # If there is no = sign keep looking for attrs
                return None
            self.data.position += 1
            self.data.skip()
            # Look for an encoding between matching quote marks
-            if self.data.currentByte in ('"', "'"):
+            if self.data.currentByte in (b'"', b"'"):
                quoteMark = self.data.currentByte
                self.data.position += 1
                oldPosition = self.data.position
@ -787,7 +869,12 @@ class ContentAttrParser(object):
 def codecName(encoding):
    """Return the python codec name corresponding to an encoding or None if the
    string doesn't correspond to a valid encoding."""
-    if (encoding is not None and type(encoding) in types.StringTypes):
+    if isinstance(encoding, bytes):
        try:
            encoding = encoding.decode("ascii")
        except UnicodeDecodeError:
            return None
    if encoding:
        canonicalName = ascii_punctuation_re.sub("", encoding).lower()
        return encodings.get(canonicalName, None)
    else:
--- a/src/html5lib/sanitizer.py
+++ b/src/html5lib/sanitizer.py
@ -1,21 +1,28 @@
 from __future__ import absolute_import, division, unicode_literals
 import re
 from xml.sax.saxutils import escape, unescape
-from tokenizer import HTMLTokenizer
+from .tokenizer import HTMLTokenizer
-from constants import tokenTypes
+from .constants import tokenTypes
 class HTMLSanitizerMixin(object):
    """ sanitization of XHTML+MathML+SVG and of inline style attributes."""
-    acceptable_elements = ['a', 'abbr', 'acronym', 'address', 'area', 'b',
+    acceptable_elements = ['a', 'abbr', 'acronym', 'address', 'area',
-        'big', 'blockquote', 'br', 'button', 'caption', 'center', 'cite',
+                           'article', 'aside', 'audio', 'b', 'big', 'blockquote', 'br', 'button',
-        'code', 'col', 'colgroup', 'dd', 'del', 'dfn', 'dir', 'div', 'dl', 'dt',
+                           'canvas', 'caption', 'center', 'cite', 'code', 'col', 'colgroup',
-        'em', 'fieldset', 'font', 'form', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6',
+                           'command', 'datagrid', 'datalist', 'dd', 'del', 'details', 'dfn',
-        'hr', 'i', 'img', 'input', 'ins', 'kbd', 'label', 'legend', 'li', 'map',
+                           'dialog', 'dir', 'div', 'dl', 'dt', 'em', 'event-source', 'fieldset',
-        'menu', 'ol', 'optgroup', 'option', 'p', 'pre', 'q', 's', 'samp',
+                           'figcaption', 'figure', 'footer', 'font', 'form', 'header', 'h1',
-        'select', 'small', 'span', 'strike', 'strong', 'sub', 'sup', 'table',
+                           'h2', 'h3', 'h4', 'h5', 'h6', 'hr', 'i', 'img', 'input', 'ins',
-        'tbody', 'td', 'textarea', 'tfoot', 'th', 'thead', 'tr', 'tt', 'u',
+                           'keygen', 'kbd', 'label', 'legend', 'li', 'm', 'map', 'menu', 'meter',
-        'ul', 'var']
+                           'multicol', 'nav', 'nextid', 'ol', 'output', 'optgroup', 'option',
                           'p', 'pre', 'progress', 'q', 's', 'samp', 'section', 'select',
                           'small', 'sound', 'source', 'spacer', 'span', 'strike', 'strong',
                           'sub', 'sup', 'table', 'tbody', 'td', 'textarea', 'time', 'tfoot',
                           'th', 'thead', 'tr', 'tt', 'u', 'ul', 'var', 'video']
    mathml_elements = ['maction', 'math', 'merror', 'mfrac', 'mi',
                       'mmultiscripts', 'mn', 'mo', 'mover', 'mpadded', 'mphantom',
@ -24,24 +31,35 @@ class HTMLSanitizerMixin(object):
                       'munderover', 'none']
    svg_elements = ['a', 'animate', 'animateColor', 'animateMotion',
-        'animateTransform', 'circle', 'defs', 'desc', 'ellipse', 'font-face',
+                    'animateTransform', 'clipPath', 'circle', 'defs', 'desc', 'ellipse',
-        'font-face-name', 'font-face-src', 'g', 'glyph', 'hkern', 
+                    'font-face', 'font-face-name', 'font-face-src', 'g', 'glyph', 'hkern',
                    'linearGradient', 'line', 'marker', 'metadata', 'missing-glyph',
                    'mpath', 'path', 'polygon', 'polyline', 'radialGradient', 'rect',
                    'set', 'stop', 'svg', 'switch', 'text', 'title', 'tspan', 'use']
    acceptable_attributes = ['abbr', 'accept', 'accept-charset', 'accesskey',
-        'action', 'align', 'alt', 'axis', 'border', 'cellpadding',
+                             'action', 'align', 'alt', 'autocomplete', 'autofocus', 'axis',
-        'cellspacing', 'char', 'charoff', 'charset', 'checked', 'cite', 'class',
+                             'background', 'balance', 'bgcolor', 'bgproperties', 'border',
-        'clear', 'cols', 'colspan', 'color', 'compact', 'coords', 'datetime',
+                             'bordercolor', 'bordercolordark', 'bordercolorlight', 'bottompadding',
-        'dir', 'disabled', 'enctype', 'for', 'frame', 'headers', 'height',
+                             'cellpadding', 'cellspacing', 'ch', 'challenge', 'char', 'charoff',
-        'href', 'hreflang', 'hspace', 'id', 'ismap', 'label', 'lang',
+                             'choff', 'charset', 'checked', 'cite', 'class', 'clear', 'color',
-        'longdesc', 'maxlength', 'media', 'method', 'multiple', 'name',
+                             'cols', 'colspan', 'compact', 'contenteditable', 'controls', 'coords',
-        'nohref', 'noshade', 'nowrap', 'prompt', 'readonly', 'rel', 'rev',
+                             'data', 'datafld', 'datapagesize', 'datasrc', 'datetime', 'default',
-        'rows', 'rowspan', 'rules', 'scope', 'selected', 'shape', 'size',
+                             'delay', 'dir', 'disabled', 'draggable', 'dynsrc', 'enctype', 'end',
-        'span', 'src', 'start', 'style', 'summary', 'tabindex', 'target',
+                             'face', 'for', 'form', 'frame', 'galleryimg', 'gutter', 'headers',
-        'title', 'type', 'usemap', 'valign', 'value', 'vspace', 'width',
+                             'height', 'hidefocus', 'hidden', 'high', 'href', 'hreflang', 'hspace',
-        'xml:lang']
+                             'icon', 'id', 'inputmode', 'ismap', 'keytype', 'label', 'leftspacing',
                             'lang', 'list', 'longdesc', 'loop', 'loopcount', 'loopend',
                             'loopstart', 'low', 'lowsrc', 'max', 'maxlength', 'media', 'method',
                             'min', 'multiple', 'name', 'nohref', 'noshade', 'nowrap', 'open',
                             'optimum', 'pattern', 'ping', 'point-size', 'poster', 'pqg', 'preload',
                             'prompt', 'radiogroup', 'readonly', 'rel', 'repeat-max', 'repeat-min',
                             'replace', 'required', 'rev', 'rightspacing', 'rows', 'rowspan',
                             'rules', 'scope', 'selected', 'shape', 'size', 'span', 'src', 'start',
                             'step', 'style', 'summary', 'suppress', 'tabindex', 'target',
                             'template', 'title', 'toppadding', 'type', 'unselectable', 'usemap',
                             'urn', 'valign', 'value', 'variable', 'volume', 'vspace', 'vrml',
                             'width', 'wrap', 'xml:lang']
    mathml_attributes = ['actiontype', 'align', 'columnalign', 'columnalign',
                         'columnalign', 'columnlines', 'columnspacing', 'columnspan', 'depth',
@ -56,41 +74,43 @@ class HTMLSanitizerMixin(object):
    svg_attributes = ['accent-height', 'accumulate', 'additive', 'alphabetic',
                      'arabic-form', 'ascent', 'attributeName', 'attributeType',
                      'baseProfile', 'bbox', 'begin', 'by', 'calcMode', 'cap-height',
-         'class', 'color', 'color-rendering', 'content', 'cx', 'cy', 'd', 'dx',
+                      'class', 'clip-path', 'color', 'color-rendering', 'content', 'cx',
-         'dy', 'descent', 'display', 'dur', 'end', 'fill', 'fill-opacity',
+                      'cy', 'd', 'dx', 'dy', 'descent', 'display', 'dur', 'end', 'fill',
-         'fill-rule', 'font-family', 'font-size', 'font-stretch', 'font-style',
+                      'fill-opacity', 'fill-rule', 'font-family', 'font-size',
-         'font-variant', 'font-weight', 'from', 'fx', 'fy', 'g1', 'g2',
+                      'font-stretch', 'font-style', 'font-variant', 'font-weight', 'from',
-         'glyph-name', 'gradientUnits', 'hanging', 'height', 'horiz-adv-x',
+                      'fx', 'fy', 'g1', 'g2', 'glyph-name', 'gradientUnits', 'hanging',
-         'horiz-origin-x', 'id', 'ideographic', 'k', 'keyPoints',
+                      'height', 'horiz-adv-x', 'horiz-origin-x', 'id', 'ideographic', 'k',
-         'keySplines', 'keyTimes', 'lang', 'marker-end', 'marker-mid',
+                      'keyPoints', 'keySplines', 'keyTimes', 'lang', 'marker-end',
-         'marker-start', 'markerHeight', 'markerUnits', 'markerWidth',
+                      'marker-mid', 'marker-start', 'markerHeight', 'markerUnits',
-         'mathematical', 'max', 'min', 'name', 'offset', 'opacity', 'orient',
+                      'markerWidth', 'mathematical', 'max', 'min', 'name', 'offset',
-         'origin', 'overline-position', 'overline-thickness', 'panose-1',
+                      'opacity', 'orient', 'origin', 'overline-position',
-         'path', 'pathLength', 'points', 'preserveAspectRatio', 'r', 'refX',
+                      'overline-thickness', 'panose-1', 'path', 'pathLength', 'points',
-         'refY', 'repeatCount', 'repeatDur', 'requiredExtensions',
+                      'preserveAspectRatio', 'r', 'refX', 'refY', 'repeatCount',
-         'requiredFeatures', 'restart', 'rotate', 'rx', 'ry', 'slope',
+                      'repeatDur', 'requiredExtensions', 'requiredFeatures', 'restart',
-         'stemh', 'stemv', 'stop-color', 'stop-opacity',
+                      'rotate', 'rx', 'ry', 'slope', 'stemh', 'stemv', 'stop-color',
-         'strikethrough-position', 'strikethrough-thickness', 'stroke',
+                      'stop-opacity', 'strikethrough-position', 'strikethrough-thickness',
-         'stroke-dasharray', 'stroke-dashoffset', 'stroke-linecap',
+                      'stroke', 'stroke-dasharray', 'stroke-dashoffset', 'stroke-linecap',
                      'stroke-linejoin', 'stroke-miterlimit', 'stroke-opacity',
                      'stroke-width', 'systemLanguage', 'target', 'text-anchor', 'to',
                      'transform', 'type', 'u1', 'u2', 'underline-position',
                      'underline-thickness', 'unicode', 'unicode-range', 'units-per-em',
                      'values', 'version', 'viewBox', 'visibility', 'width', 'widths', 'x',
                      'x-height', 'x1', 'x2', 'xlink:actuate', 'xlink:arcrole',
-         'xlink:href', 'xlink:role', 'xlink:show', 'xlink:title',
+                      'xlink:href', 'xlink:role', 'xlink:show', 'xlink:title', 'xlink:type',
-         'xlink:type', 'xml:base', 'xml:lang', 'xml:space', 'xmlns',
+                      'xml:base', 'xml:lang', 'xml:space', 'xmlns', 'xmlns:xlink', 'y',
-         'xmlns:xlink', 'y', 'y1', 'y2', 'zoomAndPan']
+                      'y1', 'y2', 'zoomAndPan']
-    attr_val_is_uri = ['href', 'src', 'cite', 'action', 'longdesc',
+    attr_val_is_uri = ['href', 'src', 'cite', 'action', 'longdesc', 'poster',
                       'xlink:href', 'xml:base']
    svg_attr_val_allows_ref = ['clip-path', 'color-profile', 'cursor', 'fill',
-      'filter', 'marker', 'marker-start', 'marker-mid', 'marker-end', 'mask', 'stroke']
+                               'filter', 'marker', 'marker-start', 'marker-mid', 'marker-end',
                               'mask', 'stroke']
-    svg_allow_local_href = ['altGlyph', 'animate', 'animateColor', 'animateMotion',
+    svg_allow_local_href = ['altGlyph', 'animate', 'animateColor',
-      'animateTransform', 'cursor', 'feImage', 'filter', 'linearGradient', 'pattern',
+                            'animateMotion', 'animateTransform', 'cursor', 'feImage', 'filter',
-      'radialGradient', 'textpath', 'tref', 'set', 'use']
+                            'linearGradient', 'pattern', 'radialGradient', 'textpath', 'tref',
                            'set', 'use']
    acceptable_css_properties = ['azimuth', 'background-color',
                                 'border-bottom-color', 'border-collapse', 'border-color',
@ -140,20 +160,35 @@ class HTMLSanitizerMixin(object):
    #   sanitize_html('<a href="javascript: sucker();">Click here for $100</a>')
    #    => <a>Click here for $100</a>
    def sanitize_token(self, token):
-        if token["type"] in (tokenTypes["StartTag"], tokenTypes["EndTag"], 
+
        # accommodate filters which use token_type differently
        token_type = token["type"]
        if token_type in list(tokenTypes.keys()):
            token_type = tokenTypes[token_type]
        if token_type in (tokenTypes["StartTag"], tokenTypes["EndTag"],
                          tokenTypes["EmptyTag"]):
            if token["name"] in self.allowed_elements:
-                if token.has_key("data"):
+                return self.allowed_token(token, token_type)
            else:
                return self.disallowed_token(token, token_type)
        elif token_type == tokenTypes["Comment"]:
            pass
        else:
            return token
    def allowed_token(self, token, token_type):
        if "data" in token:
            attrs = dict([(name, val) for name, val in
                          token["data"][::-1]
                          if name in self.allowed_attributes])
            for attr in self.attr_val_is_uri:
-                        if not attrs.has_key(attr):
+                if attr not in attrs:
                    continue
                val_unescaped = re.sub("[`\000-\040\177-\240\s]+", '',
                                       unescape(attrs[attr])).lower()
                # remove replacement characters from unescaped characters
-                        val_unescaped = val_unescaped.replace(u"\ufffd", "")
+                val_unescaped = val_unescaped.replace("\ufffd", "")
                if (re.match("^[a-z0-9][-+.a-z0-9]*:", val_unescaped) and
                    (val_unescaped.split(':')[0] not in
                     self.allowed_protocols)):
@ -167,26 +202,28 @@ class HTMLSanitizerMixin(object):
                'xlink:href' in attrs and re.search('^\s*[^#\s].*',
                                                    attrs['xlink:href'])):
                del attrs['xlink:href']
-                    if attrs.has_key('style'):
+            if 'style' in attrs:
                attrs['style'] = self.sanitize_css(attrs['style'])
-                    token["data"] = [[name,val] for name,val in attrs.items()]
+            token["data"] = [[name, val] for name, val in list(attrs.items())]
        return token
-            else:
+
-                if token["type"] == tokenTypes["EndTag"]:
+    def disallowed_token(self, token, token_type):
        if token_type == tokenTypes["EndTag"]:
            token["data"] = "</%s>" % token["name"]
        elif token["data"]:
            attrs = ''.join([' %s="%s"' % (k, escape(v)) for k, v in token["data"]])
            token["data"] = "<%s%s>" % (token["name"], attrs)
        else:
            token["data"] = "<%s>" % token["name"]
-                if token["selfClosing"]:
+        if token.get("selfClosing"):
            token["data"] = token["data"][:-1] + "/>"
-                token["type"] = tokenTypes["Characters"]
+
-                del token["name"]
+        if token["type"] in list(tokenTypes.keys()):
-                return token
+            token["type"] = "Characters"
        elif token["type"] == tokenTypes["Comment"]:
            pass
        else:
            token["type"] = tokenTypes["Characters"]
        del token["name"]
        return token
    def sanitize_css(self, style):
@ -194,12 +231,15 @@ class HTMLSanitizerMixin(object):
        style = re.compile('url\s*\(\s*[^\s)]+?\s*\)\s*').sub(' ', style)
        # gauntlet
-        if not re.match("""^([:,;#%.\sa-zA-Z0-9!]|\w-\w|'[\s\w]+'|"[\s\w]+"|\([\d,\s]+\))*$""", style): return ''
+        if not re.match("""^([:,;#%.\sa-zA-Z0-9!]|\w-\w|'[\s\w]+'|"[\s\w]+"|\([\d,\s]+\))*$""", style):
-        if not re.match("^\s*([-\w]+\s*:[^:;]*(;\s*|$))*$", style): return ''
+            return ''
        if not re.match("^\s*([-\w]+\s*:[^:;]*(;\s*|$))*$", style):
            return ''
        clean = []
        for prop, value in re.findall("([-\w]+)\s*:\s*([^:;]*)", style):
-          if not value: continue
+            if not value:
                continue
            if prop.lower() in self.allowed_css_properties:
                clean.append(prop + ': ' + value + ';')
            elif prop.split('-')[0].lower() in ['background', 'border', 'margin',
@ -215,13 +255,14 @@ class HTMLSanitizerMixin(object):
        return ' '.join(clean)
 class HTMLSanitizer(HTMLTokenizer, HTMLSanitizerMixin):
    def __init__(self, stream, encoding=None, parseMeta=True, useChardet=True,
-                 lowercaseElementName=False, lowercaseAttrName=False):
+                 lowercaseElementName=False, lowercaseAttrName=False, parser=None):
        # Change case matching defaults as we only output lowercase html anyway
        # This solution doesn't seem ideal...
        HTMLTokenizer.__init__(self, stream, encoding, parseMeta, useChardet,
-                               lowercaseElementName, lowercaseAttrName)
+                               lowercaseElementName, lowercaseAttrName, parser=parser)
    def __iter__(self):
        for token in HTMLTokenizer.__iter__(self):
--- a/src/html5lib/serializer/init.py
+++ b/src/html5lib/serializer/init.py
@ -1,17 +1,16 @@
 from __future__ import absolute_import, division, unicode_literals
-from html5lib import treewalkers
+from .. import treewalkers
-from htmlserializer import HTMLSerializer
+from .htmlserializer import HTMLSerializer
 from xhtmlserializer import XHTMLSerializer
-def serialize(input, tree="simpletree", format="html", encoding=None,
+
 def serialize(input, tree="etree", format="html", encoding=None,
              **serializer_opts):
    # XXX: Should we cache this?
    walker = treewalkers.getTreeWalker(tree)
    if format == "html":
        s = HTMLSerializer(**serializer_opts)
    elif format == "xhtml":
        s = XHTMLSerializer(**serializer_opts)
    else:
-        raise ValueError, "type must be either html or xhtml"
+        raise ValueError("type must be html")
    return s.render(walker(input), encoding)
--- a/src/html5lib/serializer/htmlserializer.py
+++ b/src/html5lib/serializer/htmlserializer.py
@ -1,18 +1,20 @@
-try:
+from __future__ import absolute_import, division, unicode_literals
-    frozenset
+from six import text_type
 except NameError:
    # Import from the sets module for python 2.3
    from sets import ImmutableSet as frozenset
 import gettext
 _ = gettext.gettext
-from html5lib.constants import voidElements, booleanAttributes, spaceCharacters
+try:
-from html5lib.constants import rcdataElements
+    from functools import reduce
 except ImportError:
    pass
 from ..constants import voidElements, booleanAttributes, spaceCharacters
 from ..constants import rcdataElements, entities, xmlEntities
 from .. import utils
 from xml.sax.saxutils import escape
-spaceCharacters = u"".join(spaceCharacters)
+spaceCharacters = "".join(spaceCharacters)
 try:
    from codecs import register_error, xmlcharrefreplace_errors
@ -21,27 +23,48 @@ except ImportError:
 else:
    unicode_encode_errors = "htmlentityreplace"
    from html5lib.constants import entities
    encode_entity_map = {}
-    for k, v in entities.items():
+    is_ucs4 = len("\U0010FFFF") == 1
-        if v != "&" and encode_entity_map.get(v) != k.lower():
+    for k, v in list(entities.items()):
        # skip multi-character entities
        if ((is_ucs4 and len(v) > 1) or
                (not is_ucs4 and len(v) > 2)):
            continue
        if v != "&":
            if len(v) == 2:
                v = utils.surrogatePairToCodepoint(v)
            else:
                v = ord(v)
            if not v in encode_entity_map or k.islower():
                # prefer &lt; over &LT; and similarly for &amp;, &gt;, etc.
                encode_entity_map[v] = k
    def htmlentityreplace_errors(exc):
        if isinstance(exc, (UnicodeEncodeError, UnicodeTranslateError)):
            res = []
-            for c in exc.object[exc.start:exc.end]:
+            codepoints = []
-                e = encode_entity_map.get(c)
+            skip = False
            for i, c in enumerate(exc.object[exc.start:exc.end]):
                if skip:
                    skip = False
                    continue
                index = i + exc.start
                if utils.isSurrogatePair(exc.object[index:min([exc.end, index + 2])]):
                    codepoint = utils.surrogatePairToCodepoint(exc.object[index:index + 2])
                    skip = True
                else:
                    codepoint = ord(c)
                codepoints.append(codepoint)
            for cp in codepoints:
                e = encode_entity_map.get(cp)
                if e:
                    res.append("&")
                    res.append(e)
                    if not e.endswith(";"):
                        res.append(";")
                else:
-                    res.append(c.encode(exc.encoding, "xmlcharrefreplace"))
+                    res.append("&#x%s;" % (hex(cp)[2:]))
-            return (u"".join(res), exc.end)
+            return ("".join(res), exc.end)
        else:
            return xmlcharrefreplace_errors(exc)
@ -49,125 +72,185 @@ else:
    del register_error
 def encode(text, encoding):
    return text.encode(encoding, unicode_encode_errors)
 class HTMLSerializer(object):
    # attribute quoting options
    quote_attr_values = False
    quote_char = '"'
    use_best_quote_char = True
    minimize_boolean_attributes = True
    # tag syntax options
    omit_optional_tags = True
    minimize_boolean_attributes = True
    use_trailing_solidus = False
    space_before_trailing_solidus = True
    # escaping options
    escape_lt_in_attrs = False
    escape_rcdata = False
    resolve_entities = True
    # miscellaneous options
    alphabetical_attributes = False
    inject_meta_charset = True
    strip_whitespace = False
    sanitize = False
    omit_optional_tags = True
    options = ("quote_attr_values", "quote_char", "use_best_quote_char",
-          "minimize_boolean_attributes", "use_trailing_solidus",
+               "omit_optional_tags", "minimize_boolean_attributes",
-          "space_before_trailing_solidus", "omit_optional_tags",
+               "use_trailing_solidus", "space_before_trailing_solidus",
-          "strip_whitespace", "inject_meta_charset", "escape_lt_in_attrs",
+               "escape_lt_in_attrs", "escape_rcdata", "resolve_entities",
-          "escape_rcdata", 'use_trailing_solidus', "sanitize")
+               "alphabetical_attributes", "inject_meta_charset",
               "strip_whitespace", "sanitize")
    def __init__(self, **kwargs):
-        if kwargs.has_key('quote_char'):
+        """Initialize HTMLSerializer.
        Keyword options (default given first unless specified) include:
        inject_meta_charset=True|False
          Whether it insert a meta element to define the character set of the
          document.
        quote_attr_values=True|False
          Whether to quote attribute values that don't require quoting
          per HTML5 parsing rules.
        quote_char=u'"'|u"'"
          Use given quote character for attribute quoting. Default is to
          use double quote unless attribute value contains a double quote,
          in which case single quotes are used instead.
        escape_lt_in_attrs=False|True
          Whether to escape < in attribute values.
        escape_rcdata=False|True
          Whether to escape characters that need to be escaped within normal
          elements within rcdata elements such as style.
        resolve_entities=True|False
          Whether to resolve named character entities that appear in the
          source tree. The XML predefined entities &lt; &gt; &amp; &quot; &apos;
          are unaffected by this setting.
        strip_whitespace=False|True
          Whether to remove semantically meaningless whitespace. (This
          compresses all whitespace to a single space except within pre.)
        minimize_boolean_attributes=True|False
          Shortens boolean attributes to give just the attribute value,
          for example <input disabled="disabled"> becomes <input disabled>.
        use_trailing_solidus=False|True
          Includes a close-tag slash at the end of the start tag of void
          elements (empty elements whose end tag is forbidden). E.g. <hr/>.
        space_before_trailing_solidus=True|False
          Places a space immediately before the closing slash in a tag
          using a trailing solidus. E.g. <hr />. Requires use_trailing_solidus.
        sanitize=False|True
          Strip all unsafe or unknown constructs from output.
          See `html5lib user documentation`_
        omit_optional_tags=True|False
          Omit start/end tags that are optional.
        alphabetical_attributes=False|True
          Reorder attributes to be in alphabetical order.
        .. _html5lib user documentation: http://code.google.com/p/html5lib/wiki/UserDocumentation
        """
        if 'quote_char' in kwargs:
            self.use_best_quote_char = False
        for attr in self.options:
            setattr(self, attr, kwargs.get(attr, getattr(self, attr)))
        self.errors = []
        self.strict = False
    def encode(self, string):
        assert(isinstance(string, text_type))
        if self.encoding:
            return string.encode(self.encoding, unicode_encode_errors)
        else:
            return string
    def encodeStrict(self, string):
        assert(isinstance(string, text_type))
        if self.encoding:
            return string.encode(self.encoding, "strict")
        else:
            return string
    def serialize(self, treewalker, encoding=None):
        self.encoding = encoding
        in_cdata = False
        self.errors = []
        if encoding and self.inject_meta_charset:
-            from html5lib.filters.inject_meta_charset import Filter
+            from ..filters.inject_meta_charset import Filter
            treewalker = Filter(treewalker, encoding)
-        # XXX: WhitespaceFilter should be used before OptionalTagFilter
+        # WhitespaceFilter should be used before OptionalTagFilter
        # for maximum efficiently of this latter filter
        if self.strip_whitespace:
-            from html5lib.filters.whitespace import Filter
+            from ..filters.whitespace import Filter
            treewalker = Filter(treewalker)
        if self.sanitize:
-            from html5lib.filters.sanitizer import Filter
+            from ..filters.sanitizer import Filter
            treewalker = Filter(treewalker)
        if self.omit_optional_tags:
-            from html5lib.filters.optionaltags import Filter
+            from ..filters.optionaltags import Filter
            treewalker = Filter(treewalker)
        # Alphabetical attributes must be last, as other filters
        # could add attributes and alter the order
        if self.alphabetical_attributes:
            from ..filters.alphabeticalattributes import Filter
            treewalker = Filter(treewalker)
        for token in treewalker:
            type = token["type"]
            if type == "Doctype":
-                doctype = u"<!DOCTYPE %s" % token["name"]
+                doctype = "<!DOCTYPE %s" % token["name"]
                if token["publicId"]:
-                    doctype += u' PUBLIC "%s"' % token["publicId"]
+                    doctype += ' PUBLIC "%s"' % token["publicId"]
                elif token["systemId"]:
-                    doctype += u" SYSTEM"
+                    doctype += " SYSTEM"
                if token["systemId"]:
-                    if token["systemId"].find(u'"') >= 0:
+                    if token["systemId"].find('"') >= 0:
-                        if token["systemId"].find(u"'") >= 0:
+                        if token["systemId"].find("'") >= 0:
                            self.serializeError(_("System identifer contains both single and double quote characters"))
-                        quote_char = u"'"
+                        quote_char = "'"
                    else:
-                        quote_char = u'"'
+                        quote_char = '"'
-                    doctype += u" %s%s%s" % (quote_char, token["systemId"], quote_char)
+                    doctype += " %s%s%s" % (quote_char, token["systemId"], quote_char)
-                doctype += u">"
+                doctype += ">"
-                
+                yield self.encodeStrict(doctype)
                if encoding:
                    yield doctype.encode(encoding)
                else:
                    yield doctype
            elif type in ("Characters", "SpaceCharacters"):
                if type == "SpaceCharacters" or in_cdata:
                    if in_cdata and token["data"].find("</") >= 0:
                        self.serializeError(_("Unexpected </ in CDATA"))
-                    if encoding:
+                    yield self.encode(token["data"])
                        yield token["data"].encode(encoding, "strict")
                else:
-                        yield token["data"]
+                    yield self.encode(escape(token["data"]))
                elif encoding:
                    yield encode(escape(token["data"]), encoding)
                else:
                    yield escape(token["data"])
            elif type in ("StartTag", "EmptyTag"):
                name = token["name"]
                yield self.encodeStrict("<%s" % name)
                if name in rcdataElements and not self.escape_rcdata:
                    in_cdata = True
                elif in_cdata:
                    self.serializeError(_("Unexpected child element of a CDATA element"))
-                attrs = token["data"]
+                for (attr_namespace, attr_name), attr_value in token["data"].items():
-                if hasattr(attrs, "items"):
+                    # TODO: Add namespace support here
-                    attrs = attrs.items()
+                    k = attr_name
-                attrs.sort()
+                    v = attr_value
-                attributes = []
+                    yield self.encodeStrict(' ')
                for k,v in attrs:
                    if encoding:
                        k = k.encode(encoding, "strict")
                    attributes.append(' ')
-                    attributes.append(k)
+                    yield self.encodeStrict(k)
                    if not self.minimize_boolean_attributes or \
-                      (k not in booleanAttributes.get(name, tuple()) \
+                        (k not in booleanAttributes.get(name, tuple())
                         and k not in booleanAttributes.get("", tuple())):
-                        attributes.append("=")
+                        yield self.encodeStrict("=")
                        if self.quote_attr_values or not v:
                            quote_attr = True
                        else:
                            quote_attr = reduce(lambda x, y: x or (y in v),
                                                spaceCharacters + ">\"'=", False)
                        v = v.replace("&", "&amp;")
-                        if self.escape_lt_in_attrs: v = v.replace("<", "&lt;")
+                        if self.escape_lt_in_attrs:
-                        if encoding:
+                            v = v.replace("<", "&lt;")
                            v = encode(v, encoding)
                        if quote_attr:
                            quote_char = self.quote_char
                            if self.use_best_quote_char:
@ -179,20 +262,17 @@ class HTMLSerializer(object):
                                v = v.replace("'", "&#39;")
                            else:
                                v = v.replace('"', "&quot;")
-                            attributes.append(quote_char)
+                            yield self.encodeStrict(quote_char)
-                            attributes.append(v)
+                            yield self.encode(v)
-                            attributes.append(quote_char)
+                            yield self.encodeStrict(quote_char)
                        else:
-                            attributes.append(v)
+                            yield self.encode(v)
                if name in voidElements and self.use_trailing_solidus:
                    if self.space_before_trailing_solidus:
-                        attributes.append(" /")
+                        yield self.encodeStrict(" /")
                    else:
-                        attributes.append("/")
+                        yield self.encodeStrict("/")
-                if encoding:
+                yield self.encode(">")
                    yield "<%s%s>" % (name.encode(encoding, "strict"), "".join(attributes))
                else:
                    yield u"<%s%s>" % (name, u"".join(attributes))
            elif type == "EndTag":
                name = token["name"]
@ -200,28 +280,33 @@ class HTMLSerializer(object):
                    in_cdata = False
                elif in_cdata:
                    self.serializeError(_("Unexpected child element of a CDATA element"))
-                end_tag = u"</%s>" % name
+                yield self.encodeStrict("</%s>" % name)
                if encoding:
                    end_tag = end_tag.encode(encoding, "strict")
                yield end_tag
            elif type == "Comment":
                data = token["data"]
                if data.find("--") >= 0:
                    self.serializeError(_("Comment contains --"))
-                comment = u"<!--%s-->" % token["data"]
+                yield self.encodeStrict("<!--%s-->" % token["data"])
-                if encoding:
+
-                    comment = comment.encode(encoding, unicode_encode_errors)
+            elif type == "Entity":
-                yield comment
+                name = token["name"]
                key = name + ";"
                if not key in entities:
                    self.serializeError(_("Entity %s not recognized" % name))
                if self.resolve_entities and key not in xmlEntities:
                    data = entities[key]
                else:
                    data = "&%s;" % name
                yield self.encodeStrict(data)
            else:
                self.serializeError(token["data"])
    def render(self, treewalker, encoding=None):
        if encoding:
-            return "".join(list(self.serialize(treewalker, encoding)))
+            return b"".join(list(self.serialize(treewalker, encoding)))
        else:
-            return u"".join(list(self.serialize(treewalker)))
+            return "".join(list(self.serialize(treewalker)))
    def serializeError(self, data="XXX ERROR MESSAGE NEEDED"):
        # XXX The idea is to make data mandatory.
@ -229,6 +314,7 @@ class HTMLSerializer(object):
        if self.strict:
            raise SerializeError
 def SerializeError(Exception):
    """Error in serialized tree"""
    pass
--- a/src/html5lib/serializer/xhtmlserializer.py
+++ b/src/html5lib/serializer/xhtmlserializer.py
@ -1,9 +0,0 @@
 from htmlserializer import HTMLSerializer
 class XHTMLSerializer(HTMLSerializer):
    quote_attr_values = True
    minimize_boolean_attributes = False
    use_trailing_solidus = True
    escape_lt_in_attrs = True
    omit_optional_tags = False
    escape_rcdata = True
--- a/src/html5lib/tokenizer.py
+++ b/src/html5lib/tokenizer.py
--- a/src/html5lib/tokenizer_old.py
+++ b/src/html5lib/tokenizer_old.py
--- a/src/html5lib/treeadapters/init.py
+++ b/src/html5lib/treeadapters/init.py
--- a/src/html5lib/treeadapters/sax.py
+++ b/src/html5lib/treeadapters/sax.py
@ -0,0 +1,44 @@
 from __future__ import absolute_import, division, unicode_literals
 from xml.sax.xmlreader import AttributesNSImpl
 from ..constants import adjustForeignAttributes, unadjustForeignAttributes
 prefix_mapping = {}
 for prefix, localName, namespace in adjustForeignAttributes.values():
    if prefix is not None:
        prefix_mapping[prefix] = namespace
 def to_sax(walker, handler):
    """Call SAX-like content handler based on treewalker walker"""
    handler.startDocument()
    for prefix, namespace in prefix_mapping.items():
        handler.startPrefixMapping(prefix, namespace)
    for token in walker:
        type = token["type"]
        if type == "Doctype":
            continue
        elif type in ("StartTag", "EmptyTag"):
            attrs = AttributesNSImpl(token["data"],
                                     unadjustForeignAttributes)
            handler.startElementNS((token["namespace"], token["name"]),
                                   token["name"],
                                   attrs)
            if type == "EmptyTag":
                handler.endElementNS((token["namespace"], token["name"]),
                                     token["name"])
        elif type == "EndTag":
            handler.endElementNS((token["namespace"], token["name"]),
                                 token["name"])
        elif type in ("Characters", "SpaceCharacters"):
            handler.characters(token["data"])
        elif type == "Comment":
            pass
        else:
            assert False, "Unknown token type"
    for prefix, namespace in prefix_mapping.items():
        handler.endPrefixMapping(prefix)
    handler.endDocument()
--- a/src/html5lib/treebuilders/init.py
+++ b/src/html5lib/treebuilders/init.py
@ -7,7 +7,7 @@ implement several things:
 1) A set of classes for various types of elements: Document, Doctype,
 Comment, Element. These must implement the interface of
 _base.treebuilders.Node (although comment nodes have a different
-signature for their constructor, see treebuilders.simpletree.Comment)
+signature for their constructor, see treebuilders.etree.Comment)
 Textual content may also be implemented as another node type, or not, as
 your tree implementation requires.
@ -24,69 +24,53 @@ getDocument - Returns the root node of the complete document tree
 testSerializer method on your treebuilder which accepts a node and
 returns a string containing Node and its children serialized according
 to the format used in the unittests
 The supplied simpletree module provides a python-only implementation
 of a full treebuilder and is a useful reference for the semantics of
 the various methods.
 """
 from __future__ import absolute_import, division, unicode_literals
 from ..utils import default_etree
 treeBuilderCache = {}
 def getTreeBuilder(treeType, implementation=None, **kwargs):
    """Get a TreeBuilder class for various types of tree with built-in support
    treeType - the name of the tree type required (case-insensitive). Supported
-               values are "simpletree", "dom", "etree" and "beautifulsoup"
+               values are:
               "simpletree" - a built-in DOM-ish tree type with support for some
                              more pythonic idioms.
               "dom" - A generic builder for DOM implementations, defaulting to
-                        a xml.dom.minidom based implementation for the sake of
+                       a xml.dom.minidom based implementation.
                        backwards compatibility (as releases up until 0.10 had a
                        builder called "dom" that was a minidom implemenation).
               "etree" - A generic builder for tree implementations exposing an
-                          elementtree-like interface (known to work with
+                         ElementTree-like interface, defaulting to
-                          ElementTree, cElementTree and lxml.etree).
+                         xml.etree.cElementTree if available and
-                "beautifulsoup" - Beautiful soup (if installed)
+                         xml.etree.ElementTree if not.
               "lxml" - A etree-based builder for lxml.etree, handling
                        limitations of lxml's implementation.
    implementation - (Currently applies to the "etree" and "dom" tree types). A
                      module implementing the tree type e.g.
-                      xml.etree.ElementTree or lxml.etree."""
+                      xml.etree.ElementTree or xml.etree.cElementTree."""
    treeType = treeType.lower()
    if treeType not in treeBuilderCache:
        if treeType == "dom":
-            import dom
+            from . import dom
-            # XXX: Keep backwards compatibility by using minidom if no implementation is given
+            # Come up with a sane default (pref. from the stdlib)
-            if implementation == None:
+            if implementation is None:
                from xml.dom import minidom
                implementation = minidom
-            # XXX: NEVER cache here, caching is done in the dom submodule
+            # NEVER cache here, caching is done in the dom submodule
            return dom.getDomModule(implementation, **kwargs).TreeBuilder
        elif treeType == "simpletree":
            import simpletree
            treeBuilderCache[treeType] = simpletree.TreeBuilder
        elif treeType == "beautifulsoup":
            import soup
            treeBuilderCache[treeType] = soup.TreeBuilder
        elif treeType == "lxml":
-            import etree_lxml
+            from . import etree_lxml
            treeBuilderCache[treeType] = etree_lxml.TreeBuilder
        elif treeType == "etree":
-            # Come up with a sane default
+            from . import etree
-            if implementation == None:
+            if implementation is None:
-                try:
+                implementation = default_etree
-                    import xml.etree.cElementTree as ET
+            # NEVER cache here, caching is done in the etree submodule
                except ImportError:
                    try:
                        import xml.etree.ElementTree as ET
                    except ImportError:
                        try:
                            import cElementTree as ET
                        except ImportError:
                            import elementtree.ElementTree as ET
                implementation = ET
            import etree
            # XXX: NEVER cache here, caching is done in the etree submodule
            return etree.getETreeModule(implementation, **kwargs).TreeBuilder
        else:
            raise ValueError("""Unrecognised treebuilder "%s" """ % treeType)
    return treeBuilderCache.get(treeType)
--- a/src/html5lib/treebuilders/_base.py
+++ b/src/html5lib/treebuilders/_base.py
@ -1,16 +1,25 @@
-from html5lib.constants import scopingElements, tableInsertModeElements, namespaces
+from __future__ import absolute_import, division, unicode_literals
-try:
+from six import text_type
    frozenset
 except NameError:
    # Import from the sets module for python 2.3
    from sets import Set as set
    from sets import ImmutableSet as frozenset
-# The scope markers are inserted when entering buttons, object elements,
+from ..constants import scopingElements, tableInsertModeElements, namespaces
 # The scope markers are inserted when entering object elements,
 # marquees, table cells, and table captions, and are used to prevent formatting
-# from "leaking" into tables, buttons, object elements, and marquees.
+# from "leaking" into tables, object elements, and marquees.
 Marker = None
 listElementsMap = {
    None: (frozenset(scopingElements), False),
    "button": (frozenset(scopingElements | set([(namespaces["html"], "button")])), False),
    "list": (frozenset(scopingElements | set([(namespaces["html"], "ol"),
                                              (namespaces["html"], "ul")])), False),
    "table": (frozenset([(namespaces["html"], "html"),
                         (namespaces["html"], "table")]), False),
    "select": (frozenset([(namespaces["html"], "optgroup"),
                          (namespaces["html"], "option")]), True)
 }
 class Node(object):
    def __init__(self, name):
        """Node representing an item in the tree.
@ -30,10 +39,10 @@ class Node(object):
        self.childNodes = []
        self._flags = []
-    def __unicode__(self):
+    def __str__(self):
        attributesStr = " ".join(["%s=\"%s\"" % (name, value)
                                  for name, value in
-                                   self.attributes.iteritems()])
+                                  self.attributes.items()])
        if attributesStr:
            return "<%s %s>" % (self.name, attributesStr)
        else:
@ -80,12 +89,36 @@ class Node(object):
        """
        raise NotImplementedError
    def hasContent(self):
        """Return true if the node has children or text, false otherwise
        """
        raise NotImplementedError
 class ActiveFormattingElements(list):
    def append(self, node):
        equalCount = 0
        if node != Marker:
            for element in self[::-1]:
                if element == Marker:
                    break
                if self.nodesEqual(element, node):
                    equalCount += 1
                if equalCount == 3:
                    self.remove(element)
                    break
        list.append(self, node)
    def nodesEqual(self, node1, node2):
        if not node1.nameTuple == node2.nameTuple:
            return False
        if not node1.attributes == node2.attributes:
            return False
        return True
 class TreeBuilder(object):
    """Base treebuilder implementation
    documentClass - the class to use for the bottommost node of a document
@ -118,7 +151,7 @@ class TreeBuilder(object):
    def reset(self):
        self.openElements = []
-        self.activeFormattingElements = []
+        self.activeFormattingElements = ActiveFormattingElements()
        # XXX - rename these to headElement, formElement
        self.headPointer = None
@ -129,20 +162,18 @@ class TreeBuilder(object):
        self.document = self.documentClass()
    def elementInScope(self, target, variant=None):
-        # Exit early when possible.
+
-        listElementsMap = {
+        # If we pass a node in we match that. if we pass a string
-            None:scopingElements,
+        # match any node with that name
-            "list":scopingElements | set([(namespaces["html"], "ol"),
+        exactNode = hasattr(target, "nameTuple")
-                                          (namespaces["html"], "ul")]),
+
-            "table":set([(namespaces["html"], "html"),
+        listElements, invert = listElementsMap[variant]
                         (namespaces["html"], "table")])
            }
        listElements = listElementsMap[variant]
        for node in reversed(self.openElements):
-            if node.name == target:
+            if (node.name == target and not exactNode or
                    node == target and exactNode):
                return True
-            elif node.nameTuple in listElements:
+            elif (invert ^ (node.nameTuple in listElements)):
                return False
        assert False  # We should never reach this point
@ -254,6 +285,7 @@ class TreeBuilder(object):
    def insertElementNormal(self, token):
        name = token["name"]
        assert isinstance(name, text_type), "Element %s not unicode" % name
        namespace = token.get("namespace", self.defaultNamespace)
        element = self.elementClass(name, namespace)
        element.attributes = token["data"]
@ -321,7 +353,7 @@ class TreeBuilder(object):
    def generateImpliedEndTags(self, exclude=None):
        name = self.openElements[-1].name
        # XXX td, th and tr are not actually needed
-        if (name in frozenset(("dd", "dt", "li", "p", "td", "th", "tr"))
+        if (name in frozenset(("dd", "dt", "li", "option", "optgroup", "p", "rp", "rt"))
                and name != exclude):
            self.openElements.pop()
            # XXX This is not entirely what the specification says. We should
--- a/src/html5lib/treebuilders/dom.py
+++ b/src/html5lib/treebuilders/dom.py
@ -1,40 +1,38 @@
 from __future__ import absolute_import, division, unicode_literals
-from xml.dom import minidom, Node, XML_NAMESPACE, XMLNS_NAMESPACE
+
-import new
+from xml.dom import minidom, Node
 import re
 import weakref
-import _base
+from . import _base
-from html5lib import constants, ihatexml
+from .. import constants
-from html5lib.constants import namespaces
+from ..constants import namespaces
 from ..utils import moduleFactoryFactory
 moduleCache = {}
 def getDomModule(DomImplementation):
    name = "_" + DomImplementation.__name__+"builder"
    if name in moduleCache:
        return moduleCache[name]
    else:
        mod = new.module(name)
        objs = getDomBuilder(DomImplementation)
        mod.__dict__.update(objs)
        moduleCache[name] = mod    
        return mod
 def getDomBuilder(DomImplementation):
    Dom = DomImplementation
-    class AttrList:
+
    class AttrList(object):
        def __init__(self, element):
            self.element = element
        def __iter__(self):
-            return self.element.attributes.items().__iter__()
+            return list(self.element.attributes.items()).__iter__()
        def __setitem__(self, name, value):
            self.element.setAttribute(name, value)
        def __len__(self):
            return len(list(self.element.attributes.items()))
        def items(self):
            return [(item[0], item[1]) for item in
-                     self.element.attributes.items()]
+                    list(self.element.attributes.items())]
        def keys(self):
-            return self.element.attributes.keys()
+            return list(self.element.attributes.keys())
        def __getitem__(self, name):
            return self.element.getAttribute(name)
@ -84,7 +82,7 @@ def getDomBuilder(DomImplementation):
        def setAttributes(self, attributes):
            if attributes:
-                for name, value in attributes.items():
+                for name, value in list(attributes.items()):
                    if isinstance(name, tuple):
                        if name[0] is not None:
                            qualifiedName = (name[0] + ":" + name[1])
@ -104,7 +102,7 @@ def getDomBuilder(DomImplementation):
            return self.element.hasChildNodes()
        def getNameTuple(self):
-            if self.namespace == None:
+            if self.namespace is None:
                return namespaces["html"], self.name
            else:
                return self.namespace, self.name
@ -155,7 +153,7 @@ def getDomBuilder(DomImplementation):
        def insertText(self, data, parent=None):
            data = data
-            if parent <> self:
+            if parent != self:
                _base.TreeBuilder.insertText(self, data, parent)
            else:
                # HACK: allow text nodes as children of the document node
@ -165,19 +163,21 @@ def getDomBuilder(DomImplementation):
                        self.dom._child_node_types.append(Node.TEXT_NODE)
                self.dom.appendChild(self.dom.createTextNode(data))
        implementation = DomImplementation
        name = None
    def testSerializer(element):
        element.normalize()
        rv = []
        def serializeElement(element, indent=0):
            if element.nodeType == Node.DOCUMENT_TYPE_NODE:
                if element.name:
                    if element.publicId or element.systemId:
                        publicId = element.publicId or ""
                        systemId = element.systemId or ""
-                        rv.append( """|%s<!DOCTYPE %s "%s" "%s">"""%(
+                        rv.append("""|%s<!DOCTYPE %s "%s" "%s">""" %
-                                ' '*indent, element.name, publicId, systemId))
+                                  (' ' * indent, element.name, publicId, systemId))
                    else:
                        rv.append("|%s<!DOCTYPE %s>" % (' ' * indent, element.name))
                else:
@ -192,16 +192,16 @@ def getDomBuilder(DomImplementation):
                rv.append("|%s\"%s\"" % (' ' * indent, element.nodeValue))
            else:
                if (hasattr(element, "namespaceURI") and
-                    element.namespaceURI != None):
+                        element.namespaceURI is not None):
                    name = "%s %s" % (constants.prefixes[element.namespaceURI],
                                      element.nodeName)
                else:
                    name = element.nodeName
                rv.append("|%s<%s>" % (' ' * indent, name))
                if element.hasAttributes():
-                    i = 0
+                    attributes = []
                    for i in range(len(element.attributes)):
                        attr = element.attributes.item(i)
                    while attr:
                        name = attr.nodeName
                        value = attr.value
                        ns = attr.namespaceURI
@ -209,9 +209,9 @@ def getDomBuilder(DomImplementation):
                            name = "%s %s" % (constants.prefixes[ns], attr.localName)
                        else:
                            name = attr.nodeName
-                        i += 1
+                        attributes.append((name, value))
                        attr = element.attributes.item(i)
                    for name, value in sorted(attributes):
                        rv.append('|%s%s="%s"' % (' ' * (indent + 2), name, value))
            indent += 2
            for child in element.childNodes:
@ -220,67 +220,8 @@ def getDomBuilder(DomImplementation):
        return "\n".join(rv)
    def dom2sax(node, handler, nsmap={'xml':XML_NAMESPACE}):
      if node.nodeType == Node.ELEMENT_NODE:
        if not nsmap:
          handler.startElement(node.nodeName, node.attributes)
          for child in node.childNodes: dom2sax(child, handler, nsmap)
          handler.endElement(node.nodeName)
        else:
          attributes = dict(node.attributes.itemsNS()) 
          # gather namespace declarations
          prefixes = []
          for attrname in node.attributes.keys():
            attr = node.getAttributeNode(attrname)
            if (attr.namespaceURI == XMLNS_NAMESPACE or
               (attr.namespaceURI == None and attr.nodeName.startswith('xmlns'))):
              prefix = (attr.nodeName != 'xmlns' and attr.nodeName or None)
              handler.startPrefixMapping(prefix, attr.nodeValue)
              prefixes.append(prefix)
              nsmap = nsmap.copy()
              nsmap[prefix] = attr.nodeValue
              del attributes[(attr.namespaceURI, attr.nodeName)]
          # apply namespace declarations
          for attrname in node.attributes.keys():
            attr = node.getAttributeNode(attrname)
            if attr.namespaceURI == None and ':' in attr.nodeName:
              prefix = attr.nodeName.split(':')[0]
              if nsmap.has_key(prefix):
                del attributes[(attr.namespaceURI, attr.nodeName)]
                attributes[(nsmap[prefix],attr.nodeName)]=attr.nodeValue
          # SAX events
          ns = node.namespaceURI or nsmap.get(None,None)
          handler.startElementNS((ns,node.nodeName), node.nodeName, attributes)
          for child in node.childNodes: dom2sax(child, handler, nsmap)
          handler.endElementNS((ns, node.nodeName), node.nodeName)
          for prefix in prefixes: handler.endPrefixMapping(prefix)
      elif node.nodeType in [Node.TEXT_NODE, Node.CDATA_SECTION_NODE]:
        handler.characters(node.nodeValue)
      elif node.nodeType == Node.DOCUMENT_NODE:
        handler.startDocument()
        for child in node.childNodes: dom2sax(child, handler, nsmap)
        handler.endDocument()
      elif node.nodeType == Node.DOCUMENT_FRAGMENT_NODE:
        for child in node.childNodes: dom2sax(child, handler, nsmap)
      else:
        # ATTRIBUTE_NODE
        # ENTITY_NODE
        # PROCESSING_INSTRUCTION_NODE
        # COMMENT_NODE
        # DOCUMENT_TYPE_NODE
        # NOTATION_NODE
        pass
    return locals()
-# Keep backwards compatibility with things that directly load 
+
-# classes/functions from this module
+# The actual means to get a module!
-for key, value in getDomModule(minidom).__dict__.items():
+getDomModule = moduleFactoryFactory(getDomBuilder)
 	globals()[key] = value
--- a/src/html5lib/treebuilders/etree.py
+++ b/src/html5lib/treebuilders/etree.py
@ -1,28 +1,21 @@
-import new
+from __future__ import absolute_import, division, unicode_literals
 from six import text_type
 import re
-import _base
+from . import _base
-from html5lib import ihatexml
+from .. import ihatexml
-from html5lib import constants
+from .. import constants
-from html5lib.constants import namespaces
+from ..constants import namespaces
 from ..utils import moduleFactoryFactory
 tag_regexp = re.compile("{([^}]*)}(.*)")
 moduleCache = {}
 def getETreeModule(ElementTreeImplementation, fullTree=False):
    name = "_" + ElementTreeImplementation.__name__+"builder"
    if name in moduleCache:
        return moduleCache[name]
    else:
        mod = new.module("_" + ElementTreeImplementation.__name__+"builder")
        objs = getETreeBuilder(ElementTreeImplementation, fullTree)
        mod.__dict__.update(objs)
        moduleCache[name] = mod    
        return mod
 def getETreeBuilder(ElementTreeImplementation, fullTree=False):
    ElementTree = ElementTreeImplementation
    ElementTreeCommentType = ElementTree.Comment("asd").tag
    class Element(_base.Node):
        def __init__(self, name, namespace=None):
            self._name = name
@ -68,9 +61,9 @@ def getETreeBuilder(ElementTreeImplementation, fullTree=False):
        def _setAttributes(self, attributes):
            # Delete existing attributes first
            # XXX - there may be a better way to do this...
-            for key in self._element.attrib.keys():
+            for key in list(self._element.attrib.keys()):
                del self._element.attrib[key]
-            for key, value in attributes.iteritems():
+            for key, value in attributes.items():
                if isinstance(key, tuple):
                    name = "{%s}%s" % (key[2], key[1])
                else:
@ -81,6 +74,7 @@ def getETreeBuilder(ElementTreeImplementation, fullTree=False):
        def _getChildNodes(self):
            return self._childNodes
        def _setChildNodes(self, value):
            del self._element[:]
            self._childNodes = []
@ -91,7 +85,7 @@ def getETreeBuilder(ElementTreeImplementation, fullTree=False):
        def hasContent(self):
            """Return true if the node has children or text"""
-            return bool(self._element.text or self._element.getchildren())
+            return bool(self._element.text or len(self._element))
        def appendChild(self, node):
            self._childNodes.append(node)
@ -99,7 +93,7 @@ def getETreeBuilder(ElementTreeImplementation, fullTree=False):
            node.parent = self
        def insertBefore(self, node, refNode):
-            index = self._element.getchildren().index(refNode._element)
+            index = list(self._element).index(refNode._element)
            self._element.insert(index, node._element)
            node.parent = self
@ -119,7 +113,7 @@ def getETreeBuilder(ElementTreeImplementation, fullTree=False):
                self._element[-1].tail += data
            else:
                # Insert the text before the specified node
-                children = self._element.getchildren()
+                children = list(self._element)
                index = children.index(insertBefore._element)
                if index > 0:
                    if not self._element[index - 1].tail:
@ -131,8 +125,8 @@ def getETreeBuilder(ElementTreeImplementation, fullTree=False):
                    self._element.text += data
        def cloneNode(self):
-            element = Element(self.name, self.namespace)
+            element = type(self)(self.name, self.namespace)
-            for name, value in self.attributes.iteritems():
+            for name, value in self.attributes.items():
                element.attributes[name] = value
            return element
@ -172,34 +166,34 @@ def getETreeBuilder(ElementTreeImplementation, fullTree=False):
            self.systemId = systemId
        def _getPublicId(self):
-            return self._element.get(u"publicId", "")
+            return self._element.get("publicId", "")
        def _setPublicId(self, value):
            if value is not None:
-                self._element.set(u"publicId", value)
+                self._element.set("publicId", value)
        publicId = property(_getPublicId, _setPublicId)
        def _getSystemId(self):
-            return self._element.get(u"systemId", "")
+            return self._element.get("systemId", "")
        def _setSystemId(self, value):
            if value is not None:
-                self._element.set(u"systemId", value)
+                self._element.set("systemId", value)
        systemId = property(_getSystemId, _setSystemId)
    class Document(Element):
        def __init__(self):
-            Element.__init__(self, "<DOCUMENT_ROOT>") 
+            Element.__init__(self, "DOCUMENT_ROOT")
    class DocumentFragment(Element):
        def __init__(self):
-            Element.__init__(self, "<DOCUMENT_FRAGMENT>")
+            Element.__init__(self, "DOCUMENT_FRAGMENT")
    def testSerializer(element):
        rv = []
-        finalText = None
+
        def serializeElement(element, indent=0):
            if not(hasattr(element, "tag")):
                element = element.getroot()
@ -207,19 +201,23 @@ def getETreeBuilder(ElementTreeImplementation, fullTree=False):
                if element.get("publicId") or element.get("systemId"):
                    publicId = element.get("publicId") or ""
                    systemId = element.get("systemId") or ""
-                    rv.append( """<!DOCTYPE %s "%s" "%s">"""%(
+                    rv.append("""<!DOCTYPE %s "%s" "%s">""" %
-                            element.text, publicId, systemId))
+                              (element.text, publicId, systemId))
                else:
                    rv.append("<!DOCTYPE %s>" % (element.text,))
-            elif element.tag == "<DOCUMENT_ROOT>":
+            elif element.tag == "DOCUMENT_ROOT":
                rv.append("#document")
-                if element.text:
+                if element.text is not None:
                    rv.append("|%s\"%s\"" % (' ' * (indent + 2), element.text))
-                if element.tail:
+                if element.tail is not None:
-                    finalText = element.tail
+                    raise TypeError("Document node cannot have tail")
-            elif type(element.tag) == type(ElementTree.Comment):
+                if hasattr(element, "attrib") and len(element.attrib):
                    raise TypeError("Document node cannot have attributes")
            elif element.tag == ElementTreeCommentType:
                rv.append("|%s<!-- %s -->" % (' ' * indent, element.text))
            else:
                assert isinstance(element.tag, text_type), \
                    "Expected unicode, got %s, %s" % (type(element.tag), element.tag)
                nsmatch = tag_regexp.match(element.tag)
                if nsmatch is None:
@ -231,54 +229,59 @@ def getETreeBuilder(ElementTreeImplementation, fullTree=False):
                rv.append("|%s<%s>" % (' ' * indent, name))
                if hasattr(element, "attrib"):
-                    for name, value in element.attrib.iteritems():
+                    attributes = []
                    for name, value in element.attrib.items():
                        nsmatch = tag_regexp.match(name)
                        if nsmatch is not None:
                            ns, name = nsmatch.groups()
                            prefix = constants.prefixes[ns]
-                            name = "%s %s"%(prefix, name)
+                            attr_string = "%s %s" % (prefix, name)
                        else:
                            attr_string = name
                        attributes.append((attr_string, value))
                    for name, value in sorted(attributes):
                        rv.append('|%s%s="%s"' % (' ' * (indent + 2), name, value))
                if element.text:
                    rv.append("|%s\"%s\"" % (' ' * (indent + 2), element.text))
            indent += 2
-            for child in element.getchildren():
+            for child in element:
                serializeElement(child, indent)
            if element.tail:
                rv.append("|%s\"%s\"" % (' ' * (indent - 2), element.tail))
        serializeElement(element, 0)
        if finalText is not None:
            rv.append("|%s\"%s\""%(' '*2, finalText))
        return "\n".join(rv)
    def tostring(element):
        """Serialize an element and its child nodes to a string"""
        rv = []
        finalText = None
        filter = ihatexml.InfosetFilter()
        def serializeElement(element):
-            if type(element) == type(ElementTree.ElementTree):
+            if isinstance(element, ElementTree.ElementTree):
                element = element.getroot()
            if element.tag == "<!DOCTYPE>":
                if element.get("publicId") or element.get("systemId"):
                    publicId = element.get("publicId") or ""
                    systemId = element.get("systemId") or ""
-                    rv.append( """<!DOCTYPE %s PUBLIC "%s" "%s">"""%(
+                    rv.append("""<!DOCTYPE %s PUBLIC "%s" "%s">""" %
-                            element.text, publicId, systemId))
+                              (element.text, publicId, systemId))
                else:
                    rv.append("<!DOCTYPE %s>" % (element.text,))
-            elif element.tag == "<DOCUMENT_ROOT>":
+            elif element.tag == "DOCUMENT_ROOT":
-                if element.text:
+                if element.text is not None:
                    rv.append(element.text)
-                if element.tail:
+                if element.tail is not None:
-                    finalText = element.tail
+                    raise TypeError("Document node cannot have tail")
                if hasattr(element, "attrib") and len(element.attrib):
                    raise TypeError("Document node cannot have attributes")
-                for child in element.getchildren():
+                for child in element:
                    serializeElement(child)
-            elif type(element.tag) == type(ElementTree.Comment):
+            elif element.tag == ElementTreeCommentType:
                rv.append("<!--%s-->" % (element.text,))
            else:
                # This is assumed to be an ordinary element
@ -287,12 +290,12 @@ def getETreeBuilder(ElementTreeImplementation, fullTree=False):
                else:
                    attr = " ".join(["%s=\"%s\"" % (
                        filter.fromXmlName(name), value)
-                                     for name, value in element.attrib.iteritems()])
+                        for name, value in element.attrib.items()])
                    rv.append("<%s %s>" % (element.tag, attr))
                if element.text:
                    rv.append(element.text)
-                for child in element.getchildren():
+                for child in element:
                    serializeElement(child)
                rv.append("</%s>" % (element.tag,))
@ -302,9 +305,6 @@ def getETreeBuilder(ElementTreeImplementation, fullTree=False):
        serializeElement(element)
        if finalText is not None:
            rv.append("%s\""%(' '*2, finalText))
        return "".join(rv)
    class TreeBuilder(_base.TreeBuilder):
@ -313,6 +313,7 @@ def getETreeBuilder(ElementTreeImplementation, fullTree=False):
        elementClass = Element
        commentClass = Comment
        fragmentClass = DocumentFragment
        implementation = ElementTreeImplementation
        def testSerializer(self, element):
            return testSerializer(element)
@ -320,6 +321,10 @@ def getETreeBuilder(ElementTreeImplementation, fullTree=False):
        def getDocument(self):
            if fullTree:
                return self.document._element
            else:
                if self.defaultNamespace is not None:
                    return self.document._element.find(
                        "{%s}html" % self.defaultNamespace)
                else:
                    return self.document._element.find("html")
@ -327,3 +332,6 @@ def getETreeBuilder(ElementTreeImplementation, fullTree=False):
            return _base.TreeBuilder.getFragment(self)._element
    return locals()
 getETreeModule = moduleFactoryFactory(getETreeBuilder)
--- a/src/html5lib/treebuilders/etree_lxml.py
+++ b/src/html5lib/treebuilders/etree_lxml.py
@ -1,20 +1,3 @@
 import new
 import warnings
 import re
 import _base
 from html5lib.constants import DataLossWarning
 import html5lib.constants as constants
 import etree as etree_builders
 from html5lib import ihatexml
 try:
    import lxml.etree as etree
 except ImportError:
    pass
 fullTree = True
 """Module for supporting the lxml.etree library. The idea here is to use as much
 of the native library as possible, without using fragile hacks like custom element
 names that break between releases. The downside of this is that we cannot represent
@ -26,12 +9,34 @@ Docypes with no name
 When any of these things occur, we emit a DataLossWarning
 """
 from __future__ import absolute_import, division, unicode_literals
 import warnings
 import re
 import sys
 from . import _base
 from ..constants import DataLossWarning
 from .. import constants
 from . import etree as etree_builders
 from .. import ihatexml
 import lxml.etree as etree
 fullTree = True
 tag_regexp = re.compile("{([^}]*)}(.*)")
 comment_type = etree.Comment("asd").tag
 class DocumentType(object):
    def __init__(self, name, publicId, systemId):
        self.name = name
        self.publicId = publicId
        self.systemId = systemId
 class Document(object):
    def __init__(self):
        self._elementTree = None
@ -45,10 +50,12 @@ class Document(object):
    childNodes = property(_getChildNodes)
 def testSerializer(element):
    rv = []
    finalText = None
-    filter = ihatexml.InfosetFilter()
+    infosetFilter = ihatexml.InfosetFilter()
    def serializeElement(element, indent=0):
        if not hasattr(element, "tag"):
            if hasattr(element, "getroot"):
@ -70,47 +77,52 @@ def testSerializer(element):
                while next_element is not None:
                    serializeElement(next_element, indent + 2)
                    next_element = next_element.getnext()
-            elif isinstance(element, basestring):
+            elif isinstance(element, str) or isinstance(element, bytes):
                # Text in a fragment
                assert isinstance(element, str) or sys.version_info.major == 2
                rv.append("|%s\"%s\"" % (' ' * indent, element))
            else:
                # Fragment case
                rv.append("#document-fragment")
                for next_element in element:
                    serializeElement(next_element, indent + 2)
-        elif type(element.tag) == type(etree.Comment):
+        elif element.tag == comment_type:
            rv.append("|%s<!-- %s -->" % (' ' * indent, element.text))
            if hasattr(element, "tail") and element.tail:
                rv.append("|%s\"%s\"" % (' ' * indent, element.tail))
        else:
            assert isinstance(element, etree._Element)
            nsmatch = etree_builders.tag_regexp.match(element.tag)
            if nsmatch is not None:
                ns = nsmatch.group(1)
                tag = nsmatch.group(2)
                prefix = constants.prefixes[ns]
                rv.append("|%s<%s %s>" % (' ' * indent, prefix,
-                                        filter.fromXmlName(tag)))
+                                          infosetFilter.fromXmlName(tag)))
            else:
                rv.append("|%s<%s>" % (' ' * indent,
-                                     filter.fromXmlName(element.tag)))
+                                       infosetFilter.fromXmlName(element.tag)))
            if hasattr(element, "attrib"):
-                for name, value in element.attrib.iteritems():
+                attributes = []
-                    nsmatch = etree_builders.tag_regexp.match(name)
+                for name, value in element.attrib.items():
-                    if nsmatch:
+                    nsmatch = tag_regexp.match(name)
-                        ns = nsmatch.group(1)
+                    if nsmatch is not None:
-                        name = nsmatch.group(2)
+                        ns, name = nsmatch.groups()
                        name = infosetFilter.fromXmlName(name)
                        prefix = constants.prefixes[ns]
-                        rv.append('|%s%s %s="%s"' % (' '*(indent+2), 
+                        attr_string = "%s %s" % (prefix, name)
                                                  prefix,
                                                  filter.fromXmlName(name),
                                                  value))
                    else:
-                        rv.append('|%s%s="%s"' % (' '*(indent+2), 
+                        attr_string = infosetFilter.fromXmlName(name)
-                                                  filter.fromXmlName(name),
+                    attributes.append((attr_string, value))
-                                                  value))
+
                for name, value in sorted(attributes):
                    rv.append('|%s%s="%s"' % (' ' * (indent + 2), name, value))
            if element.text:
                rv.append("|%s\"%s\"" % (' ' * (indent + 2), element.text))
            indent += 2
-            for child in element.getchildren():
+            for child in element:
                serializeElement(child, indent)
            if hasattr(element, "tail") and element.tail:
                rv.append("|%s\"%s\"" % (' ' * (indent - 2), element.tail))
@ -121,10 +133,12 @@ def testSerializer(element):
    return "\n".join(rv)
 def tostring(element):
    """Serialize an element and its child nodes to a string"""
    rv = []
    finalText = None
    def serializeElement(element):
        if not hasattr(element, "tag"):
            if element.docinfo.internalDTD:
@ -135,7 +149,7 @@ def tostring(element):
                rv.append(dtd_str)
            serializeElement(element.getroot())
-        elif type(element.tag) == type(etree.Comment):
+        elif element.tag == comment_type:
            rv.append("<!--%s-->" % (element.text,))
        else:
@ -144,12 +158,12 @@ def tostring(element):
                rv.append("<%s>" % (element.tag,))
            else:
                attr = " ".join(["%s=\"%s\"" % (name, value)
-                                 for name, value in element.attrib.iteritems()])
+                                 for name, value in element.attrib.items()])
                rv.append("<%s %s>" % (element.tag, attr))
            if element.text:
                rv.append(element.text)
-            for child in element.getchildren():
+            for child in element:
                serializeElement(child)
            rv.append("</%s>" % (element.tag,))
@ -171,44 +185,45 @@ class TreeBuilder(_base.TreeBuilder):
    elementClass = None
    commentClass = None
    fragmentClass = Document
    implementation = etree
    def __init__(self, namespaceHTMLElements, fullTree=False):
        builder = etree_builders.getETreeModule(etree, fullTree=fullTree)
-        filter = self.filter = ihatexml.InfosetFilter()
+        infosetFilter = self.infosetFilter = ihatexml.InfosetFilter()
        self.namespaceHTMLElements = namespaceHTMLElements
        class Attributes(dict):
            def __init__(self, element, value={}):
                self._element = element
                dict.__init__(self, value)
-                for key, value in self.iteritems():
+                for key, value in self.items():
                    if isinstance(key, tuple):
-                        name = "{%s}%s"%(key[2], filter.coerceAttribute(key[1]))
+                        name = "{%s}%s" % (key[2], infosetFilter.coerceAttribute(key[1]))
                    else:
-                        name = filter.coerceAttribute(key)
+                        name = infosetFilter.coerceAttribute(key)
                    self._element._element.attrib[name] = value
            def __setitem__(self, key, value):
                dict.__setitem__(self, key, value)
                if isinstance(key, tuple):
-                    name = "{%s}%s"%(key[2], filter.coerceAttribute(key[1]))
+                    name = "{%s}%s" % (key[2], infosetFilter.coerceAttribute(key[1]))
                else:
-                    name = filter.coerceAttribute(key)
+                    name = infosetFilter.coerceAttribute(key)
                self._element._element.attrib[name] = value
        class Element(builder.Element):
            def __init__(self, name, namespace):
-                name = filter.coerceElement(name)
+                name = infosetFilter.coerceElement(name)
                builder.Element.__init__(self, name, namespace=namespace)
                self._attributes = Attributes(self)
            def _setName(self, name):
-                self._name = filter.coerceElement(name)
+                self._name = infosetFilter.coerceElement(name)
                self._element.tag = self._getETreeTag(
                    self._name, self._namespace)
            def _getName(self):
-                return filter.fromXmlName(self._name)
+                return infosetFilter.fromXmlName(self._name)
            name = property(_getName, _setName)
@ -221,20 +236,19 @@ class TreeBuilder(_base.TreeBuilder):
            attributes = property(_getAttributes, _setAttributes)
            def insertText(self, data, insertBefore=None):
-                data = filter.coerceCharacters(data)
+                data = infosetFilter.coerceCharacters(data)
                builder.Element.insertText(self, data, insertBefore)
            def appendChild(self, child):
                builder.Element.appendChild(self, child)
        class Comment(builder.Comment):
            def __init__(self, data):
-                data = filter.coerceComment(data)
+                data = infosetFilter.coerceComment(data)
                builder.Comment.__init__(self, data)
            def _setData(self, data):
-                data = filter.coerceComment(data)
+                data = infosetFilter.coerceComment(data)
                self._element.text = data
            def _getData(self):
@ -267,7 +281,7 @@ class TreeBuilder(_base.TreeBuilder):
        element = self.openElements[0]._element
        if element.text:
            fragment.append(element.text)
-        fragment.extend(element.getchildren())
+        fragment.extend(list(element))
        if element.tail:
            fragment.append(element.tail)
        return fragment
@ -277,15 +291,26 @@ class TreeBuilder(_base.TreeBuilder):
        publicId = token["publicId"]
        systemId = token["systemId"]
-        if not name or ihatexml.nonXmlNameBMPRegexp.search(name) or name[0] == '"':
+        if not name:
-            warnings.warn("lxml cannot represent null or non-xml doctype", DataLossWarning)
+            warnings.warn("lxml cannot represent empty doctype", DataLossWarning)
            self.doctype = None
        else:
            coercedName = self.infosetFilter.coerceElement(name)
            if coercedName != name:
                warnings.warn("lxml cannot represent non-xml doctype", DataLossWarning)
-        doctype = self.doctypeClass(name, publicId, systemId)
+            doctype = self.doctypeClass(coercedName, publicId, systemId)
            self.doctype = doctype
    def insertCommentInitial(self, data, parent=None):
        self.initial_comments.append(data)
    def insertCommentMain(self, data, parent=None):
        if (parent == self.document and
                self.document._elementTree.getroot()[-1].tag == comment_type):
                warnings.warn("lxml cannot represent adjacent comments beyond the root elements", DataLossWarning)
        super(TreeBuilder, self).insertComment(data, parent)
    def insertRoot(self, token):
        """Create the document root"""
        # Because of the way libxml2 works, it doesn't seem to be possible to
@ -293,20 +318,29 @@ class TreeBuilder(_base.TreeBuilder):
        # Therefore we need to use the built-in parser to create our iniial
        # tree, after which we can add elements like normal
        docStr = ""
-        if self.doctype and self.doctype.name and not self.doctype.name.startswith('"'):
+        if self.doctype:
            assert self.doctype.name
            docStr += "<!DOCTYPE %s" % self.doctype.name
            if (self.doctype.publicId is not None or
                    self.doctype.systemId is not None):
-                docStr += ' PUBLIC "%s" "%s"'%(self.doctype.publicId or "",
+                docStr += (' PUBLIC "%s" ' %
-                                               self.doctype.systemId or "")
+                           (self.infosetFilter.coercePubid(self.doctype.publicId or "")))
                if self.doctype.systemId:
                    sysid = self.doctype.systemId
                    if sysid.find("'") >= 0 and sysid.find('"') >= 0:
                        warnings.warn("DOCTYPE system cannot contain single and double quotes", DataLossWarning)
                        sysid = sysid.replace("'", 'U00027')
                    if sysid.find("'") >= 0:
                        docStr += '"%s"' % sysid
                    else:
                        docStr += "'%s'" % sysid
                else:
                    docStr += "''"
            docStr += ">"
            if self.doctype.name != token["name"]:
                warnings.warn("lxml cannot represent doctype with a different name to the root element", DataLossWarning)
        docStr += "<THIS_SHOULD_NEVER_APPEAR_PUBLICLY/>"
        try:
        root = etree.fromstring(docStr)
        except etree.XMLSyntaxError:
            print docStr
            raise
        # Append the initial comments:
        for comment_token in self.initial_comments:
@ -332,4 +366,4 @@ class TreeBuilder(_base.TreeBuilder):
        self.openElements.append(root_element)
        # Reset to the default insert comment function
-        self.insertComment = super(TreeBuilder, self).insertComment
+        self.insertComment = self.insertCommentMain
--- a/src/html5lib/treebuilders/simpletree.py
+++ b/src/html5lib/treebuilders/simpletree.py
@ -1,248 +0,0 @@
 import _base
 from html5lib.constants import voidElements, namespaces, prefixes
 from xml.sax.saxutils import escape
 # Really crappy basic implementation of a DOM-core like thing
 class Node(_base.Node):
    type = -1
    def __init__(self, name):
        self.name = name
        self.parent = None
        self.value = None
        self.childNodes = []
        self._flags = []
    def __iter__(self):
        for node in self.childNodes:
            yield node
            for item in node:
                yield item
    def __unicode__(self):
        return self.name
    def toxml(self):
        raise NotImplementedError
    def printTree(self, indent=0):
        tree = '\n|%s%s' % (' '* indent, unicode(self))
        for child in self.childNodes:
            tree += child.printTree(indent + 2)
        return tree
    def appendChild(self, node):
        if (isinstance(node, TextNode) and self.childNodes and
          isinstance(self.childNodes[-1], TextNode)):
            self.childNodes[-1].value += node.value
        else:
            self.childNodes.append(node)
        node.parent = self
    def insertText(self, data, insertBefore=None):
        if insertBefore is None:
            self.appendChild(TextNode(data))
        else:
            self.insertBefore(TextNode(data), insertBefore)
    def insertBefore(self, node, refNode):
        index = self.childNodes.index(refNode)
        if (isinstance(node, TextNode) and index > 0 and
          isinstance(self.childNodes[index - 1], TextNode)):
            self.childNodes[index - 1].value += node.value
        else:
            self.childNodes.insert(index, node)
        node.parent = self
    def removeChild(self, node):
        try:
            self.childNodes.remove(node)
        except:
            # XXX
            raise
        node.parent = None
    def cloneNode(self):
        raise NotImplementedError
    def hasContent(self):
        """Return true if the node has children or text"""
        return bool(self.childNodes)
    def getNameTuple(self):
        if self.namespace == None:
            return namespaces["html"], self.name
        else:
            return self.namespace, self.name
    nameTuple = property(getNameTuple)
 class Document(Node):
    type = 1
    def __init__(self):
        Node.__init__(self, None)
    def __unicode__(self):
        return "#document"
    def appendChild(self, child):
        Node.appendChild(self, child)
    def toxml(self, encoding="utf=8"):
        result = ""
        for child in self.childNodes:
            result += child.toxml()
        return result.encode(encoding)
    def hilite(self, encoding="utf-8"):
        result = "<pre>"
        for child in self.childNodes:
            result += child.hilite()
        return result.encode(encoding) + "</pre>"
    def printTree(self):
        tree = unicode(self)
        for child in self.childNodes:
            tree += child.printTree(2)
        return tree
    def cloneNode(self):
        return Document()
 class DocumentFragment(Document):
    type = 2
    def __unicode__(self):
        return "#document-fragment"
    def cloneNode(self):
        return DocumentFragment()
 class DocumentType(Node):
    type = 3
    def __init__(self, name, publicId, systemId):
        Node.__init__(self, name)
        self.publicId = publicId
        self.systemId = systemId
    def __unicode__(self):
        if self.publicId or self.systemId:
            publicId = self.publicId or ""
            systemId = self.systemId or ""
            return """<!DOCTYPE %s "%s" "%s">"""%(
                self.name, publicId, systemId)
        else:
            return u"<!DOCTYPE %s>" % self.name
    toxml = __unicode__
    def hilite(self):
        return '<code class="markup doctype">&lt;!DOCTYPE %s></code>' % self.name
    def cloneNode(self):
        return DocumentType(self.name, self.publicId, self.systemId)
 class TextNode(Node):
    type = 4
    def __init__(self, value):
        Node.__init__(self, None)
        self.value = value
    def __unicode__(self):
        return u"\"%s\"" % self.value
    def toxml(self):
        return escape(self.value)
    hilite = toxml
    def cloneNode(self):
        return TextNode(self.value)
 class Element(Node):
    type = 5
    def __init__(self, name, namespace=None):
        Node.__init__(self, name)
        self.namespace = namespace
        self.attributes = {}
    def __unicode__(self):
        if self.namespace == None:
            return u"<%s>" % self.name
        else:
            return u"<%s %s>"%(prefixes[self.namespace], self.name)
    def toxml(self):
        result = '<' + self.name
        if self.attributes:
            for name,value in self.attributes.iteritems():
                result += u' %s="%s"' % (name, escape(value,{'"':'&quot;'}))
        if self.childNodes:
            result += '>'
            for child in self.childNodes:
                result += child.toxml()
            result += u'</%s>' % self.name
        else:
            result += u'/>'
        return result
    def hilite(self):
        result = '&lt;<code class="markup element-name">%s</code>' % self.name
        if self.attributes:
            for name, value in self.attributes.iteritems():
                result += ' <code class="markup attribute-name">%s</code>=<code class="markup attribute-value">"%s"</code>' % (name, escape(value, {'"':'&quot;'}))
        if self.childNodes:
            result += ">"
            for child in self.childNodes:
                result += child.hilite()
        elif self.name in voidElements:
            return result + ">"
        return result + '&lt;/<code class="markup element-name">%s</code>>' % self.name
    def printTree(self, indent):
        tree = '\n|%s%s' % (' '*indent, unicode(self))
        indent += 2
        if self.attributes:
            for name, value in self.attributes.iteritems():
                if isinstance(name, tuple):
                    name = "%s %s"%(name[0], name[1])
                tree += '\n|%s%s="%s"' % (' ' * indent, name, value)
        for child in self.childNodes:
            tree += child.printTree(indent)
        return tree
    def cloneNode(self):
        newNode = Element(self.name)
        if hasattr(self, 'namespace'):
            newNode.namespace = self.namespace
        for attr, value in self.attributes.iteritems():
            newNode.attributes[attr] = value
        return newNode
 class CommentNode(Node):
    type = 6
    def __init__(self, data):
        Node.__init__(self, None)
        self.data = data
    def __unicode__(self):
        return "<!-- %s -->" % self.data
    def toxml(self):
        return "<!--%s-->" % self.data
    def hilite(self):
        return '<code class="markup comment">&lt;!--%s--></code>' % escape(self.data)
    def cloneNode(self):
        return CommentNode(self.data)
 class TreeBuilder(_base.TreeBuilder):
    documentClass = Document
    doctypeClass = DocumentType
    elementClass = Element
    commentClass = CommentNode
    fragmentClass = DocumentFragment
    def testSerializer(self, node):
        return node.printTree()
--- a/src/html5lib/treebuilders/soup.py
+++ b/src/html5lib/treebuilders/soup.py
@ -1,228 +0,0 @@
 import warnings
 warnings.warn("BeautifulSoup 3.x (as of 3.1) is not fully compatible with html5lib and support will be removed in the future", DeprecationWarning)
 from BeautifulSoup import BeautifulSoup, Tag, NavigableString, Comment, Declaration
 import _base
 from html5lib.constants import namespaces, DataLossWarning
 class AttrList(object):
    def __init__(self, element):
        self.element = element
        self.attrs = dict(self.element.attrs)
    def __iter__(self):
        return self.attrs.items().__iter__()
    def __setitem__(self, name, value):
        "set attr", name, value
        self.element[name] = value
    def items(self):
        return self.attrs.items()
    def keys(self):
        return self.attrs.keys()
    def __getitem__(self, name):
        return self.attrs[name]
    def __contains__(self, name):
        return name in self.attrs.keys()
 class Element(_base.Node):
    def __init__(self, element, soup, namespace):
        _base.Node.__init__(self, element.name)
        self.element = element
        self.soup = soup
        self.namespace = namespace
    def _nodeIndex(self, node, refNode):
        # Finds a node by identity rather than equality
        for index in range(len(self.element.contents)):
            if id(self.element.contents[index]) == id(refNode.element):
                return index
        return None
    def appendChild(self, node):
        if (node.element.__class__ == NavigableString and self.element.contents
            and self.element.contents[-1].__class__ == NavigableString):
            # Concatenate new text onto old text node
            # (TODO: This has O(n^2) performance, for input like "a</a>a</a>a</a>...")
            newStr = NavigableString(self.element.contents[-1]+node.element)
            # Remove the old text node
            # (Can't simply use .extract() by itself, because it fails if
            # an equal text node exists within the parent node)
            oldElement = self.element.contents[-1]
            del self.element.contents[-1]
            oldElement.parent = None
            oldElement.extract()
            self.element.insert(len(self.element.contents), newStr)
        else:
            self.element.insert(len(self.element.contents), node.element)
            node.parent = self
    def getAttributes(self):
        return AttrList(self.element)
    def setAttributes(self, attributes):
        if attributes:
            for name, value in attributes.items():
                self.element[name] =  value
    attributes = property(getAttributes, setAttributes)
    def insertText(self, data, insertBefore=None):
        text = TextNode(NavigableString(data), self.soup)
        if insertBefore:
            self.insertBefore(text, insertBefore)
        else:
            self.appendChild(text)
    def insertBefore(self, node, refNode):
        index = self._nodeIndex(node, refNode)
        if (node.element.__class__ == NavigableString and self.element.contents
            and self.element.contents[index-1].__class__ == NavigableString):
            # (See comments in appendChild)
            newStr = NavigableString(self.element.contents[index-1]+node.element)
            oldNode = self.element.contents[index-1]
            del self.element.contents[index-1]
            oldNode.parent = None
            oldNode.extract()
            self.element.insert(index-1, newStr)
        else:
            self.element.insert(index, node.element)
            node.parent = self
    def removeChild(self, node):
        index = self._nodeIndex(node.parent, node)
        del node.parent.element.contents[index]
        node.element.parent = None
        node.element.extract()
        node.parent = None
    def reparentChildren(self, newParent):
        while self.element.contents:
            child = self.element.contents[0]
            child.extract()
            if isinstance(child, Tag):
                newParent.appendChild(Element(child, self.soup, namespaces["html"]))
            else:
                newParent.appendChild(TextNode(child, self.soup))
    def cloneNode(self):
        node = Element(Tag(self.soup, self.element.name), self.soup, self.namespace)
        for key,value in self.attributes:
            node.attributes[key] = value
        return node
    def hasContent(self):
        return self.element.contents
    def getNameTuple(self):
        if self.namespace == None:
            return namespaces["html"], self.name
        else:
            return self.namespace, self.name
    nameTuple = property(getNameTuple)
 class TextNode(Element):
    def __init__(self, element, soup):
        _base.Node.__init__(self, None)
        self.element = element
        self.soup = soup
    def cloneNode(self):
        raise NotImplementedError
 class TreeBuilder(_base.TreeBuilder):
    def __init__(self, namespaceHTMLElements):
        if namespaceHTMLElements:
            warnings.warn("BeautifulSoup cannot represent elements in any namespace", DataLossWarning)
        _base.TreeBuilder.__init__(self, namespaceHTMLElements)
    def documentClass(self):
        self.soup = BeautifulSoup("")
        return Element(self.soup, self.soup, None)
    def insertDoctype(self, token):
        name = token["name"]
        publicId = token["publicId"]
        systemId = token["systemId"]
        if publicId:
            self.soup.insert(0, Declaration("%s PUBLIC \"%s\" \"%s\""%(name, publicId, systemId or "")))
        elif systemId:
            self.soup.insert(0, Declaration("%s SYSTEM \"%s\""%
                                            (name, systemId)))
        else:
            self.soup.insert(0, Declaration(name))
    def elementClass(self, name, namespace):
        if namespace is not None:
            warnings.warn("BeautifulSoup cannot represent elements in any namespace", DataLossWarning)
        return Element(Tag(self.soup, name), self.soup, namespace)
    def commentClass(self, data):
        return TextNode(Comment(data), self.soup)
    def fragmentClass(self):
        self.soup = BeautifulSoup("")
        self.soup.name = "[document_fragment]"
        return Element(self.soup, self.soup, None) 
    def appendChild(self, node):
        self.soup.insert(len(self.soup.contents), node.element)
    def testSerializer(self, element):
        return testSerializer(element)
    def getDocument(self):
        return self.soup
    def getFragment(self):
        return _base.TreeBuilder.getFragment(self).element
 def testSerializer(element):
    import re
    rv = []
    def serializeElement(element, indent=0):
        if isinstance(element, Declaration):
            doctype_regexp = r'(?P<name>[^\s]*)( PUBLIC "(?P<publicId>.*)" "(?P<systemId1>.*)"| SYSTEM "(?P<systemId2>.*)")?'
            m = re.compile(doctype_regexp).match(element.string)
            assert m is not None, "DOCTYPE did not match expected format"
            name = m.group('name')
            publicId = m.group('publicId')
            if publicId is not None:
                systemId = m.group('systemId1') or ""
            else:
                systemId = m.group('systemId2')
            if publicId is not None or systemId is not None:
                rv.append("""|%s<!DOCTYPE %s "%s" "%s">"""%
                          (' '*indent, name, publicId or "", systemId or ""))
            else:
                rv.append("|%s<!DOCTYPE %s>"%(' '*indent, name))
        elif isinstance(element, BeautifulSoup):
            if element.name == "[document_fragment]":
                rv.append("#document-fragment")                
            else:
                rv.append("#document")
        elif isinstance(element, Comment):
            rv.append("|%s<!-- %s -->"%(' '*indent, element.string))
        elif isinstance(element, unicode):
            rv.append("|%s\"%s\"" %(' '*indent, element))
        else:
            rv.append("|%s<%s>"%(' '*indent, element.name))
            if element.attrs:
                for name, value in element.attrs:
                    rv.append('|%s%s="%s"' % (' '*(indent+2), name, value))
        indent += 2
        if hasattr(element, "contents"):
            for child in element.contents:
                serializeElement(child, indent)
    serializeElement(element, 0)
    return "\n".join(rv)
--- a/src/html5lib/treewalkers/init.py
+++ b/src/html5lib/treewalkers/init.py
@ -8,23 +8,27 @@ implements a 'serialize' method taking a tree as sole argument and
 returning an iterator generating tokens.
 """
 from __future__ import absolute_import, division, unicode_literals
 import sys
 from ..utils import default_etree
 treeWalkerCache = {}
 def getTreeWalker(treeType, implementation=None, **kwargs):
    """Get a TreeWalker class for various types of tree with built-in support
    treeType - the name of the tree type required (case-insensitive). Supported
-               values are "simpletree", "dom", "etree" and "beautifulsoup"
+               values are:
               "simpletree" - a built-in DOM-ish tree type with support for some
                              more pythonic idioms.
                "dom" - The xml.dom.minidom DOM implementation
                "pulldom" - The xml.dom.pulldom event stream
                "etree" - A generic walker for tree implementations exposing an
                          elementtree-like interface (known to work with
                          ElementTree, cElementTree and lxml.etree).
                "lxml" - Optimized walker for lxml.etree
                "beautifulsoup" - Beautiful soup (if installed)
                "genshi" - a Genshi stream
    implementation - (Currently applies to the "etree" tree type only). A module
@ -33,20 +37,21 @@ def getTreeWalker(treeType, implementation=None, **kwargs):
    treeType = treeType.lower()
    if treeType not in treeWalkerCache:
-        if treeType in ("dom", "pulldom", "simpletree"):
+        if treeType in ("dom", "pulldom"):
-            mod = __import__(treeType, globals())
+            name = "%s.%s" % (__name__, treeType)
            __import__(name)
            mod = sys.modules[name]
            treeWalkerCache[treeType] = mod.TreeWalker
        elif treeType == "genshi":
-            import genshistream
+            from . import genshistream
            treeWalkerCache[treeType] = genshistream.TreeWalker
        elif treeType == "beautifulsoup":
            import soup
            treeWalkerCache[treeType] = soup.TreeWalker
        elif treeType == "lxml":
-            import lxmletree
+            from . import lxmletree
            treeWalkerCache[treeType] = lxmletree.TreeWalker
        elif treeType == "etree":
-            import etree
+            from . import etree
            if implementation is None:
                implementation = default_etree
            # XXX: NEVER cache here, caching is done in the etree submodule
            return etree.getETreeModule(implementation, **kwargs).TreeWalker
    return treeWalkerCache.get(treeType)
--- a/src/html5lib/treewalkers/_base.py
+++ b/src/html5lib/treewalkers/_base.py
@ -1,8 +1,40 @@
 from __future__ import absolute_import, division, unicode_literals
 from six import text_type, string_types
 import gettext
 _ = gettext.gettext
-from html5lib.constants import voidElements, spaceCharacters
+from xml.dom import Node
-spaceCharacters = u"".join(spaceCharacters)
+
 DOCUMENT = Node.DOCUMENT_NODE
 DOCTYPE = Node.DOCUMENT_TYPE_NODE
 TEXT = Node.TEXT_NODE
 ELEMENT = Node.ELEMENT_NODE
 COMMENT = Node.COMMENT_NODE
 ENTITY = Node.ENTITY_NODE
 UNKNOWN = "<#UNKNOWN#>"
 from ..constants import voidElements, spaceCharacters
 spaceCharacters = "".join(spaceCharacters)
 def to_text(s, blank_if_none=True):
    """Wrapper around six.text_type to convert None to empty string"""
    if s is None:
        if blank_if_none:
            return ""
        else:
            return None
    elif isinstance(s, text_type):
        return s
    else:
        return text_type(s)
 def is_text_or_none(string):
    """Wrapper around isinstance(string_types) or is None"""
    return string is None or isinstance(string, string_types)
 class TreeWalker(object):
    def __init__(self, tree):
@ -14,34 +46,48 @@ class TreeWalker(object):
    def error(self, msg):
        return {"type": "SerializeError", "data": msg}
    def normalizeAttrs(self, attrs):
        if not attrs:
            attrs = []
        elif hasattr(attrs, 'items'):
            attrs = attrs.items()
        return [(unicode(name),unicode(value)) for name,value in attrs]
    def emptyTag(self, namespace, name, attrs, hasChildren=False):
-        yield {"type": "EmptyTag", "name": unicode(name), 
+        assert namespace is None or isinstance(namespace, string_types), type(namespace)
-               "namespace":unicode(namespace),
+        assert isinstance(name, string_types), type(name)
-               "data": self.normalizeAttrs(attrs)}
+        assert all((namespace is None or isinstance(namespace, string_types)) and
                   isinstance(name, string_types) and
                   isinstance(value, string_types)
                   for (namespace, name), value in attrs.items())
        yield {"type": "EmptyTag", "name": to_text(name, False),
               "namespace": to_text(namespace),
               "data": attrs}
        if hasChildren:
            yield self.error(_("Void element has children"))
    def startTag(self, namespace, name, attrs):
        assert namespace is None or isinstance(namespace, string_types), type(namespace)
        assert isinstance(name, string_types), type(name)
        assert all((namespace is None or isinstance(namespace, string_types)) and
                   isinstance(name, string_types) and
                   isinstance(value, string_types)
                   for (namespace, name), value in attrs.items())
        return {"type": "StartTag",
-                "name": unicode(name),
+                "name": text_type(name),
-                "namespace":unicode(namespace),
+                "namespace": to_text(namespace),
-                "data": self.normalizeAttrs(attrs)}
+                "data": dict(((to_text(namespace, False), to_text(name)),
                              to_text(value, False))
                             for (namespace, name), value in attrs.items())}
    def endTag(self, namespace, name):
        assert namespace is None or isinstance(namespace, string_types), type(namespace)
        assert isinstance(name, string_types), type(namespace)
        return {"type": "EndTag",
-                "name": unicode(name),
+                "name": to_text(name, False),
-                "namespace":unicode(namespace),
+                "namespace": to_text(namespace),
-                "data": []}
+                "data": {}}
    def text(self, data):
-        data = unicode(data)
+        assert isinstance(data, string_types), type(data)
        data = to_text(data)
        middle = data.lstrip(spaceCharacters)
        left = data[:len(data) - len(middle)]
        if left:
@ -55,41 +101,29 @@ class TreeWalker(object):
            yield {"type": "SpaceCharacters", "data": right}
    def comment(self, data):
-        return {"type": "Comment", "data": unicode(data)}
+        assert isinstance(data, string_types), type(data)
        return {"type": "Comment", "data": text_type(data)}
    def doctype(self, name, publicId=None, systemId=None, correct=True):
        assert is_text_or_none(name), type(name)
        assert is_text_or_none(publicId), type(publicId)
        assert is_text_or_none(systemId), type(systemId)
        return {"type": "Doctype",
-                "name": name is not None and unicode(name) or u"",
+                "name": to_text(name),
-                "publicId": publicId,
+                "publicId": to_text(publicId),
-                "systemId": systemId,
+                "systemId": to_text(systemId),
-                "correct": correct}
+                "correct": to_text(correct)}
    def entity(self, name):
        assert isinstance(name, string_types), type(name)
        return {"type": "Entity", "name": text_type(name)}
    def unknown(self, nodeType):
        return self.error(_("Unknown node type: ") + nodeType)
 class RecursiveTreeWalker(TreeWalker):
    def walkChildren(self, node):
        raise NodeImplementedError
    def element(self, node, namespace, name, attrs, hasChildren):
        if name in voidElements:
            for token in self.emptyTag(namespace, name, attrs, hasChildren):
                yield token
        else:
            yield self.startTag(name, attrs)
            if hasChildren:
                for token in self.walkChildren(node):
                    yield token
            yield self.endTag(name)
 from xml.dom import Node
 DOCUMENT = Node.DOCUMENT_NODE
 DOCTYPE = Node.DOCUMENT_TYPE_NODE
 TEXT = Node.TEXT_NODE
 ELEMENT = Node.ELEMENT_NODE
 COMMENT = Node.COMMENT_NODE
 UNKNOWN = "<#UNKNOWN#>"
 class NonRecursiveTreeWalker(TreeWalker):
    def getNodeDetails(self, node):
@ -110,7 +144,6 @@ class NonRecursiveTreeWalker(TreeWalker):
            details = self.getNodeDetails(currentNode)
            type, details = details[0], details[1:]
            hasChildren = False
            endTag = None
            if type == DOCTYPE:
                yield self.doctype(*details)
@ -127,12 +160,14 @@ class NonRecursiveTreeWalker(TreeWalker):
                        yield token
                    hasChildren = False
                else:
                    endTag = name
                    yield self.startTag(namespace, name, attributes)
            elif type == COMMENT:
                yield self.comment(details[0])
            elif type == ENTITY:
                yield self.entity(details[0])
            elif type == DOCUMENT:
                hasChildren = True
--- a/src/html5lib/treewalkers/dom.py
+++ b/src/html5lib/treewalkers/dom.py
@ -1,10 +1,12 @@
 from __future__ import absolute_import, division, unicode_literals
 from xml.dom import Node
 import gettext
 _ = gettext.gettext
-import _base
+from . import _base
-from html5lib.constants import voidElements
+
 class TreeWalker(_base.NonRecursiveTreeWalker):
    def getNodeDetails(self, node):
@ -15,8 +17,15 @@ class TreeWalker(_base.NonRecursiveTreeWalker):
            return _base.TEXT, node.nodeValue
        elif node.nodeType == Node.ELEMENT_NODE:
            attrs = {}
            for attr in list(node.attributes.keys()):
                attr = node.getAttributeNode(attr)
                if attr.namespaceURI:
                    attrs[(attr.namespaceURI, attr.localName)] = attr.value
                else:
                    attrs[(None, attr.name)] = attr.value
            return (_base.ELEMENT, node.namespaceURI, node.nodeName,
-                    node.attributes.items(), node.hasChildNodes)
+                    attrs, node.hasChildNodes())
        elif node.nodeType == Node.COMMENT_NODE:
            return _base.COMMENT, node.nodeValue
--- a/src/html5lib/treewalkers/etree.py
+++ b/src/html5lib/treewalkers/etree.py
@ -1,30 +1,28 @@
 from __future__ import absolute_import, division, unicode_literals
 try:
    from collections import OrderedDict
 except ImportError:
    try:
        from ordereddict import OrderedDict
    except ImportError:
        OrderedDict = dict
 import gettext
 _ = gettext.gettext
 import new
 import copy
 import re
-import _base
+from six import text_type
-from html5lib.constants import voidElements
+
 from . import _base
 from ..utils import moduleFactoryFactory
 tag_regexp = re.compile("{([^}]*)}(.*)")
 moduleCache = {}
 def getETreeModule(ElementTreeImplementation):
    name = "_" + ElementTreeImplementation.__name__+"builder"
    if name in moduleCache:
        return moduleCache[name]
    else:
        mod = new.module("_" + ElementTreeImplementation.__name__+"builder")
        objs = getETreeBuilder(ElementTreeImplementation)
        mod.__dict__.update(objs)
        moduleCache[name] = mod
        return mod
 def getETreeBuilder(ElementTreeImplementation):
    ElementTree = ElementTreeImplementation
    ElementTreeCommentType = ElementTree.Comment("asd").tag
    class TreeWalker(_base.NonRecursiveTreeWalker):
        """Given the particular ElementTree representation, this implementation,
@ -51,17 +49,18 @@ def getETreeBuilder(ElementTreeImplementation):
            if not(hasattr(node, "tag")):
                node = node.getroot()
-            if node.tag in ("<DOCUMENT_ROOT>", "<DOCUMENT_FRAGMENT>"):
+            if node.tag in ("DOCUMENT_ROOT", "DOCUMENT_FRAGMENT"):
                return (_base.DOCUMENT,)
            elif node.tag == "<!DOCTYPE>":
                return (_base.DOCTYPE, node.text,
                        node.get("publicId"), node.get("systemId"))
-            elif type(node.tag) == type(ElementTree.Comment):
+            elif node.tag == ElementTreeCommentType:
                return _base.COMMENT, node.text
            else:
                assert type(node.tag) == text_type, type(node.tag)
                # This is assumed to be an ordinary element
                match = tag_regexp.match(node.tag)
                if match:
@ -69,8 +68,15 @@ def getETreeBuilder(ElementTreeImplementation):
                else:
                    namespace = None
                    tag = node.tag
                attrs = OrderedDict()
                for name, value in list(node.attrib.items()):
                    match = tag_regexp.match(name)
                    if match:
                        attrs[(match.group(1), match.group(2))] = value
                    else:
                        attrs[(None, name)] = value
                return (_base.ELEMENT, namespace, tag,
-                        node.attrib.items(), len(node) or node.text)
+                        attrs, len(node) or node.text)
        def getFirstChild(self, node):
            if isinstance(node, tuple):
@ -128,3 +134,5 @@ def getETreeBuilder(ElementTreeImplementation):
                    return parent, list(parents[-1]).index(parent), parents, None
    return locals()
 getETreeModule = moduleFactoryFactory(getETreeBuilder)
--- a/src/html5lib/treewalkers/genshistream.py
+++ b/src/html5lib/treewalkers/genshistream.py
@ -1,50 +1,49 @@
 from __future__ import absolute_import, division, unicode_literals
 from genshi.core import QName
 from genshi.core import START, END, XML_NAMESPACE, DOCTYPE, TEXT
 from genshi.core import START_NS, END_NS, START_CDATA, END_CDATA, PI, COMMENT
 from genshi.output import NamespaceFlattener
-import _base
+from . import _base
 from ..constants import voidElements, namespaces
 from html5lib.constants import voidElements
 class TreeWalker(_base.TreeWalker):
    def __iter__(self):
-        depth = 0
+        # Buffer the events so we can pass in the following one
        ignore_until = None
        previous = None
        for event in self.tree:
            if previous is not None:
                if previous[0] == START:
                    depth += 1
                if ignore_until <= depth:
                    ignore_until = None
                if ignore_until is None:
                for token in self.tokens(previous, event):
                    yield token
                        if token["type"] == "EmptyTag":
                            ignore_until = depth
                if previous[0] == END:
                    depth -= 1
            previous = event
        # Don't forget the final event!
        if previous is not None:
            if ignore_until is None or ignore_until <= depth:
            for token in self.tokens(previous, None):
                yield token
            elif ignore_until is not None:
                raise ValueError("Illformed DOM event stream: void element without END_ELEMENT")
    def tokens(self, event, next):
        kind, data, pos = event
        if kind == START:
-            tag, attrib = data
+            tag, attribs = data
            name = tag.localname
            namespace = tag.namespace
-            if tag in voidElements:
+            converted_attribs = {}
-                for token in self.emptyTag(namespace, name, list(attrib),
+            for k, v in attribs:
                if isinstance(k, QName):
                    converted_attribs[(k.namespace, k.localname)] = v
                else:
                    converted_attribs[(None, k)] = v
            if namespace == namespaces["html"] and name in voidElements:
                for token in self.emptyTag(namespace, name, converted_attribs,
                                           not next or next[0] != END
                                           or next[1] != tag):
                    yield token
            else:
-                yield self.startTag(namespace, name, list(attrib))
+                yield self.startTag(namespace, name, converted_attribs)
        elif kind == END:
            name = data.localname
@ -62,7 +61,7 @@ class TreeWalker(_base.TreeWalker):
        elif kind == DOCTYPE:
            yield self.doctype(*data)
-        elif kind in (XML_NAMESPACE, DOCTYPE, START_NS, END_NS, \
+        elif kind in (XML_NAMESPACE, DOCTYPE, START_NS, END_NS,
                      START_CDATA, END_CDATA, PI):
            pass
--- a/src/html5lib/treewalkers/lxmletree.py
+++ b/src/html5lib/treewalkers/lxmletree.py
@ -1,22 +1,35 @@
 from __future__ import absolute_import, division, unicode_literals
 from six import text_type
 from lxml import etree
-from html5lib.treebuilders.etree import tag_regexp
+from ..treebuilders.etree import tag_regexp
 from gettext import gettext
 _ = gettext
-import _base
+from . import _base
 from .. import ihatexml
 def ensure_str(s):
    if s is None:
        return None
    elif isinstance(s, text_type):
        return s
    else:
        return s.decode("utf-8", "strict")
 from html5lib.constants import voidElements
 from html5lib import ihatexml
 class Root(object):
    def __init__(self, et):
        self.elementtree = et
        self.children = []
        if et.docinfo.internalDTD:
-            self.children.append(Doctype(self, et.docinfo.root_name, 
+            self.children.append(Doctype(self,
-                                         et.docinfo.public_id, 
+                                         ensure_str(et.docinfo.root_name),
-                                         et.docinfo.system_url))
+                                         ensure_str(et.docinfo.public_id),
                                         ensure_str(et.docinfo.system_url)))
        root = et.getroot()
        node = root
@ -38,6 +51,7 @@ class Root(object):
    def __len__(self):
        return 1
 class Doctype(object):
    def __init__(self, root_node, name, public_id, system_id):
        self.root_node = root_node
@ -51,6 +65,7 @@ class Doctype(object):
    def getnext(self):
        return self.root_node.children[1]
 class FragmentRoot(Root):
    def __init__(self, children):
        self.children = [FragmentWrapper(self, child) for child in children]
@ -59,19 +74,23 @@ class FragmentRoot(Root):
    def getnext(self):
        return None
 class FragmentWrapper(object):
    def __init__(self, fragment_root, obj):
        self.root_node = fragment_root
        self.obj = obj
        if hasattr(self.obj, 'text'):
-            self.text = self.obj.text
+            self.text = ensure_str(self.obj.text)
        else:
            self.text = None
        if hasattr(self.obj, 'tail'):
-            self.tail = self.obj.tail
+            self.tail = ensure_str(self.obj.tail)
        else:
            self.tail = None
-        self.isstring = isinstance(obj, basestring)
+        self.isstring = isinstance(obj, str) or isinstance(obj, bytes)
        # Support for bytes here is Py2
        if self.isstring:
            self.obj = ensure_str(self.obj)
    def __getattr__(self, name):
        return getattr(self.obj, name)
@ -87,7 +106,7 @@ class FragmentWrapper(object):
    def __getitem__(self, key):
        return self.obj[key]
-    def __nonzero__(self):
+    def __bool__(self):
        return bool(self.obj)
    def getparent(self):
@ -96,6 +115,9 @@ class FragmentWrapper(object):
    def __str__(self):
        return str(self.obj)
    def __unicode__(self):
        return str(self.obj)
    def __len__(self):
        return len(self.obj)
@ -108,11 +130,12 @@ class TreeWalker(_base.NonRecursiveTreeWalker):
            tree = FragmentRoot(tree)
        _base.NonRecursiveTreeWalker.__init__(self, tree)
        self.filter = ihatexml.InfosetFilter()
    def getNodeDetails(self, node):
        if isinstance(node, tuple):  # Text node
            node, key = node
            assert key in ("text", "tail"), _("Text nodes are text or tail, found %s") % key
-            return _base.TEXT, getattr(node, key)
+            return _base.TEXT, ensure_str(getattr(node, key))
        elif isinstance(node, Root):
            return (_base.DOCUMENT,)
@ -121,23 +144,33 @@ class TreeWalker(_base.NonRecursiveTreeWalker):
            return _base.DOCTYPE, node.name, node.public_id, node.system_id
        elif isinstance(node, FragmentWrapper) and node.isstring:
-            return _base.TEXT, node
+            return _base.TEXT, node.obj
        elif node.tag == etree.Comment:
-            return _base.COMMENT, node.text
+            return _base.COMMENT, ensure_str(node.text)
        elif node.tag == etree.Entity:
            return _base.ENTITY, ensure_str(node.text)[1:-1]  # strip &;
        else:
            # This is assumed to be an ordinary element
-            match = tag_regexp.match(node.tag)
+            match = tag_regexp.match(ensure_str(node.tag))
            if match:
                namespace, tag = match.groups()
            else:
                namespace = None
-                tag = node.tag
+                tag = ensure_str(node.tag)
            attrs = {}
            for name, value in list(node.attrib.items()):
                name = ensure_str(name)
                value = ensure_str(value)
                match = tag_regexp.match(name)
                if match:
                    attrs[(match.group(1), match.group(2))] = value
                else:
                    attrs[(None, name)] = value
            return (_base.ELEMENT, namespace, self.filter.fromXmlName(tag),
-                    [(self.filter.fromXmlName(name), value) for 
+                    attrs, len(node) > 0 or node.text)
                     name,value in node.attrib.iteritems()], 
                     len(node) > 0 or node.text)
    def getFirstChild(self, node):
        assert not isinstance(node, tuple), _("Text nodes have no children")
@ -162,7 +195,7 @@ class TreeWalker(_base.NonRecursiveTreeWalker):
            else:  # tail
                return node.getnext()
-        return node.tail and (node, "tail") or node.getnext()
+        return (node, "tail") if node.tail else node.getnext()
    def getParentNode(self, node):
        if isinstance(node, tuple):  # Text node
--- a/src/html5lib/treewalkers/pulldom.py
+++ b/src/html5lib/treewalkers/pulldom.py
@ -1,9 +1,12 @@
 from __future__ import absolute_import, division, unicode_literals
 from xml.dom.pulldom import START_ELEMENT, END_ELEMENT, \
    COMMENT, IGNORABLE_WHITESPACE, CHARACTERS
-import _base
+from . import _base
 from ..constants import voidElements
 from html5lib.constants import voidElements
 class TreeWalker(_base.TreeWalker):
    def __iter__(self):
@ -30,14 +33,18 @@ class TreeWalker(_base.TreeWalker):
        if type == START_ELEMENT:
            name = node.nodeName
            namespace = node.namespaceURI
            attrs = {}
            for attr in list(node.attributes.keys()):
                attr = node.getAttributeNode(attr)
                attrs[(attr.namespaceURI, attr.localName)] = attr.value
            if name in voidElements:
                for token in self.emptyTag(namespace,
                                           name,
-                                           node.attributes.items(), 
+                                           attrs,
                                           not next or next[1] is not node):
                    yield token
            else:
-                yield self.startTag(namespace, name, node.attributes.items())
+                yield self.startTag(namespace, name, attrs)
        elif type == END_ELEMENT:
            name = node.nodeName
--- a/src/html5lib/treewalkers/simpletree.py
+++ b/src/html5lib/treewalkers/simpletree.py
@ -1,72 +0,0 @@
 import gettext
 _ = gettext.gettext
 import _base
 class TreeWalker(_base.NonRecursiveTreeWalker):
    """Given that simpletree has no performant way of getting a node's
    next sibling, this implementation returns "nodes" as tuples with the
    following content:
    1. The parent Node (Element, Document or DocumentFragment)
    2. The child index of the current node in its parent's children list
    3. A list used as a stack of all ancestors. It is a pair tuple whose
       first item is a parent Node and second item is a child index.
    """
    def getNodeDetails(self, node):
        if isinstance(node, tuple): # It might be the root Node
            parent, idx, parents = node
            node = parent.childNodes[idx]
        # testing node.type allows us not to import treebuilders.simpletree
        if node.type in (1, 2): # Document or DocumentFragment
            return (_base.DOCUMENT,)
        elif node.type == 3: # DocumentType
            return _base.DOCTYPE, node.name, node.publicId, node.systemId
        elif node.type == 4: # TextNode
            return _base.TEXT, node.value
        elif node.type == 5: # Element
            return (_base.ELEMENT, node.namespace, node.name, 
                    node.attributes.items(), node.hasContent())
        elif node.type == 6: # CommentNode
            return _base.COMMENT, node.data
        else:
            return _node.UNKNOWN, node.type
    def getFirstChild(self, node):
        if isinstance(node, tuple): # It might be the root Node
            parent, idx, parents = node
            parents.append((parent, idx))
            node = parent.childNodes[idx]
        else:
            parents = []
        assert node.hasContent(), "Node has no children"
        return (node, 0, parents)
    def getNextSibling(self, node):
        assert isinstance(node, tuple), "Node is not a tuple: " + str(node)
        parent, idx, parents = node
        idx += 1
        if len(parent.childNodes) > idx:
            return (parent, idx, parents)
        else:
            return None
    def getParentNode(self, node):
        assert isinstance(node, tuple)
        parent, idx, parents = node
        if parents:
            parent, idx = parents.pop()
            return parent, idx, parents
        else:
            # HACK: We could return ``parent`` but None will stop the algorithm the same way
            return None
--- a/src/html5lib/treewalkers/soup.py
+++ b/src/html5lib/treewalkers/soup.py
@ -1,59 +0,0 @@
 import re
 import gettext
 _ = gettext.gettext
 from BeautifulSoup import BeautifulSoup, Declaration, Comment, Tag
 from html5lib.constants import namespaces
 import _base
 class TreeWalker(_base.NonRecursiveTreeWalker):
    doctype_regexp = re.compile(
        r'(?P<name>[^\s]*)(\s*PUBLIC\s*"(?P<publicId>.*)"\s*"(?P<systemId1>.*)"|\s*SYSTEM\s*"(?P<systemId2>.*)")?')
    def getNodeDetails(self, node):
        if isinstance(node, BeautifulSoup): # Document or DocumentFragment
            return (_base.DOCUMENT,)
        elif isinstance(node, Declaration): # DocumentType
            string = unicode(node.string)
            #Slice needed to remove markup added during unicode conversion,
            #but only in some versions of BeautifulSoup/Python
            if string.startswith('<!') and string.endswith('>'):
                string = string[2:-1]
            m = self.doctype_regexp.match(string)
            #This regexp approach seems wrong and fragile
            #but beautiful soup stores the doctype as a single thing and we want the seperate bits
            #It should work as long as the tree is created by html5lib itself but may be wrong if it's
            #been modified at all
            #We could just feed to it a html5lib tokenizer, I guess...
            assert m is not None, "DOCTYPE did not match expected format"
            name = m.group('name')
            publicId = m.group('publicId')
            if publicId is not None:
                systemId = m.group('systemId1')
            else:
                systemId = m.group('systemId2')
            return _base.DOCTYPE, name, publicId or "", systemId or ""
        elif isinstance(node, Comment):
            string = unicode(node.string)
            if string.startswith('<!--') and string.endswith('-->'):
                string = string[4:-3]
            return _base.COMMENT, string
        elif isinstance(node, unicode): # TextNode
            return _base.TEXT, node
        elif isinstance(node, Tag): # Element
            return (_base.ELEMENT, namespaces["html"], node.name,
                    dict(node.attrs).items(), node.contents)
        else:
            return _base.UNKNOWN, node.__class__.__name__
    def getFirstChild(self, node):
        return node.contents[0]
    def getNextSibling(self, node):
        return node.nextSibling
    def getParentNode(self, node):
        return node.parent
--- a/src/html5lib/trie/init.py
+++ b/src/html5lib/trie/init.py
@ -0,0 +1,12 @@
 from __future__ import absolute_import, division, unicode_literals
 from .py import Trie as PyTrie
 Trie = PyTrie
 try:
    from .datrie import Trie as DATrie
 except ImportError:
    pass
 else:
    Trie = DATrie
--- a/src/html5lib/trie/_base.py
+++ b/src/html5lib/trie/_base.py
@ -0,0 +1,37 @@
 from __future__ import absolute_import, division, unicode_literals
 from collections import Mapping
 class Trie(Mapping):
    """Abstract base class for tries"""
    def keys(self, prefix=None):
        keys = super().keys()
        if prefix is None:
            return set(keys)
        # Python 2.6: no set comprehensions
        return set([x for x in keys if x.startswith(prefix)])
    def has_keys_with_prefix(self, prefix):
        for key in self.keys():
            if key.startswith(prefix):
                return True
        return False
    def longest_prefix(self, prefix):
        if prefix in self:
            return prefix
        for i in range(1, len(prefix) + 1):
            if prefix[:-i] in self:
                return prefix[:-i]
        raise KeyError(prefix)
    def longest_prefix_item(self, prefix):
        lprefix = self.longest_prefix(prefix)
        return (lprefix, self[lprefix])
--- a/src/html5lib/trie/datrie.py
+++ b/src/html5lib/trie/datrie.py
@ -0,0 +1,44 @@
 from __future__ import absolute_import, division, unicode_literals
 from datrie import Trie as DATrie
 from six import text_type
 from ._base import Trie as ABCTrie
 class Trie(ABCTrie):
    def __init__(self, data):
        chars = set()
        for key in data.keys():
            if not isinstance(key, text_type):
                raise TypeError("All keys must be strings")
            for char in key:
                chars.add(char)
        self._data = DATrie("".join(chars))
        for key, value in data.items():
            self._data[key] = value
    def __contains__(self, key):
        return key in self._data
    def __len__(self):
        return len(self._data)
    def __iter__(self):
        raise NotImplementedError()
    def __getitem__(self, key):
        return self._data[key]
    def keys(self, prefix=None):
        return self._data.keys(prefix)
    def has_keys_with_prefix(self, prefix):
        return self._data.has_keys_with_prefix(prefix)
    def longest_prefix(self, prefix):
        return self._data.longest_prefix(prefix)
    def longest_prefix_item(self, prefix):
        return self._data.longest_prefix_item(prefix)
--- a/src/html5lib/trie/py.py
+++ b/src/html5lib/trie/py.py
@ -0,0 +1,67 @@
 from __future__ import absolute_import, division, unicode_literals
 from six import text_type
 from bisect import bisect_left
 from ._base import Trie as ABCTrie
 class Trie(ABCTrie):
    def __init__(self, data):
        if not all(isinstance(x, text_type) for x in data.keys()):
            raise TypeError("All keys must be strings")
        self._data = data
        self._keys = sorted(data.keys())
        self._cachestr = ""
        self._cachepoints = (0, len(data))
    def __contains__(self, key):
        return key in self._data
    def __len__(self):
        return len(self._data)
    def __iter__(self):
        return iter(self._data)
    def __getitem__(self, key):
        return self._data[key]
    def keys(self, prefix=None):
        if prefix is None or prefix == "" or not self._keys:
            return set(self._keys)
        if prefix.startswith(self._cachestr):
            lo, hi = self._cachepoints
            start = i = bisect_left(self._keys, prefix, lo, hi)
        else:
            start = i = bisect_left(self._keys, prefix)
        keys = set()
        if start == len(self._keys):
            return keys
        while self._keys[i].startswith(prefix):
            keys.add(self._keys[i])
            i += 1
        self._cachestr = prefix
        self._cachepoints = (start, i)
        return keys
    def has_keys_with_prefix(self, prefix):
        if prefix in self._data:
            return True
        if prefix.startswith(self._cachestr):
            lo, hi = self._cachepoints
            i = bisect_left(self._keys, prefix, lo, hi)
        else:
            i = bisect_left(self._keys, prefix)
        if i == len(self._keys):
            return False
        return self._keys[i].startswith(prefix)
--- a/src/html5lib/utils.py
+++ b/src/html5lib/utils.py
@ -1,9 +1,16 @@
 from __future__ import absolute_import, division, unicode_literals
 from types import ModuleType
 try:
-    frozenset
+    import xml.etree.cElementTree as default_etree
-except NameError:
+except ImportError:
-    #Import from the sets module for python 2.3
+    import xml.etree.ElementTree as default_etree
-    from sets import Set as set
+
-    from sets import ImmutableSet as frozenset
+
 __all__ = ["default_etree", "MethodDispatcher", "isSurrogatePair",
           "surrogatePairToCodepoint", "moduleFactoryFactory"]
 class MethodDispatcher(dict):
    """Dict with 2 special properties:
@ -35,122 +42,41 @@ class MethodDispatcher(dict):
    def __getitem__(self, key):
        return dict.get(self, key, self.default)
 #Pure python implementation of deque taken from the ASPN Python Cookbook
 #Original code by Raymond Hettinger
-class deque(object):
+# Some utility functions to dal with weirdness around UCS2 vs UCS4
 # python builds
-    def __init__(self, iterable=(), maxsize=-1):
+def isSurrogatePair(data):
-        if not hasattr(self, 'data'):
+    return (len(data) == 2 and
-            self.left = self.right = 0
+            ord(data[0]) >= 0xD800 and ord(data[0]) <= 0xDBFF and
-            self.data = {}
+            ord(data[1]) >= 0xDC00 and ord(data[1]) <= 0xDFFF)
        self.maxsize = maxsize
        self.extend(iterable)
    def append(self, x):
        self.data[self.right] = x
        self.right += 1
        if self.maxsize != -1 and len(self) > self.maxsize:
            self.popleft()
-    def appendleft(self, x):
+def surrogatePairToCodepoint(data):
-        self.left -= 1        
+    char_val = (0x10000 + (ord(data[0]) - 0xD800) * 0x400 +
-        self.data[self.left] = x
+                (ord(data[1]) - 0xDC00))
-        if self.maxsize != -1 and len(self) > self.maxsize:
+    return char_val
            self.pop()      
-    def pop(self):
+# Module Factory Factory (no, this isn't Java, I know)
-        if self.left == self.right:
+# Here to stop this being duplicated all over the place.
            raise IndexError('cannot pop from empty deque')
        self.right -= 1
        elem = self.data[self.right]
        del self.data[self.right]         
        return elem
    def popleft(self):
        if self.left == self.right:
            raise IndexError('cannot pop from empty deque')
        elem = self.data[self.left]
        del self.data[self.left]
        self.left += 1
        return elem
-    def clear(self):
+def moduleFactoryFactory(factory):
-        self.data.clear()
+    moduleCache = {}
        self.left = self.right = 0
-    def extend(self, iterable):
+    def moduleFactory(baseModule, *args, **kwargs):
-        for elem in iterable:
+        if isinstance(ModuleType.__name__, type("")):
-            self.append(elem)
+            name = "_%s_factory" % baseModule.__name__
        else:
            name = b"_%s_factory" % baseModule.__name__
-    def extendleft(self, iterable):
+        if name in moduleCache:
-        for elem in iterable:
+            return moduleCache[name]
-            self.appendleft(elem)
+        else:
            mod = ModuleType(name)
            objs = factory(baseModule, *args, **kwargs)
            mod.__dict__.update(objs)
            moduleCache[name] = mod
            return mod
-    def rotate(self, n=1):
+    return moduleFactory
        if self:
            n %= len(self)
            for i in xrange(n):
                self.appendleft(self.pop())
    def __getitem__(self, i):
        if i < 0:
            i += len(self)
        try:
            return self.data[i + self.left]
        except KeyError:
            raise IndexError
    def __setitem__(self, i, value):
        if i < 0:
            i += len(self)        
        try:
            self.data[i + self.left] = value
        except KeyError:
            raise IndexError
    def __delitem__(self, i):
        size = len(self)
        if not (-size <= i < size):
            raise IndexError
        data = self.data
        if i < 0:
            i += size
        for j in xrange(self.left+i, self.right-1):
            data[j] = data[j+1]
        self.pop()
    def __len__(self):
        return self.right - self.left
    def __cmp__(self, other):
        if type(self) != type(other):
            return cmp(type(self), type(other))
        return cmp(list(self), list(other))
    def __repr__(self, _track=[]):
        if id(self) in _track:
            return '...'
        _track.append(id(self))
        r = 'deque(%r)' % (list(self),)
        _track.remove(id(self))
        return r
    def __getstate__(self):
        return (tuple(self),)
    def __setstate__(self, s):
        self.__init__(s[0])
    def __hash__(self):
        raise TypeError
    def __copy__(self):
        return self.__class__(self)
    def __deepcopy__(self, memo={}):
        from copy import deepcopy
        result = self.__class__()
        memo[id(self)] = result
        result.__init__(deepcopy(tuple(self), memo))
        return result