Merge in changes from upstream html5lib

2025-07-09 03:04:10 -04:00 · 2015-11-03 10:41:47 +05:30 · 2015-11-03 10:41:47 +05:30 · 892571a180
commit 892571a180
parent 700c77ddcd
11 changed files with 327 additions and 213 deletions
--- a/src/html5lib/init.py
+++ b/src/html5lib/init.py
@ -20,4 +20,4 @@ from .serializer import serialize

 __all__ = ["HTMLParser", "parse", "parseFragment", "getTreeBuilder",
           "getTreeWalker", "serialize"]
-__version__ = "0.999-dev"
+__version__ = "0.999999-dev"
--- a/src/html5lib/constants.py
+++ b/src/html5lib/constants.py
@ -1,292 +1,290 @@
 from __future__ import absolute_import, division, unicode_literals

 import string
-import gettext
-_ = gettext.gettext

 EOF = None

 E = {
    "null-character":
-        _("Null character in input stream, replaced with U+FFFD."),
+        "Null character in input stream, replaced with U+FFFD.",
    "invalid-codepoint":
-        _("Invalid codepoint in stream."),
+        "Invalid codepoint in stream.",
    "incorrectly-placed-solidus":
-        _("Solidus (/) incorrectly placed in tag."),
+        "Solidus (/) incorrectly placed in tag.",
    "incorrect-cr-newline-entity":
-        _("Incorrect CR newline entity, replaced with LF."),
+        "Incorrect CR newline entity, replaced with LF.",
    "illegal-windows-1252-entity":
-        _("Entity used with illegal number (windows-1252 reference)."),
+        "Entity used with illegal number (windows-1252 reference).",
    "cant-convert-numeric-entity":
-        _("Numeric entity couldn't be converted to character "
+        ("Numeric entity couldn't be converted to character "
          "(codepoint U+%(charAsInt)08x)."),
    "illegal-codepoint-for-numeric-entity":
-        _("Numeric entity represents an illegal codepoint: "
+        ("Numeric entity represents an illegal codepoint: "
          "U+%(charAsInt)08x."),
    "numeric-entity-without-semicolon":
-        _("Numeric entity didn't end with ';'."),
+        "Numeric entity didn't end with ';'.",
    "expected-numeric-entity-but-got-eof":
-        _("Numeric entity expected. Got end of file instead."),
+        "Numeric entity expected. Got end of file instead.",
    "expected-numeric-entity":
-        _("Numeric entity expected but none found."),
+        "Numeric entity expected but none found.",
    "named-entity-without-semicolon":
-        _("Named entity didn't end with ';'."),
+        "Named entity didn't end with ';'.",
    "expected-named-entity":
-        _("Named entity expected. Got none."),
+        "Named entity expected. Got none.",
    "attributes-in-end-tag":
-        _("End tag contains unexpected attributes."),
+        "End tag contains unexpected attributes.",
    'self-closing-flag-on-end-tag':
-        _("End tag contains unexpected self-closing flag."),
+        "End tag contains unexpected self-closing flag.",
    "expected-tag-name-but-got-right-bracket":
-        _("Expected tag name. Got '>' instead."),
+        "Expected tag name. Got '>' instead.",
    "expected-tag-name-but-got-question-mark":
-        _("Expected tag name. Got '?' instead. (HTML doesn't "
+        ("Expected tag name. Got '?' instead. (HTML doesn't "
          "support processing instructions.)"),
    "expected-tag-name":
-        _("Expected tag name. Got something else instead"),
+        "Expected tag name. Got something else instead",
    "expected-closing-tag-but-got-right-bracket":
-        _("Expected closing tag. Got '>' instead. Ignoring '</>'."),
+        "Expected closing tag. Got '>' instead. Ignoring '</>'.",
    "expected-closing-tag-but-got-eof":
-        _("Expected closing tag. Unexpected end of file."),
+        "Expected closing tag. Unexpected end of file.",
    "expected-closing-tag-but-got-char":
-        _("Expected closing tag. Unexpected character '%(data)s' found."),
+        "Expected closing tag. Unexpected character '%(data)s' found.",
    "eof-in-tag-name":
-        _("Unexpected end of file in the tag name."),
+        "Unexpected end of file in the tag name.",
    "expected-attribute-name-but-got-eof":
-        _("Unexpected end of file. Expected attribute name instead."),
+        "Unexpected end of file. Expected attribute name instead.",
    "eof-in-attribute-name":
-        _("Unexpected end of file in attribute name."),
+        "Unexpected end of file in attribute name.",
    "invalid-character-in-attribute-name":
-        _("Invalid character in attribute name"),
+        "Invalid character in attribute name",
    "duplicate-attribute":
-        _("Dropped duplicate attribute on tag."),
+        "Dropped duplicate attribute on tag.",
    "expected-end-of-tag-name-but-got-eof":
-        _("Unexpected end of file. Expected = or end of tag."),
+        "Unexpected end of file. Expected = or end of tag.",
    "expected-attribute-value-but-got-eof":
-        _("Unexpected end of file. Expected attribute value."),
+        "Unexpected end of file. Expected attribute value.",
    "expected-attribute-value-but-got-right-bracket":
-        _("Expected attribute value. Got '>' instead."),
+        "Expected attribute value. Got '>' instead.",
    'equals-in-unquoted-attribute-value':
-        _("Unexpected = in unquoted attribute"),
+        "Unexpected = in unquoted attribute",
    'unexpected-character-in-unquoted-attribute-value':
-        _("Unexpected character in unquoted attribute"),
+        "Unexpected character in unquoted attribute",
    "invalid-character-after-attribute-name":
-        _("Unexpected character after attribute name."),
+        "Unexpected character after attribute name.",
    "unexpected-character-after-attribute-value":
-        _("Unexpected character after attribute value."),
+        "Unexpected character after attribute value.",
    "eof-in-attribute-value-double-quote":
-        _("Unexpected end of file in attribute value (\")."),
+        "Unexpected end of file in attribute value (\").",
    "eof-in-attribute-value-single-quote":
-        _("Unexpected end of file in attribute value (')."),
+        "Unexpected end of file in attribute value (').",
    "eof-in-attribute-value-no-quotes":
-        _("Unexpected end of file in attribute value."),
+        "Unexpected end of file in attribute value.",
    "unexpected-EOF-after-solidus-in-tag":
-        _("Unexpected end of file in tag. Expected >"),
+        "Unexpected end of file in tag. Expected >",
    "unexpected-character-after-solidus-in-tag":
-        _("Unexpected character after / in tag. Expected >"),
+        "Unexpected character after / in tag. Expected >",
    "expected-dashes-or-doctype":
-        _("Expected '--' or 'DOCTYPE'. Not found."),
+        "Expected '--' or 'DOCTYPE'. Not found.",
    "unexpected-bang-after-double-dash-in-comment":
-        _("Unexpected ! after -- in comment"),
+        "Unexpected ! after -- in comment",
    "unexpected-space-after-double-dash-in-comment":
-        _("Unexpected space after -- in comment"),
+        "Unexpected space after -- in comment",
    "incorrect-comment":
-        _("Incorrect comment."),
+        "Incorrect comment.",
    "eof-in-comment":
-        _("Unexpected end of file in comment."),
+        "Unexpected end of file in comment.",
    "eof-in-comment-end-dash":
-        _("Unexpected end of file in comment (-)"),
+        "Unexpected end of file in comment (-)",
    "unexpected-dash-after-double-dash-in-comment":
-        _("Unexpected '-' after '--' found in comment."),
+        "Unexpected '-' after '--' found in comment.",
    "eof-in-comment-double-dash":
-        _("Unexpected end of file in comment (--)."),
+        "Unexpected end of file in comment (--).",
    "eof-in-comment-end-space-state":
-        _("Unexpected end of file in comment."),
+        "Unexpected end of file in comment.",
    "eof-in-comment-end-bang-state":
-        _("Unexpected end of file in comment."),
+        "Unexpected end of file in comment.",
    "unexpected-char-in-comment":
-        _("Unexpected character in comment found."),
+        "Unexpected character in comment found.",
    "need-space-after-doctype":
-        _("No space after literal string 'DOCTYPE'."),
+        "No space after literal string 'DOCTYPE'.",
    "expected-doctype-name-but-got-right-bracket":
-        _("Unexpected > character. Expected DOCTYPE name."),
+        "Unexpected > character. Expected DOCTYPE name.",
    "expected-doctype-name-but-got-eof":
-        _("Unexpected end of file. Expected DOCTYPE name."),
+        "Unexpected end of file. Expected DOCTYPE name.",
    "eof-in-doctype-name":
-        _("Unexpected end of file in DOCTYPE name."),
+        "Unexpected end of file in DOCTYPE name.",
    "eof-in-doctype":
-        _("Unexpected end of file in DOCTYPE."),
+        "Unexpected end of file in DOCTYPE.",
    "expected-space-or-right-bracket-in-doctype":
-        _("Expected space or '>'. Got '%(data)s'"),
+        "Expected space or '>'. Got '%(data)s'",
    "unexpected-end-of-doctype":
-        _("Unexpected end of DOCTYPE."),
+        "Unexpected end of DOCTYPE.",
    "unexpected-char-in-doctype":
-        _("Unexpected character in DOCTYPE."),
+        "Unexpected character in DOCTYPE.",
    "eof-in-innerhtml":
-        _("XXX innerHTML EOF"),
+        "XXX innerHTML EOF",
    "unexpected-doctype":
-        _("Unexpected DOCTYPE. Ignored."),
+        "Unexpected DOCTYPE. Ignored.",
    "non-html-root":
-        _("html needs to be the first start tag."),
+        "html needs to be the first start tag.",
    "expected-doctype-but-got-eof":
-        _("Unexpected End of file. Expected DOCTYPE."),
+        "Unexpected End of file. Expected DOCTYPE.",
    "unknown-doctype":
-        _("Erroneous DOCTYPE."),
+        "Erroneous DOCTYPE.",
    "expected-doctype-but-got-chars":
-        _("Unexpected non-space characters. Expected DOCTYPE."),
+        "Unexpected non-space characters. Expected DOCTYPE.",
    "expected-doctype-but-got-start-tag":
-        _("Unexpected start tag (%(name)s). Expected DOCTYPE."),
+        "Unexpected start tag (%(name)s). Expected DOCTYPE.",
    "expected-doctype-but-got-end-tag":
-        _("Unexpected end tag (%(name)s). Expected DOCTYPE."),
+        "Unexpected end tag (%(name)s). Expected DOCTYPE.",
    "end-tag-after-implied-root":
-        _("Unexpected end tag (%(name)s) after the (implied) root element."),
+        "Unexpected end tag (%(name)s) after the (implied) root element.",
    "expected-named-closing-tag-but-got-eof":
-        _("Unexpected end of file. Expected end tag (%(name)s)."),
+        "Unexpected end of file. Expected end tag (%(name)s).",
    "two-heads-are-not-better-than-one":
-        _("Unexpected start tag head in existing head. Ignored."),
+        "Unexpected start tag head in existing head. Ignored.",
    "unexpected-end-tag":
-        _("Unexpected end tag (%(name)s). Ignored."),
+        "Unexpected end tag (%(name)s). Ignored.",
    "unexpected-start-tag-out-of-my-head":
-        _("Unexpected start tag (%(name)s) that can be in head. Moved."),
+        "Unexpected start tag (%(name)s) that can be in head. Moved.",
    "unexpected-start-tag":
-        _("Unexpected start tag (%(name)s)."),
+        "Unexpected start tag (%(name)s).",
    "missing-end-tag":
-        _("Missing end tag (%(name)s)."),
+        "Missing end tag (%(name)s).",
    "missing-end-tags":
-        _("Missing end tags (%(name)s)."),
+        "Missing end tags (%(name)s).",
    "unexpected-start-tag-implies-end-tag":
-        _("Unexpected start tag (%(startName)s) "
+        ("Unexpected start tag (%(startName)s) "
          "implies end tag (%(endName)s)."),
    "unexpected-start-tag-treated-as":
-        _("Unexpected start tag (%(originalName)s). Treated as %(newName)s."),
+        "Unexpected start tag (%(originalName)s). Treated as %(newName)s.",
    "deprecated-tag":
-        _("Unexpected start tag %(name)s. Don't use it!"),
+        "Unexpected start tag %(name)s. Don't use it!",
    "unexpected-start-tag-ignored":
-        _("Unexpected start tag %(name)s. Ignored."),
+        "Unexpected start tag %(name)s. Ignored.",
    "expected-one-end-tag-but-got-another":
-        _("Unexpected end tag (%(gotName)s). "
+        ("Unexpected end tag (%(gotName)s). "
          "Missing end tag (%(expectedName)s)."),
    "end-tag-too-early":
-        _("End tag (%(name)s) seen too early. Expected other end tag."),
+        "End tag (%(name)s) seen too early. Expected other end tag.",
    "end-tag-too-early-named":
-        _("Unexpected end tag (%(gotName)s). Expected end tag (%(expectedName)s)."),
+        "Unexpected end tag (%(gotName)s). Expected end tag (%(expectedName)s).",
    "end-tag-too-early-ignored":
-        _("End tag (%(name)s) seen too early. Ignored."),
+        "End tag (%(name)s) seen too early. Ignored.",
    "adoption-agency-1.1":
-        _("End tag (%(name)s) violates step 1, "
+        ("End tag (%(name)s) violates step 1, "
          "paragraph 1 of the adoption agency algorithm."),
    "adoption-agency-1.2":
-        _("End tag (%(name)s) violates step 1, "
+        ("End tag (%(name)s) violates step 1, "
          "paragraph 2 of the adoption agency algorithm."),
    "adoption-agency-1.3":
-        _("End tag (%(name)s) violates step 1, "
+        ("End tag (%(name)s) violates step 1, "
          "paragraph 3 of the adoption agency algorithm."),
    "adoption-agency-4.4":
-        _("End tag (%(name)s) violates step 4, "
+        ("End tag (%(name)s) violates step 4, "
          "paragraph 4 of the adoption agency algorithm."),
    "unexpected-end-tag-treated-as":
-        _("Unexpected end tag (%(originalName)s). Treated as %(newName)s."),
+        "Unexpected end tag (%(originalName)s). Treated as %(newName)s.",
    "no-end-tag":
-        _("This element (%(name)s) has no end tag."),
+        "This element (%(name)s) has no end tag.",
    "unexpected-implied-end-tag-in-table":
-        _("Unexpected implied end tag (%(name)s) in the table phase."),
+        "Unexpected implied end tag (%(name)s) in the table phase.",
    "unexpected-implied-end-tag-in-table-body":
-        _("Unexpected implied end tag (%(name)s) in the table body phase."),
+        "Unexpected implied end tag (%(name)s) in the table body phase.",
    "unexpected-char-implies-table-voodoo":
-        _("Unexpected non-space characters in "
+        ("Unexpected non-space characters in "
          "table context caused voodoo mode."),
    "unexpected-hidden-input-in-table":
-        _("Unexpected input with type hidden in table context."),
+        "Unexpected input with type hidden in table context.",
    "unexpected-form-in-table":
-        _("Unexpected form in table context."),
+        "Unexpected form in table context.",
    "unexpected-start-tag-implies-table-voodoo":
-        _("Unexpected start tag (%(name)s) in "
+        ("Unexpected start tag (%(name)s) in "
          "table context caused voodoo mode."),
    "unexpected-end-tag-implies-table-voodoo":
-        _("Unexpected end tag (%(name)s) in "
+        ("Unexpected end tag (%(name)s) in "
          "table context caused voodoo mode."),
    "unexpected-cell-in-table-body":
-        _("Unexpected table cell start tag (%(name)s) "
+        ("Unexpected table cell start tag (%(name)s) "
          "in the table body phase."),
    "unexpected-cell-end-tag":
-        _("Got table cell end tag (%(name)s) "
+        ("Got table cell end tag (%(name)s) "
          "while required end tags are missing."),
    "unexpected-end-tag-in-table-body":
-        _("Unexpected end tag (%(name)s) in the table body phase. Ignored."),
+        "Unexpected end tag (%(name)s) in the table body phase. Ignored.",
    "unexpected-implied-end-tag-in-table-row":
-        _("Unexpected implied end tag (%(name)s) in the table row phase."),
+        "Unexpected implied end tag (%(name)s) in the table row phase.",
    "unexpected-end-tag-in-table-row":
-        _("Unexpected end tag (%(name)s) in the table row phase. Ignored."),
+        "Unexpected end tag (%(name)s) in the table row phase. Ignored.",
    "unexpected-select-in-select":
-        _("Unexpected select start tag in the select phase "
+        ("Unexpected select start tag in the select phase "
          "treated as select end tag."),
    "unexpected-input-in-select":
-        _("Unexpected input start tag in the select phase."),
+        "Unexpected input start tag in the select phase.",
    "unexpected-start-tag-in-select":
-        _("Unexpected start tag token (%(name)s in the select phase. "
+        ("Unexpected start tag token (%(name)s in the select phase. "
          "Ignored."),
    "unexpected-end-tag-in-select":
-        _("Unexpected end tag (%(name)s) in the select phase. Ignored."),
+        "Unexpected end tag (%(name)s) in the select phase. Ignored.",
    "unexpected-table-element-start-tag-in-select-in-table":
-        _("Unexpected table element start tag (%(name)s) in the select in table phase."),
+        "Unexpected table element start tag (%(name)s) in the select in table phase.",
    "unexpected-table-element-end-tag-in-select-in-table":
-        _("Unexpected table element end tag (%(name)s) in the select in table phase."),
+        "Unexpected table element end tag (%(name)s) in the select in table phase.",
    "unexpected-char-after-body":
-        _("Unexpected non-space characters in the after body phase."),
+        "Unexpected non-space characters in the after body phase.",
    "unexpected-start-tag-after-body":
-        _("Unexpected start tag token (%(name)s)"
+        ("Unexpected start tag token (%(name)s)"
          " in the after body phase."),
    "unexpected-end-tag-after-body":
-        _("Unexpected end tag token (%(name)s)"
+        ("Unexpected end tag token (%(name)s)"
          " in the after body phase."),
    "unexpected-char-in-frameset":
-        _("Unexpected characters in the frameset phase. Characters ignored."),
+        "Unexpected characters in the frameset phase. Characters ignored.",
    "unexpected-start-tag-in-frameset":
-        _("Unexpected start tag token (%(name)s)"
+        ("Unexpected start tag token (%(name)s)"
          " in the frameset phase. Ignored."),
    "unexpected-frameset-in-frameset-innerhtml":
-        _("Unexpected end tag token (frameset) "
+        ("Unexpected end tag token (frameset) "
          "in the frameset phase (innerHTML)."),
    "unexpected-end-tag-in-frameset":
-        _("Unexpected end tag token (%(name)s)"
+        ("Unexpected end tag token (%(name)s)"
          " in the frameset phase. Ignored."),
    "unexpected-char-after-frameset":
-        _("Unexpected non-space characters in the "
+        ("Unexpected non-space characters in the "
          "after frameset phase. Ignored."),
    "unexpected-start-tag-after-frameset":
-        _("Unexpected start tag (%(name)s)"
+        ("Unexpected start tag (%(name)s)"
          " in the after frameset phase. Ignored."),
    "unexpected-end-tag-after-frameset":
-        _("Unexpected end tag (%(name)s)"
+        ("Unexpected end tag (%(name)s)"
          " in the after frameset phase. Ignored."),
    "unexpected-end-tag-after-body-innerhtml":
-        _("Unexpected end tag after body(innerHtml)"),
+        "Unexpected end tag after body(innerHtml)",
    "expected-eof-but-got-char":
-        _("Unexpected non-space characters. Expected end of file."),
+        "Unexpected non-space characters. Expected end of file.",
    "expected-eof-but-got-start-tag":
-        _("Unexpected start tag (%(name)s)"
+        ("Unexpected start tag (%(name)s)"
          ". Expected end of file."),
    "expected-eof-but-got-end-tag":
-        _("Unexpected end tag (%(name)s)"
+        ("Unexpected end tag (%(name)s)"
          ". Expected end of file."),
    "eof-in-table":
-        _("Unexpected end of file. Expected table content."),
+        "Unexpected end of file. Expected table content.",
    "eof-in-select":
-        _("Unexpected end of file. Expected select content."),
+        "Unexpected end of file. Expected select content.",
    "eof-in-frameset":
-        _("Unexpected end of file. Expected frameset content."),
+        "Unexpected end of file. Expected frameset content.",
    "eof-in-script-in-script":
-        _("Unexpected end of file. Expected script content."),
+        "Unexpected end of file. Expected script content.",
    "eof-in-foreign-lands":
-        _("Unexpected end of file. Expected foreign content"),
+        "Unexpected end of file. Expected foreign content",
    "non-void-element-with-trailing-solidus":
-        _("Trailing solidus not allowed on element %(name)s"),
+        "Trailing solidus not allowed on element %(name)s",
    "unexpected-html-element-in-foreign-content":
-        _("Element %(name)s not allowed in a non-html context"),
+        "Element %(name)s not allowed in a non-html context",
    "unexpected-end-tag-before-html":
-        _("Unexpected end tag (%(name)s) before html."),
+        "Unexpected end tag (%(name)s) before html.",
    "XXX-undefined-error":
-        _("Undefined error (this sucks and should be fixed)"),
+        "Undefined error (this sucks and should be fixed)",
 }

 namespaces = {
--- a/src/html5lib/filters/lint.py
+++ b/src/html5lib/filters/lint.py
@ -1,8 +1,5 @@
 from __future__ import absolute_import, division, unicode_literals

-from gettext import gettext
-_ = gettext
-
 from . import _base
 from ..constants import cdataElements, rcdataElements, voidElements

@ -23,24 +20,24 @@ class Filter(_base.Filter):
            if type in ("StartTag", "EmptyTag"):
                name = token["name"]
                if contentModelFlag != "PCDATA":
-                    raise LintError(_("StartTag not in PCDATA content model flag: %(tag)s") % {"tag": name})
+                    raise LintError("StartTag not in PCDATA content model flag: %(tag)s" % {"tag": name})
                if not isinstance(name, str):
-                    raise LintError(_("Tag name is not a string: %(tag)r") % {"tag": name})
+                    raise LintError("Tag name is not a string: %(tag)r" % {"tag": name})
                if not name:
-                    raise LintError(_("Empty tag name"))
+                    raise LintError("Empty tag name")
                if type == "StartTag" and name in voidElements:
-                    raise LintError(_("Void element reported as StartTag token: %(tag)s") % {"tag": name})
+                    raise LintError("Void element reported as StartTag token: %(tag)s" % {"tag": name})
                elif type == "EmptyTag" and name not in voidElements:
-                    raise LintError(_("Non-void element reported as EmptyTag token: %(tag)s") % {"tag": token["name"]})
+                    raise LintError("Non-void element reported as EmptyTag token: %(tag)s" % {"tag": token["name"]})
                if type == "StartTag":
                    open_elements.append(name)
                for name, value in token["data"]:
                    if not isinstance(name, str):
-                        raise LintError(_("Attribute name is not a string: %(name)r") % {"name": name})
+                        raise LintError("Attribute name is not a string: %(name)r" % {"name": name})
                    if not name:
-                        raise LintError(_("Empty attribute name"))
+                        raise LintError("Empty attribute name")
                    if not isinstance(value, str):
-                        raise LintError(_("Attribute value is not a string: %(value)r") % {"value": value})
+                        raise LintError("Attribute value is not a string: %(value)r" % {"value": value})
                if name in cdataElements:
                    contentModelFlag = "CDATA"
                elif name in rcdataElements:
@ -51,43 +48,43 @@ class Filter(_base.Filter):
            elif type == "EndTag":
                name = token["name"]
                if not isinstance(name, str):
-                    raise LintError(_("Tag name is not a string: %(tag)r") % {"tag": name})
+                    raise LintError("Tag name is not a string: %(tag)r" % {"tag": name})
                if not name:
-                    raise LintError(_("Empty tag name"))
+                    raise LintError("Empty tag name")
                if name in voidElements:
-                    raise LintError(_("Void element reported as EndTag token: %(tag)s") % {"tag": name})
+                    raise LintError("Void element reported as EndTag token: %(tag)s" % {"tag": name})
                start_name = open_elements.pop()
                if start_name != name:
-                    raise LintError(_("EndTag (%(end)s) does not match StartTag (%(start)s)") % {"end": name, "start": start_name})
+                    raise LintError("EndTag (%(end)s) does not match StartTag (%(start)s)" % {"end": name, "start": start_name})
                contentModelFlag = "PCDATA"

            elif type == "Comment":
                if contentModelFlag != "PCDATA":
-                    raise LintError(_("Comment not in PCDATA content model flag"))
+                    raise LintError("Comment not in PCDATA content model flag")

            elif type in ("Characters", "SpaceCharacters"):
                data = token["data"]
                if not isinstance(data, str):
-                    raise LintError(_("Attribute name is not a string: %(name)r") % {"name": data})
+                    raise LintError("Attribute name is not a string: %(name)r" % {"name": data})
                if not data:
-                    raise LintError(_("%(type)s token with empty data") % {"type": type})
+                    raise LintError("%(type)s token with empty data" % {"type": type})
                if type == "SpaceCharacters":
                    data = data.strip(spaceCharacters)
                    if data:
-                        raise LintError(_("Non-space character(s) found in SpaceCharacters token: %(token)r") % {"token": data})
+                        raise LintError("Non-space character(s) found in SpaceCharacters token: %(token)r" % {"token": data})

            elif type == "Doctype":
                name = token["name"]
                if contentModelFlag != "PCDATA":
-                    raise LintError(_("Doctype not in PCDATA content model flag: %(name)s") % {"name": name})
+                    raise LintError("Doctype not in PCDATA content model flag: %(name)s" % {"name": name})
                if not isinstance(name, str):
-                    raise LintError(_("Tag name is not a string: %(tag)r") % {"tag": name})
+                    raise LintError("Tag name is not a string: %(tag)r" % {"tag": name})
                # XXX: what to do with token["data"] ?

            elif type in ("ParseError", "SerializeError"):
                pass

            else:
-                raise LintError(_("Unknown token type: %(type)s") % {"type": type})
+                raise LintError("Unknown token type: %(type)s" % {"type": type})

            yield token
--- a/src/html5lib/html5parser.py
+++ b/src/html5lib/html5parser.py
@ -11,7 +11,7 @@ from .treebuilders._base import Marker

 from . import utils
 from .constants import (
-    spaceCharacters, asciiUpper2Lower, specialElements, headingElements,
+    spaceCharacters, asciiUpper2Lower, specialElements, headingElements, E,
    cdataElements, rcdataElements, tokenTypes, tagTokenTypes, ReparseException, namespaces,
    htmlIntegrationPointElements, mathmlTextIntegrationPointElements,
    adjustForeignAttributes as adjustForeignAttributesMap, adjustSVGAttributes,
@ -141,6 +141,17 @@ class HTMLParser(object):

        self.framesetOK = True

+    @property
+    def documentEncoding(self):
+        """The name of the character encoding
+        that was used to decode the input stream,
+        or :obj:`None` if that is not determined yet.
+
+        """
+        if not hasattr(self, 'tokenizer'):
+            return None
+        return self.tokenizer.stream.charEncoding[0]
+
    def isHTMLIntegrationPoint(self, element):
        if (element.name == "annotation-xml" and
                element.namespace == namespaces["mathml"]):
@ -204,8 +215,8 @@ class HTMLParser(object):
                    elif type == DoctypeToken:
                        new_token = phase.processDoctype(new_token)

-            if (type == StartTagToken and token["selfClosing"]
-                    and not token["selfClosingAcknowledged"]):
+            if (type == StartTagToken and token["selfClosing"] and
+                    not token["selfClosingAcknowledged"]):
                self.parseError("non-void-element-with-trailing-solidus",
                                {"name": token["name"]})

@ -257,7 +268,7 @@ class HTMLParser(object):
        # XXX The idea is to make errorcode mandatory.
        self.errors.append((self.tokenizer.stream.position(), errorcode, datavars))
        if self.strict:
-            raise ParseError
+            raise ParseError(E[errorcode] % datavars)

    def normalizeToken(self, token):
        """ HTML5 specific normalizations to the token stream """
@ -449,8 +460,8 @@ def getPhases(debug):
            if publicId != "":
                publicId = publicId.translate(asciiUpper2Lower)

-            if (not correct or token["name"] != "html"
-                or publicId.startswith(
+            if (not correct or token["name"] != "html" or
+                    publicId.startswith(
                    ("+//silmaril//dtd html pro v0r11 19970101//",
                     "-//advasoft ltd//dtd html 3.0 aswedit + extensions//",
                     "-//as//dtd html 3.0 aswedit + extensions//",
@ -505,21 +516,21 @@ def getPhases(debug):
                     "-//w3c//dtd w3 html//",
                     "-//w3o//dtd w3 html 3.0//",
                     "-//webtechs//dtd mozilla html 2.0//",
-                     "-//webtechs//dtd mozilla html//"))
-                or publicId in
+                     "-//webtechs//dtd mozilla html//")) or
+                     publicId in
                    ("-//w3o//dtd w3 html strict 3.0//en//",
                     "-/w3c/dtd html 4.0 transitional/en",
-                     "html")
-                or publicId.startswith(
+                     "html") or
+                    publicId.startswith(
                    ("-//w3c//dtd html 4.01 frameset//",
                     "-//w3c//dtd html 4.01 transitional//")) and
-                    systemId is None
-                    or systemId and systemId.lower() == "http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd"):
+                    systemId is None or
+                    systemId and systemId.lower() == "http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd"):
                self.parser.compatMode = "quirks"
            elif (publicId.startswith(
                    ("-//w3c//dtd xhtml 1.0 frameset//",
-                     "-//w3c//dtd xhtml 1.0 transitional//"))
-                  or publicId.startswith(
+                     "-//w3c//dtd xhtml 1.0 transitional//")) or
+                  publicId.startswith(
                      ("-//w3c//dtd html 4.01 frameset//",
                       "-//w3c//dtd html 4.01 transitional//")) and
                  systemId is not None):
@ -817,7 +828,7 @@ def getPhases(debug):
            self.startTagHandler = utils.MethodDispatcher([
                ("html", self.startTagHtml),
                (("base", "basefont", "bgsound", "command", "link", "meta",
-                  "noframes", "script", "style", "title"),
+                  "script", "style", "title"),
                 self.startTagProcessInHead),
                ("body", self.startTagBody),
                ("frameset", self.startTagFrameset),
@ -916,8 +927,8 @@ def getPhases(debug):
            data = token["data"]
            self.processSpaceCharacters = self.processSpaceCharactersNonPre
            if (data.startswith("\n") and
-                self.tree.openElements[-1].name in ("pre", "listing", "textarea")
-                    and not self.tree.openElements[-1].hasContent()):
+                self.tree.openElements[-1].name in ("pre", "listing", "textarea") and
+                    not self.tree.openElements[-1].hasContent()):
                data = data[1:]
            if data:
                self.tree.reconstructActiveFormattingElements()
@ -944,8 +955,8 @@ def getPhases(debug):

        def startTagBody(self, token):
            self.parser.parseError("unexpected-start-tag", {"name": "body"})
-            if (len(self.tree.openElements) == 1
-                    or self.tree.openElements[1].name != "body"):
+            if (len(self.tree.openElements) == 1 or
+                    self.tree.openElements[1].name != "body"):
                assert self.parser.innerHTML
            else:
                self.parser.framesetOK = False
@ -1143,8 +1154,7 @@ def getPhases(debug):
            attributes["name"] = "isindex"
            self.processStartTag(self.impliedTagToken("input", "StartTag",
                                                 attributes=attributes,
-                                                 selfClosing=
-                                                 token["selfClosing"]))
+                                                 selfClosing=token["selfClosing"]))
            self.processEndTag(self.impliedTagToken("label"))
            self.processStartTag(self.impliedTagToken("hr", "StartTag"))
            self.processEndTag(self.impliedTagToken("form"))
--- a/src/html5lib/sanitizer.py
+++ b/src/html5lib/sanitizer.py
@ -1,12 +1,31 @@
 from __future__ import absolute_import, division, unicode_literals

 import re
+import sys
 from xml.sax.saxutils import escape, unescape
+if sys.version_info[0] == 2:
+    from urlparse import urlparse
+else:
+    from urllib.parse import urlparse

 from .tokenizer import HTMLTokenizer
 from .constants import tokenTypes


+content_type_rgx = re.compile(r'''
+                               ^
+                               # Match a content type <application>/<type>
+                               (?P<content_type>[-a-zA-Z0-9.]+/[-a-zA-Z0-9.]+)
+                               # Match any character set and encoding
+                               (?:(?:;charset=(?:[-a-zA-Z0-9]+)(?:;(?:base64))?)
+                                 |(?:;(?:base64))?(?:;charset=(?:[-a-zA-Z0-9]+))?)
+                               # Assume the rest is data
+                               ,.*
+                               $
+                               ''',
+                              re.VERBOSE)
+
+
 class HTMLSanitizerMixin(object):
    """ sanitization of XHTML+MathML+SVG and of inline style attributes."""

@ -100,8 +119,8 @@ class HTMLSanitizerMixin(object):
                      'xml:base', 'xml:lang', 'xml:space', 'xmlns', 'xmlns:xlink', 'y',
                      'y1', 'y2', 'zoomAndPan']

-    attr_val_is_uri = ['href', 'src', 'cite', 'action', 'longdesc', 'poster',
-                       'xlink:href', 'xml:base']
+    attr_val_is_uri = ['href', 'src', 'cite', 'action', 'longdesc', 'poster', 'background', 'datasrc',
+                       'dynsrc', 'lowsrc', 'ping', 'poster', 'xlink:href', 'xml:base']

    svg_attr_val_allows_ref = ['clip-path', 'color-profile', 'cursor', 'fill',
                               'filter', 'marker', 'marker-start', 'marker-mid', 'marker-end',
@ -138,7 +157,9 @@ class HTMLSanitizerMixin(object):
    acceptable_protocols = ['ed2k', 'ftp', 'http', 'https', 'irc',
                            'mailto', 'news', 'gopher', 'nntp', 'telnet', 'webcal',
                            'xmpp', 'callto', 'feed', 'urn', 'aim', 'rsync', 'tag',
-                            'ssh', 'sftp', 'rtsp', 'afs']
+                            'ssh', 'sftp', 'rtsp', 'afs', 'data']
+
+    acceptable_content_types = ['image/png', 'image/jpeg', 'image/gif', 'image/webp', 'image/bmp', 'text/plain']

    # subclasses may define their own versions of these constants
    allowed_elements = acceptable_elements + mathml_elements + svg_elements
@ -147,6 +168,7 @@ class HTMLSanitizerMixin(object):
    allowed_css_keywords = acceptable_css_keywords
    allowed_svg_properties = acceptable_svg_properties
    allowed_protocols = acceptable_protocols
+    allowed_content_types = acceptable_content_types

    # Sanitize the +html+, escaping all elements not in ALLOWED_ELEMENTS, and
    # stripping out all # attributes not in ALLOWED_ATTRIBUTES. Style
@ -189,10 +211,21 @@ class HTMLSanitizerMixin(object):
                                       unescape(attrs[attr])).lower()
                # remove replacement characters from unescaped characters
                val_unescaped = val_unescaped.replace("\ufffd", "")
-                if (re.match("^[a-z0-9][-+.a-z0-9]*:", val_unescaped) and
-                    (val_unescaped.split(':')[0] not in
-                     self.allowed_protocols)):
+                try:
+                    uri = urlparse.urlparse(val_unescaped)
+                except ValueError:
+                    uri = None
                    del attrs[attr]
+                if uri and uri.scheme:
+                    if uri.scheme not in self.allowed_protocols:
+                        del attrs[attr]
+                    if uri.scheme == 'data':
+                        m = content_type_rgx.match(uri.path)
+                        if not m:
+                            del attrs[attr]
+                        elif m.group('content_type') not in self.allowed_content_types:
+                            del attrs[attr]
+
            for attr in self.svg_attr_val_allows_ref:
                if attr in attrs:
                    attrs[attr] = re.sub(r'url\s*\(\s*[^#\s][^)]+?\)',
@ -245,7 +278,7 @@ class HTMLSanitizerMixin(object):
            elif prop.split('-')[0].lower() in ['background', 'border', 'margin',
                                                'padding']:
                for keyword in value.split():
-                    if not keyword in self.acceptable_css_keywords and \
+                    if keyword not in self.acceptable_css_keywords and \
                            not re.match("^(#[0-9a-f]+|rgb\(\d+%?,\d*%?,?\d*%?\)?|\d{0,2}\.?\d{0,2}(cm|em|ex|in|mm|pc|pt|px|%|,|\))?)$", keyword):
                        break
                else:
--- a/src/html5lib/serializer/htmlserializer.py
+++ b/src/html5lib/serializer/htmlserializer.py
@ -4,9 +4,6 @@ try:
 except NameError:
    text_type = str

-import gettext
-_ = gettext.gettext
-
 try:
    from functools import reduce
 except ImportError:
@ -211,7 +208,7 @@ class HTMLSerializer(object):
                if token["systemId"]:
                    if token["systemId"].find('"') >= 0:
                        if token["systemId"].find("'") >= 0:
-                            self.serializeError(_("System identifer contains both single and double quote characters"))
+                            self.serializeError("System identifer contains both single and double quote characters")
                        quote_char = "'"
                    else:
                        quote_char = '"'
@ -223,7 +220,7 @@ class HTMLSerializer(object):
            elif type in ("Characters", "SpaceCharacters"):
                if type == "SpaceCharacters" or in_cdata:
                    if in_cdata and token["data"].find("</") >= 0:
-                        self.serializeError(_("Unexpected </ in CDATA"))
+                        self.serializeError("Unexpected </ in CDATA")
                    yield self.encode(token["data"])
                else:
                    yield self.encode(escape(token["data"]))
@ -234,7 +231,7 @@ class HTMLSerializer(object):
                if name in rcdataElements and not self.escape_rcdata:
                    in_cdata = True
                elif in_cdata:
-                    self.serializeError(_("Unexpected child element of a CDATA element"))
+                    self.serializeError("Unexpected child element of a CDATA element")
                for (attr_namespace, attr_name), attr_value in token["data"].items():
                    # TODO: Add namespace support here
                    k = attr_name
@ -282,20 +279,20 @@ class HTMLSerializer(object):
                if name in rcdataElements:
                    in_cdata = False
                elif in_cdata:
-                    self.serializeError(_("Unexpected child element of a CDATA element"))
+                    self.serializeError("Unexpected child element of a CDATA element")
                yield self.encodeStrict("</%s>" % name)

            elif type == "Comment":
                data = token["data"]
                if data.find("--") >= 0:
-                    self.serializeError(_("Comment contains --"))
+                    self.serializeError("Comment contains --")
                yield self.encodeStrict("<!--%s-->" % token["data"])

            elif type == "Entity":
                name = token["name"]
                key = name + ";"
                if not key in entities:
-                    self.serializeError(_("Entity %s not recognized" % name))
+                    self.serializeError("Entity %s not recognized" % name)
                if self.resolve_entities and key not in xmlEntities:
                    data = entities[key]
                else:
--- a/src/html5lib/treewalkers/init.py
+++ b/src/html5lib/treewalkers/init.py
@ -10,8 +10,12 @@ returning an iterator generating tokens.

 from __future__ import absolute_import, division, unicode_literals

+__all__ = ["getTreeWalker", "pprint", "dom", "etree", "genshistream", "lxmletree",
+           "pulldom"]
+
 import sys

+from .. import constants
 from ..utils import default_etree

 treeWalkerCache = {}
@ -55,3 +59,89 @@ def getTreeWalker(treeType, implementation=None, **kwargs):
            # XXX: NEVER cache here, caching is done in the etree submodule
            return etree.getETreeModule(implementation, **kwargs).TreeWalker
    return treeWalkerCache.get(treeType)
+
+
+def concatenateCharacterTokens(tokens):
+    pendingCharacters = []
+    for token in tokens:
+        type = token["type"]
+        if type in ("Characters", "SpaceCharacters"):
+            pendingCharacters.append(token["data"])
+        else:
+            if pendingCharacters:
+                yield {"type": "Characters", "data": "".join(pendingCharacters)}
+                pendingCharacters = []
+            yield token
+    if pendingCharacters:
+        yield {"type": "Characters", "data": "".join(pendingCharacters)}
+
+
+def pprint(walker):
+    """Pretty printer for tree walkers"""
+    output = []
+    indent = 0
+    for token in concatenateCharacterTokens(walker):
+        type = token["type"]
+        if type in ("StartTag", "EmptyTag"):
+            # tag name
+            if token["namespace"] and token["namespace"] != constants.namespaces["html"]:
+                if token["namespace"] in constants.prefixes:
+                    ns = constants.prefixes[token["namespace"]]
+                else:
+                    ns = token["namespace"]
+                name = "%s %s" % (ns, token["name"])
+            else:
+                name = token["name"]
+            output.append("%s<%s>" % (" " * indent, name))
+            indent += 2
+            # attributes (sorted for consistent ordering)
+            attrs = token["data"]
+            for (namespace, localname), value in sorted(attrs.items()):
+                if namespace:
+                    if namespace in constants.prefixes:
+                        ns = constants.prefixes[namespace]
+                    else:
+                        ns = namespace
+                    name = "%s %s" % (ns, localname)
+                else:
+                    name = localname
+                output.append("%s%s=\"%s\"" % (" " * indent, name, value))
+            # self-closing
+            if type == "EmptyTag":
+                indent -= 2
+
+        elif type == "EndTag":
+            indent -= 2
+
+        elif type == "Comment":
+            output.append("%s<!-- %s -->" % (" " * indent, token["data"]))
+
+        elif type == "Doctype":
+            if token["name"]:
+                if token["publicId"]:
+                    output.append("""%s<!DOCTYPE %s "%s" "%s">""" %
+                                  (" " * indent,
+                                   token["name"],
+                                   token["publicId"],
+                                   token["systemId"] if token["systemId"] else ""))
+                elif token["systemId"]:
+                    output.append("""%s<!DOCTYPE %s "" "%s">""" %
+                                  (" " * indent,
+                                   token["name"],
+                                   token["systemId"]))
+                else:
+                    output.append("%s<!DOCTYPE %s>" % (" " * indent,
+                                                       token["name"]))
+            else:
+                output.append("%s<!DOCTYPE >" % (" " * indent,))
+
+        elif type == "Characters":
+            output.append("%s\"%s\"" % (" " * indent, token["data"]))
+
+        elif type == "SpaceCharacters":
+            assert False, "concatenateCharacterTokens should have got rid of all Space tokens"
+
+        else:
+            raise ValueError("Unknown token type, %s" % type)
+
+    return "\n".join(output)
--- a/src/html5lib/treewalkers/_base.py
+++ b/src/html5lib/treewalkers/_base.py
@ -6,8 +6,8 @@ except NameError:
    text_type = str
    string_types = str,

-import gettext
-_ = gettext.gettext
+__all__ = ["DOCUMENT", "DOCTYPE", "TEXT", "ELEMENT", "COMMENT", "ENTITY", "UNKNOWN",
+           "TreeWalker", "NonRecursiveTreeWalker"]

 from xml.dom import Node

@ -63,7 +63,7 @@ class TreeWalker(object):
               "namespace": to_text(namespace),
               "data": attrs}
        if hasChildren:
-            yield self.error(_("Void element has children"))
+            yield self.error("Void element has children")

    def startTag(self, namespace, name, attrs):
        assert namespace is None or isinstance(namespace, string_types), type(namespace)
@ -127,7 +127,7 @@ class TreeWalker(object):
        return {"type": "Entity", "name": text_type(name)}

    def unknown(self, nodeType):
-        return self.error(_("Unknown node type: ") + nodeType)
+        return self.error("Unknown node type: " + nodeType)


 class NonRecursiveTreeWalker(TreeWalker):
--- a/src/html5lib/treewalkers/dom.py
+++ b/src/html5lib/treewalkers/dom.py
@ -2,9 +2,6 @@ from __future__ import absolute_import, division, unicode_literals

 from xml.dom import Node

-import gettext
-_ = gettext.gettext
-
 from . import _base


--- a/src/html5lib/treewalkers/etree.py
+++ b/src/html5lib/treewalkers/etree.py
@ -7,15 +7,14 @@ except ImportError:
        from ordereddict import OrderedDict
    except ImportError:
        OrderedDict = dict
-import gettext
-_ = gettext.gettext

 import re

 try:
-    text_type = unicode
+    unicode
+    string_types = basestring,
 except NameError:
-    text_type = str
+    string_types = str,

 from . import _base
 from ..utils import moduleFactoryFactory
@ -63,7 +62,7 @@ def getETreeBuilder(ElementTreeImplementation):
                return _base.COMMENT, node.text

            else:
-                assert type(node.tag) == text_type, type(node.tag)
+                assert isinstance(node.tag, string_types), type(node.tag)
                # This is assumed to be an ordinary element
                match = tag_regexp.match(node.tag)
                if match:
--- a/src/html5lib/treewalkers/lxmletree.py
+++ b/src/html5lib/treewalkers/lxmletree.py
@ -7,9 +7,6 @@ except NameError:
 from lxml import etree
 from ..treebuilders.etree import tag_regexp

-from gettext import gettext
-_ = gettext
-
 from . import _base

 from .. import ihatexml
@ -90,10 +87,6 @@ class FragmentWrapper(object):
            self.tail = ensure_str(self.obj.tail)
        else:
            self.tail = None
-        self.isstring = isinstance(obj, str) or isinstance(obj, bytes)
-        # Support for bytes here is Py2
-        if self.isstring:
-            self.obj = ensure_str(self.obj)

    def __getattr__(self, name):
        return getattr(self.obj, name)
@ -137,7 +130,7 @@ class TreeWalker(_base.NonRecursiveTreeWalker):
    def getNodeDetails(self, node):
        if isinstance(node, tuple):  # Text node
            node, key = node
-            assert key in ("text", "tail"), _("Text nodes are text or tail, found %s") % key
+            assert key in ("text", "tail"), "Text nodes are text or tail, found %s" % key
            return _base.TEXT, ensure_str(getattr(node, key))

        elif isinstance(node, Root):
@ -146,7 +139,7 @@ class TreeWalker(_base.NonRecursiveTreeWalker):
        elif isinstance(node, Doctype):
            return _base.DOCTYPE, node.name, node.public_id, node.system_id

-        elif isinstance(node, FragmentWrapper) and node.isstring:
+        elif isinstance(node, FragmentWrapper) and not hasattr(node, "tag"):
            return _base.TEXT, node.obj

        elif node.tag == etree.Comment:
@ -176,7 +169,7 @@ class TreeWalker(_base.NonRecursiveTreeWalker):
                    attrs, len(node) > 0 or node.text)

    def getFirstChild(self, node):
-        assert not isinstance(node, tuple), _("Text nodes have no children")
+        assert not isinstance(node, tuple), "Text nodes have no children"

        assert len(node) or node.text, "Node has no children"
        if node.text:
@ -187,7 +180,7 @@ class TreeWalker(_base.NonRecursiveTreeWalker):
    def getNextSibling(self, node):
        if isinstance(node, tuple):  # Text node
            node, key = node
-            assert key in ("text", "tail"), _("Text nodes are text or tail, found %s") % key
+            assert key in ("text", "tail"), "Text nodes are text or tail, found %s" % key
            if key == "text":
                # XXX: we cannot use a "bool(node) and node[0] or None" construct here
                # because node[0] might evaluate to False if it has no child element
@ -203,7 +196,7 @@ class TreeWalker(_base.NonRecursiveTreeWalker):
    def getParentNode(self, node):
        if isinstance(node, tuple):  # Text node
            node, key = node
-            assert key in ("text", "tail"), _("Text nodes are text or tail, found %s") % key
+            assert key in ("text", "tail"), "Text nodes are text or tail, found %s" % key
            if key == "text":
                return node
            # else: fallback to "normal" processing