diff --git a/src/html5lib/__init__.py b/src/html5lib/__init__.py
index 0a43c066b6..a77228056c 100644
--- a/src/html5lib/__init__.py
+++ b/src/html5lib/__init__.py
@@ -20,4 +20,4 @@ from .serializer import serialize
__all__ = ["HTMLParser", "parse", "parseFragment", "getTreeBuilder",
"getTreeWalker", "serialize"]
-__version__ = "0.999-dev"
+__version__ = "0.999999-dev"
diff --git a/src/html5lib/constants.py b/src/html5lib/constants.py
index 6d5ccb176b..8ee29ffb7c 100644
--- a/src/html5lib/constants.py
+++ b/src/html5lib/constants.py
@@ -1,292 +1,290 @@
from __future__ import absolute_import, division, unicode_literals
import string
-import gettext
-_ = gettext.gettext
EOF = None
E = {
"null-character":
- _("Null character in input stream, replaced with U+FFFD."),
+ "Null character in input stream, replaced with U+FFFD.",
"invalid-codepoint":
- _("Invalid codepoint in stream."),
+ "Invalid codepoint in stream.",
"incorrectly-placed-solidus":
- _("Solidus (/) incorrectly placed in tag."),
+ "Solidus (/) incorrectly placed in tag.",
"incorrect-cr-newline-entity":
- _("Incorrect CR newline entity, replaced with LF."),
+ "Incorrect CR newline entity, replaced with LF.",
"illegal-windows-1252-entity":
- _("Entity used with illegal number (windows-1252 reference)."),
+ "Entity used with illegal number (windows-1252 reference).",
"cant-convert-numeric-entity":
- _("Numeric entity couldn't be converted to character "
+ ("Numeric entity couldn't be converted to character "
"(codepoint U+%(charAsInt)08x)."),
"illegal-codepoint-for-numeric-entity":
- _("Numeric entity represents an illegal codepoint: "
+ ("Numeric entity represents an illegal codepoint: "
"U+%(charAsInt)08x."),
"numeric-entity-without-semicolon":
- _("Numeric entity didn't end with ';'."),
+ "Numeric entity didn't end with ';'.",
"expected-numeric-entity-but-got-eof":
- _("Numeric entity expected. Got end of file instead."),
+ "Numeric entity expected. Got end of file instead.",
"expected-numeric-entity":
- _("Numeric entity expected but none found."),
+ "Numeric entity expected but none found.",
"named-entity-without-semicolon":
- _("Named entity didn't end with ';'."),
+ "Named entity didn't end with ';'.",
"expected-named-entity":
- _("Named entity expected. Got none."),
+ "Named entity expected. Got none.",
"attributes-in-end-tag":
- _("End tag contains unexpected attributes."),
+ "End tag contains unexpected attributes.",
'self-closing-flag-on-end-tag':
- _("End tag contains unexpected self-closing flag."),
+ "End tag contains unexpected self-closing flag.",
"expected-tag-name-but-got-right-bracket":
- _("Expected tag name. Got '>' instead."),
+ "Expected tag name. Got '>' instead.",
"expected-tag-name-but-got-question-mark":
- _("Expected tag name. Got '?' instead. (HTML doesn't "
+ ("Expected tag name. Got '?' instead. (HTML doesn't "
"support processing instructions.)"),
"expected-tag-name":
- _("Expected tag name. Got something else instead"),
+ "Expected tag name. Got something else instead",
"expected-closing-tag-but-got-right-bracket":
- _("Expected closing tag. Got '>' instead. Ignoring '>'."),
+ "Expected closing tag. Got '>' instead. Ignoring '>'.",
"expected-closing-tag-but-got-eof":
- _("Expected closing tag. Unexpected end of file."),
+ "Expected closing tag. Unexpected end of file.",
"expected-closing-tag-but-got-char":
- _("Expected closing tag. Unexpected character '%(data)s' found."),
+ "Expected closing tag. Unexpected character '%(data)s' found.",
"eof-in-tag-name":
- _("Unexpected end of file in the tag name."),
+ "Unexpected end of file in the tag name.",
"expected-attribute-name-but-got-eof":
- _("Unexpected end of file. Expected attribute name instead."),
+ "Unexpected end of file. Expected attribute name instead.",
"eof-in-attribute-name":
- _("Unexpected end of file in attribute name."),
+ "Unexpected end of file in attribute name.",
"invalid-character-in-attribute-name":
- _("Invalid character in attribute name"),
+ "Invalid character in attribute name",
"duplicate-attribute":
- _("Dropped duplicate attribute on tag."),
+ "Dropped duplicate attribute on tag.",
"expected-end-of-tag-name-but-got-eof":
- _("Unexpected end of file. Expected = or end of tag."),
+ "Unexpected end of file. Expected = or end of tag.",
"expected-attribute-value-but-got-eof":
- _("Unexpected end of file. Expected attribute value."),
+ "Unexpected end of file. Expected attribute value.",
"expected-attribute-value-but-got-right-bracket":
- _("Expected attribute value. Got '>' instead."),
+ "Expected attribute value. Got '>' instead.",
'equals-in-unquoted-attribute-value':
- _("Unexpected = in unquoted attribute"),
+ "Unexpected = in unquoted attribute",
'unexpected-character-in-unquoted-attribute-value':
- _("Unexpected character in unquoted attribute"),
+ "Unexpected character in unquoted attribute",
"invalid-character-after-attribute-name":
- _("Unexpected character after attribute name."),
+ "Unexpected character after attribute name.",
"unexpected-character-after-attribute-value":
- _("Unexpected character after attribute value."),
+ "Unexpected character after attribute value.",
"eof-in-attribute-value-double-quote":
- _("Unexpected end of file in attribute value (\")."),
+ "Unexpected end of file in attribute value (\").",
"eof-in-attribute-value-single-quote":
- _("Unexpected end of file in attribute value (')."),
+ "Unexpected end of file in attribute value (').",
"eof-in-attribute-value-no-quotes":
- _("Unexpected end of file in attribute value."),
+ "Unexpected end of file in attribute value.",
"unexpected-EOF-after-solidus-in-tag":
- _("Unexpected end of file in tag. Expected >"),
+ "Unexpected end of file in tag. Expected >",
"unexpected-character-after-solidus-in-tag":
- _("Unexpected character after / in tag. Expected >"),
+ "Unexpected character after / in tag. Expected >",
"expected-dashes-or-doctype":
- _("Expected '--' or 'DOCTYPE'. Not found."),
+ "Expected '--' or 'DOCTYPE'. Not found.",
"unexpected-bang-after-double-dash-in-comment":
- _("Unexpected ! after -- in comment"),
+ "Unexpected ! after -- in comment",
"unexpected-space-after-double-dash-in-comment":
- _("Unexpected space after -- in comment"),
+ "Unexpected space after -- in comment",
"incorrect-comment":
- _("Incorrect comment."),
+ "Incorrect comment.",
"eof-in-comment":
- _("Unexpected end of file in comment."),
+ "Unexpected end of file in comment.",
"eof-in-comment-end-dash":
- _("Unexpected end of file in comment (-)"),
+ "Unexpected end of file in comment (-)",
"unexpected-dash-after-double-dash-in-comment":
- _("Unexpected '-' after '--' found in comment."),
+ "Unexpected '-' after '--' found in comment.",
"eof-in-comment-double-dash":
- _("Unexpected end of file in comment (--)."),
+ "Unexpected end of file in comment (--).",
"eof-in-comment-end-space-state":
- _("Unexpected end of file in comment."),
+ "Unexpected end of file in comment.",
"eof-in-comment-end-bang-state":
- _("Unexpected end of file in comment."),
+ "Unexpected end of file in comment.",
"unexpected-char-in-comment":
- _("Unexpected character in comment found."),
+ "Unexpected character in comment found.",
"need-space-after-doctype":
- _("No space after literal string 'DOCTYPE'."),
+ "No space after literal string 'DOCTYPE'.",
"expected-doctype-name-but-got-right-bracket":
- _("Unexpected > character. Expected DOCTYPE name."),
+ "Unexpected > character. Expected DOCTYPE name.",
"expected-doctype-name-but-got-eof":
- _("Unexpected end of file. Expected DOCTYPE name."),
+ "Unexpected end of file. Expected DOCTYPE name.",
"eof-in-doctype-name":
- _("Unexpected end of file in DOCTYPE name."),
+ "Unexpected end of file in DOCTYPE name.",
"eof-in-doctype":
- _("Unexpected end of file in DOCTYPE."),
+ "Unexpected end of file in DOCTYPE.",
"expected-space-or-right-bracket-in-doctype":
- _("Expected space or '>'. Got '%(data)s'"),
+ "Expected space or '>'. Got '%(data)s'",
"unexpected-end-of-doctype":
- _("Unexpected end of DOCTYPE."),
+ "Unexpected end of DOCTYPE.",
"unexpected-char-in-doctype":
- _("Unexpected character in DOCTYPE."),
+ "Unexpected character in DOCTYPE.",
"eof-in-innerhtml":
- _("XXX innerHTML EOF"),
+ "XXX innerHTML EOF",
"unexpected-doctype":
- _("Unexpected DOCTYPE. Ignored."),
+ "Unexpected DOCTYPE. Ignored.",
"non-html-root":
- _("html needs to be the first start tag."),
+ "html needs to be the first start tag.",
"expected-doctype-but-got-eof":
- _("Unexpected End of file. Expected DOCTYPE."),
+ "Unexpected End of file. Expected DOCTYPE.",
"unknown-doctype":
- _("Erroneous DOCTYPE."),
+ "Erroneous DOCTYPE.",
"expected-doctype-but-got-chars":
- _("Unexpected non-space characters. Expected DOCTYPE."),
+ "Unexpected non-space characters. Expected DOCTYPE.",
"expected-doctype-but-got-start-tag":
- _("Unexpected start tag (%(name)s). Expected DOCTYPE."),
+ "Unexpected start tag (%(name)s). Expected DOCTYPE.",
"expected-doctype-but-got-end-tag":
- _("Unexpected end tag (%(name)s). Expected DOCTYPE."),
+ "Unexpected end tag (%(name)s). Expected DOCTYPE.",
"end-tag-after-implied-root":
- _("Unexpected end tag (%(name)s) after the (implied) root element."),
+ "Unexpected end tag (%(name)s) after the (implied) root element.",
"expected-named-closing-tag-but-got-eof":
- _("Unexpected end of file. Expected end tag (%(name)s)."),
+ "Unexpected end of file. Expected end tag (%(name)s).",
"two-heads-are-not-better-than-one":
- _("Unexpected start tag head in existing head. Ignored."),
+ "Unexpected start tag head in existing head. Ignored.",
"unexpected-end-tag":
- _("Unexpected end tag (%(name)s). Ignored."),
+ "Unexpected end tag (%(name)s). Ignored.",
"unexpected-start-tag-out-of-my-head":
- _("Unexpected start tag (%(name)s) that can be in head. Moved."),
+ "Unexpected start tag (%(name)s) that can be in head. Moved.",
"unexpected-start-tag":
- _("Unexpected start tag (%(name)s)."),
+ "Unexpected start tag (%(name)s).",
"missing-end-tag":
- _("Missing end tag (%(name)s)."),
+ "Missing end tag (%(name)s).",
"missing-end-tags":
- _("Missing end tags (%(name)s)."),
+ "Missing end tags (%(name)s).",
"unexpected-start-tag-implies-end-tag":
- _("Unexpected start tag (%(startName)s) "
+ ("Unexpected start tag (%(startName)s) "
"implies end tag (%(endName)s)."),
"unexpected-start-tag-treated-as":
- _("Unexpected start tag (%(originalName)s). Treated as %(newName)s."),
+ "Unexpected start tag (%(originalName)s). Treated as %(newName)s.",
"deprecated-tag":
- _("Unexpected start tag %(name)s. Don't use it!"),
+ "Unexpected start tag %(name)s. Don't use it!",
"unexpected-start-tag-ignored":
- _("Unexpected start tag %(name)s. Ignored."),
+ "Unexpected start tag %(name)s. Ignored.",
"expected-one-end-tag-but-got-another":
- _("Unexpected end tag (%(gotName)s). "
+ ("Unexpected end tag (%(gotName)s). "
"Missing end tag (%(expectedName)s)."),
"end-tag-too-early":
- _("End tag (%(name)s) seen too early. Expected other end tag."),
+ "End tag (%(name)s) seen too early. Expected other end tag.",
"end-tag-too-early-named":
- _("Unexpected end tag (%(gotName)s). Expected end tag (%(expectedName)s)."),
+ "Unexpected end tag (%(gotName)s). Expected end tag (%(expectedName)s).",
"end-tag-too-early-ignored":
- _("End tag (%(name)s) seen too early. Ignored."),
+ "End tag (%(name)s) seen too early. Ignored.",
"adoption-agency-1.1":
- _("End tag (%(name)s) violates step 1, "
+ ("End tag (%(name)s) violates step 1, "
"paragraph 1 of the adoption agency algorithm."),
"adoption-agency-1.2":
- _("End tag (%(name)s) violates step 1, "
+ ("End tag (%(name)s) violates step 1, "
"paragraph 2 of the adoption agency algorithm."),
"adoption-agency-1.3":
- _("End tag (%(name)s) violates step 1, "
+ ("End tag (%(name)s) violates step 1, "
"paragraph 3 of the adoption agency algorithm."),
"adoption-agency-4.4":
- _("End tag (%(name)s) violates step 4, "
+ ("End tag (%(name)s) violates step 4, "
"paragraph 4 of the adoption agency algorithm."),
"unexpected-end-tag-treated-as":
- _("Unexpected end tag (%(originalName)s). Treated as %(newName)s."),
+ "Unexpected end tag (%(originalName)s). Treated as %(newName)s.",
"no-end-tag":
- _("This element (%(name)s) has no end tag."),
+ "This element (%(name)s) has no end tag.",
"unexpected-implied-end-tag-in-table":
- _("Unexpected implied end tag (%(name)s) in the table phase."),
+ "Unexpected implied end tag (%(name)s) in the table phase.",
"unexpected-implied-end-tag-in-table-body":
- _("Unexpected implied end tag (%(name)s) in the table body phase."),
+ "Unexpected implied end tag (%(name)s) in the table body phase.",
"unexpected-char-implies-table-voodoo":
- _("Unexpected non-space characters in "
+ ("Unexpected non-space characters in "
"table context caused voodoo mode."),
"unexpected-hidden-input-in-table":
- _("Unexpected input with type hidden in table context."),
+ "Unexpected input with type hidden in table context.",
"unexpected-form-in-table":
- _("Unexpected form in table context."),
+ "Unexpected form in table context.",
"unexpected-start-tag-implies-table-voodoo":
- _("Unexpected start tag (%(name)s) in "
+ ("Unexpected start tag (%(name)s) in "
"table context caused voodoo mode."),
"unexpected-end-tag-implies-table-voodoo":
- _("Unexpected end tag (%(name)s) in "
+ ("Unexpected end tag (%(name)s) in "
"table context caused voodoo mode."),
"unexpected-cell-in-table-body":
- _("Unexpected table cell start tag (%(name)s) "
+ ("Unexpected table cell start tag (%(name)s) "
"in the table body phase."),
"unexpected-cell-end-tag":
- _("Got table cell end tag (%(name)s) "
+ ("Got table cell end tag (%(name)s) "
"while required end tags are missing."),
"unexpected-end-tag-in-table-body":
- _("Unexpected end tag (%(name)s) in the table body phase. Ignored."),
+ "Unexpected end tag (%(name)s) in the table body phase. Ignored.",
"unexpected-implied-end-tag-in-table-row":
- _("Unexpected implied end tag (%(name)s) in the table row phase."),
+ "Unexpected implied end tag (%(name)s) in the table row phase.",
"unexpected-end-tag-in-table-row":
- _("Unexpected end tag (%(name)s) in the table row phase. Ignored."),
+ "Unexpected end tag (%(name)s) in the table row phase. Ignored.",
"unexpected-select-in-select":
- _("Unexpected select start tag in the select phase "
+ ("Unexpected select start tag in the select phase "
"treated as select end tag."),
"unexpected-input-in-select":
- _("Unexpected input start tag in the select phase."),
+ "Unexpected input start tag in the select phase.",
"unexpected-start-tag-in-select":
- _("Unexpected start tag token (%(name)s in the select phase. "
+ ("Unexpected start tag token (%(name)s in the select phase. "
"Ignored."),
"unexpected-end-tag-in-select":
- _("Unexpected end tag (%(name)s) in the select phase. Ignored."),
+ "Unexpected end tag (%(name)s) in the select phase. Ignored.",
"unexpected-table-element-start-tag-in-select-in-table":
- _("Unexpected table element start tag (%(name)s) in the select in table phase."),
+ "Unexpected table element start tag (%(name)s) in the select in table phase.",
"unexpected-table-element-end-tag-in-select-in-table":
- _("Unexpected table element end tag (%(name)s) in the select in table phase."),
+ "Unexpected table element end tag (%(name)s) in the select in table phase.",
"unexpected-char-after-body":
- _("Unexpected non-space characters in the after body phase."),
+ "Unexpected non-space characters in the after body phase.",
"unexpected-start-tag-after-body":
- _("Unexpected start tag token (%(name)s)"
+ ("Unexpected start tag token (%(name)s)"
" in the after body phase."),
"unexpected-end-tag-after-body":
- _("Unexpected end tag token (%(name)s)"
+ ("Unexpected end tag token (%(name)s)"
" in the after body phase."),
"unexpected-char-in-frameset":
- _("Unexpected characters in the frameset phase. Characters ignored."),
+ "Unexpected characters in the frameset phase. Characters ignored.",
"unexpected-start-tag-in-frameset":
- _("Unexpected start tag token (%(name)s)"
+ ("Unexpected start tag token (%(name)s)"
" in the frameset phase. Ignored."),
"unexpected-frameset-in-frameset-innerhtml":
- _("Unexpected end tag token (frameset) "
+ ("Unexpected end tag token (frameset) "
"in the frameset phase (innerHTML)."),
"unexpected-end-tag-in-frameset":
- _("Unexpected end tag token (%(name)s)"
+ ("Unexpected end tag token (%(name)s)"
" in the frameset phase. Ignored."),
"unexpected-char-after-frameset":
- _("Unexpected non-space characters in the "
+ ("Unexpected non-space characters in the "
"after frameset phase. Ignored."),
"unexpected-start-tag-after-frameset":
- _("Unexpected start tag (%(name)s)"
+ ("Unexpected start tag (%(name)s)"
" in the after frameset phase. Ignored."),
"unexpected-end-tag-after-frameset":
- _("Unexpected end tag (%(name)s)"
+ ("Unexpected end tag (%(name)s)"
" in the after frameset phase. Ignored."),
"unexpected-end-tag-after-body-innerhtml":
- _("Unexpected end tag after body(innerHtml)"),
+ "Unexpected end tag after body(innerHtml)",
"expected-eof-but-got-char":
- _("Unexpected non-space characters. Expected end of file."),
+ "Unexpected non-space characters. Expected end of file.",
"expected-eof-but-got-start-tag":
- _("Unexpected start tag (%(name)s)"
+ ("Unexpected start tag (%(name)s)"
". Expected end of file."),
"expected-eof-but-got-end-tag":
- _("Unexpected end tag (%(name)s)"
+ ("Unexpected end tag (%(name)s)"
". Expected end of file."),
"eof-in-table":
- _("Unexpected end of file. Expected table content."),
+ "Unexpected end of file. Expected table content.",
"eof-in-select":
- _("Unexpected end of file. Expected select content."),
+ "Unexpected end of file. Expected select content.",
"eof-in-frameset":
- _("Unexpected end of file. Expected frameset content."),
+ "Unexpected end of file. Expected frameset content.",
"eof-in-script-in-script":
- _("Unexpected end of file. Expected script content."),
+ "Unexpected end of file. Expected script content.",
"eof-in-foreign-lands":
- _("Unexpected end of file. Expected foreign content"),
+ "Unexpected end of file. Expected foreign content",
"non-void-element-with-trailing-solidus":
- _("Trailing solidus not allowed on element %(name)s"),
+ "Trailing solidus not allowed on element %(name)s",
"unexpected-html-element-in-foreign-content":
- _("Element %(name)s not allowed in a non-html context"),
+ "Element %(name)s not allowed in a non-html context",
"unexpected-end-tag-before-html":
- _("Unexpected end tag (%(name)s) before html."),
+ "Unexpected end tag (%(name)s) before html.",
"XXX-undefined-error":
- _("Undefined error (this sucks and should be fixed)"),
+ "Undefined error (this sucks and should be fixed)",
}
namespaces = {
diff --git a/src/html5lib/filters/lint.py b/src/html5lib/filters/lint.py
index 7cc99a4ba7..8884696dc5 100644
--- a/src/html5lib/filters/lint.py
+++ b/src/html5lib/filters/lint.py
@@ -1,8 +1,5 @@
from __future__ import absolute_import, division, unicode_literals
-from gettext import gettext
-_ = gettext
-
from . import _base
from ..constants import cdataElements, rcdataElements, voidElements
@@ -23,24 +20,24 @@ class Filter(_base.Filter):
if type in ("StartTag", "EmptyTag"):
name = token["name"]
if contentModelFlag != "PCDATA":
- raise LintError(_("StartTag not in PCDATA content model flag: %(tag)s") % {"tag": name})
+ raise LintError("StartTag not in PCDATA content model flag: %(tag)s" % {"tag": name})
if not isinstance(name, str):
- raise LintError(_("Tag name is not a string: %(tag)r") % {"tag": name})
+ raise LintError("Tag name is not a string: %(tag)r" % {"tag": name})
if not name:
- raise LintError(_("Empty tag name"))
+ raise LintError("Empty tag name")
if type == "StartTag" and name in voidElements:
- raise LintError(_("Void element reported as StartTag token: %(tag)s") % {"tag": name})
+ raise LintError("Void element reported as StartTag token: %(tag)s" % {"tag": name})
elif type == "EmptyTag" and name not in voidElements:
- raise LintError(_("Non-void element reported as EmptyTag token: %(tag)s") % {"tag": token["name"]})
+ raise LintError("Non-void element reported as EmptyTag token: %(tag)s" % {"tag": token["name"]})
if type == "StartTag":
open_elements.append(name)
for name, value in token["data"]:
if not isinstance(name, str):
- raise LintError(_("Attribute name is not a string: %(name)r") % {"name": name})
+ raise LintError("Attribute name is not a string: %(name)r" % {"name": name})
if not name:
- raise LintError(_("Empty attribute name"))
+ raise LintError("Empty attribute name")
if not isinstance(value, str):
- raise LintError(_("Attribute value is not a string: %(value)r") % {"value": value})
+ raise LintError("Attribute value is not a string: %(value)r" % {"value": value})
if name in cdataElements:
contentModelFlag = "CDATA"
elif name in rcdataElements:
@@ -51,43 +48,43 @@ class Filter(_base.Filter):
elif type == "EndTag":
name = token["name"]
if not isinstance(name, str):
- raise LintError(_("Tag name is not a string: %(tag)r") % {"tag": name})
+ raise LintError("Tag name is not a string: %(tag)r" % {"tag": name})
if not name:
- raise LintError(_("Empty tag name"))
+ raise LintError("Empty tag name")
if name in voidElements:
- raise LintError(_("Void element reported as EndTag token: %(tag)s") % {"tag": name})
+ raise LintError("Void element reported as EndTag token: %(tag)s" % {"tag": name})
start_name = open_elements.pop()
if start_name != name:
- raise LintError(_("EndTag (%(end)s) does not match StartTag (%(start)s)") % {"end": name, "start": start_name})
+ raise LintError("EndTag (%(end)s) does not match StartTag (%(start)s)" % {"end": name, "start": start_name})
contentModelFlag = "PCDATA"
elif type == "Comment":
if contentModelFlag != "PCDATA":
- raise LintError(_("Comment not in PCDATA content model flag"))
+ raise LintError("Comment not in PCDATA content model flag")
elif type in ("Characters", "SpaceCharacters"):
data = token["data"]
if not isinstance(data, str):
- raise LintError(_("Attribute name is not a string: %(name)r") % {"name": data})
+ raise LintError("Attribute name is not a string: %(name)r" % {"name": data})
if not data:
- raise LintError(_("%(type)s token with empty data") % {"type": type})
+ raise LintError("%(type)s token with empty data" % {"type": type})
if type == "SpaceCharacters":
data = data.strip(spaceCharacters)
if data:
- raise LintError(_("Non-space character(s) found in SpaceCharacters token: %(token)r") % {"token": data})
+ raise LintError("Non-space character(s) found in SpaceCharacters token: %(token)r" % {"token": data})
elif type == "Doctype":
name = token["name"]
if contentModelFlag != "PCDATA":
- raise LintError(_("Doctype not in PCDATA content model flag: %(name)s") % {"name": name})
+ raise LintError("Doctype not in PCDATA content model flag: %(name)s" % {"name": name})
if not isinstance(name, str):
- raise LintError(_("Tag name is not a string: %(tag)r") % {"tag": name})
+ raise LintError("Tag name is not a string: %(tag)r" % {"tag": name})
# XXX: what to do with token["data"] ?
elif type in ("ParseError", "SerializeError"):
pass
else:
- raise LintError(_("Unknown token type: %(type)s") % {"type": type})
+ raise LintError("Unknown token type: %(type)s" % {"type": type})
yield token
diff --git a/src/html5lib/html5parser.py b/src/html5lib/html5parser.py
index 494abf3e2f..c453552a59 100644
--- a/src/html5lib/html5parser.py
+++ b/src/html5lib/html5parser.py
@@ -11,7 +11,7 @@ from .treebuilders._base import Marker
from . import utils
from .constants import (
- spaceCharacters, asciiUpper2Lower, specialElements, headingElements,
+ spaceCharacters, asciiUpper2Lower, specialElements, headingElements, E,
cdataElements, rcdataElements, tokenTypes, tagTokenTypes, ReparseException, namespaces,
htmlIntegrationPointElements, mathmlTextIntegrationPointElements,
adjustForeignAttributes as adjustForeignAttributesMap, adjustSVGAttributes,
@@ -141,6 +141,17 @@ class HTMLParser(object):
self.framesetOK = True
+ @property
+ def documentEncoding(self):
+ """The name of the character encoding
+ that was used to decode the input stream,
+ or :obj:`None` if that is not determined yet.
+
+ """
+ if not hasattr(self, 'tokenizer'):
+ return None
+ return self.tokenizer.stream.charEncoding[0]
+
def isHTMLIntegrationPoint(self, element):
if (element.name == "annotation-xml" and
element.namespace == namespaces["mathml"]):
@@ -204,8 +215,8 @@ class HTMLParser(object):
elif type == DoctypeToken:
new_token = phase.processDoctype(new_token)
- if (type == StartTagToken and token["selfClosing"]
- and not token["selfClosingAcknowledged"]):
+ if (type == StartTagToken and token["selfClosing"] and
+ not token["selfClosingAcknowledged"]):
self.parseError("non-void-element-with-trailing-solidus",
{"name": token["name"]})
@@ -257,7 +268,7 @@ class HTMLParser(object):
# XXX The idea is to make errorcode mandatory.
self.errors.append((self.tokenizer.stream.position(), errorcode, datavars))
if self.strict:
- raise ParseError
+ raise ParseError(E[errorcode] % datavars)
def normalizeToken(self, token):
""" HTML5 specific normalizations to the token stream """
@@ -449,8 +460,8 @@ def getPhases(debug):
if publicId != "":
publicId = publicId.translate(asciiUpper2Lower)
- if (not correct or token["name"] != "html"
- or publicId.startswith(
+ if (not correct or token["name"] != "html" or
+ publicId.startswith(
("+//silmaril//dtd html pro v0r11 19970101//",
"-//advasoft ltd//dtd html 3.0 aswedit + extensions//",
"-//as//dtd html 3.0 aswedit + extensions//",
@@ -505,21 +516,21 @@ def getPhases(debug):
"-//w3c//dtd w3 html//",
"-//w3o//dtd w3 html 3.0//",
"-//webtechs//dtd mozilla html 2.0//",
- "-//webtechs//dtd mozilla html//"))
- or publicId in
+ "-//webtechs//dtd mozilla html//")) or
+ publicId in
("-//w3o//dtd w3 html strict 3.0//en//",
"-/w3c/dtd html 4.0 transitional/en",
- "html")
- or publicId.startswith(
+ "html") or
+ publicId.startswith(
("-//w3c//dtd html 4.01 frameset//",
"-//w3c//dtd html 4.01 transitional//")) and
- systemId is None
- or systemId and systemId.lower() == "http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd"):
+ systemId is None or
+ systemId and systemId.lower() == "http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd"):
self.parser.compatMode = "quirks"
elif (publicId.startswith(
("-//w3c//dtd xhtml 1.0 frameset//",
- "-//w3c//dtd xhtml 1.0 transitional//"))
- or publicId.startswith(
+ "-//w3c//dtd xhtml 1.0 transitional//")) or
+ publicId.startswith(
("-//w3c//dtd html 4.01 frameset//",
"-//w3c//dtd html 4.01 transitional//")) and
systemId is not None):
@@ -817,7 +828,7 @@ def getPhases(debug):
self.startTagHandler = utils.MethodDispatcher([
("html", self.startTagHtml),
(("base", "basefont", "bgsound", "command", "link", "meta",
- "noframes", "script", "style", "title"),
+ "script", "style", "title"),
self.startTagProcessInHead),
("body", self.startTagBody),
("frameset", self.startTagFrameset),
@@ -916,8 +927,8 @@ def getPhases(debug):
data = token["data"]
self.processSpaceCharacters = self.processSpaceCharactersNonPre
if (data.startswith("\n") and
- self.tree.openElements[-1].name in ("pre", "listing", "textarea")
- and not self.tree.openElements[-1].hasContent()):
+ self.tree.openElements[-1].name in ("pre", "listing", "textarea") and
+ not self.tree.openElements[-1].hasContent()):
data = data[1:]
if data:
self.tree.reconstructActiveFormattingElements()
@@ -944,8 +955,8 @@ def getPhases(debug):
def startTagBody(self, token):
self.parser.parseError("unexpected-start-tag", {"name": "body"})
- if (len(self.tree.openElements) == 1
- or self.tree.openElements[1].name != "body"):
+ if (len(self.tree.openElements) == 1 or
+ self.tree.openElements[1].name != "body"):
assert self.parser.innerHTML
else:
self.parser.framesetOK = False
@@ -1143,8 +1154,7 @@ def getPhases(debug):
attributes["name"] = "isindex"
self.processStartTag(self.impliedTagToken("input", "StartTag",
attributes=attributes,
- selfClosing=
- token["selfClosing"]))
+ selfClosing=token["selfClosing"]))
self.processEndTag(self.impliedTagToken("label"))
self.processStartTag(self.impliedTagToken("hr", "StartTag"))
self.processEndTag(self.impliedTagToken("form"))
diff --git a/src/html5lib/sanitizer.py b/src/html5lib/sanitizer.py
index 71dc5212c1..5a05eb1213 100644
--- a/src/html5lib/sanitizer.py
+++ b/src/html5lib/sanitizer.py
@@ -1,12 +1,31 @@
from __future__ import absolute_import, division, unicode_literals
import re
+import sys
from xml.sax.saxutils import escape, unescape
+if sys.version_info[0] == 2:
+ from urlparse import urlparse
+else:
+ from urllib.parse import urlparse
from .tokenizer import HTMLTokenizer
from .constants import tokenTypes
+content_type_rgx = re.compile(r'''
+ ^
+ # Match a content type /
+ (?P[-a-zA-Z0-9.]+/[-a-zA-Z0-9.]+)
+ # Match any character set and encoding
+ (?:(?:;charset=(?:[-a-zA-Z0-9]+)(?:;(?:base64))?)
+ |(?:;(?:base64))?(?:;charset=(?:[-a-zA-Z0-9]+))?)
+ # Assume the rest is data
+ ,.*
+ $
+ ''',
+ re.VERBOSE)
+
+
class HTMLSanitizerMixin(object):
""" sanitization of XHTML+MathML+SVG and of inline style attributes."""
@@ -100,8 +119,8 @@ class HTMLSanitizerMixin(object):
'xml:base', 'xml:lang', 'xml:space', 'xmlns', 'xmlns:xlink', 'y',
'y1', 'y2', 'zoomAndPan']
- attr_val_is_uri = ['href', 'src', 'cite', 'action', 'longdesc', 'poster',
- 'xlink:href', 'xml:base']
+ attr_val_is_uri = ['href', 'src', 'cite', 'action', 'longdesc', 'poster', 'background', 'datasrc',
+ 'dynsrc', 'lowsrc', 'ping', 'poster', 'xlink:href', 'xml:base']
svg_attr_val_allows_ref = ['clip-path', 'color-profile', 'cursor', 'fill',
'filter', 'marker', 'marker-start', 'marker-mid', 'marker-end',
@@ -138,7 +157,9 @@ class HTMLSanitizerMixin(object):
acceptable_protocols = ['ed2k', 'ftp', 'http', 'https', 'irc',
'mailto', 'news', 'gopher', 'nntp', 'telnet', 'webcal',
'xmpp', 'callto', 'feed', 'urn', 'aim', 'rsync', 'tag',
- 'ssh', 'sftp', 'rtsp', 'afs']
+ 'ssh', 'sftp', 'rtsp', 'afs', 'data']
+
+ acceptable_content_types = ['image/png', 'image/jpeg', 'image/gif', 'image/webp', 'image/bmp', 'text/plain']
# subclasses may define their own versions of these constants
allowed_elements = acceptable_elements + mathml_elements + svg_elements
@@ -147,6 +168,7 @@ class HTMLSanitizerMixin(object):
allowed_css_keywords = acceptable_css_keywords
allowed_svg_properties = acceptable_svg_properties
allowed_protocols = acceptable_protocols
+ allowed_content_types = acceptable_content_types
# Sanitize the +html+, escaping all elements not in ALLOWED_ELEMENTS, and
# stripping out all # attributes not in ALLOWED_ATTRIBUTES. Style
@@ -189,10 +211,21 @@ class HTMLSanitizerMixin(object):
unescape(attrs[attr])).lower()
# remove replacement characters from unescaped characters
val_unescaped = val_unescaped.replace("\ufffd", "")
- if (re.match("^[a-z0-9][-+.a-z0-9]*:", val_unescaped) and
- (val_unescaped.split(':')[0] not in
- self.allowed_protocols)):
+ try:
+ uri = urlparse.urlparse(val_unescaped)
+ except ValueError:
+ uri = None
del attrs[attr]
+ if uri and uri.scheme:
+ if uri.scheme not in self.allowed_protocols:
+ del attrs[attr]
+ if uri.scheme == 'data':
+ m = content_type_rgx.match(uri.path)
+ if not m:
+ del attrs[attr]
+ elif m.group('content_type') not in self.allowed_content_types:
+ del attrs[attr]
+
for attr in self.svg_attr_val_allows_ref:
if attr in attrs:
attrs[attr] = re.sub(r'url\s*\(\s*[^#\s][^)]+?\)',
@@ -245,7 +278,7 @@ class HTMLSanitizerMixin(object):
elif prop.split('-')[0].lower() in ['background', 'border', 'margin',
'padding']:
for keyword in value.split():
- if not keyword in self.acceptable_css_keywords and \
+ if keyword not in self.acceptable_css_keywords and \
not re.match("^(#[0-9a-f]+|rgb\(\d+%?,\d*%?,?\d*%?\)?|\d{0,2}\.?\d{0,2}(cm|em|ex|in|mm|pc|pt|px|%|,|\))?)$", keyword):
break
else:
diff --git a/src/html5lib/serializer/htmlserializer.py b/src/html5lib/serializer/htmlserializer.py
index 39a25b79b0..427b42a9c6 100644
--- a/src/html5lib/serializer/htmlserializer.py
+++ b/src/html5lib/serializer/htmlserializer.py
@@ -4,9 +4,6 @@ try:
except NameError:
text_type = str
-import gettext
-_ = gettext.gettext
-
try:
from functools import reduce
except ImportError:
@@ -211,7 +208,7 @@ class HTMLSerializer(object):
if token["systemId"]:
if token["systemId"].find('"') >= 0:
if token["systemId"].find("'") >= 0:
- self.serializeError(_("System identifer contains both single and double quote characters"))
+ self.serializeError("System identifer contains both single and double quote characters")
quote_char = "'"
else:
quote_char = '"'
@@ -223,7 +220,7 @@ class HTMLSerializer(object):
elif type in ("Characters", "SpaceCharacters"):
if type == "SpaceCharacters" or in_cdata:
if in_cdata and token["data"].find("") >= 0:
- self.serializeError(_("Unexpected in CDATA"))
+ self.serializeError("Unexpected in CDATA")
yield self.encode(token["data"])
else:
yield self.encode(escape(token["data"]))
@@ -234,7 +231,7 @@ class HTMLSerializer(object):
if name in rcdataElements and not self.escape_rcdata:
in_cdata = True
elif in_cdata:
- self.serializeError(_("Unexpected child element of a CDATA element"))
+ self.serializeError("Unexpected child element of a CDATA element")
for (attr_namespace, attr_name), attr_value in token["data"].items():
# TODO: Add namespace support here
k = attr_name
@@ -282,20 +279,20 @@ class HTMLSerializer(object):
if name in rcdataElements:
in_cdata = False
elif in_cdata:
- self.serializeError(_("Unexpected child element of a CDATA element"))
+ self.serializeError("Unexpected child element of a CDATA element")
yield self.encodeStrict("%s>" % name)
elif type == "Comment":
data = token["data"]
if data.find("--") >= 0:
- self.serializeError(_("Comment contains --"))
+ self.serializeError("Comment contains --")
yield self.encodeStrict("" % token["data"])
elif type == "Entity":
name = token["name"]
key = name + ";"
if not key in entities:
- self.serializeError(_("Entity %s not recognized" % name))
+ self.serializeError("Entity %s not recognized" % name)
if self.resolve_entities and key not in xmlEntities:
data = entities[key]
else:
diff --git a/src/html5lib/treewalkers/__init__.py b/src/html5lib/treewalkers/__init__.py
index 18124e75f3..20b91b114a 100644
--- a/src/html5lib/treewalkers/__init__.py
+++ b/src/html5lib/treewalkers/__init__.py
@@ -10,8 +10,12 @@ returning an iterator generating tokens.
from __future__ import absolute_import, division, unicode_literals
+__all__ = ["getTreeWalker", "pprint", "dom", "etree", "genshistream", "lxmletree",
+ "pulldom"]
+
import sys
+from .. import constants
from ..utils import default_etree
treeWalkerCache = {}
@@ -55,3 +59,89 @@ def getTreeWalker(treeType, implementation=None, **kwargs):
# XXX: NEVER cache here, caching is done in the etree submodule
return etree.getETreeModule(implementation, **kwargs).TreeWalker
return treeWalkerCache.get(treeType)
+
+
+def concatenateCharacterTokens(tokens):
+ pendingCharacters = []
+ for token in tokens:
+ type = token["type"]
+ if type in ("Characters", "SpaceCharacters"):
+ pendingCharacters.append(token["data"])
+ else:
+ if pendingCharacters:
+ yield {"type": "Characters", "data": "".join(pendingCharacters)}
+ pendingCharacters = []
+ yield token
+ if pendingCharacters:
+ yield {"type": "Characters", "data": "".join(pendingCharacters)}
+
+
+def pprint(walker):
+ """Pretty printer for tree walkers"""
+ output = []
+ indent = 0
+ for token in concatenateCharacterTokens(walker):
+ type = token["type"]
+ if type in ("StartTag", "EmptyTag"):
+ # tag name
+ if token["namespace"] and token["namespace"] != constants.namespaces["html"]:
+ if token["namespace"] in constants.prefixes:
+ ns = constants.prefixes[token["namespace"]]
+ else:
+ ns = token["namespace"]
+ name = "%s %s" % (ns, token["name"])
+ else:
+ name = token["name"]
+ output.append("%s<%s>" % (" " * indent, name))
+ indent += 2
+ # attributes (sorted for consistent ordering)
+ attrs = token["data"]
+ for (namespace, localname), value in sorted(attrs.items()):
+ if namespace:
+ if namespace in constants.prefixes:
+ ns = constants.prefixes[namespace]
+ else:
+ ns = namespace
+ name = "%s %s" % (ns, localname)
+ else:
+ name = localname
+ output.append("%s%s=\"%s\"" % (" " * indent, name, value))
+ # self-closing
+ if type == "EmptyTag":
+ indent -= 2
+
+ elif type == "EndTag":
+ indent -= 2
+
+ elif type == "Comment":
+ output.append("%s" % (" " * indent, token["data"]))
+
+ elif type == "Doctype":
+ if token["name"]:
+ if token["publicId"]:
+ output.append("""%s""" %
+ (" " * indent,
+ token["name"],
+ token["publicId"],
+ token["systemId"] if token["systemId"] else ""))
+ elif token["systemId"]:
+ output.append("""%s""" %
+ (" " * indent,
+ token["name"],
+ token["systemId"]))
+ else:
+ output.append("%s" % (" " * indent,
+ token["name"]))
+ else:
+ output.append("%s" % (" " * indent,))
+
+ elif type == "Characters":
+ output.append("%s\"%s\"" % (" " * indent, token["data"]))
+
+ elif type == "SpaceCharacters":
+ assert False, "concatenateCharacterTokens should have got rid of all Space tokens"
+
+ else:
+ raise ValueError("Unknown token type, %s" % type)
+
+ return "\n".join(output)
diff --git a/src/html5lib/treewalkers/_base.py b/src/html5lib/treewalkers/_base.py
index d36284bbbe..006dcd1656 100644
--- a/src/html5lib/treewalkers/_base.py
+++ b/src/html5lib/treewalkers/_base.py
@@ -6,8 +6,8 @@ except NameError:
text_type = str
string_types = str,
-import gettext
-_ = gettext.gettext
+__all__ = ["DOCUMENT", "DOCTYPE", "TEXT", "ELEMENT", "COMMENT", "ENTITY", "UNKNOWN",
+ "TreeWalker", "NonRecursiveTreeWalker"]
from xml.dom import Node
@@ -63,7 +63,7 @@ class TreeWalker(object):
"namespace": to_text(namespace),
"data": attrs}
if hasChildren:
- yield self.error(_("Void element has children"))
+ yield self.error("Void element has children")
def startTag(self, namespace, name, attrs):
assert namespace is None or isinstance(namespace, string_types), type(namespace)
@@ -127,7 +127,7 @@ class TreeWalker(object):
return {"type": "Entity", "name": text_type(name)}
def unknown(self, nodeType):
- return self.error(_("Unknown node type: ") + nodeType)
+ return self.error("Unknown node type: " + nodeType)
class NonRecursiveTreeWalker(TreeWalker):
diff --git a/src/html5lib/treewalkers/dom.py b/src/html5lib/treewalkers/dom.py
index a01287a944..ac4dcf31bf 100644
--- a/src/html5lib/treewalkers/dom.py
+++ b/src/html5lib/treewalkers/dom.py
@@ -2,9 +2,6 @@ from __future__ import absolute_import, division, unicode_literals
from xml.dom import Node
-import gettext
-_ = gettext.gettext
-
from . import _base
diff --git a/src/html5lib/treewalkers/etree.py b/src/html5lib/treewalkers/etree.py
index 3a33756b19..fa94810f10 100644
--- a/src/html5lib/treewalkers/etree.py
+++ b/src/html5lib/treewalkers/etree.py
@@ -7,15 +7,14 @@ except ImportError:
from ordereddict import OrderedDict
except ImportError:
OrderedDict = dict
-import gettext
-_ = gettext.gettext
import re
try:
- text_type = unicode
+ unicode
+ string_types = basestring,
except NameError:
- text_type = str
+ string_types = str,
from . import _base
from ..utils import moduleFactoryFactory
@@ -63,7 +62,7 @@ def getETreeBuilder(ElementTreeImplementation):
return _base.COMMENT, node.text
else:
- assert type(node.tag) == text_type, type(node.tag)
+ assert isinstance(node.tag, string_types), type(node.tag)
# This is assumed to be an ordinary element
match = tag_regexp.match(node.tag)
if match:
diff --git a/src/html5lib/treewalkers/lxmletree.py b/src/html5lib/treewalkers/lxmletree.py
index 4c55f852df..b2f27b1819 100644
--- a/src/html5lib/treewalkers/lxmletree.py
+++ b/src/html5lib/treewalkers/lxmletree.py
@@ -7,9 +7,6 @@ except NameError:
from lxml import etree
from ..treebuilders.etree import tag_regexp
-from gettext import gettext
-_ = gettext
-
from . import _base
from .. import ihatexml
@@ -90,10 +87,6 @@ class FragmentWrapper(object):
self.tail = ensure_str(self.obj.tail)
else:
self.tail = None
- self.isstring = isinstance(obj, str) or isinstance(obj, bytes)
- # Support for bytes here is Py2
- if self.isstring:
- self.obj = ensure_str(self.obj)
def __getattr__(self, name):
return getattr(self.obj, name)
@@ -137,7 +130,7 @@ class TreeWalker(_base.NonRecursiveTreeWalker):
def getNodeDetails(self, node):
if isinstance(node, tuple): # Text node
node, key = node
- assert key in ("text", "tail"), _("Text nodes are text or tail, found %s") % key
+ assert key in ("text", "tail"), "Text nodes are text or tail, found %s" % key
return _base.TEXT, ensure_str(getattr(node, key))
elif isinstance(node, Root):
@@ -146,7 +139,7 @@ class TreeWalker(_base.NonRecursiveTreeWalker):
elif isinstance(node, Doctype):
return _base.DOCTYPE, node.name, node.public_id, node.system_id
- elif isinstance(node, FragmentWrapper) and node.isstring:
+ elif isinstance(node, FragmentWrapper) and not hasattr(node, "tag"):
return _base.TEXT, node.obj
elif node.tag == etree.Comment:
@@ -176,7 +169,7 @@ class TreeWalker(_base.NonRecursiveTreeWalker):
attrs, len(node) > 0 or node.text)
def getFirstChild(self, node):
- assert not isinstance(node, tuple), _("Text nodes have no children")
+ assert not isinstance(node, tuple), "Text nodes have no children"
assert len(node) or node.text, "Node has no children"
if node.text:
@@ -187,7 +180,7 @@ class TreeWalker(_base.NonRecursiveTreeWalker):
def getNextSibling(self, node):
if isinstance(node, tuple): # Text node
node, key = node
- assert key in ("text", "tail"), _("Text nodes are text or tail, found %s") % key
+ assert key in ("text", "tail"), "Text nodes are text or tail, found %s" % key
if key == "text":
# XXX: we cannot use a "bool(node) and node[0] or None" construct here
# because node[0] might evaluate to False if it has no child element
@@ -203,7 +196,7 @@ class TreeWalker(_base.NonRecursiveTreeWalker):
def getParentNode(self, node):
if isinstance(node, tuple): # Text node
node, key = node
- assert key in ("text", "tail"), _("Text nodes are text or tail, found %s") % key
+ assert key in ("text", "tail"), "Text nodes are text or tail, found %s" % key
if key == "text":
return node
# else: fallback to "normal" processing