diff --git a/src/html5lib/__init__.py b/src/html5lib/__init__.py index 0a43c066b6..a77228056c 100644 --- a/src/html5lib/__init__.py +++ b/src/html5lib/__init__.py @@ -20,4 +20,4 @@ from .serializer import serialize __all__ = ["HTMLParser", "parse", "parseFragment", "getTreeBuilder", "getTreeWalker", "serialize"] -__version__ = "0.999-dev" +__version__ = "0.999999-dev" diff --git a/src/html5lib/constants.py b/src/html5lib/constants.py index 6d5ccb176b..8ee29ffb7c 100644 --- a/src/html5lib/constants.py +++ b/src/html5lib/constants.py @@ -1,292 +1,290 @@ from __future__ import absolute_import, division, unicode_literals import string -import gettext -_ = gettext.gettext EOF = None E = { "null-character": - _("Null character in input stream, replaced with U+FFFD."), + "Null character in input stream, replaced with U+FFFD.", "invalid-codepoint": - _("Invalid codepoint in stream."), + "Invalid codepoint in stream.", "incorrectly-placed-solidus": - _("Solidus (/) incorrectly placed in tag."), + "Solidus (/) incorrectly placed in tag.", "incorrect-cr-newline-entity": - _("Incorrect CR newline entity, replaced with LF."), + "Incorrect CR newline entity, replaced with LF.", "illegal-windows-1252-entity": - _("Entity used with illegal number (windows-1252 reference)."), + "Entity used with illegal number (windows-1252 reference).", "cant-convert-numeric-entity": - _("Numeric entity couldn't be converted to character " + ("Numeric entity couldn't be converted to character " "(codepoint U+%(charAsInt)08x)."), "illegal-codepoint-for-numeric-entity": - _("Numeric entity represents an illegal codepoint: " + ("Numeric entity represents an illegal codepoint: " "U+%(charAsInt)08x."), "numeric-entity-without-semicolon": - _("Numeric entity didn't end with ';'."), + "Numeric entity didn't end with ';'.", "expected-numeric-entity-but-got-eof": - _("Numeric entity expected. Got end of file instead."), + "Numeric entity expected. Got end of file instead.", "expected-numeric-entity": - _("Numeric entity expected but none found."), + "Numeric entity expected but none found.", "named-entity-without-semicolon": - _("Named entity didn't end with ';'."), + "Named entity didn't end with ';'.", "expected-named-entity": - _("Named entity expected. Got none."), + "Named entity expected. Got none.", "attributes-in-end-tag": - _("End tag contains unexpected attributes."), + "End tag contains unexpected attributes.", 'self-closing-flag-on-end-tag': - _("End tag contains unexpected self-closing flag."), + "End tag contains unexpected self-closing flag.", "expected-tag-name-but-got-right-bracket": - _("Expected tag name. Got '>' instead."), + "Expected tag name. Got '>' instead.", "expected-tag-name-but-got-question-mark": - _("Expected tag name. Got '?' instead. (HTML doesn't " + ("Expected tag name. Got '?' instead. (HTML doesn't " "support processing instructions.)"), "expected-tag-name": - _("Expected tag name. Got something else instead"), + "Expected tag name. Got something else instead", "expected-closing-tag-but-got-right-bracket": - _("Expected closing tag. Got '>' instead. Ignoring ''."), + "Expected closing tag. Got '>' instead. Ignoring ''.", "expected-closing-tag-but-got-eof": - _("Expected closing tag. Unexpected end of file."), + "Expected closing tag. Unexpected end of file.", "expected-closing-tag-but-got-char": - _("Expected closing tag. Unexpected character '%(data)s' found."), + "Expected closing tag. Unexpected character '%(data)s' found.", "eof-in-tag-name": - _("Unexpected end of file in the tag name."), + "Unexpected end of file in the tag name.", "expected-attribute-name-but-got-eof": - _("Unexpected end of file. Expected attribute name instead."), + "Unexpected end of file. Expected attribute name instead.", "eof-in-attribute-name": - _("Unexpected end of file in attribute name."), + "Unexpected end of file in attribute name.", "invalid-character-in-attribute-name": - _("Invalid character in attribute name"), + "Invalid character in attribute name", "duplicate-attribute": - _("Dropped duplicate attribute on tag."), + "Dropped duplicate attribute on tag.", "expected-end-of-tag-name-but-got-eof": - _("Unexpected end of file. Expected = or end of tag."), + "Unexpected end of file. Expected = or end of tag.", "expected-attribute-value-but-got-eof": - _("Unexpected end of file. Expected attribute value."), + "Unexpected end of file. Expected attribute value.", "expected-attribute-value-but-got-right-bracket": - _("Expected attribute value. Got '>' instead."), + "Expected attribute value. Got '>' instead.", 'equals-in-unquoted-attribute-value': - _("Unexpected = in unquoted attribute"), + "Unexpected = in unquoted attribute", 'unexpected-character-in-unquoted-attribute-value': - _("Unexpected character in unquoted attribute"), + "Unexpected character in unquoted attribute", "invalid-character-after-attribute-name": - _("Unexpected character after attribute name."), + "Unexpected character after attribute name.", "unexpected-character-after-attribute-value": - _("Unexpected character after attribute value."), + "Unexpected character after attribute value.", "eof-in-attribute-value-double-quote": - _("Unexpected end of file in attribute value (\")."), + "Unexpected end of file in attribute value (\").", "eof-in-attribute-value-single-quote": - _("Unexpected end of file in attribute value (')."), + "Unexpected end of file in attribute value (').", "eof-in-attribute-value-no-quotes": - _("Unexpected end of file in attribute value."), + "Unexpected end of file in attribute value.", "unexpected-EOF-after-solidus-in-tag": - _("Unexpected end of file in tag. Expected >"), + "Unexpected end of file in tag. Expected >", "unexpected-character-after-solidus-in-tag": - _("Unexpected character after / in tag. Expected >"), + "Unexpected character after / in tag. Expected >", "expected-dashes-or-doctype": - _("Expected '--' or 'DOCTYPE'. Not found."), + "Expected '--' or 'DOCTYPE'. Not found.", "unexpected-bang-after-double-dash-in-comment": - _("Unexpected ! after -- in comment"), + "Unexpected ! after -- in comment", "unexpected-space-after-double-dash-in-comment": - _("Unexpected space after -- in comment"), + "Unexpected space after -- in comment", "incorrect-comment": - _("Incorrect comment."), + "Incorrect comment.", "eof-in-comment": - _("Unexpected end of file in comment."), + "Unexpected end of file in comment.", "eof-in-comment-end-dash": - _("Unexpected end of file in comment (-)"), + "Unexpected end of file in comment (-)", "unexpected-dash-after-double-dash-in-comment": - _("Unexpected '-' after '--' found in comment."), + "Unexpected '-' after '--' found in comment.", "eof-in-comment-double-dash": - _("Unexpected end of file in comment (--)."), + "Unexpected end of file in comment (--).", "eof-in-comment-end-space-state": - _("Unexpected end of file in comment."), + "Unexpected end of file in comment.", "eof-in-comment-end-bang-state": - _("Unexpected end of file in comment."), + "Unexpected end of file in comment.", "unexpected-char-in-comment": - _("Unexpected character in comment found."), + "Unexpected character in comment found.", "need-space-after-doctype": - _("No space after literal string 'DOCTYPE'."), + "No space after literal string 'DOCTYPE'.", "expected-doctype-name-but-got-right-bracket": - _("Unexpected > character. Expected DOCTYPE name."), + "Unexpected > character. Expected DOCTYPE name.", "expected-doctype-name-but-got-eof": - _("Unexpected end of file. Expected DOCTYPE name."), + "Unexpected end of file. Expected DOCTYPE name.", "eof-in-doctype-name": - _("Unexpected end of file in DOCTYPE name."), + "Unexpected end of file in DOCTYPE name.", "eof-in-doctype": - _("Unexpected end of file in DOCTYPE."), + "Unexpected end of file in DOCTYPE.", "expected-space-or-right-bracket-in-doctype": - _("Expected space or '>'. Got '%(data)s'"), + "Expected space or '>'. Got '%(data)s'", "unexpected-end-of-doctype": - _("Unexpected end of DOCTYPE."), + "Unexpected end of DOCTYPE.", "unexpected-char-in-doctype": - _("Unexpected character in DOCTYPE."), + "Unexpected character in DOCTYPE.", "eof-in-innerhtml": - _("XXX innerHTML EOF"), + "XXX innerHTML EOF", "unexpected-doctype": - _("Unexpected DOCTYPE. Ignored."), + "Unexpected DOCTYPE. Ignored.", "non-html-root": - _("html needs to be the first start tag."), + "html needs to be the first start tag.", "expected-doctype-but-got-eof": - _("Unexpected End of file. Expected DOCTYPE."), + "Unexpected End of file. Expected DOCTYPE.", "unknown-doctype": - _("Erroneous DOCTYPE."), + "Erroneous DOCTYPE.", "expected-doctype-but-got-chars": - _("Unexpected non-space characters. Expected DOCTYPE."), + "Unexpected non-space characters. Expected DOCTYPE.", "expected-doctype-but-got-start-tag": - _("Unexpected start tag (%(name)s). Expected DOCTYPE."), + "Unexpected start tag (%(name)s). Expected DOCTYPE.", "expected-doctype-but-got-end-tag": - _("Unexpected end tag (%(name)s). Expected DOCTYPE."), + "Unexpected end tag (%(name)s). Expected DOCTYPE.", "end-tag-after-implied-root": - _("Unexpected end tag (%(name)s) after the (implied) root element."), + "Unexpected end tag (%(name)s) after the (implied) root element.", "expected-named-closing-tag-but-got-eof": - _("Unexpected end of file. Expected end tag (%(name)s)."), + "Unexpected end of file. Expected end tag (%(name)s).", "two-heads-are-not-better-than-one": - _("Unexpected start tag head in existing head. Ignored."), + "Unexpected start tag head in existing head. Ignored.", "unexpected-end-tag": - _("Unexpected end tag (%(name)s). Ignored."), + "Unexpected end tag (%(name)s). Ignored.", "unexpected-start-tag-out-of-my-head": - _("Unexpected start tag (%(name)s) that can be in head. Moved."), + "Unexpected start tag (%(name)s) that can be in head. Moved.", "unexpected-start-tag": - _("Unexpected start tag (%(name)s)."), + "Unexpected start tag (%(name)s).", "missing-end-tag": - _("Missing end tag (%(name)s)."), + "Missing end tag (%(name)s).", "missing-end-tags": - _("Missing end tags (%(name)s)."), + "Missing end tags (%(name)s).", "unexpected-start-tag-implies-end-tag": - _("Unexpected start tag (%(startName)s) " + ("Unexpected start tag (%(startName)s) " "implies end tag (%(endName)s)."), "unexpected-start-tag-treated-as": - _("Unexpected start tag (%(originalName)s). Treated as %(newName)s."), + "Unexpected start tag (%(originalName)s). Treated as %(newName)s.", "deprecated-tag": - _("Unexpected start tag %(name)s. Don't use it!"), + "Unexpected start tag %(name)s. Don't use it!", "unexpected-start-tag-ignored": - _("Unexpected start tag %(name)s. Ignored."), + "Unexpected start tag %(name)s. Ignored.", "expected-one-end-tag-but-got-another": - _("Unexpected end tag (%(gotName)s). " + ("Unexpected end tag (%(gotName)s). " "Missing end tag (%(expectedName)s)."), "end-tag-too-early": - _("End tag (%(name)s) seen too early. Expected other end tag."), + "End tag (%(name)s) seen too early. Expected other end tag.", "end-tag-too-early-named": - _("Unexpected end tag (%(gotName)s). Expected end tag (%(expectedName)s)."), + "Unexpected end tag (%(gotName)s). Expected end tag (%(expectedName)s).", "end-tag-too-early-ignored": - _("End tag (%(name)s) seen too early. Ignored."), + "End tag (%(name)s) seen too early. Ignored.", "adoption-agency-1.1": - _("End tag (%(name)s) violates step 1, " + ("End tag (%(name)s) violates step 1, " "paragraph 1 of the adoption agency algorithm."), "adoption-agency-1.2": - _("End tag (%(name)s) violates step 1, " + ("End tag (%(name)s) violates step 1, " "paragraph 2 of the adoption agency algorithm."), "adoption-agency-1.3": - _("End tag (%(name)s) violates step 1, " + ("End tag (%(name)s) violates step 1, " "paragraph 3 of the adoption agency algorithm."), "adoption-agency-4.4": - _("End tag (%(name)s) violates step 4, " + ("End tag (%(name)s) violates step 4, " "paragraph 4 of the adoption agency algorithm."), "unexpected-end-tag-treated-as": - _("Unexpected end tag (%(originalName)s). Treated as %(newName)s."), + "Unexpected end tag (%(originalName)s). Treated as %(newName)s.", "no-end-tag": - _("This element (%(name)s) has no end tag."), + "This element (%(name)s) has no end tag.", "unexpected-implied-end-tag-in-table": - _("Unexpected implied end tag (%(name)s) in the table phase."), + "Unexpected implied end tag (%(name)s) in the table phase.", "unexpected-implied-end-tag-in-table-body": - _("Unexpected implied end tag (%(name)s) in the table body phase."), + "Unexpected implied end tag (%(name)s) in the table body phase.", "unexpected-char-implies-table-voodoo": - _("Unexpected non-space characters in " + ("Unexpected non-space characters in " "table context caused voodoo mode."), "unexpected-hidden-input-in-table": - _("Unexpected input with type hidden in table context."), + "Unexpected input with type hidden in table context.", "unexpected-form-in-table": - _("Unexpected form in table context."), + "Unexpected form in table context.", "unexpected-start-tag-implies-table-voodoo": - _("Unexpected start tag (%(name)s) in " + ("Unexpected start tag (%(name)s) in " "table context caused voodoo mode."), "unexpected-end-tag-implies-table-voodoo": - _("Unexpected end tag (%(name)s) in " + ("Unexpected end tag (%(name)s) in " "table context caused voodoo mode."), "unexpected-cell-in-table-body": - _("Unexpected table cell start tag (%(name)s) " + ("Unexpected table cell start tag (%(name)s) " "in the table body phase."), "unexpected-cell-end-tag": - _("Got table cell end tag (%(name)s) " + ("Got table cell end tag (%(name)s) " "while required end tags are missing."), "unexpected-end-tag-in-table-body": - _("Unexpected end tag (%(name)s) in the table body phase. Ignored."), + "Unexpected end tag (%(name)s) in the table body phase. Ignored.", "unexpected-implied-end-tag-in-table-row": - _("Unexpected implied end tag (%(name)s) in the table row phase."), + "Unexpected implied end tag (%(name)s) in the table row phase.", "unexpected-end-tag-in-table-row": - _("Unexpected end tag (%(name)s) in the table row phase. Ignored."), + "Unexpected end tag (%(name)s) in the table row phase. Ignored.", "unexpected-select-in-select": - _("Unexpected select start tag in the select phase " + ("Unexpected select start tag in the select phase " "treated as select end tag."), "unexpected-input-in-select": - _("Unexpected input start tag in the select phase."), + "Unexpected input start tag in the select phase.", "unexpected-start-tag-in-select": - _("Unexpected start tag token (%(name)s in the select phase. " + ("Unexpected start tag token (%(name)s in the select phase. " "Ignored."), "unexpected-end-tag-in-select": - _("Unexpected end tag (%(name)s) in the select phase. Ignored."), + "Unexpected end tag (%(name)s) in the select phase. Ignored.", "unexpected-table-element-start-tag-in-select-in-table": - _("Unexpected table element start tag (%(name)s) in the select in table phase."), + "Unexpected table element start tag (%(name)s) in the select in table phase.", "unexpected-table-element-end-tag-in-select-in-table": - _("Unexpected table element end tag (%(name)s) in the select in table phase."), + "Unexpected table element end tag (%(name)s) in the select in table phase.", "unexpected-char-after-body": - _("Unexpected non-space characters in the after body phase."), + "Unexpected non-space characters in the after body phase.", "unexpected-start-tag-after-body": - _("Unexpected start tag token (%(name)s)" + ("Unexpected start tag token (%(name)s)" " in the after body phase."), "unexpected-end-tag-after-body": - _("Unexpected end tag token (%(name)s)" + ("Unexpected end tag token (%(name)s)" " in the after body phase."), "unexpected-char-in-frameset": - _("Unexpected characters in the frameset phase. Characters ignored."), + "Unexpected characters in the frameset phase. Characters ignored.", "unexpected-start-tag-in-frameset": - _("Unexpected start tag token (%(name)s)" + ("Unexpected start tag token (%(name)s)" " in the frameset phase. Ignored."), "unexpected-frameset-in-frameset-innerhtml": - _("Unexpected end tag token (frameset) " + ("Unexpected end tag token (frameset) " "in the frameset phase (innerHTML)."), "unexpected-end-tag-in-frameset": - _("Unexpected end tag token (%(name)s)" + ("Unexpected end tag token (%(name)s)" " in the frameset phase. Ignored."), "unexpected-char-after-frameset": - _("Unexpected non-space characters in the " + ("Unexpected non-space characters in the " "after frameset phase. Ignored."), "unexpected-start-tag-after-frameset": - _("Unexpected start tag (%(name)s)" + ("Unexpected start tag (%(name)s)" " in the after frameset phase. Ignored."), "unexpected-end-tag-after-frameset": - _("Unexpected end tag (%(name)s)" + ("Unexpected end tag (%(name)s)" " in the after frameset phase. Ignored."), "unexpected-end-tag-after-body-innerhtml": - _("Unexpected end tag after body(innerHtml)"), + "Unexpected end tag after body(innerHtml)", "expected-eof-but-got-char": - _("Unexpected non-space characters. Expected end of file."), + "Unexpected non-space characters. Expected end of file.", "expected-eof-but-got-start-tag": - _("Unexpected start tag (%(name)s)" + ("Unexpected start tag (%(name)s)" ". Expected end of file."), "expected-eof-but-got-end-tag": - _("Unexpected end tag (%(name)s)" + ("Unexpected end tag (%(name)s)" ". Expected end of file."), "eof-in-table": - _("Unexpected end of file. Expected table content."), + "Unexpected end of file. Expected table content.", "eof-in-select": - _("Unexpected end of file. Expected select content."), + "Unexpected end of file. Expected select content.", "eof-in-frameset": - _("Unexpected end of file. Expected frameset content."), + "Unexpected end of file. Expected frameset content.", "eof-in-script-in-script": - _("Unexpected end of file. Expected script content."), + "Unexpected end of file. Expected script content.", "eof-in-foreign-lands": - _("Unexpected end of file. Expected foreign content"), + "Unexpected end of file. Expected foreign content", "non-void-element-with-trailing-solidus": - _("Trailing solidus not allowed on element %(name)s"), + "Trailing solidus not allowed on element %(name)s", "unexpected-html-element-in-foreign-content": - _("Element %(name)s not allowed in a non-html context"), + "Element %(name)s not allowed in a non-html context", "unexpected-end-tag-before-html": - _("Unexpected end tag (%(name)s) before html."), + "Unexpected end tag (%(name)s) before html.", "XXX-undefined-error": - _("Undefined error (this sucks and should be fixed)"), + "Undefined error (this sucks and should be fixed)", } namespaces = { diff --git a/src/html5lib/filters/lint.py b/src/html5lib/filters/lint.py index 7cc99a4ba7..8884696dc5 100644 --- a/src/html5lib/filters/lint.py +++ b/src/html5lib/filters/lint.py @@ -1,8 +1,5 @@ from __future__ import absolute_import, division, unicode_literals -from gettext import gettext -_ = gettext - from . import _base from ..constants import cdataElements, rcdataElements, voidElements @@ -23,24 +20,24 @@ class Filter(_base.Filter): if type in ("StartTag", "EmptyTag"): name = token["name"] if contentModelFlag != "PCDATA": - raise LintError(_("StartTag not in PCDATA content model flag: %(tag)s") % {"tag": name}) + raise LintError("StartTag not in PCDATA content model flag: %(tag)s" % {"tag": name}) if not isinstance(name, str): - raise LintError(_("Tag name is not a string: %(tag)r") % {"tag": name}) + raise LintError("Tag name is not a string: %(tag)r" % {"tag": name}) if not name: - raise LintError(_("Empty tag name")) + raise LintError("Empty tag name") if type == "StartTag" and name in voidElements: - raise LintError(_("Void element reported as StartTag token: %(tag)s") % {"tag": name}) + raise LintError("Void element reported as StartTag token: %(tag)s" % {"tag": name}) elif type == "EmptyTag" and name not in voidElements: - raise LintError(_("Non-void element reported as EmptyTag token: %(tag)s") % {"tag": token["name"]}) + raise LintError("Non-void element reported as EmptyTag token: %(tag)s" % {"tag": token["name"]}) if type == "StartTag": open_elements.append(name) for name, value in token["data"]: if not isinstance(name, str): - raise LintError(_("Attribute name is not a string: %(name)r") % {"name": name}) + raise LintError("Attribute name is not a string: %(name)r" % {"name": name}) if not name: - raise LintError(_("Empty attribute name")) + raise LintError("Empty attribute name") if not isinstance(value, str): - raise LintError(_("Attribute value is not a string: %(value)r") % {"value": value}) + raise LintError("Attribute value is not a string: %(value)r" % {"value": value}) if name in cdataElements: contentModelFlag = "CDATA" elif name in rcdataElements: @@ -51,43 +48,43 @@ class Filter(_base.Filter): elif type == "EndTag": name = token["name"] if not isinstance(name, str): - raise LintError(_("Tag name is not a string: %(tag)r") % {"tag": name}) + raise LintError("Tag name is not a string: %(tag)r" % {"tag": name}) if not name: - raise LintError(_("Empty tag name")) + raise LintError("Empty tag name") if name in voidElements: - raise LintError(_("Void element reported as EndTag token: %(tag)s") % {"tag": name}) + raise LintError("Void element reported as EndTag token: %(tag)s" % {"tag": name}) start_name = open_elements.pop() if start_name != name: - raise LintError(_("EndTag (%(end)s) does not match StartTag (%(start)s)") % {"end": name, "start": start_name}) + raise LintError("EndTag (%(end)s) does not match StartTag (%(start)s)" % {"end": name, "start": start_name}) contentModelFlag = "PCDATA" elif type == "Comment": if contentModelFlag != "PCDATA": - raise LintError(_("Comment not in PCDATA content model flag")) + raise LintError("Comment not in PCDATA content model flag") elif type in ("Characters", "SpaceCharacters"): data = token["data"] if not isinstance(data, str): - raise LintError(_("Attribute name is not a string: %(name)r") % {"name": data}) + raise LintError("Attribute name is not a string: %(name)r" % {"name": data}) if not data: - raise LintError(_("%(type)s token with empty data") % {"type": type}) + raise LintError("%(type)s token with empty data" % {"type": type}) if type == "SpaceCharacters": data = data.strip(spaceCharacters) if data: - raise LintError(_("Non-space character(s) found in SpaceCharacters token: %(token)r") % {"token": data}) + raise LintError("Non-space character(s) found in SpaceCharacters token: %(token)r" % {"token": data}) elif type == "Doctype": name = token["name"] if contentModelFlag != "PCDATA": - raise LintError(_("Doctype not in PCDATA content model flag: %(name)s") % {"name": name}) + raise LintError("Doctype not in PCDATA content model flag: %(name)s" % {"name": name}) if not isinstance(name, str): - raise LintError(_("Tag name is not a string: %(tag)r") % {"tag": name}) + raise LintError("Tag name is not a string: %(tag)r" % {"tag": name}) # XXX: what to do with token["data"] ? elif type in ("ParseError", "SerializeError"): pass else: - raise LintError(_("Unknown token type: %(type)s") % {"type": type}) + raise LintError("Unknown token type: %(type)s" % {"type": type}) yield token diff --git a/src/html5lib/html5parser.py b/src/html5lib/html5parser.py index 494abf3e2f..c453552a59 100644 --- a/src/html5lib/html5parser.py +++ b/src/html5lib/html5parser.py @@ -11,7 +11,7 @@ from .treebuilders._base import Marker from . import utils from .constants import ( - spaceCharacters, asciiUpper2Lower, specialElements, headingElements, + spaceCharacters, asciiUpper2Lower, specialElements, headingElements, E, cdataElements, rcdataElements, tokenTypes, tagTokenTypes, ReparseException, namespaces, htmlIntegrationPointElements, mathmlTextIntegrationPointElements, adjustForeignAttributes as adjustForeignAttributesMap, adjustSVGAttributes, @@ -141,6 +141,17 @@ class HTMLParser(object): self.framesetOK = True + @property + def documentEncoding(self): + """The name of the character encoding + that was used to decode the input stream, + or :obj:`None` if that is not determined yet. + + """ + if not hasattr(self, 'tokenizer'): + return None + return self.tokenizer.stream.charEncoding[0] + def isHTMLIntegrationPoint(self, element): if (element.name == "annotation-xml" and element.namespace == namespaces["mathml"]): @@ -204,8 +215,8 @@ class HTMLParser(object): elif type == DoctypeToken: new_token = phase.processDoctype(new_token) - if (type == StartTagToken and token["selfClosing"] - and not token["selfClosingAcknowledged"]): + if (type == StartTagToken and token["selfClosing"] and + not token["selfClosingAcknowledged"]): self.parseError("non-void-element-with-trailing-solidus", {"name": token["name"]}) @@ -257,7 +268,7 @@ class HTMLParser(object): # XXX The idea is to make errorcode mandatory. self.errors.append((self.tokenizer.stream.position(), errorcode, datavars)) if self.strict: - raise ParseError + raise ParseError(E[errorcode] % datavars) def normalizeToken(self, token): """ HTML5 specific normalizations to the token stream """ @@ -449,8 +460,8 @@ def getPhases(debug): if publicId != "": publicId = publicId.translate(asciiUpper2Lower) - if (not correct or token["name"] != "html" - or publicId.startswith( + if (not correct or token["name"] != "html" or + publicId.startswith( ("+//silmaril//dtd html pro v0r11 19970101//", "-//advasoft ltd//dtd html 3.0 aswedit + extensions//", "-//as//dtd html 3.0 aswedit + extensions//", @@ -505,21 +516,21 @@ def getPhases(debug): "-//w3c//dtd w3 html//", "-//w3o//dtd w3 html 3.0//", "-//webtechs//dtd mozilla html 2.0//", - "-//webtechs//dtd mozilla html//")) - or publicId in + "-//webtechs//dtd mozilla html//")) or + publicId in ("-//w3o//dtd w3 html strict 3.0//en//", "-/w3c/dtd html 4.0 transitional/en", - "html") - or publicId.startswith( + "html") or + publicId.startswith( ("-//w3c//dtd html 4.01 frameset//", "-//w3c//dtd html 4.01 transitional//")) and - systemId is None - or systemId and systemId.lower() == "http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd"): + systemId is None or + systemId and systemId.lower() == "http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd"): self.parser.compatMode = "quirks" elif (publicId.startswith( ("-//w3c//dtd xhtml 1.0 frameset//", - "-//w3c//dtd xhtml 1.0 transitional//")) - or publicId.startswith( + "-//w3c//dtd xhtml 1.0 transitional//")) or + publicId.startswith( ("-//w3c//dtd html 4.01 frameset//", "-//w3c//dtd html 4.01 transitional//")) and systemId is not None): @@ -817,7 +828,7 @@ def getPhases(debug): self.startTagHandler = utils.MethodDispatcher([ ("html", self.startTagHtml), (("base", "basefont", "bgsound", "command", "link", "meta", - "noframes", "script", "style", "title"), + "script", "style", "title"), self.startTagProcessInHead), ("body", self.startTagBody), ("frameset", self.startTagFrameset), @@ -916,8 +927,8 @@ def getPhases(debug): data = token["data"] self.processSpaceCharacters = self.processSpaceCharactersNonPre if (data.startswith("\n") and - self.tree.openElements[-1].name in ("pre", "listing", "textarea") - and not self.tree.openElements[-1].hasContent()): + self.tree.openElements[-1].name in ("pre", "listing", "textarea") and + not self.tree.openElements[-1].hasContent()): data = data[1:] if data: self.tree.reconstructActiveFormattingElements() @@ -944,8 +955,8 @@ def getPhases(debug): def startTagBody(self, token): self.parser.parseError("unexpected-start-tag", {"name": "body"}) - if (len(self.tree.openElements) == 1 - or self.tree.openElements[1].name != "body"): + if (len(self.tree.openElements) == 1 or + self.tree.openElements[1].name != "body"): assert self.parser.innerHTML else: self.parser.framesetOK = False @@ -1143,8 +1154,7 @@ def getPhases(debug): attributes["name"] = "isindex" self.processStartTag(self.impliedTagToken("input", "StartTag", attributes=attributes, - selfClosing= - token["selfClosing"])) + selfClosing=token["selfClosing"])) self.processEndTag(self.impliedTagToken("label")) self.processStartTag(self.impliedTagToken("hr", "StartTag")) self.processEndTag(self.impliedTagToken("form")) diff --git a/src/html5lib/sanitizer.py b/src/html5lib/sanitizer.py index 71dc5212c1..5a05eb1213 100644 --- a/src/html5lib/sanitizer.py +++ b/src/html5lib/sanitizer.py @@ -1,12 +1,31 @@ from __future__ import absolute_import, division, unicode_literals import re +import sys from xml.sax.saxutils import escape, unescape +if sys.version_info[0] == 2: + from urlparse import urlparse +else: + from urllib.parse import urlparse from .tokenizer import HTMLTokenizer from .constants import tokenTypes +content_type_rgx = re.compile(r''' + ^ + # Match a content type / + (?P[-a-zA-Z0-9.]+/[-a-zA-Z0-9.]+) + # Match any character set and encoding + (?:(?:;charset=(?:[-a-zA-Z0-9]+)(?:;(?:base64))?) + |(?:;(?:base64))?(?:;charset=(?:[-a-zA-Z0-9]+))?) + # Assume the rest is data + ,.* + $ + ''', + re.VERBOSE) + + class HTMLSanitizerMixin(object): """ sanitization of XHTML+MathML+SVG and of inline style attributes.""" @@ -100,8 +119,8 @@ class HTMLSanitizerMixin(object): 'xml:base', 'xml:lang', 'xml:space', 'xmlns', 'xmlns:xlink', 'y', 'y1', 'y2', 'zoomAndPan'] - attr_val_is_uri = ['href', 'src', 'cite', 'action', 'longdesc', 'poster', - 'xlink:href', 'xml:base'] + attr_val_is_uri = ['href', 'src', 'cite', 'action', 'longdesc', 'poster', 'background', 'datasrc', + 'dynsrc', 'lowsrc', 'ping', 'poster', 'xlink:href', 'xml:base'] svg_attr_val_allows_ref = ['clip-path', 'color-profile', 'cursor', 'fill', 'filter', 'marker', 'marker-start', 'marker-mid', 'marker-end', @@ -138,7 +157,9 @@ class HTMLSanitizerMixin(object): acceptable_protocols = ['ed2k', 'ftp', 'http', 'https', 'irc', 'mailto', 'news', 'gopher', 'nntp', 'telnet', 'webcal', 'xmpp', 'callto', 'feed', 'urn', 'aim', 'rsync', 'tag', - 'ssh', 'sftp', 'rtsp', 'afs'] + 'ssh', 'sftp', 'rtsp', 'afs', 'data'] + + acceptable_content_types = ['image/png', 'image/jpeg', 'image/gif', 'image/webp', 'image/bmp', 'text/plain'] # subclasses may define their own versions of these constants allowed_elements = acceptable_elements + mathml_elements + svg_elements @@ -147,6 +168,7 @@ class HTMLSanitizerMixin(object): allowed_css_keywords = acceptable_css_keywords allowed_svg_properties = acceptable_svg_properties allowed_protocols = acceptable_protocols + allowed_content_types = acceptable_content_types # Sanitize the +html+, escaping all elements not in ALLOWED_ELEMENTS, and # stripping out all # attributes not in ALLOWED_ATTRIBUTES. Style @@ -189,10 +211,21 @@ class HTMLSanitizerMixin(object): unescape(attrs[attr])).lower() # remove replacement characters from unescaped characters val_unescaped = val_unescaped.replace("\ufffd", "") - if (re.match("^[a-z0-9][-+.a-z0-9]*:", val_unescaped) and - (val_unescaped.split(':')[0] not in - self.allowed_protocols)): + try: + uri = urlparse.urlparse(val_unescaped) + except ValueError: + uri = None del attrs[attr] + if uri and uri.scheme: + if uri.scheme not in self.allowed_protocols: + del attrs[attr] + if uri.scheme == 'data': + m = content_type_rgx.match(uri.path) + if not m: + del attrs[attr] + elif m.group('content_type') not in self.allowed_content_types: + del attrs[attr] + for attr in self.svg_attr_val_allows_ref: if attr in attrs: attrs[attr] = re.sub(r'url\s*\(\s*[^#\s][^)]+?\)', @@ -245,7 +278,7 @@ class HTMLSanitizerMixin(object): elif prop.split('-')[0].lower() in ['background', 'border', 'margin', 'padding']: for keyword in value.split(): - if not keyword in self.acceptable_css_keywords and \ + if keyword not in self.acceptable_css_keywords and \ not re.match("^(#[0-9a-f]+|rgb\(\d+%?,\d*%?,?\d*%?\)?|\d{0,2}\.?\d{0,2}(cm|em|ex|in|mm|pc|pt|px|%|,|\))?)$", keyword): break else: diff --git a/src/html5lib/serializer/htmlserializer.py b/src/html5lib/serializer/htmlserializer.py index 39a25b79b0..427b42a9c6 100644 --- a/src/html5lib/serializer/htmlserializer.py +++ b/src/html5lib/serializer/htmlserializer.py @@ -4,9 +4,6 @@ try: except NameError: text_type = str -import gettext -_ = gettext.gettext - try: from functools import reduce except ImportError: @@ -211,7 +208,7 @@ class HTMLSerializer(object): if token["systemId"]: if token["systemId"].find('"') >= 0: if token["systemId"].find("'") >= 0: - self.serializeError(_("System identifer contains both single and double quote characters")) + self.serializeError("System identifer contains both single and double quote characters") quote_char = "'" else: quote_char = '"' @@ -223,7 +220,7 @@ class HTMLSerializer(object): elif type in ("Characters", "SpaceCharacters"): if type == "SpaceCharacters" or in_cdata: if in_cdata and token["data"].find("= 0: - self.serializeError(_("Unexpected " % name) elif type == "Comment": data = token["data"] if data.find("--") >= 0: - self.serializeError(_("Comment contains --")) + self.serializeError("Comment contains --") yield self.encodeStrict("" % token["data"]) elif type == "Entity": name = token["name"] key = name + ";" if not key in entities: - self.serializeError(_("Entity %s not recognized" % name)) + self.serializeError("Entity %s not recognized" % name) if self.resolve_entities and key not in xmlEntities: data = entities[key] else: diff --git a/src/html5lib/treewalkers/__init__.py b/src/html5lib/treewalkers/__init__.py index 18124e75f3..20b91b114a 100644 --- a/src/html5lib/treewalkers/__init__.py +++ b/src/html5lib/treewalkers/__init__.py @@ -10,8 +10,12 @@ returning an iterator generating tokens. from __future__ import absolute_import, division, unicode_literals +__all__ = ["getTreeWalker", "pprint", "dom", "etree", "genshistream", "lxmletree", + "pulldom"] + import sys +from .. import constants from ..utils import default_etree treeWalkerCache = {} @@ -55,3 +59,89 @@ def getTreeWalker(treeType, implementation=None, **kwargs): # XXX: NEVER cache here, caching is done in the etree submodule return etree.getETreeModule(implementation, **kwargs).TreeWalker return treeWalkerCache.get(treeType) + + +def concatenateCharacterTokens(tokens): + pendingCharacters = [] + for token in tokens: + type = token["type"] + if type in ("Characters", "SpaceCharacters"): + pendingCharacters.append(token["data"]) + else: + if pendingCharacters: + yield {"type": "Characters", "data": "".join(pendingCharacters)} + pendingCharacters = [] + yield token + if pendingCharacters: + yield {"type": "Characters", "data": "".join(pendingCharacters)} + + +def pprint(walker): + """Pretty printer for tree walkers""" + output = [] + indent = 0 + for token in concatenateCharacterTokens(walker): + type = token["type"] + if type in ("StartTag", "EmptyTag"): + # tag name + if token["namespace"] and token["namespace"] != constants.namespaces["html"]: + if token["namespace"] in constants.prefixes: + ns = constants.prefixes[token["namespace"]] + else: + ns = token["namespace"] + name = "%s %s" % (ns, token["name"]) + else: + name = token["name"] + output.append("%s<%s>" % (" " * indent, name)) + indent += 2 + # attributes (sorted for consistent ordering) + attrs = token["data"] + for (namespace, localname), value in sorted(attrs.items()): + if namespace: + if namespace in constants.prefixes: + ns = constants.prefixes[namespace] + else: + ns = namespace + name = "%s %s" % (ns, localname) + else: + name = localname + output.append("%s%s=\"%s\"" % (" " * indent, name, value)) + # self-closing + if type == "EmptyTag": + indent -= 2 + + elif type == "EndTag": + indent -= 2 + + elif type == "Comment": + output.append("%s" % (" " * indent, token["data"])) + + elif type == "Doctype": + if token["name"]: + if token["publicId"]: + output.append("""%s""" % + (" " * indent, + token["name"], + token["publicId"], + token["systemId"] if token["systemId"] else "")) + elif token["systemId"]: + output.append("""%s""" % + (" " * indent, + token["name"], + token["systemId"])) + else: + output.append("%s" % (" " * indent, + token["name"])) + else: + output.append("%s" % (" " * indent,)) + + elif type == "Characters": + output.append("%s\"%s\"" % (" " * indent, token["data"])) + + elif type == "SpaceCharacters": + assert False, "concatenateCharacterTokens should have got rid of all Space tokens" + + else: + raise ValueError("Unknown token type, %s" % type) + + return "\n".join(output) diff --git a/src/html5lib/treewalkers/_base.py b/src/html5lib/treewalkers/_base.py index d36284bbbe..006dcd1656 100644 --- a/src/html5lib/treewalkers/_base.py +++ b/src/html5lib/treewalkers/_base.py @@ -6,8 +6,8 @@ except NameError: text_type = str string_types = str, -import gettext -_ = gettext.gettext +__all__ = ["DOCUMENT", "DOCTYPE", "TEXT", "ELEMENT", "COMMENT", "ENTITY", "UNKNOWN", + "TreeWalker", "NonRecursiveTreeWalker"] from xml.dom import Node @@ -63,7 +63,7 @@ class TreeWalker(object): "namespace": to_text(namespace), "data": attrs} if hasChildren: - yield self.error(_("Void element has children")) + yield self.error("Void element has children") def startTag(self, namespace, name, attrs): assert namespace is None or isinstance(namespace, string_types), type(namespace) @@ -127,7 +127,7 @@ class TreeWalker(object): return {"type": "Entity", "name": text_type(name)} def unknown(self, nodeType): - return self.error(_("Unknown node type: ") + nodeType) + return self.error("Unknown node type: " + nodeType) class NonRecursiveTreeWalker(TreeWalker): diff --git a/src/html5lib/treewalkers/dom.py b/src/html5lib/treewalkers/dom.py index a01287a944..ac4dcf31bf 100644 --- a/src/html5lib/treewalkers/dom.py +++ b/src/html5lib/treewalkers/dom.py @@ -2,9 +2,6 @@ from __future__ import absolute_import, division, unicode_literals from xml.dom import Node -import gettext -_ = gettext.gettext - from . import _base diff --git a/src/html5lib/treewalkers/etree.py b/src/html5lib/treewalkers/etree.py index 3a33756b19..fa94810f10 100644 --- a/src/html5lib/treewalkers/etree.py +++ b/src/html5lib/treewalkers/etree.py @@ -7,15 +7,14 @@ except ImportError: from ordereddict import OrderedDict except ImportError: OrderedDict = dict -import gettext -_ = gettext.gettext import re try: - text_type = unicode + unicode + string_types = basestring, except NameError: - text_type = str + string_types = str, from . import _base from ..utils import moduleFactoryFactory @@ -63,7 +62,7 @@ def getETreeBuilder(ElementTreeImplementation): return _base.COMMENT, node.text else: - assert type(node.tag) == text_type, type(node.tag) + assert isinstance(node.tag, string_types), type(node.tag) # This is assumed to be an ordinary element match = tag_regexp.match(node.tag) if match: diff --git a/src/html5lib/treewalkers/lxmletree.py b/src/html5lib/treewalkers/lxmletree.py index 4c55f852df..b2f27b1819 100644 --- a/src/html5lib/treewalkers/lxmletree.py +++ b/src/html5lib/treewalkers/lxmletree.py @@ -7,9 +7,6 @@ except NameError: from lxml import etree from ..treebuilders.etree import tag_regexp -from gettext import gettext -_ = gettext - from . import _base from .. import ihatexml @@ -90,10 +87,6 @@ class FragmentWrapper(object): self.tail = ensure_str(self.obj.tail) else: self.tail = None - self.isstring = isinstance(obj, str) or isinstance(obj, bytes) - # Support for bytes here is Py2 - if self.isstring: - self.obj = ensure_str(self.obj) def __getattr__(self, name): return getattr(self.obj, name) @@ -137,7 +130,7 @@ class TreeWalker(_base.NonRecursiveTreeWalker): def getNodeDetails(self, node): if isinstance(node, tuple): # Text node node, key = node - assert key in ("text", "tail"), _("Text nodes are text or tail, found %s") % key + assert key in ("text", "tail"), "Text nodes are text or tail, found %s" % key return _base.TEXT, ensure_str(getattr(node, key)) elif isinstance(node, Root): @@ -146,7 +139,7 @@ class TreeWalker(_base.NonRecursiveTreeWalker): elif isinstance(node, Doctype): return _base.DOCTYPE, node.name, node.public_id, node.system_id - elif isinstance(node, FragmentWrapper) and node.isstring: + elif isinstance(node, FragmentWrapper) and not hasattr(node, "tag"): return _base.TEXT, node.obj elif node.tag == etree.Comment: @@ -176,7 +169,7 @@ class TreeWalker(_base.NonRecursiveTreeWalker): attrs, len(node) > 0 or node.text) def getFirstChild(self, node): - assert not isinstance(node, tuple), _("Text nodes have no children") + assert not isinstance(node, tuple), "Text nodes have no children" assert len(node) or node.text, "Node has no children" if node.text: @@ -187,7 +180,7 @@ class TreeWalker(_base.NonRecursiveTreeWalker): def getNextSibling(self, node): if isinstance(node, tuple): # Text node node, key = node - assert key in ("text", "tail"), _("Text nodes are text or tail, found %s") % key + assert key in ("text", "tail"), "Text nodes are text or tail, found %s" % key if key == "text": # XXX: we cannot use a "bool(node) and node[0] or None" construct here # because node[0] might evaluate to False if it has no child element @@ -203,7 +196,7 @@ class TreeWalker(_base.NonRecursiveTreeWalker): def getParentNode(self, node): if isinstance(node, tuple): # Text node node, key = node - assert key in ("text", "tail"), _("Text nodes are text or tail, found %s") % key + assert key in ("text", "tail"), "Text nodes are text or tail, found %s" % key if key == "text": return node # else: fallback to "normal" processing