try: frozenset except NameError: # Import from the sets module for python 2.3 from sets import Set as set from sets import ImmutableSet as frozenset try: any except: # Implement 'any' for python 2.4 and previous def any(iterable): for element in iterable: if element: return True return False import sys import inputstream import tokenizer import treebuilders from treebuilders._base import Marker from treebuilders import simpletree import utils from constants import spaceCharacters, asciiUpper2Lower from constants import scopingElements, formattingElements, specialElements from constants import headingElements, tableInsertModeElements from constants import cdataElements, rcdataElements, voidElements from constants import tokenTypes, ReparseException, namespaces def parse(doc, treebuilder="simpletree", encoding=None, namespaceHTMLElements=True): tb = treebuilders.getTreeBuilder(treebuilder) p = HTMLParser(tb, namespaceHTMLElements=namespaceHTMLElements) return p.parse(doc, encoding=encoding) def parseFragment(doc, container="div", treebuilder="simpletree", encoding=None, namespaceHTMLElements=True): tb = treebuilders.getTreeBuilder(treebuilder) p = HTMLParser(tb, namespaceHTMLElements=namespaceHTMLElements) return p.parseFragment(doc, container=container, encoding=encoding) class HTMLParser(object): """HTML parser. Generates a tree structure from a stream of (possibly malformed) HTML""" def __init__(self, tree = simpletree.TreeBuilder, tokenizer = tokenizer.HTMLTokenizer, strict = False, namespaceHTMLElements = True): """ strict - raise an exception when a parse error is encountered tree - a treebuilder class controlling the type of tree that will be returned. Built in treebuilders can be accessed through html5lib.treebuilders.getTreeBuilder(treeType) tokenizer - a class that provides a stream of tokens to the treebuilder. This may be replaced for e.g. a sanitizer which converts some tags to text """ # Raise an exception on the first error encountered self.strict = strict self.tree = tree(namespaceHTMLElements) self.tokenizer_class = tokenizer self.errors = [] self.phases = { "initial": InitialPhase(self, self.tree), "beforeHtml": BeforeHtmlPhase(self, self.tree), "beforeHead": BeforeHeadPhase(self, self.tree), "inHead": InHeadPhase(self, self.tree), # XXX "inHeadNoscript": InHeadNoScriptPhase(self, self.tree), "afterHead": AfterHeadPhase(self, self.tree), "inBody": InBodyPhase(self, self.tree), "text": TextPhase(self, self.tree), "inTable": InTablePhase(self, self.tree), "inTableText": InTableTextPhase(self, self.tree), "inCaption": InCaptionPhase(self, self.tree), "inColumnGroup": InColumnGroupPhase(self, self.tree), "inTableBody": InTableBodyPhase(self, self.tree), "inRow": InRowPhase(self, self.tree), "inCell": InCellPhase(self, self.tree), "inSelect": InSelectPhase(self, self.tree), "inSelectInTable": InSelectInTablePhase(self, self.tree), "inForeignContent": InForeignContentPhase(self, self.tree), "afterBody": AfterBodyPhase(self, self.tree), "inFrameset": InFramesetPhase(self, self.tree), "afterFrameset": AfterFramesetPhase(self, self.tree), "afterAfterBody": AfterAfterBodyPhase(self, self.tree), "afterAfterFrameset": AfterAfterFramesetPhase(self, self.tree), # XXX after after frameset } def _parse(self, stream, innerHTML=False, container="div", encoding=None, parseMeta=True, useChardet=True, **kwargs): self.innerHTMLMode = innerHTML self.container = container self.tokenizer = self.tokenizer_class(stream, encoding=encoding, parseMeta=parseMeta, useChardet=useChardet, **kwargs) self.reset() while True: try: self.mainLoop() break except ReparseException, e: self.reset() def reset(self): self.tree.reset() self.firstStartTag = False self.errors = [] # "quirks" / "limited quirks" / "no quirks" self.compatMode = "no quirks" if self.innerHTMLMode: self.innerHTML = self.container.lower() if self.innerHTML in cdataElements: self.tokenizer.state = self.tokenizer.rcdataState elif self.innerHTML in rcdataElements: self.tokenizer.state = self.tokenizer.rawtextState elif self.innerHTML == 'plaintext': self.tokenizer.state = self.tokenizer.plaintextState else: # state already is data state # self.tokenizer.state = self.tokenizer.dataState pass self.phase = self.phases["beforeHtml"] self.phase.insertHtmlElement() self.resetInsertionMode() else: self.innerHTML = False self.phase = self.phases["initial"] self.lastPhase = None self.secondaryPhase = None self.beforeRCDataPhase = None self.framesetOK = True def mainLoop(self): (CharactersToken, SpaceCharactersToken, StartTagToken, EndTagToken, CommentToken, DoctypeToken) = (tokenTypes["Characters"], tokenTypes["SpaceCharacters"], tokenTypes["StartTag"], tokenTypes["EndTag"], tokenTypes["Comment"], tokenTypes["Doctype"]) CharactersToken = tokenTypes["Characters"] SpaceCharactersToken = tokenTypes["SpaceCharacters"] StartTagToken = tokenTypes["StartTag"] EndTagToken = tokenTypes["EndTag"] CommentToken = tokenTypes["Comment"] DoctypeToken = tokenTypes["Doctype"] for token in self.normalizedTokens(): type = token["type"] if type == CharactersToken: self.phase.processCharacters(token) elif type == SpaceCharactersToken: self.phase.processSpaceCharacters(token) elif type == StartTagToken: self.selfClosingAcknowledged = False self.phase.processStartTag(token) if (token["selfClosing"] and not self.selfClosingAcknowledged): self.parseError("non-void-element-with-trailing-solidus", {"name":token["name"]}) elif type == EndTagToken: self.phase.processEndTag(token) elif type == CommentToken: self.phase.processComment(token) elif type == DoctypeToken: self.phase.processDoctype(token) else: self.parseError(token["data"], token.get("datavars", {})) # When the loop finishes it's EOF self.phase.processEOF() def normalizedTokens(self): for token in self.tokenizer: yield self.normalizeToken(token) def parse(self, stream, encoding=None, parseMeta=True, useChardet=True): """Parse a HTML document into a well-formed tree stream - a filelike object or string containing the HTML to be parsed The optional encoding parameter must be a string that indicates the encoding. If specified, that encoding will be used, regardless of any BOM or later declaration (such as in a meta element) """ self._parse(stream, innerHTML=False, encoding=encoding, parseMeta=parseMeta, useChardet=useChardet) return self.tree.getDocument() def parseFragment(self, stream, container="div", encoding=None, parseMeta=False, useChardet=True): """Parse a HTML fragment into a well-formed tree fragment container - name of the element we're setting the innerHTML property if set to None, default to 'div' stream - a filelike object or string containing the HTML to be parsed The optional encoding parameter must be a string that indicates the encoding. If specified, that encoding will be used, regardless of any BOM or later declaration (such as in a meta element) """ self._parse(stream, True, container=container, encoding=encoding) return self.tree.getFragment() def parseError(self, errorcode="XXX-undefined-error", datavars={}): # XXX The idea is to make errorcode mandatory. self.errors.append((self.tokenizer.stream.position(), errorcode, datavars)) if self.strict: raise ParseError def normalizeToken(self, token): """ HTML5 specific normalizations to the token stream """ if token["type"] == tokenTypes["StartTag"]: token["data"] = dict(token["data"][::-1]) return token def adjustMathMLAttributes(self, token): replacements = {"definitionurl":"definitionURL"} for k,v in replacements.iteritems(): if k in token["data"]: token["data"][v] = token["data"][k] del token["data"][k] def adjustSVGAttributes(self, token): replacements = { "attributename" : "attributeName", "attributetype" : "attributeType", "basefrequency" : "baseFrequency", "baseprofile" : "baseProfile", "calcmode" : "calcMode", "clippathunits" : "clipPathUnits", "contentscripttype" : "contentScriptType", "contentstyletype" : "contentStyleType", "diffuseconstant" : "diffuseConstant", "edgemode" : "edgeMode", "externalresourcesrequired" : "externalResourcesRequired", "filterres" : "filterRes", "filterunits" : "filterUnits", "glyphref" : "glyphRef", "gradienttransform" : "gradientTransform", "gradientunits" : "gradientUnits", "kernelmatrix" : "kernelMatrix", "kernelunitlength" : "kernelUnitLength", "keypoints" : "keyPoints", "keysplines" : "keySplines", "keytimes" : "keyTimes", "lengthadjust" : "lengthAdjust", "limitingconeangle" : "limitingConeAngle", "markerheight" : "markerHeight", "markerunits" : "markerUnits", "markerwidth" : "markerWidth", "maskcontentunits" : "maskContentUnits", "maskunits" : "maskUnits", "numoctaves" : "numOctaves", "pathlength" : "pathLength", "patterncontentunits" : "patternContentUnits", "patterntransform" : "patternTransform", "patternunits" : "patternUnits", "pointsatx" : "pointsAtX", "pointsaty" : "pointsAtY", "pointsatz" : "pointsAtZ", "preservealpha" : "preserveAlpha", "preserveaspectratio" : "preserveAspectRatio", "primitiveunits" : "primitiveUnits", "refx" : "refX", "refy" : "refY", "repeatcount" : "repeatCount", "repeatdur" : "repeatDur", "requiredextensions" : "requiredExtensions", "requiredfeatures" : "requiredFeatures", "specularconstant" : "specularConstant", "specularexponent" : "specularExponent", "spreadmethod" : "spreadMethod", "startoffset" : "startOffset", "stddeviation" : "stdDeviation", "stitchtiles" : "stitchTiles", "surfacescale" : "surfaceScale", "systemlanguage" : "systemLanguage", "tablevalues" : "tableValues", "targetx" : "targetX", "targety" : "targetY", "textlength" : "textLength", "viewbox" : "viewBox", "viewtarget" : "viewTarget", "xchannelselector" : "xChannelSelector", "ychannelselector" : "yChannelSelector", "zoomandpan" : "zoomAndPan" } for originalName in token["data"].keys(): if originalName in replacements: svgName = replacements[originalName] token["data"][svgName] = token["data"][originalName] del token["data"][originalName] def adjustForeignAttributes(self, token): replacements = { "xlink:actuate":("xlink", "actuate", namespaces["xlink"]), "xlink:arcrole":("xlink", "arcrole", namespaces["xlink"]), "xlink:href":("xlink", "href", namespaces["xlink"]), "xlink:role":("xlink", "role", namespaces["xlink"]), "xlink:show":("xlink", "show", namespaces["xlink"]), "xlink:title":("xlink", "title", namespaces["xlink"]), "xlink:type":("xlink", "type", namespaces["xlink"]), "xml:base":("xml", "base", namespaces["xml"]), "xml:lang":("xml", "lang", namespaces["xml"]), "xml:space":("xml", "space", namespaces["xml"]), "xmlns":(None, "xmlns", namespaces["xmlns"]), "xmlns:xlink":("xmlns", "xlink", namespaces["xmlns"]) } for originalName in token["data"].iterkeys(): if originalName in replacements: foreignName = replacements[originalName] token["data"][foreignName] = token["data"][originalName] del token["data"][originalName] def resetInsertionMode(self): # The name of this method is mostly historical. (It's also used in the # specification.) last = False newModes = { "select":"inSelect", "td":"inCell", "th":"inCell", "tr":"inRow", "tbody":"inTableBody", "thead":"inTableBody", "tfoot":"inTableBody", "caption":"inCaption", "colgroup":"inColumnGroup", "table":"inTable", "head":"inBody", "body":"inBody", "frameset":"inFrameset" } for node in self.tree.openElements[::-1]: nodeName = node.name if node == self.tree.openElements[0]: last = True if nodeName not in ['td', 'th']: # XXX assert self.innerHTML nodeName = self.innerHTML # Check for conditions that should only happen in the innerHTML # case if nodeName in ("select", "colgroup", "head", "frameset"): # XXX assert self.innerHTML if nodeName in newModes: self.phase = self.phases[newModes[nodeName]] break elif node.namespace in (namespaces["mathml"], namespaces["svg"]): self.phase = self.phases["inForeignContent"] self.secondaryPhase = self.phases["inBody"] break elif nodeName == "html": if self.tree.headPointer is None: self.phase = self.phases["beforeHead"] else: self.phase = self.phases["afterHead"] break elif last: self.phase = self.phases["inBody"] break def parseRCDataRawtext(self, token, contentType): """Generic RCDATA/RAWTEXT Parsing algorithm contentType - RCDATA or RAWTEXT """ assert contentType in ("RAWTEXT", "RCDATA") element = self.tree.insertElement(token) if contentType == "RAWTEXT": self.tokenizer.state = self.tokenizer.rawtextState else: self.tokenizer.state = self.tokenizer.rcdataState self.originalPhase = self.phase self.phase = self.phases["text"] class Phase(object): """Base class for helper object that implements each phase of processing """ # Order should be (they can be omitted): # * EOF # * Comment # * Doctype # * SpaceCharacters # * Characters # * StartTag # - startTag* methods # * EndTag # - endTag* methods def __init__(self, parser, tree): self.parser = parser self.tree = tree def processEOF(self): raise NotImplementedError def processComment(self, token): # For most phases the following is correct. Where it's not it will be # overridden. self.tree.insertComment(token, self.tree.openElements[-1]) def processDoctype(self, token): self.parser.parseError("unexpected-doctype") def processCharacters(self, token): self.tree.insertText(token["data"]) def processSpaceCharacters(self, token): self.tree.insertText(token["data"]) def processStartTag(self, token): self.startTagHandler[token["name"]](token) def startTagHtml(self, token): if self.parser.firstStartTag == False and token["name"] == "html": self.parser.parseError("non-html-root") # XXX Need a check here to see if the first start tag token emitted is # this token... If it's not, invoke self.parser.parseError(). for attr, value in token["data"].iteritems(): if attr not in self.tree.openElements[0].attributes: self.tree.openElements[0].attributes[attr] = value self.parser.firstStartTag = False def processEndTag(self, token): self.endTagHandler[token["name"]](token) class InitialPhase(Phase): def processSpaceCharacters(self, token): pass def processComment(self, token): self.tree.insertComment(token, self.tree.document) def processDoctype(self, token): name = token["name"] publicId = token["publicId"] systemId = token["systemId"] correct = token["correct"] if (name != "html" or publicId != None or systemId != None and systemId != "about:legacy-compat"): self.parser.parseError("unknown-doctype") if publicId is None: publicId = "" self.tree.insertDoctype(token) if publicId != "": publicId = publicId.translate(asciiUpper2Lower) if (not correct or token["name"] != "html" or publicId.startswith( ("+//silmaril//dtd html pro v0r11 19970101//", "-//advasoft ltd//dtd html 3.0 aswedit + extensions//", "-//as//dtd html 3.0 aswedit + extensions//", "-//ietf//dtd html 2.0 level 1//", "-//ietf//dtd html 2.0 level 2//", "-//ietf//dtd html 2.0 strict level 1//", "-//ietf//dtd html 2.0 strict level 2//", "-//ietf//dtd html 2.0 strict//", "-//ietf//dtd html 2.0//", "-//ietf//dtd html 2.1e//", "-//ietf//dtd html 3.0//", "-//ietf//dtd html 3.2 final//", "-//ietf//dtd html 3.2//", "-//ietf//dtd html 3//", "-//ietf//dtd html level 0//", "-//ietf//dtd html level 1//", "-//ietf//dtd html level 2//", "-//ietf//dtd html level 3//", "-//ietf//dtd html strict level 0//", "-//ietf//dtd html strict level 1//", "-//ietf//dtd html strict level 2//", "-//ietf//dtd html strict level 3//", "-//ietf//dtd html strict//", "-//ietf//dtd html//", "-//metrius//dtd metrius presentational//", "-//microsoft//dtd internet explorer 2.0 html strict//", "-//microsoft//dtd internet explorer 2.0 html//", "-//microsoft//dtd internet explorer 2.0 tables//", "-//microsoft//dtd internet explorer 3.0 html strict//", "-//microsoft//dtd internet explorer 3.0 html//", "-//microsoft//dtd internet explorer 3.0 tables//", "-//netscape comm. corp.//dtd html//", "-//netscape comm. corp.//dtd strict html//", "-//o'reilly and associates//dtd html 2.0//", "-//o'reilly and associates//dtd html extended 1.0//", "-//o'reilly and associates//dtd html extended relaxed 1.0//", "-//softquad software//dtd hotmetal pro 6.0::19990601::extensions to html 4.0//", "-//softquad//dtd hotmetal pro 4.0::19971010::extensions to html 4.0//", "-//spyglass//dtd html 2.0 extended//", "-//sq//dtd html 2.0 hotmetal + extensions//", "-//sun microsystems corp.//dtd hotjava html//", "-//sun microsystems corp.//dtd hotjava strict html//", "-//w3c//dtd html 3 1995-03-24//", "-//w3c//dtd html 3.2 draft//", "-//w3c//dtd html 3.2 final//", "-//w3c//dtd html 3.2//", "-//w3c//dtd html 3.2s draft//", "-//w3c//dtd html 4.0 frameset//", "-//w3c//dtd html 4.0 transitional//", "-//w3c//dtd html experimental 19960712//", "-//w3c//dtd html experimental 970421//", "-//w3c//dtd w3 html//", "-//w3o//dtd w3 html 3.0//", "-//webtechs//dtd mozilla html 2.0//", "-//webtechs//dtd mozilla html//")) or publicId in ("-//w3o//dtd w3 html strict 3.0//en//", "-/w3c/dtd html 4.0 transitional/en", "html") or publicId.startswith( ("-//w3c//dtd html 4.01 frameset//", "-//w3c//dtd html 4.01 transitional//")) and systemId == None or systemId and systemId.lower() == "http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd"): self.parser.compatMode = "quirks" elif (publicId.startswith( ("-//w3c//dtd xhtml 1.0 frameset//", "-//w3c//dtd xhtml 1.0 transitional//")) or publicId.startswith( ("-//w3c//dtd html 4.01 frameset//", "-//w3c//dtd html 4.01 transitional//")) and systemId != None): self.parser.compatMode = "limited quirks" self.parser.phase = self.parser.phases["beforeHtml"] def anythingElse(self): self.parser.compatMode = "quirks" self.parser.phase = self.parser.phases["beforeHtml"] def processCharacters(self, token): self.parser.parseError("expected-doctype-but-got-chars") self.anythingElse() self.parser.phase.processCharacters(token) def processStartTag(self, token): self.parser.parseError("expected-doctype-but-got-start-tag", {"name": token["name"]}) self.anythingElse() self.parser.phase.processStartTag(token) def processEndTag(self, token): self.parser.parseError("expected-doctype-but-got-end-tag", {"name": token["name"]}) self.anythingElse() self.parser.phase.processEndTag(token) def processEOF(self): self.parser.parseError("expected-doctype-but-got-eof") self.anythingElse() self.parser.phase.processEOF() class BeforeHtmlPhase(Phase): # helper methods def insertHtmlElement(self): self.tree.insertRoot(impliedTagToken("html", "StartTag")) self.parser.phase = self.parser.phases["beforeHead"] # other def processEOF(self): self.insertHtmlElement() self.parser.phase.processEOF() def processComment(self, token): self.tree.insertComment(token, self.tree.document) def processSpaceCharacters(self, token): pass def processCharacters(self, token): self.insertHtmlElement() self.parser.phase.processCharacters(token) def processStartTag(self, token): if token["name"] == "html": self.parser.firstStartTag = True self.insertHtmlElement() self.parser.phase.processStartTag(token) def processEndTag(self, token): if token["name"] not in ("head", "body", "html", "br"): self.parser.parseError("unexpected-end-tag-before-html", {"name": token["name"]}) else: self.insertHtmlElement() self.parser.phase.processEndTag(token) class BeforeHeadPhase(Phase): def __init__(self, parser, tree): Phase.__init__(self, parser, tree) self.startTagHandler = utils.MethodDispatcher([ ("html", self.startTagHtml), ("head", self.startTagHead) ]) self.startTagHandler.default = self.startTagOther self.endTagHandler = utils.MethodDispatcher([ (("head", "body", "html", "br"), self.endTagImplyHead) ]) self.endTagHandler.default = self.endTagOther def processEOF(self): self.startTagHead(impliedTagToken("head", "StartTag")) self.parser.phase.processEOF() def processSpaceCharacters(self, token): pass def processCharacters(self, token): self.startTagHead(impliedTagToken("head", "StartTag")) self.parser.phase.processCharacters(token) def startTagHtml(self, token): self.parser.phases["inBody"].processStartTag(token) def startTagHead(self, token): self.tree.insertElement(token) self.tree.headPointer = self.tree.openElements[-1] self.parser.phase = self.parser.phases["inHead"] def startTagOther(self, token): self.startTagHead(impliedTagToken("head", "StartTag")) self.parser.phase.processStartTag(token) def endTagImplyHead(self, token): self.startTagHead(impliedTagToken("head", "StartTag")) self.parser.phase.processEndTag(token) def endTagOther(self, token): self.parser.parseError("end-tag-after-implied-root", {"name": token["name"]}) class InHeadPhase(Phase): def __init__(self, parser, tree): Phase.__init__(self, parser, tree) self.startTagHandler = utils.MethodDispatcher([ ("html", self.startTagHtml), ("title", self.startTagTitle), (("noscript", "noframes", "style"), self.startTagNoScriptNoFramesStyle), ("script", self.startTagScript), (("base", "link", "command"), self.startTagBaseLinkCommand), ("meta", self.startTagMeta), ("head", self.startTagHead) ]) self.startTagHandler.default = self.startTagOther self. endTagHandler = utils.MethodDispatcher([ ("head", self.endTagHead), (("br", "html", "body"), self.endTagHtmlBodyBr) ]) self.endTagHandler.default = self.endTagOther # helper def appendToHead(self, element): if self.tree.headPointer is not None: self.tree.headPointer.appendChild(element) else: assert self.parser.innerHTML self.tree.openElementsw[-1].appendChild(element) # the real thing def processEOF (self): self.anythingElse() self.parser.phase.processEOF() def processCharacters(self, token): self.anythingElse() self.parser.phase.processCharacters(token) def startTagHtml(self, token): self.parser.phases["inBody"].processStartTag(token) def startTagHead(self, token): self.parser.parseError("two-heads-are-not-better-than-one") def startTagBaseLinkCommand(self, token): self.tree.insertElement(token) self.tree.openElements.pop() token["selfClosingAcknowledged"] = True def startTagMeta(self, token): self.tree.insertElement(token) self.tree.openElements.pop() token["selfClosingAcknowledged"] = True attributes = token["data"] if self.parser.tokenizer.stream.charEncoding[1] == "tentative": if "charset" in attributes: self.parser.tokenizer.stream.changeEncoding(attributes["charset"]) elif "content" in attributes: # Encoding it as UTF-8 here is a hack, as really we should pass # the abstract Unicode string, and just use the # ContentAttrParser on that, but using UTF-8 allows all chars # to be encoded and as a ASCII-superset works. data = inputstream.EncodingBytes(attributes["content"].encode("utf-8")) parser = inputstream.ContentAttrParser(data) codec = parser.parse() self.parser.tokenizer.stream.changeEncoding(codec) def startTagTitle(self, token): self.parser.parseRCDataRawtext(token, "RCDATA") def startTagNoScriptNoFramesStyle(self, token): #Need to decide whether to implement the scripting-disabled case self.parser.parseRCDataRawtext(token, "RAWTEXT") def startTagScript(self, token): self.tree.insertElement(token) self.parser.tokenizer.state = self.parser.tokenizer.scriptDataState self.parser.originalPhase = self.parser.phase self.parser.phase = self.parser.phases["text"] def startTagOther(self, token): self.anythingElse() self.parser.phase.processStartTag(token) def endTagHead(self, token): node = self.parser.tree.openElements.pop() assert node.name == "head", "Expected head got %s"%node.name self.parser.phase = self.parser.phases["afterHead"] def endTagHtmlBodyBr(self, token): self.anythingElse() self.parser.phase.processEndTag(token) def endTagOther(self, token): self.parser.parseError("unexpected-end-tag", {"name": token["name"]}) def anythingElse(self): self.endTagHead(impliedTagToken("head")) # XXX If we implement a parser for which scripting is disabled we need to # implement this phase. # # class InHeadNoScriptPhase(Phase): class AfterHeadPhase(Phase): def __init__(self, parser, tree): Phase.__init__(self, parser, tree) self.startTagHandler = utils.MethodDispatcher([ ("html", self.startTagHtml), ("body", self.startTagBody), ("frameset", self.startTagFrameset), (("base", "link", "meta", "noframes", "script", "style", "title"), self.startTagFromHead), ("head", self.startTagHead) ]) self.startTagHandler.default = self.startTagOther self.endTagHandler = utils.MethodDispatcher([(("body", "html", "br"), self.endTagHtmlBodyBr)]) self.endTagHandler.default = self.endTagOther def processEOF(self): self.anythingElse() self.parser.phase.processEOF() def processCharacters(self, token): self.anythingElse() self.parser.phase.processCharacters(token) def startTagBody(self, token): self.parser.framesetOK = False self.tree.insertElement(token) self.parser.phase = self.parser.phases["inBody"] def startTagFrameset(self, token): self.tree.insertElement(token) self.parser.phase = self.parser.phases["inFrameset"] def startTagFromHead(self, token): self.parser.parseError("unexpected-start-tag-out-of-my-head", {"name": token["name"]}) self.tree.openElements.append(self.tree.headPointer) self.parser.phases["inHead"].processStartTag(token) for node in self.tree.openElements[::-1]: if node.name == "head": self.tree.openElements.remove(node) break def startTagHead(self, token): self.parser.parseError("unexpected-start-tag", {"name":token["name"]}) def startTagOther(self, token): self.anythingElse() self.parser.phase.processStartTag(token) def endTagHtmlBodyBr(self, token): self.anythingElse() self.parser.phase.processEndTag(token) def endTagOther(self, token): self.parser.parseError("unexpected-end-tag", {"name":token["name"]}) def anythingElse(self): self.tree.insertElement(impliedTagToken("body", "StartTag")) self.parser.phase = self.parser.phases["inBody"] self.parser.framesetOK = True class InBodyPhase(Phase): # http://www.whatwg.org/specs/web-apps/current-work/#parsing-main-inbody # the really-really-really-very crazy mode def __init__(self, parser, tree): Phase.__init__(self, parser, tree) #Keep a ref to this for special handling of whitespace in
        self.processSpaceCharactersNonPre = self.processSpaceCharacters

        self.startTagHandler = utils.MethodDispatcher([
            ("html", self.startTagHtml),
            (("base", "command", "link", "meta", "noframes", "script", "style", 
              "title"), self.startTagProcessInHead),
            ("body", self.startTagBody),
            ("frameset", self.startTagFrameset),
            (("address", "article", "aside", "blockquote", "center", "datagrid",
              "details", "dir", "div", "dl", "fieldset", "figure",
              "footer", "header", "hgroup", "menu", "nav", "ol", "p",
              "section", "ul"),
              self.startTagCloseP),
            (("pre", "listing"), self.startTagPreListing),
            ("form", self.startTagForm),
            (("li", "dd", "dt"), self.startTagListItem),
            ("plaintext",self.startTagPlaintext),
            (headingElements, self.startTagHeading),
            ("a", self.startTagA),
            (("b", "big", "code", "em", "font", "i", "s", "small", "strike", 
              "strong", "tt", "u"),self.startTagFormatting),
            ("nobr", self.startTagNobr),
            ("button", self.startTagButton),
            (("applet", "marquee", "object"), self.startTagAppletMarqueeObject),
            ("xmp", self.startTagXmp),
            ("table", self.startTagTable),
            (("area", "basefont", "bgsound", "br", "embed", "img", "input",
              "keygen", "spacer", "wbr"), self.startTagVoidFormatting),
            (("param", "source"), self.startTagParamSource),
            ("hr", self.startTagHr),
            ("image", self.startTagImage),
            ("isindex", self.startTagIsIndex),
            ("textarea", self.startTagTextarea),
            ("iframe", self.startTagIFrame),
            (("noembed", "noframes", "noscript"), self.startTagRawtext),
            ("select", self.startTagSelect),
            (("rp", "rt"), self.startTagRpRt),
            (("option", "optgroup"), self.startTagOpt),
            (("math"), self.startTagMath),
            (("svg"), self.startTagSvg),
            (("caption", "col", "colgroup", "frame", "head",
              "tbody", "td", "tfoot", "th", "thead",
              "tr"), self.startTagMisplaced)
        ])
        self.startTagHandler.default = self.startTagOther

        self.endTagHandler = utils.MethodDispatcher([
            ("body",self.endTagBody),
            ("html",self.endTagHtml),
            (("address", "article", "aside", "blockquote", "center", "datagrid",
              "details", "dir", "div", "dl", "fieldset", "figure",
              "footer", "header", "hgroup", "listing", "menu", "nav", "ol", "pre", 
              "section", "ul"), self.endTagBlock),
            ("form", self.endTagForm),
            ("p",self.endTagP),
            (("dd", "dt", "li"), self.endTagListItem),
            (headingElements, self.endTagHeading),
            (("a", "b", "big", "code", "em", "font", "i", "nobr", "s", "small",
              "strike", "strong", "tt", "u"), self.endTagFormatting),
            (("applet", "button", "marquee", "object"), self.endTagAppletButtonMarqueeObject),
            ("br", self.endTagBr),
            ])
        self.endTagHandler.default = self.endTagOther

    # helper
    def addFormattingElement(self, token):
        self.tree.insertElement(token)
        self.tree.activeFormattingElements.append(
            self.tree.openElements[-1])

    # the real deal
    def processEOF(self):
        allowed_elements = frozenset(("dd", "dt", "li", "p", "tbody", "td",
                                      "tfoot", "th", "thead", "tr", "body",
                                      "html"))
        for node in self.tree.openElements[::-1]:
            if node.name not in allowed_elements:
                self.parser.parseError("expected-closing-tag-but-got-eof")
                break
        #Stop parsing
    
    def processSpaceCharactersDropNewline(self, token):
        # Sometimes (start of 
, , and