mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Add support for line numbers to the HTML 5 parser
This commit is contained in:
parent
0d1c917281
commit
9503652a4b
@ -256,8 +256,9 @@ class TreeBuilder(BaseTreeBuilder):
|
|||||||
documentClass = Document
|
documentClass = Document
|
||||||
doctypeClass = DocType
|
doctypeClass = DocType
|
||||||
|
|
||||||
def __init__(self, namespaceHTMLElements=True):
|
def __init__(self, namespaceHTMLElements=True, linenumber_attribute=None):
|
||||||
BaseTreeBuilder.__init__(self, namespaceHTMLElements)
|
BaseTreeBuilder.__init__(self, namespaceHTMLElements)
|
||||||
|
self.linenumber_attribute = linenumber_attribute
|
||||||
self.lxml_context = create_lxml_context()
|
self.lxml_context = create_lxml_context()
|
||||||
self.elementClass = partial(ElementFactory, context=self.lxml_context)
|
self.elementClass = partial(ElementFactory, context=self.lxml_context)
|
||||||
self.proxy_cache = []
|
self.proxy_cache = []
|
||||||
@ -304,6 +305,20 @@ class TreeBuilder(BaseTreeBuilder):
|
|||||||
elem.name = token_name
|
elem.name = token_name
|
||||||
elem.namespace = elem.nsmap[elem.prefix]
|
elem.namespace = elem.nsmap[elem.prefix]
|
||||||
elem.nameTuple = (elem.nsmap[elem.prefix], elem.name)
|
elem.nameTuple = (elem.nsmap[elem.prefix], elem.name)
|
||||||
|
position = token.get('position', None)
|
||||||
|
if position is not None:
|
||||||
|
# Unfortunately, libxml2 can only store line numbers upto 65535
|
||||||
|
# (unsigned short). If you really need to workaround this, use the
|
||||||
|
# patch here:
|
||||||
|
# https://bug325533.bugzilla-attachments.gnome.org/attachment.cgi?id=56951
|
||||||
|
# (replacing int with size_t) and patching lxml correspondingly to
|
||||||
|
# get rid of the OverflowError
|
||||||
|
try:
|
||||||
|
elem.sourceline = position[0][0]
|
||||||
|
except OverflowError:
|
||||||
|
elem.sourceline = 65535
|
||||||
|
if self.linenumber_attribute is not None:
|
||||||
|
elem.set(self.linenumber_attribute, str(position[0][0]))
|
||||||
return elem
|
return elem
|
||||||
|
|
||||||
def insertElementNormal(self, token):
|
def insertElementNormal(self, token):
|
||||||
@ -367,8 +382,9 @@ def process_namespace_free_attribs(attrs):
|
|||||||
|
|
||||||
class NoNamespaceTreeBuilder(TreeBuilder):
|
class NoNamespaceTreeBuilder(TreeBuilder):
|
||||||
|
|
||||||
def __init__(self, namespaceHTMLElements=False):
|
def __init__(self, namespaceHTMLElements=False, linenumber_attribute=None):
|
||||||
BaseTreeBuilder.__init__(self, namespaceHTMLElements)
|
BaseTreeBuilder.__init__(self, namespaceHTMLElements)
|
||||||
|
self.linenumber_attribute = linenumber_attribute
|
||||||
self.lxml_context = create_lxml_context()
|
self.lxml_context = create_lxml_context()
|
||||||
self.elementClass = partial(ElementFactory, context=self.lxml_context)
|
self.elementClass = partial(ElementFactory, context=self.lxml_context)
|
||||||
self.proxy_cache = []
|
self.proxy_cache = []
|
||||||
@ -387,6 +403,14 @@ class NoNamespaceTreeBuilder(TreeBuilder):
|
|||||||
elem.name = elem.tag
|
elem.name = elem.tag
|
||||||
elem.namespace = token.get('namespace', self.defaultNamespace)
|
elem.namespace = token.get('namespace', self.defaultNamespace)
|
||||||
elem.nameTuple = (elem.namespace or html_ns, elem.name)
|
elem.nameTuple = (elem.namespace or html_ns, elem.name)
|
||||||
|
position = token.get('position', None)
|
||||||
|
if position is not None:
|
||||||
|
try:
|
||||||
|
elem.sourceline = position[0][0]
|
||||||
|
except OverflowError:
|
||||||
|
elem.sourceline = 65535
|
||||||
|
if self.linenumber_attribute is not None:
|
||||||
|
elem.set(self.linenumber_attribute, str(position[0][0]))
|
||||||
return elem
|
return elem
|
||||||
|
|
||||||
def apply_html_attributes(self, attrs):
|
def apply_html_attributes(self, attrs):
|
||||||
@ -401,6 +425,7 @@ class NoNamespaceTreeBuilder(TreeBuilder):
|
|||||||
except ValueError:
|
except ValueError:
|
||||||
html.set(to_xml_name(k), v)
|
html.set(to_xml_name(k), v)
|
||||||
|
|
||||||
|
# Input Stream {{{
|
||||||
_regex_cache = {}
|
_regex_cache = {}
|
||||||
|
|
||||||
class FastStream(object):
|
class FastStream(object):
|
||||||
@ -414,7 +439,7 @@ class FastStream(object):
|
|||||||
self.charEncoding = ("utf-8", "certain")
|
self.charEncoding = ("utf-8", "certain")
|
||||||
self.track_position = track_position
|
self.track_position = track_position
|
||||||
if track_position:
|
if track_position:
|
||||||
self.new_lines = tuple(m.start() for m in re.finditer(r'\n', raw))
|
self.new_lines = tuple(m.start() + 1 for m in re.finditer(r'\n', raw))
|
||||||
|
|
||||||
def reset(self):
|
def reset(self):
|
||||||
self.pos = 0
|
self.pos = 0
|
||||||
@ -451,17 +476,24 @@ class FastStream(object):
|
|||||||
def position(self):
|
def position(self):
|
||||||
if not self.track_position:
|
if not self.track_position:
|
||||||
return (-1, -1)
|
return (-1, -1)
|
||||||
lnum = bisect(self.new_lines, self.pos)
|
pos = self.pos
|
||||||
if lnum == 0:
|
lnum = bisect(self.new_lines, pos)
|
||||||
return (1, self.pos)
|
# lnum is the line from which the next char() will come, therefore the
|
||||||
return (lnum, self.pos - self.new_lines[lnum - 1])
|
# current char is a \n and \n is given the line number of the line it
|
||||||
|
# creates.
|
||||||
|
try:
|
||||||
|
offset = self.new_lines[lnum - 1] - pos
|
||||||
|
except IndexError:
|
||||||
|
offset = pos
|
||||||
|
return (lnum + 1, offset)
|
||||||
|
# }}}
|
||||||
|
|
||||||
if len("\U0010FFFF") == 1: # UCS4 build
|
if len("\U0010FFFF") == 1: # UCS4 build
|
||||||
replace_chars = re.compile("[\uD800-\uDFFF]")
|
replace_chars = re.compile("[\uD800-\uDFFF]")
|
||||||
else:
|
else:
|
||||||
replace_chars = re.compile("([\uD800-\uDBFF](?![\uDC00-\uDFFF])|(?<![\uD800-\uDBFF])[\uDC00-\uDFFF])")
|
replace_chars = re.compile("([\uD800-\uDBFF](?![\uDC00-\uDFFF])|(?<![\uD800-\uDBFF])[\uDC00-\uDFFF])")
|
||||||
|
|
||||||
def parse(raw, decoder=None, log=None, discard_namespaces=False, line_numbers=True):
|
def parse(raw, decoder=None, log=None, discard_namespaces=False, line_numbers=True, linenumber_attribute=None):
|
||||||
if isinstance(raw, bytes):
|
if isinstance(raw, bytes):
|
||||||
raw = xml_to_unicode(raw)[0] if decoder is None else decoder(raw)
|
raw = xml_to_unicode(raw)[0] if decoder is None else decoder(raw)
|
||||||
raw = fix_self_closing_cdata_tags(raw) # TODO: Handle this in the parser
|
raw = fix_self_closing_cdata_tags(raw) # TODO: Handle this in the parser
|
||||||
@ -471,10 +503,10 @@ def parse(raw, decoder=None, log=None, discard_namespaces=False, line_numbers=Tr
|
|||||||
|
|
||||||
stream_class = partial(FastStream, track_position=line_numbers)
|
stream_class = partial(FastStream, track_position=line_numbers)
|
||||||
stream = stream_class(raw)
|
stream = stream_class(raw)
|
||||||
builder = NoNamespaceTreeBuilder if discard_namespaces else TreeBuilder
|
builder = partial(NoNamespaceTreeBuilder if discard_namespaces else TreeBuilder, linenumber_attribute=linenumber_attribute)
|
||||||
while True:
|
while True:
|
||||||
try:
|
try:
|
||||||
parser = HTMLParser(tree=builder, namespaceHTMLElements=not discard_namespaces)
|
parser = HTMLParser(tree=builder, track_positions=line_numbers, namespaceHTMLElements=not discard_namespaces)
|
||||||
with warnings.catch_warnings():
|
with warnings.catch_warnings():
|
||||||
warnings.simplefilter('ignore', category=DataLossWarning)
|
warnings.simplefilter('ignore', category=DataLossWarning)
|
||||||
try:
|
try:
|
||||||
@ -495,8 +527,8 @@ def parse(raw, decoder=None, log=None, discard_namespaces=False, line_numbers=Tr
|
|||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
from lxml import etree
|
from lxml import etree
|
||||||
# root = parse('\n<html><head><title>a\n</title><p> \n<b>b', discard_namespaces=False)
|
root = parse('\n<html><head><title>a\n</title><p> \n<b>b', discard_namespaces=False)
|
||||||
root = parse('\n<html><p><svg viewbox="0 0 0 0"><image xlink:href="xxx"/><b></svg> \n<b>xxx', discard_namespaces=False)
|
# root = parse('\n<html><p><svg viewbox="0 0 0 0"><image xlink:href="xxx"/><b></svg> \n<b>xxx', discard_namespaces=False)
|
||||||
print (etree.tostring(root, encoding='utf-8'))
|
print (etree.tostring(root, encoding='utf-8'))
|
||||||
print()
|
print()
|
||||||
|
|
||||||
|
@ -141,4 +141,13 @@ class ParsingTests(BaseTest):
|
|||||||
test(self, parse)
|
test(self, parse)
|
||||||
|
|
||||||
root = parse('<html><p><svg><image /><b></svg> \n<b>xxx', discard_namespaces=True)
|
root = parse('<html><p><svg><image /><b></svg> \n<b>xxx', discard_namespaces=True)
|
||||||
|
self.assertTrue(root.xpath('//b'), 'Namespaces not discarded')
|
||||||
self.assertFalse(root.xpath('//svg/b'), 'The <b> was not moved out of <svg>')
|
self.assertFalse(root.xpath('//svg/b'), 'The <b> was not moved out of <svg>')
|
||||||
|
|
||||||
|
for ds in (False, True):
|
||||||
|
src = '\n<html>\n<p>\n<svg><image />\n<b></svg> '
|
||||||
|
root = parse(src, discard_namespaces=ds)
|
||||||
|
for tag, lnum in {'html':2, 'head':3, 'body':3, 'p':3, 'svg':4, 'image':4, 'b':5}.iteritems():
|
||||||
|
elem = root.xpath('//*[local-name()="%s"]' % tag)[0]
|
||||||
|
self.assertEqual(lnum, elem.sourceline, 'Line number incorrect for %s, source: %s:' % (tag, src))
|
||||||
|
|
||||||
|
@ -37,6 +37,7 @@ def parseFragment(doc, container="div", treebuilder="etree", encoding=None,
|
|||||||
|
|
||||||
def method_decorator_metaclass(function):
|
def method_decorator_metaclass(function):
|
||||||
class Decorated(type):
|
class Decorated(type):
|
||||||
|
|
||||||
def __new__(meta, classname, bases, classDict):
|
def __new__(meta, classname, bases, classDict):
|
||||||
for attributeName, attribute in classDict.items():
|
for attributeName, attribute in classDict.items():
|
||||||
if isinstance(attribute, types.FunctionType):
|
if isinstance(attribute, types.FunctionType):
|
||||||
@ -48,11 +49,12 @@ def method_decorator_metaclass(function):
|
|||||||
|
|
||||||
|
|
||||||
class HTMLParser(object):
|
class HTMLParser(object):
|
||||||
|
|
||||||
"""HTML parser. Generates a tree structure from a stream of (possibly
|
"""HTML parser. Generates a tree structure from a stream of (possibly
|
||||||
malformed) HTML"""
|
malformed) HTML"""
|
||||||
|
|
||||||
def __init__(self, tree=None, tokenizer=tokenizer.HTMLTokenizer,
|
def __init__(self, tree=None, tokenizer=tokenizer.HTMLTokenizer,
|
||||||
strict=False, namespaceHTMLElements=True, debug=False):
|
strict=False, namespaceHTMLElements=True, debug=False, track_positions=False):
|
||||||
"""
|
"""
|
||||||
strict - raise an exception when a parse error is encountered
|
strict - raise an exception when a parse error is encountered
|
||||||
|
|
||||||
@ -67,6 +69,7 @@ class HTMLParser(object):
|
|||||||
|
|
||||||
# Raise an exception on the first error encountered
|
# Raise an exception on the first error encountered
|
||||||
self.strict = strict
|
self.strict = strict
|
||||||
|
self.track_positions = track_positions
|
||||||
|
|
||||||
if tree is None:
|
if tree is None:
|
||||||
tree = treebuilders.getTreeBuilder("etree")
|
tree = treebuilders.getTreeBuilder("etree")
|
||||||
@ -85,6 +88,7 @@ class HTMLParser(object):
|
|||||||
self.tokenizer = self.tokenizer_class(stream, encoding=encoding,
|
self.tokenizer = self.tokenizer_class(stream, encoding=encoding,
|
||||||
parseMeta=parseMeta,
|
parseMeta=parseMeta,
|
||||||
useChardet=useChardet,
|
useChardet=useChardet,
|
||||||
|
track_positions=self.track_positions,
|
||||||
parser=self, **kwargs)
|
parser=self, **kwargs)
|
||||||
self.reset()
|
self.reset()
|
||||||
|
|
||||||
@ -406,6 +410,15 @@ class HTMLParser(object):
|
|||||||
|
|
||||||
self.phase = self.phases["text"]
|
self.phase = self.phases["text"]
|
||||||
|
|
||||||
|
def impliedTagToken(self, name, type="EndTag", attributes=None,
|
||||||
|
selfClosing=False):
|
||||||
|
if attributes is None:
|
||||||
|
attributes = {}
|
||||||
|
ans = {"type": tokenTypes[type], "name": name, "data": attributes,
|
||||||
|
"selfClosing": selfClosing}
|
||||||
|
if self.track_positions:
|
||||||
|
ans['position'] = (self.tokenizer.stream.position(), True)
|
||||||
|
return ans
|
||||||
|
|
||||||
def getPhases(debug):
|
def getPhases(debug):
|
||||||
def log(function):
|
def log(function):
|
||||||
@ -440,12 +453,14 @@ def getPhases(debug):
|
|||||||
return type
|
return type
|
||||||
|
|
||||||
class Phase(with_metaclass(getMetaclass(debug, log))):
|
class Phase(with_metaclass(getMetaclass(debug, log))):
|
||||||
|
|
||||||
"""Base class for helper object that implements each phase of processing
|
"""Base class for helper object that implements each phase of processing
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, parser, tree):
|
def __init__(self, parser, tree):
|
||||||
self.parser = parser
|
self.parser = parser
|
||||||
self.tree = tree
|
self.tree = tree
|
||||||
|
self.impliedTagToken = parser.impliedTagToken
|
||||||
|
|
||||||
def processEOF(self):
|
def processEOF(self):
|
||||||
raise NotImplementedError
|
raise NotImplementedError
|
||||||
@ -479,6 +494,7 @@ def getPhases(debug):
|
|||||||
return self.endTagHandler[token["name"]](token)
|
return self.endTagHandler[token["name"]](token)
|
||||||
|
|
||||||
class InitialPhase(Phase):
|
class InitialPhase(Phase):
|
||||||
|
|
||||||
def processSpaceCharacters(self, token):
|
def processSpaceCharacters(self, token):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
@ -609,8 +625,9 @@ def getPhases(debug):
|
|||||||
|
|
||||||
class BeforeHtmlPhase(Phase):
|
class BeforeHtmlPhase(Phase):
|
||||||
# helper methods
|
# helper methods
|
||||||
|
|
||||||
def insertHtmlElement(self):
|
def insertHtmlElement(self):
|
||||||
self.tree.insertRoot(impliedTagToken("html", "StartTag"))
|
self.tree.insertRoot(self.impliedTagToken("html", "StartTag"))
|
||||||
self.parser.phase = self.parser.phases["beforeHead"]
|
self.parser.phase = self.parser.phases["beforeHead"]
|
||||||
|
|
||||||
# other
|
# other
|
||||||
@ -643,6 +660,7 @@ def getPhases(debug):
|
|||||||
return token
|
return token
|
||||||
|
|
||||||
class BeforeHeadPhase(Phase):
|
class BeforeHeadPhase(Phase):
|
||||||
|
|
||||||
def __init__(self, parser, tree):
|
def __init__(self, parser, tree):
|
||||||
Phase.__init__(self, parser, tree)
|
Phase.__init__(self, parser, tree)
|
||||||
|
|
||||||
@ -658,14 +676,14 @@ def getPhases(debug):
|
|||||||
self.endTagHandler.default = self.endTagOther
|
self.endTagHandler.default = self.endTagOther
|
||||||
|
|
||||||
def processEOF(self):
|
def processEOF(self):
|
||||||
self.startTagHead(impliedTagToken("head", "StartTag"))
|
self.startTagHead(self.impliedTagToken("head", "StartTag"))
|
||||||
return True
|
return True
|
||||||
|
|
||||||
def processSpaceCharacters(self, token):
|
def processSpaceCharacters(self, token):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def processCharacters(self, token):
|
def processCharacters(self, token):
|
||||||
self.startTagHead(impliedTagToken("head", "StartTag"))
|
self.startTagHead(self.impliedTagToken("head", "StartTag"))
|
||||||
return token
|
return token
|
||||||
|
|
||||||
def startTagHtml(self, token):
|
def startTagHtml(self, token):
|
||||||
@ -677,11 +695,11 @@ def getPhases(debug):
|
|||||||
self.parser.phase = self.parser.phases["inHead"]
|
self.parser.phase = self.parser.phases["inHead"]
|
||||||
|
|
||||||
def startTagOther(self, token):
|
def startTagOther(self, token):
|
||||||
self.startTagHead(impliedTagToken("head", "StartTag"))
|
self.startTagHead(self.impliedTagToken("head", "StartTag"))
|
||||||
return token
|
return token
|
||||||
|
|
||||||
def endTagImplyHead(self, token):
|
def endTagImplyHead(self, token):
|
||||||
self.startTagHead(impliedTagToken("head", "StartTag"))
|
self.startTagHead(self.impliedTagToken("head", "StartTag"))
|
||||||
return token
|
return token
|
||||||
|
|
||||||
def endTagOther(self, token):
|
def endTagOther(self, token):
|
||||||
@ -689,6 +707,7 @@ def getPhases(debug):
|
|||||||
{"name": token["name"]})
|
{"name": token["name"]})
|
||||||
|
|
||||||
class InHeadPhase(Phase):
|
class InHeadPhase(Phase):
|
||||||
|
|
||||||
def __init__(self, parser, tree):
|
def __init__(self, parser, tree):
|
||||||
Phase.__init__(self, parser, tree)
|
Phase.__init__(self, parser, tree)
|
||||||
|
|
||||||
@ -781,13 +800,14 @@ def getPhases(debug):
|
|||||||
self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
|
self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
|
||||||
|
|
||||||
def anythingElse(self):
|
def anythingElse(self):
|
||||||
self.endTagHead(impliedTagToken("head"))
|
self.endTagHead(self.impliedTagToken("head"))
|
||||||
|
|
||||||
# XXX If we implement a parser for which scripting is disabled we need to
|
# XXX If we implement a parser for which scripting is disabled we need to
|
||||||
# implement this phase.
|
# implement this phase.
|
||||||
#
|
#
|
||||||
# class InHeadNoScriptPhase(Phase):
|
# class InHeadNoScriptPhase(Phase):
|
||||||
class AfterHeadPhase(Phase):
|
class AfterHeadPhase(Phase):
|
||||||
|
|
||||||
def __init__(self, parser, tree):
|
def __init__(self, parser, tree):
|
||||||
Phase.__init__(self, parser, tree)
|
Phase.__init__(self, parser, tree)
|
||||||
|
|
||||||
@ -850,13 +870,14 @@ def getPhases(debug):
|
|||||||
self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
|
self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
|
||||||
|
|
||||||
def anythingElse(self):
|
def anythingElse(self):
|
||||||
self.tree.insertElement(impliedTagToken("body", "StartTag"))
|
self.tree.insertElement(self.impliedTagToken("body", "StartTag"))
|
||||||
self.parser.phase = self.parser.phases["inBody"]
|
self.parser.phase = self.parser.phases["inBody"]
|
||||||
self.parser.framesetOK = True
|
self.parser.framesetOK = True
|
||||||
|
|
||||||
class InBodyPhase(Phase):
|
class InBodyPhase(Phase):
|
||||||
# http://www.whatwg.org/specs/web-apps/current-work/#parsing-main-inbody
|
# http://www.whatwg.org/specs/web-apps/current-work/#parsing-main-inbody
|
||||||
# the really-really-really-very crazy mode
|
# the really-really-really-very crazy mode
|
||||||
|
|
||||||
def __init__(self, parser, tree):
|
def __init__(self, parser, tree):
|
||||||
Phase.__init__(self, parser, tree)
|
Phase.__init__(self, parser, tree)
|
||||||
|
|
||||||
@ -1027,12 +1048,12 @@ def getPhases(debug):
|
|||||||
|
|
||||||
def startTagCloseP(self, token):
|
def startTagCloseP(self, token):
|
||||||
if self.tree.elementInScope("p", variant="button"):
|
if self.tree.elementInScope("p", variant="button"):
|
||||||
self.endTagP(impliedTagToken("p"))
|
self.endTagP(self.impliedTagToken("p"))
|
||||||
self.tree.insertElement(token)
|
self.tree.insertElement(token)
|
||||||
|
|
||||||
def startTagPreListing(self, token):
|
def startTagPreListing(self, token):
|
||||||
if self.tree.elementInScope("p", variant="button"):
|
if self.tree.elementInScope("p", variant="button"):
|
||||||
self.endTagP(impliedTagToken("p"))
|
self.endTagP(self.impliedTagToken("p"))
|
||||||
self.tree.insertElement(token)
|
self.tree.insertElement(token)
|
||||||
self.parser.framesetOK = False
|
self.parser.framesetOK = False
|
||||||
self.processSpaceCharacters = self.processSpaceCharactersDropNewline
|
self.processSpaceCharacters = self.processSpaceCharactersDropNewline
|
||||||
@ -1042,7 +1063,7 @@ def getPhases(debug):
|
|||||||
self.parser.parseError("unexpected-start-tag", {"name": "form"})
|
self.parser.parseError("unexpected-start-tag", {"name": "form"})
|
||||||
else:
|
else:
|
||||||
if self.tree.elementInScope("p", variant="button"):
|
if self.tree.elementInScope("p", variant="button"):
|
||||||
self.endTagP(impliedTagToken("p"))
|
self.endTagP(self.impliedTagToken("p"))
|
||||||
self.tree.insertElement(token)
|
self.tree.insertElement(token)
|
||||||
self.tree.formPointer = self.tree.openElements[-1]
|
self.tree.formPointer = self.tree.openElements[-1]
|
||||||
|
|
||||||
@ -1056,7 +1077,7 @@ def getPhases(debug):
|
|||||||
for node in reversed(self.tree.openElements):
|
for node in reversed(self.tree.openElements):
|
||||||
if node.name in stopNames:
|
if node.name in stopNames:
|
||||||
self.parser.phase.processEndTag(
|
self.parser.phase.processEndTag(
|
||||||
impliedTagToken(node.name, "EndTag"))
|
self.impliedTagToken(node.name, "EndTag"))
|
||||||
break
|
break
|
||||||
if (node.nameTuple in specialElements and
|
if (node.nameTuple in specialElements and
|
||||||
node.name not in ("address", "div", "p")):
|
node.name not in ("address", "div", "p")):
|
||||||
@ -1064,19 +1085,19 @@ def getPhases(debug):
|
|||||||
|
|
||||||
if self.tree.elementInScope("p", variant="button"):
|
if self.tree.elementInScope("p", variant="button"):
|
||||||
self.parser.phase.processEndTag(
|
self.parser.phase.processEndTag(
|
||||||
impliedTagToken("p", "EndTag"))
|
self.impliedTagToken("p", "EndTag"))
|
||||||
|
|
||||||
self.tree.insertElement(token)
|
self.tree.insertElement(token)
|
||||||
|
|
||||||
def startTagPlaintext(self, token):
|
def startTagPlaintext(self, token):
|
||||||
if self.tree.elementInScope("p", variant="button"):
|
if self.tree.elementInScope("p", variant="button"):
|
||||||
self.endTagP(impliedTagToken("p"))
|
self.endTagP(self.impliedTagToken("p"))
|
||||||
self.tree.insertElement(token)
|
self.tree.insertElement(token)
|
||||||
self.parser.tokenizer.state = self.parser.tokenizer.plaintextState
|
self.parser.tokenizer.state = self.parser.tokenizer.plaintextState
|
||||||
|
|
||||||
def startTagHeading(self, token):
|
def startTagHeading(self, token):
|
||||||
if self.tree.elementInScope("p", variant="button"):
|
if self.tree.elementInScope("p", variant="button"):
|
||||||
self.endTagP(impliedTagToken("p"))
|
self.endTagP(self.impliedTagToken("p"))
|
||||||
if self.tree.openElements[-1].name in headingElements:
|
if self.tree.openElements[-1].name in headingElements:
|
||||||
self.parser.parseError("unexpected-start-tag", {"name": token["name"]})
|
self.parser.parseError("unexpected-start-tag", {"name": token["name"]})
|
||||||
self.tree.openElements.pop()
|
self.tree.openElements.pop()
|
||||||
@ -1087,7 +1108,7 @@ def getPhases(debug):
|
|||||||
if afeAElement is not False:
|
if afeAElement is not False:
|
||||||
self.parser.parseError("unexpected-start-tag-implies-end-tag",
|
self.parser.parseError("unexpected-start-tag-implies-end-tag",
|
||||||
{"startName": "a", "endName": "a"})
|
{"startName": "a", "endName": "a"})
|
||||||
self.endTagFormatting(impliedTagToken("a"))
|
self.endTagFormatting(self.impliedTagToken("a"))
|
||||||
if afeAElement in self.tree.openElements:
|
if afeAElement in self.tree.openElements:
|
||||||
self.tree.openElements.remove(afeAElement)
|
self.tree.openElements.remove(afeAElement)
|
||||||
if afeAElement in self.tree.activeFormattingElements:
|
if afeAElement in self.tree.activeFormattingElements:
|
||||||
@ -1104,7 +1125,7 @@ def getPhases(debug):
|
|||||||
if self.tree.elementInScope("nobr"):
|
if self.tree.elementInScope("nobr"):
|
||||||
self.parser.parseError("unexpected-start-tag-implies-end-tag",
|
self.parser.parseError("unexpected-start-tag-implies-end-tag",
|
||||||
{"startName": "nobr", "endName": "nobr"})
|
{"startName": "nobr", "endName": "nobr"})
|
||||||
self.processEndTag(impliedTagToken("nobr"))
|
self.processEndTag(self.impliedTagToken("nobr"))
|
||||||
# XXX Need tests that trigger the following
|
# XXX Need tests that trigger the following
|
||||||
self.tree.reconstructActiveFormattingElements()
|
self.tree.reconstructActiveFormattingElements()
|
||||||
self.addFormattingElement(token)
|
self.addFormattingElement(token)
|
||||||
@ -1113,7 +1134,7 @@ def getPhases(debug):
|
|||||||
if self.tree.elementInScope("button"):
|
if self.tree.elementInScope("button"):
|
||||||
self.parser.parseError("unexpected-start-tag-implies-end-tag",
|
self.parser.parseError("unexpected-start-tag-implies-end-tag",
|
||||||
{"startName": "button", "endName": "button"})
|
{"startName": "button", "endName": "button"})
|
||||||
self.processEndTag(impliedTagToken("button"))
|
self.processEndTag(self.impliedTagToken("button"))
|
||||||
return token
|
return token
|
||||||
else:
|
else:
|
||||||
self.tree.reconstructActiveFormattingElements()
|
self.tree.reconstructActiveFormattingElements()
|
||||||
@ -1128,7 +1149,7 @@ def getPhases(debug):
|
|||||||
|
|
||||||
def startTagXmp(self, token):
|
def startTagXmp(self, token):
|
||||||
if self.tree.elementInScope("p", variant="button"):
|
if self.tree.elementInScope("p", variant="button"):
|
||||||
self.endTagP(impliedTagToken("p"))
|
self.endTagP(self.impliedTagToken("p"))
|
||||||
self.tree.reconstructActiveFormattingElements()
|
self.tree.reconstructActiveFormattingElements()
|
||||||
self.parser.framesetOK = False
|
self.parser.framesetOK = False
|
||||||
self.parser.parseRCDataRawtext(token, "RAWTEXT")
|
self.parser.parseRCDataRawtext(token, "RAWTEXT")
|
||||||
@ -1136,7 +1157,7 @@ def getPhases(debug):
|
|||||||
def startTagTable(self, token):
|
def startTagTable(self, token):
|
||||||
if self.parser.compatMode != "quirks":
|
if self.parser.compatMode != "quirks":
|
||||||
if self.tree.elementInScope("p", variant="button"):
|
if self.tree.elementInScope("p", variant="button"):
|
||||||
self.processEndTag(impliedTagToken("p"))
|
self.processEndTag(self.impliedTagToken("p"))
|
||||||
self.tree.insertElement(token)
|
self.tree.insertElement(token)
|
||||||
self.parser.framesetOK = False
|
self.parser.framesetOK = False
|
||||||
self.parser.phase = self.parser.phases["inTable"]
|
self.parser.phase = self.parser.phases["inTable"]
|
||||||
@ -1163,7 +1184,7 @@ def getPhases(debug):
|
|||||||
|
|
||||||
def startTagHr(self, token):
|
def startTagHr(self, token):
|
||||||
if self.tree.elementInScope("p", variant="button"):
|
if self.tree.elementInScope("p", variant="button"):
|
||||||
self.endTagP(impliedTagToken("p"))
|
self.endTagP(self.impliedTagToken("p"))
|
||||||
self.tree.insertElement(token)
|
self.tree.insertElement(token)
|
||||||
self.tree.openElements.pop()
|
self.tree.openElements.pop()
|
||||||
token["selfClosingAcknowledged"] = True
|
token["selfClosingAcknowledged"] = True
|
||||||
@ -1173,7 +1194,7 @@ def getPhases(debug):
|
|||||||
# No really...
|
# No really...
|
||||||
self.parser.parseError("unexpected-start-tag-treated-as",
|
self.parser.parseError("unexpected-start-tag-treated-as",
|
||||||
{"originalName": "image", "newName": "img"})
|
{"originalName": "image", "newName": "img"})
|
||||||
self.processStartTag(impliedTagToken("img", "StartTag",
|
self.processStartTag(self.impliedTagToken("img", "StartTag",
|
||||||
attributes=token["data"],
|
attributes=token["data"],
|
||||||
selfClosing=token["selfClosing"]))
|
selfClosing=token["selfClosing"]))
|
||||||
|
|
||||||
@ -1184,10 +1205,10 @@ def getPhases(debug):
|
|||||||
form_attrs = {}
|
form_attrs = {}
|
||||||
if "action" in token["data"]:
|
if "action" in token["data"]:
|
||||||
form_attrs["action"] = token["data"]["action"]
|
form_attrs["action"] = token["data"]["action"]
|
||||||
self.processStartTag(impliedTagToken("form", "StartTag",
|
self.processStartTag(self.impliedTagToken("form", "StartTag",
|
||||||
attributes=form_attrs))
|
attributes=form_attrs))
|
||||||
self.processStartTag(impliedTagToken("hr", "StartTag"))
|
self.processStartTag(self.impliedTagToken("hr", "StartTag"))
|
||||||
self.processStartTag(impliedTagToken("label", "StartTag"))
|
self.processStartTag(self.impliedTagToken("label", "StartTag"))
|
||||||
# XXX Localization ...
|
# XXX Localization ...
|
||||||
if "prompt" in token["data"]:
|
if "prompt" in token["data"]:
|
||||||
prompt = token["data"]["prompt"]
|
prompt = token["data"]["prompt"]
|
||||||
@ -1201,13 +1222,13 @@ def getPhases(debug):
|
|||||||
if "prompt" in attributes:
|
if "prompt" in attributes:
|
||||||
del attributes["prompt"]
|
del attributes["prompt"]
|
||||||
attributes["name"] = "isindex"
|
attributes["name"] = "isindex"
|
||||||
self.processStartTag(impliedTagToken("input", "StartTag",
|
self.processStartTag(self.impliedTagToken("input", "StartTag",
|
||||||
attributes=attributes,
|
attributes=attributes,
|
||||||
selfClosing=
|
selfClosing=
|
||||||
token["selfClosing"]))
|
token["selfClosing"]))
|
||||||
self.processEndTag(impliedTagToken("label"))
|
self.processEndTag(self.impliedTagToken("label"))
|
||||||
self.processStartTag(impliedTagToken("hr", "StartTag"))
|
self.processStartTag(self.impliedTagToken("hr", "StartTag"))
|
||||||
self.processEndTag(impliedTagToken("form"))
|
self.processEndTag(self.impliedTagToken("form"))
|
||||||
|
|
||||||
def startTagTextarea(self, token):
|
def startTagTextarea(self, token):
|
||||||
self.tree.insertElement(token)
|
self.tree.insertElement(token)
|
||||||
@ -1225,7 +1246,7 @@ def getPhases(debug):
|
|||||||
|
|
||||||
def startTagOpt(self, token):
|
def startTagOpt(self, token):
|
||||||
if self.tree.openElements[-1].name == "option":
|
if self.tree.openElements[-1].name == "option":
|
||||||
self.parser.phase.processEndTag(impliedTagToken("option"))
|
self.parser.phase.processEndTag(self.impliedTagToken("option"))
|
||||||
self.tree.reconstructActiveFormattingElements()
|
self.tree.reconstructActiveFormattingElements()
|
||||||
self.parser.tree.insertElement(token)
|
self.parser.tree.insertElement(token)
|
||||||
|
|
||||||
@ -1289,9 +1310,9 @@ def getPhases(debug):
|
|||||||
|
|
||||||
def endTagP(self, token):
|
def endTagP(self, token):
|
||||||
if not self.tree.elementInScope("p", variant="button"):
|
if not self.tree.elementInScope("p", variant="button"):
|
||||||
self.startTagCloseP(impliedTagToken("p", "StartTag"))
|
self.startTagCloseP(self.impliedTagToken("p", "StartTag"))
|
||||||
self.parser.parseError("unexpected-end-tag", {"name": "p"})
|
self.parser.parseError("unexpected-end-tag", {"name": "p"})
|
||||||
self.endTagP(impliedTagToken("p", "EndTag"))
|
self.endTagP(self.impliedTagToken("p", "EndTag"))
|
||||||
else:
|
else:
|
||||||
self.tree.generateImpliedEndTags("p")
|
self.tree.generateImpliedEndTags("p")
|
||||||
if self.tree.openElements[-1].name != "p":
|
if self.tree.openElements[-1].name != "p":
|
||||||
@ -1321,7 +1342,7 @@ def getPhases(debug):
|
|||||||
def endTagHtml(self, token):
|
def endTagHtml(self, token):
|
||||||
# We repeat the test for the body end tag token being ignored here
|
# We repeat the test for the body end tag token being ignored here
|
||||||
if self.tree.elementInScope("body"):
|
if self.tree.elementInScope("body"):
|
||||||
self.endTagBody(impliedTagToken("body"))
|
self.endTagBody(self.impliedTagToken("body"))
|
||||||
return token
|
return token
|
||||||
|
|
||||||
def endTagBlock(self, token):
|
def endTagBlock(self, token):
|
||||||
@ -1562,7 +1583,7 @@ def getPhases(debug):
|
|||||||
self.parser.parseError("unexpected-end-tag-treated-as",
|
self.parser.parseError("unexpected-end-tag-treated-as",
|
||||||
{"originalName": "br", "newName": "br element"})
|
{"originalName": "br", "newName": "br element"})
|
||||||
self.tree.reconstructActiveFormattingElements()
|
self.tree.reconstructActiveFormattingElements()
|
||||||
self.tree.insertElement(impliedTagToken("br", "StartTag"))
|
self.tree.insertElement(self.impliedTagToken("br", "StartTag"))
|
||||||
self.tree.openElements.pop()
|
self.tree.openElements.pop()
|
||||||
|
|
||||||
def endTagOther(self, token):
|
def endTagOther(self, token):
|
||||||
@ -1580,6 +1601,7 @@ def getPhases(debug):
|
|||||||
break
|
break
|
||||||
|
|
||||||
class TextPhase(Phase):
|
class TextPhase(Phase):
|
||||||
|
|
||||||
def __init__(self, parser, tree):
|
def __init__(self, parser, tree):
|
||||||
Phase.__init__(self, parser, tree)
|
Phase.__init__(self, parser, tree)
|
||||||
self.startTagHandler = utils.MethodDispatcher([])
|
self.startTagHandler = utils.MethodDispatcher([])
|
||||||
@ -1614,6 +1636,7 @@ def getPhases(debug):
|
|||||||
|
|
||||||
class InTablePhase(Phase):
|
class InTablePhase(Phase):
|
||||||
# http://www.whatwg.org/specs/web-apps/current-work/#in-table
|
# http://www.whatwg.org/specs/web-apps/current-work/#in-table
|
||||||
|
|
||||||
def __init__(self, parser, tree):
|
def __init__(self, parser, tree):
|
||||||
Phase.__init__(self, parser, tree)
|
Phase.__init__(self, parser, tree)
|
||||||
self.startTagHandler = utils.MethodDispatcher([
|
self.startTagHandler = utils.MethodDispatcher([
|
||||||
@ -1685,7 +1708,7 @@ def getPhases(debug):
|
|||||||
self.parser.phase = self.parser.phases["inColumnGroup"]
|
self.parser.phase = self.parser.phases["inColumnGroup"]
|
||||||
|
|
||||||
def startTagCol(self, token):
|
def startTagCol(self, token):
|
||||||
self.startTagColgroup(impliedTagToken("colgroup", "StartTag"))
|
self.startTagColgroup(self.impliedTagToken("colgroup", "StartTag"))
|
||||||
return token
|
return token
|
||||||
|
|
||||||
def startTagRowGroup(self, token):
|
def startTagRowGroup(self, token):
|
||||||
@ -1694,13 +1717,13 @@ def getPhases(debug):
|
|||||||
self.parser.phase = self.parser.phases["inTableBody"]
|
self.parser.phase = self.parser.phases["inTableBody"]
|
||||||
|
|
||||||
def startTagImplyTbody(self, token):
|
def startTagImplyTbody(self, token):
|
||||||
self.startTagRowGroup(impliedTagToken("tbody", "StartTag"))
|
self.startTagRowGroup(self.impliedTagToken("tbody", "StartTag"))
|
||||||
return token
|
return token
|
||||||
|
|
||||||
def startTagTable(self, token):
|
def startTagTable(self, token):
|
||||||
self.parser.parseError("unexpected-start-tag-implies-end-tag",
|
self.parser.parseError("unexpected-start-tag-implies-end-tag",
|
||||||
{"startName": "table", "endName": "table"})
|
{"startName": "table", "endName": "table"})
|
||||||
self.parser.phase.processEndTag(impliedTagToken("table"))
|
self.parser.phase.processEndTag(self.impliedTagToken("table"))
|
||||||
if not self.parser.innerHTML:
|
if not self.parser.innerHTML:
|
||||||
return token
|
return token
|
||||||
|
|
||||||
@ -1758,6 +1781,7 @@ def getPhases(debug):
|
|||||||
self.tree.insertFromTable = False
|
self.tree.insertFromTable = False
|
||||||
|
|
||||||
class InTableTextPhase(Phase):
|
class InTableTextPhase(Phase):
|
||||||
|
|
||||||
def __init__(self, parser, tree):
|
def __init__(self, parser, tree):
|
||||||
Phase.__init__(self, parser, tree)
|
Phase.__init__(self, parser, tree)
|
||||||
self.originalPhase = None
|
self.originalPhase = None
|
||||||
@ -1804,6 +1828,7 @@ def getPhases(debug):
|
|||||||
|
|
||||||
class InCaptionPhase(Phase):
|
class InCaptionPhase(Phase):
|
||||||
# http://www.whatwg.org/specs/web-apps/current-work/#in-caption
|
# http://www.whatwg.org/specs/web-apps/current-work/#in-caption
|
||||||
|
|
||||||
def __init__(self, parser, tree):
|
def __init__(self, parser, tree):
|
||||||
Phase.__init__(self, parser, tree)
|
Phase.__init__(self, parser, tree)
|
||||||
|
|
||||||
@ -1835,7 +1860,7 @@ def getPhases(debug):
|
|||||||
self.parser.parseError()
|
self.parser.parseError()
|
||||||
# XXX Have to duplicate logic here to find out if the tag is ignored
|
# XXX Have to duplicate logic here to find out if the tag is ignored
|
||||||
ignoreEndTag = self.ignoreEndTagCaption()
|
ignoreEndTag = self.ignoreEndTagCaption()
|
||||||
self.parser.phase.processEndTag(impliedTagToken("caption"))
|
self.parser.phase.processEndTag(self.impliedTagToken("caption"))
|
||||||
if not ignoreEndTag:
|
if not ignoreEndTag:
|
||||||
return token
|
return token
|
||||||
|
|
||||||
@ -1863,7 +1888,7 @@ def getPhases(debug):
|
|||||||
def endTagTable(self, token):
|
def endTagTable(self, token):
|
||||||
self.parser.parseError()
|
self.parser.parseError()
|
||||||
ignoreEndTag = self.ignoreEndTagCaption()
|
ignoreEndTag = self.ignoreEndTagCaption()
|
||||||
self.parser.phase.processEndTag(impliedTagToken("caption"))
|
self.parser.phase.processEndTag(self.impliedTagToken("caption"))
|
||||||
if not ignoreEndTag:
|
if not ignoreEndTag:
|
||||||
return token
|
return token
|
||||||
|
|
||||||
@ -1900,13 +1925,13 @@ def getPhases(debug):
|
|||||||
return
|
return
|
||||||
else:
|
else:
|
||||||
ignoreEndTag = self.ignoreEndTagColgroup()
|
ignoreEndTag = self.ignoreEndTagColgroup()
|
||||||
self.endTagColgroup(impliedTagToken("colgroup"))
|
self.endTagColgroup(self.impliedTagToken("colgroup"))
|
||||||
if not ignoreEndTag:
|
if not ignoreEndTag:
|
||||||
return True
|
return True
|
||||||
|
|
||||||
def processCharacters(self, token):
|
def processCharacters(self, token):
|
||||||
ignoreEndTag = self.ignoreEndTagColgroup()
|
ignoreEndTag = self.ignoreEndTagColgroup()
|
||||||
self.endTagColgroup(impliedTagToken("colgroup"))
|
self.endTagColgroup(self.impliedTagToken("colgroup"))
|
||||||
if not ignoreEndTag:
|
if not ignoreEndTag:
|
||||||
return token
|
return token
|
||||||
|
|
||||||
@ -1916,7 +1941,7 @@ def getPhases(debug):
|
|||||||
|
|
||||||
def startTagOther(self, token):
|
def startTagOther(self, token):
|
||||||
ignoreEndTag = self.ignoreEndTagColgroup()
|
ignoreEndTag = self.ignoreEndTagColgroup()
|
||||||
self.endTagColgroup(impliedTagToken("colgroup"))
|
self.endTagColgroup(self.impliedTagToken("colgroup"))
|
||||||
if not ignoreEndTag:
|
if not ignoreEndTag:
|
||||||
return token
|
return token
|
||||||
|
|
||||||
@ -1934,12 +1959,13 @@ def getPhases(debug):
|
|||||||
|
|
||||||
def endTagOther(self, token):
|
def endTagOther(self, token):
|
||||||
ignoreEndTag = self.ignoreEndTagColgroup()
|
ignoreEndTag = self.ignoreEndTagColgroup()
|
||||||
self.endTagColgroup(impliedTagToken("colgroup"))
|
self.endTagColgroup(self.impliedTagToken("colgroup"))
|
||||||
if not ignoreEndTag:
|
if not ignoreEndTag:
|
||||||
return token
|
return token
|
||||||
|
|
||||||
class InTableBodyPhase(Phase):
|
class InTableBodyPhase(Phase):
|
||||||
# http://www.whatwg.org/specs/web-apps/current-work/#in-table0
|
# http://www.whatwg.org/specs/web-apps/current-work/#in-table0
|
||||||
|
|
||||||
def __init__(self, parser, tree):
|
def __init__(self, parser, tree):
|
||||||
Phase.__init__(self, parser, tree)
|
Phase.__init__(self, parser, tree)
|
||||||
self.startTagHandler = utils.MethodDispatcher([
|
self.startTagHandler = utils.MethodDispatcher([
|
||||||
@ -1987,7 +2013,7 @@ def getPhases(debug):
|
|||||||
def startTagTableCell(self, token):
|
def startTagTableCell(self, token):
|
||||||
self.parser.parseError("unexpected-cell-in-table-body",
|
self.parser.parseError("unexpected-cell-in-table-body",
|
||||||
{"name": token["name"]})
|
{"name": token["name"]})
|
||||||
self.startTagTr(impliedTagToken("tr", "StartTag"))
|
self.startTagTr(self.impliedTagToken("tr", "StartTag"))
|
||||||
return token
|
return token
|
||||||
|
|
||||||
def startTagTableOther(self, token):
|
def startTagTableOther(self, token):
|
||||||
@ -1997,7 +2023,7 @@ def getPhases(debug):
|
|||||||
self.tree.elementInScope("tfoot", variant="table")):
|
self.tree.elementInScope("tfoot", variant="table")):
|
||||||
self.clearStackToTableBodyContext()
|
self.clearStackToTableBodyContext()
|
||||||
self.endTagTableRowGroup(
|
self.endTagTableRowGroup(
|
||||||
impliedTagToken(self.tree.openElements[-1].name))
|
self.impliedTagToken(self.tree.openElements[-1].name))
|
||||||
return token
|
return token
|
||||||
else:
|
else:
|
||||||
# innerHTML case
|
# innerHTML case
|
||||||
@ -2022,7 +2048,7 @@ def getPhases(debug):
|
|||||||
self.tree.elementInScope("tfoot", variant="table")):
|
self.tree.elementInScope("tfoot", variant="table")):
|
||||||
self.clearStackToTableBodyContext()
|
self.clearStackToTableBodyContext()
|
||||||
self.endTagTableRowGroup(
|
self.endTagTableRowGroup(
|
||||||
impliedTagToken(self.tree.openElements[-1].name))
|
self.impliedTagToken(self.tree.openElements[-1].name))
|
||||||
return token
|
return token
|
||||||
else:
|
else:
|
||||||
# innerHTML case
|
# innerHTML case
|
||||||
@ -2038,6 +2064,7 @@ def getPhases(debug):
|
|||||||
|
|
||||||
class InRowPhase(Phase):
|
class InRowPhase(Phase):
|
||||||
# http://www.whatwg.org/specs/web-apps/current-work/#in-row
|
# http://www.whatwg.org/specs/web-apps/current-work/#in-row
|
||||||
|
|
||||||
def __init__(self, parser, tree):
|
def __init__(self, parser, tree):
|
||||||
Phase.__init__(self, parser, tree)
|
Phase.__init__(self, parser, tree)
|
||||||
self.startTagHandler = utils.MethodDispatcher([
|
self.startTagHandler = utils.MethodDispatcher([
|
||||||
@ -2085,7 +2112,7 @@ def getPhases(debug):
|
|||||||
|
|
||||||
def startTagTableOther(self, token):
|
def startTagTableOther(self, token):
|
||||||
ignoreEndTag = self.ignoreEndTagTr()
|
ignoreEndTag = self.ignoreEndTagTr()
|
||||||
self.endTagTr(impliedTagToken("tr"))
|
self.endTagTr(self.impliedTagToken("tr"))
|
||||||
# XXX how are we sure it's always ignored in the innerHTML case?
|
# XXX how are we sure it's always ignored in the innerHTML case?
|
||||||
if not ignoreEndTag:
|
if not ignoreEndTag:
|
||||||
return token
|
return token
|
||||||
@ -2105,7 +2132,7 @@ def getPhases(debug):
|
|||||||
|
|
||||||
def endTagTable(self, token):
|
def endTagTable(self, token):
|
||||||
ignoreEndTag = self.ignoreEndTagTr()
|
ignoreEndTag = self.ignoreEndTagTr()
|
||||||
self.endTagTr(impliedTagToken("tr"))
|
self.endTagTr(self.impliedTagToken("tr"))
|
||||||
# Reprocess the current tag if the tr end tag was not ignored
|
# Reprocess the current tag if the tr end tag was not ignored
|
||||||
# XXX how are we sure it's always ignored in the innerHTML case?
|
# XXX how are we sure it's always ignored in the innerHTML case?
|
||||||
if not ignoreEndTag:
|
if not ignoreEndTag:
|
||||||
@ -2113,7 +2140,7 @@ def getPhases(debug):
|
|||||||
|
|
||||||
def endTagTableRowGroup(self, token):
|
def endTagTableRowGroup(self, token):
|
||||||
if self.tree.elementInScope(token["name"], variant="table"):
|
if self.tree.elementInScope(token["name"], variant="table"):
|
||||||
self.endTagTr(impliedTagToken("tr"))
|
self.endTagTr(self.impliedTagToken("tr"))
|
||||||
return token
|
return token
|
||||||
else:
|
else:
|
||||||
self.parser.parseError()
|
self.parser.parseError()
|
||||||
@ -2127,6 +2154,7 @@ def getPhases(debug):
|
|||||||
|
|
||||||
class InCellPhase(Phase):
|
class InCellPhase(Phase):
|
||||||
# http://www.whatwg.org/specs/web-apps/current-work/#in-cell
|
# http://www.whatwg.org/specs/web-apps/current-work/#in-cell
|
||||||
|
|
||||||
def __init__(self, parser, tree):
|
def __init__(self, parser, tree):
|
||||||
Phase.__init__(self, parser, tree)
|
Phase.__init__(self, parser, tree)
|
||||||
self.startTagHandler = utils.MethodDispatcher([
|
self.startTagHandler = utils.MethodDispatcher([
|
||||||
@ -2146,9 +2174,9 @@ def getPhases(debug):
|
|||||||
# helper
|
# helper
|
||||||
def closeCell(self):
|
def closeCell(self):
|
||||||
if self.tree.elementInScope("td", variant="table"):
|
if self.tree.elementInScope("td", variant="table"):
|
||||||
self.endTagTableCell(impliedTagToken("td"))
|
self.endTagTableCell(self.impliedTagToken("td"))
|
||||||
elif self.tree.elementInScope("th", variant="table"):
|
elif self.tree.elementInScope("th", variant="table"):
|
||||||
self.endTagTableCell(impliedTagToken("th"))
|
self.endTagTableCell(self.impliedTagToken("th"))
|
||||||
|
|
||||||
# the rest
|
# the rest
|
||||||
def processEOF(self):
|
def processEOF(self):
|
||||||
@ -2202,6 +2230,7 @@ def getPhases(debug):
|
|||||||
return self.parser.phases["inBody"].processEndTag(token)
|
return self.parser.phases["inBody"].processEndTag(token)
|
||||||
|
|
||||||
class InSelectPhase(Phase):
|
class InSelectPhase(Phase):
|
||||||
|
|
||||||
def __init__(self, parser, tree):
|
def __init__(self, parser, tree):
|
||||||
Phase.__init__(self, parser, tree)
|
Phase.__init__(self, parser, tree)
|
||||||
|
|
||||||
@ -2249,12 +2278,12 @@ def getPhases(debug):
|
|||||||
|
|
||||||
def startTagSelect(self, token):
|
def startTagSelect(self, token):
|
||||||
self.parser.parseError("unexpected-select-in-select")
|
self.parser.parseError("unexpected-select-in-select")
|
||||||
self.endTagSelect(impliedTagToken("select"))
|
self.endTagSelect(self.impliedTagToken("select"))
|
||||||
|
|
||||||
def startTagInput(self, token):
|
def startTagInput(self, token):
|
||||||
self.parser.parseError("unexpected-input-in-select")
|
self.parser.parseError("unexpected-input-in-select")
|
||||||
if self.tree.elementInScope("select", variant="select"):
|
if self.tree.elementInScope("select", variant="select"):
|
||||||
self.endTagSelect(impliedTagToken("select"))
|
self.endTagSelect(self.impliedTagToken("select"))
|
||||||
return token
|
return token
|
||||||
else:
|
else:
|
||||||
assert self.parser.innerHTML
|
assert self.parser.innerHTML
|
||||||
@ -2302,6 +2331,7 @@ def getPhases(debug):
|
|||||||
{"name": token["name"]})
|
{"name": token["name"]})
|
||||||
|
|
||||||
class InSelectInTablePhase(Phase):
|
class InSelectInTablePhase(Phase):
|
||||||
|
|
||||||
def __init__(self, parser, tree):
|
def __init__(self, parser, tree):
|
||||||
Phase.__init__(self, parser, tree)
|
Phase.__init__(self, parser, tree)
|
||||||
|
|
||||||
@ -2325,7 +2355,7 @@ def getPhases(debug):
|
|||||||
|
|
||||||
def startTagTable(self, token):
|
def startTagTable(self, token):
|
||||||
self.parser.parseError("unexpected-table-element-start-tag-in-select-in-table", {"name": token["name"]})
|
self.parser.parseError("unexpected-table-element-start-tag-in-select-in-table", {"name": token["name"]})
|
||||||
self.endTagOther(impliedTagToken("select"))
|
self.endTagOther(self.impliedTagToken("select"))
|
||||||
return token
|
return token
|
||||||
|
|
||||||
def startTagOther(self, token):
|
def startTagOther(self, token):
|
||||||
@ -2334,7 +2364,7 @@ def getPhases(debug):
|
|||||||
def endTagTable(self, token):
|
def endTagTable(self, token):
|
||||||
self.parser.parseError("unexpected-table-element-end-tag-in-select-in-table", {"name": token["name"]})
|
self.parser.parseError("unexpected-table-element-end-tag-in-select-in-table", {"name": token["name"]})
|
||||||
if self.tree.elementInScope(token["name"], variant="table"):
|
if self.tree.elementInScope(token["name"], variant="table"):
|
||||||
self.endTagOther(impliedTagToken("select"))
|
self.endTagOther(self.impliedTagToken("select"))
|
||||||
return token
|
return token
|
||||||
|
|
||||||
def endTagOther(self, token):
|
def endTagOther(self, token):
|
||||||
@ -2456,6 +2486,7 @@ def getPhases(debug):
|
|||||||
return new_token
|
return new_token
|
||||||
|
|
||||||
class AfterBodyPhase(Phase):
|
class AfterBodyPhase(Phase):
|
||||||
|
|
||||||
def __init__(self, parser, tree):
|
def __init__(self, parser, tree):
|
||||||
Phase.__init__(self, parser, tree)
|
Phase.__init__(self, parser, tree)
|
||||||
|
|
||||||
@ -2504,6 +2535,7 @@ def getPhases(debug):
|
|||||||
|
|
||||||
class InFramesetPhase(Phase):
|
class InFramesetPhase(Phase):
|
||||||
# http://www.whatwg.org/specs/web-apps/current-work/#in-frameset
|
# http://www.whatwg.org/specs/web-apps/current-work/#in-frameset
|
||||||
|
|
||||||
def __init__(self, parser, tree):
|
def __init__(self, parser, tree):
|
||||||
Phase.__init__(self, parser, tree)
|
Phase.__init__(self, parser, tree)
|
||||||
|
|
||||||
@ -2561,6 +2593,7 @@ def getPhases(debug):
|
|||||||
|
|
||||||
class AfterFramesetPhase(Phase):
|
class AfterFramesetPhase(Phase):
|
||||||
# http://www.whatwg.org/specs/web-apps/current-work/#after3
|
# http://www.whatwg.org/specs/web-apps/current-work/#after3
|
||||||
|
|
||||||
def __init__(self, parser, tree):
|
def __init__(self, parser, tree):
|
||||||
Phase.__init__(self, parser, tree)
|
Phase.__init__(self, parser, tree)
|
||||||
|
|
||||||
@ -2597,6 +2630,7 @@ def getPhases(debug):
|
|||||||
{"name": token["name"]})
|
{"name": token["name"]})
|
||||||
|
|
||||||
class AfterAfterBodyPhase(Phase):
|
class AfterAfterBodyPhase(Phase):
|
||||||
|
|
||||||
def __init__(self, parser, tree):
|
def __init__(self, parser, tree):
|
||||||
Phase.__init__(self, parser, tree)
|
Phase.__init__(self, parser, tree)
|
||||||
|
|
||||||
@ -2635,6 +2669,7 @@ def getPhases(debug):
|
|||||||
return token
|
return token
|
||||||
|
|
||||||
class AfterAfterFramesetPhase(Phase):
|
class AfterAfterFramesetPhase(Phase):
|
||||||
|
|
||||||
def __init__(self, parser, tree):
|
def __init__(self, parser, tree):
|
||||||
Phase.__init__(self, parser, tree)
|
Phase.__init__(self, parser, tree)
|
||||||
|
|
||||||
@ -2698,14 +2733,7 @@ def getPhases(debug):
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
def impliedTagToken(name, type="EndTag", attributes=None,
|
|
||||||
selfClosing=False):
|
|
||||||
if attributes is None:
|
|
||||||
attributes = {}
|
|
||||||
return {"type": tokenTypes[type], "name": name, "data": attributes,
|
|
||||||
"selfClosing": selfClosing}
|
|
||||||
|
|
||||||
|
|
||||||
class ParseError(Exception):
|
class ParseError(Exception):
|
||||||
|
|
||||||
"""Error in parsed document"""
|
"""Error in parsed document"""
|
||||||
pass
|
pass
|
||||||
|
@ -35,10 +35,11 @@ class HTMLTokenizer(object):
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, stream, encoding=None, parseMeta=True, useChardet=True,
|
def __init__(self, stream, encoding=None, parseMeta=True, useChardet=True,
|
||||||
lowercaseElementName=True, lowercaseAttrName=True, parser=None):
|
lowercaseElementName=True, lowercaseAttrName=True, parser=None, track_positions=False):
|
||||||
|
|
||||||
self.stream = HTMLInputStream(stream, encoding, parseMeta, useChardet)
|
self.stream = HTMLInputStream(stream, encoding, parseMeta, useChardet)
|
||||||
self.parser = parser
|
self.parser = parser
|
||||||
|
self.track_positions = track_positions
|
||||||
|
|
||||||
# Perform case conversions?
|
# Perform case conversions?
|
||||||
self.lowercaseElementName = lowercaseElementName
|
self.lowercaseElementName = lowercaseElementName
|
||||||
@ -378,6 +379,8 @@ class HTMLTokenizer(object):
|
|||||||
"name": data, "data": [],
|
"name": data, "data": [],
|
||||||
"selfClosing": False,
|
"selfClosing": False,
|
||||||
"selfClosingAcknowledged": False}
|
"selfClosingAcknowledged": False}
|
||||||
|
if self.track_positions:
|
||||||
|
self.currentToken['position'] = (self.stream.position(), False)
|
||||||
self.state = self.tagNameState
|
self.state = self.tagNameState
|
||||||
elif data == ">":
|
elif data == ">":
|
||||||
# XXX In theory it could be something besides a tag name. But
|
# XXX In theory it could be something besides a tag name. But
|
||||||
|
Loading…
x
Reference in New Issue
Block a user