From 66934ec8fbc8ffee37b21202794362530db831ac Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Thu, 15 Dec 2011 13:50:23 +0530 Subject: [PATCH] Conversion engine: When parsing invalid XHTML use the HTML 5 algorithm, for greater robustness. Fixes #901466 ( tag bug) --- src/calibre/__init__.py | 8 +- src/calibre/ebooks/oeb/base.py | 291 +- src/calibre/ebooks/oeb/entitydefs.py | 256 -- src/calibre/ebooks/oeb/parse_utils.py | 347 +++ src/calibre/ebooks/oeb/reader.py | 8 +- src/html5lib/__init__.py | 17 + src/html5lib/constants.py | 1169 +++++++++ src/html5lib/filters/__init__.py | 0 src/html5lib/filters/_base.py | 10 + src/html5lib/filters/formfiller.py | 127 + src/html5lib/filters/inject_meta_charset.py | 63 + src/html5lib/filters/lint.py | 88 + src/html5lib/filters/optionaltags.py | 202 ++ src/html5lib/filters/sanitizer.py | 8 + src/html5lib/filters/whitespace.py | 41 + src/html5lib/html5parser.py | 2625 +++++++++++++++++++ src/html5lib/ihatexml.py | 177 ++ src/html5lib/inputstream.py | 794 ++++++ src/html5lib/sanitizer.py | 230 ++ src/html5lib/serializer/__init__.py | 17 + src/html5lib/serializer/htmlserializer.py | 234 ++ src/html5lib/serializer/xhtmlserializer.py | 9 + src/html5lib/tokenizer.py | 1586 +++++++++++ src/html5lib/tokenizer_old.py | 1212 +++++++++ src/html5lib/treebuilders/__init__.py | 92 + src/html5lib/treebuilders/_base.py | 345 +++ src/html5lib/treebuilders/dom.py | 286 ++ src/html5lib/treebuilders/etree.py | 329 +++ src/html5lib/treebuilders/etree_lxml.py | 335 +++ src/html5lib/treebuilders/simpletree.py | 248 ++ src/html5lib/treebuilders/soup.py | 228 ++ src/html5lib/treewalkers/__init__.py | 52 + src/html5lib/treewalkers/_base.py | 165 ++ src/html5lib/treewalkers/dom.py | 37 + src/html5lib/treewalkers/etree.py | 130 + src/html5lib/treewalkers/genshistream.py | 70 + src/html5lib/treewalkers/lxmletree.py | 175 ++ src/html5lib/treewalkers/pulldom.py | 56 + src/html5lib/treewalkers/simpletree.py | 72 + src/html5lib/treewalkers/soup.py | 59 + src/html5lib/utils.py | 156 ++ 41 files changed, 11810 insertions(+), 544 deletions(-) delete mode 100644 src/calibre/ebooks/oeb/entitydefs.py create mode 100644 src/calibre/ebooks/oeb/parse_utils.py create mode 100644 src/html5lib/__init__.py create mode 100644 src/html5lib/constants.py create mode 100644 src/html5lib/filters/__init__.py create mode 100644 src/html5lib/filters/_base.py create mode 100644 src/html5lib/filters/formfiller.py create mode 100644 src/html5lib/filters/inject_meta_charset.py create mode 100644 src/html5lib/filters/lint.py create mode 100644 src/html5lib/filters/optionaltags.py create mode 100644 src/html5lib/filters/sanitizer.py create mode 100644 src/html5lib/filters/whitespace.py create mode 100644 src/html5lib/html5parser.py create mode 100644 src/html5lib/ihatexml.py create mode 100644 src/html5lib/inputstream.py create mode 100644 src/html5lib/sanitizer.py create mode 100644 src/html5lib/serializer/__init__.py create mode 100644 src/html5lib/serializer/htmlserializer.py create mode 100644 src/html5lib/serializer/xhtmlserializer.py create mode 100644 src/html5lib/tokenizer.py create mode 100644 src/html5lib/tokenizer_old.py create mode 100644 src/html5lib/treebuilders/__init__.py create mode 100644 src/html5lib/treebuilders/_base.py create mode 100644 src/html5lib/treebuilders/dom.py create mode 100644 src/html5lib/treebuilders/etree.py create mode 100644 src/html5lib/treebuilders/etree_lxml.py create mode 100644 src/html5lib/treebuilders/simpletree.py create mode 100644 src/html5lib/treebuilders/soup.py create mode 100644 src/html5lib/treewalkers/__init__.py create mode 100644 src/html5lib/treewalkers/_base.py create mode 100644 src/html5lib/treewalkers/dom.py create mode 100644 src/html5lib/treewalkers/etree.py create mode 100644 src/html5lib/treewalkers/genshistream.py create mode 100644 src/html5lib/treewalkers/lxmletree.py create mode 100644 src/html5lib/treewalkers/pulldom.py create mode 100644 src/html5lib/treewalkers/simpletree.py create mode 100644 src/html5lib/treewalkers/soup.py create mode 100644 src/html5lib/utils.py diff --git a/src/calibre/__init__.py b/src/calibre/__init__.py index 7790e91bea..989ad3de84 100644 --- a/src/calibre/__init__.py +++ b/src/calibre/__init__.py @@ -558,11 +558,11 @@ xml_entity_to_unicode = partial(entity_to_unicode, result_exceptions = { '>' : '>', '&' : '&'}) -def replace_entities(raw): - return _ent_pat.sub(entity_to_unicode, raw) +def replace_entities(raw, encoding='cp1252'): + return _ent_pat.sub(partial(entity_to_unicode, encoding=encoding), raw) -def xml_replace_entities(raw): - return _ent_pat.sub(xml_entity_to_unicode, raw) +def xml_replace_entities(raw, encoding='cp1252'): + return _ent_pat.sub(partial(xml_entity_to_unicode, encoding=encoding), raw) def prepare_string_for_xml(raw, attribute=False): raw = _ent_pat.sub(entity_to_unicode, raw) diff --git a/src/calibre/ebooks/oeb/base.py b/src/calibre/ebooks/oeb/base.py index 28e5169bc9..d90e3bdfa4 100644 --- a/src/calibre/ebooks/oeb/base.py +++ b/src/calibre/ebooks/oeb/base.py @@ -16,15 +16,13 @@ from urllib import unquote as urlunquote from lxml import etree, html from calibre.constants import filesystem_encoding, __version__ from calibre.translations.dynamic import translate -from calibre.ebooks.chardet import xml_to_unicode, strip_encoding_declarations -from calibre.ebooks.oeb.entitydefs import ENTITYDEFS +from calibre.ebooks.chardet import xml_to_unicode from calibre.ebooks.conversion.preprocess import CSSPreProcessor -from calibre import isbytestring, as_unicode, get_types_map - -RECOVER_PARSER = etree.XMLParser(recover=True, no_network=True) +from calibre import (isbytestring, as_unicode, get_types_map) +from calibre.ebooks.oeb.parse_utils import (barename, XHTML_NS, RECOVER_PARSER, + namespace, XHTML, parse_html, NotHTML) XML_NS = 'http://www.w3.org/XML/1998/namespace' -XHTML_NS = 'http://www.w3.org/1999/xhtml' OEB_DOC_NS = 'http://openebook.org/namespaces/oeb-document/1.0/' OPF1_NS = 'http://openebook.org/namespaces/oeb-package/1.0/' OPF2_NS = 'http://www.idpf.org/2007/opf' @@ -55,9 +53,6 @@ OPF2_NSMAP = {'opf': OPF2_NS, 'dc': DC11_NS, 'dcterms': DCTERMS_NS, def XML(name): return '{%s}%s' % (XML_NS, name) -def XHTML(name): - return '{%s}%s' % (XHTML_NS, name) - def OPF(name): return '{%s}%s' % (OPF2_NS, name) @@ -279,22 +274,11 @@ PREFIXNAME_RE = re.compile(r'^[^:]+[:][^:]+') XMLDECL_RE = re.compile(r'^\s*<[?]xml.*?[?]>') CSSURL_RE = re.compile(r'''url[(](?P["']?)(?P[^)]+)(?P=q)[)]''') - def element(parent, *args, **kwargs): if parent is not None: return etree.SubElement(parent, *args, **kwargs) return etree.Element(*args, **kwargs) -def namespace(name): - if '}' in name: - return name.split('}', 1)[0][1:] - return '' - -def barename(name): - if '}' in name: - return name.split('}', 1)[1] - return name - def prefixname(name, nsrmap): if not isqname(name): return name @@ -373,25 +357,6 @@ def urlnormalize(href): parts = (urlquote(part) for part in parts) return urlunparse(parts) -def merge_multiple_html_heads_and_bodies(root, log=None): - heads, bodies = xpath(root, '//h:head'), xpath(root, '//h:body') - if not (len(heads) > 1 or len(bodies) > 1): return root - for child in root: root.remove(child) - head = root.makeelement(XHTML('head')) - body = root.makeelement(XHTML('body')) - for h in heads: - for x in h: - head.append(x) - for b in bodies: - for x in b: - body.append(x) - map(root.append, (head, body)) - if log is not None: - log.warn('Merging multiple and sections') - return root - - - class DummyHandler(logging.Handler): @@ -418,10 +383,6 @@ class OEBError(Exception): """Generic OEB-processing error.""" pass -class NotHTML(OEBError): - '''Raised when a file that should be HTML (as per manifest) is not''' - pass - class NullContainer(object): """An empty container. @@ -801,7 +762,6 @@ class Manifest(object): """ NUM_RE = re.compile('^(.*)([0-9][0-9.]*)(?=[.]|$)') - META_XP = XPath('/h:html/h:head/h:meta[@http-equiv="Content-Type"]') def __init__(self, oeb, id, href, media_type, fallback=None, loader=str, data=None): @@ -830,244 +790,17 @@ class Manifest(object): return None return etree.fromstring(data, parser=RECOVER_PARSER) - def clean_word_doc(self, data): - prefixes = [] - for match in re.finditer(r'xmlns:(\S+?)=".*?microsoft.*?"', data): - prefixes.append(match.group(1)) - if prefixes: - self.oeb.log.warn('Found microsoft markup, cleaning...') - # Remove empty tags as they are not rendered by browsers - # but can become renderable HTML tags like

if the - # document is parsed by an HTML parser - pat = re.compile( - r'<(%s):([a-zA-Z0-9]+)[^>/]*?>'%('|'.join(prefixes)), - re.DOTALL) - data = pat.sub('', data) - pat = re.compile( - r'<(%s):([a-zA-Z0-9]+)[^>/]*?/>'%('|'.join(prefixes))) - data = pat.sub('', data) - return data - def _parse_xhtml(self, data): orig_data = data - self.oeb.log.debug('Parsing', self.href, '...') - # Convert to Unicode and normalize line endings - data = self.oeb.decode(data) - data = strip_encoding_declarations(data) - data = self.oeb.html_preprocessor(data) - # There could be null bytes in data if it had � entities in it - data = data.replace('\0', '') - - # Remove DOCTYPE declaration as it messes up parsing - # In particular, it causes tostring to insert xmlns - # declarations, which messes up the coercing logic - idx = data.find(' -1: - pre = data[:idx] - data = data[idx:] - if ']+)', pre): - val = match.group(2) - if val.startswith('"') and val.endswith('"'): - val = val[1:-1] - user_entities[match.group(1)] = val - if user_entities: - pat = re.compile(r'&(%s);'%('|'.join(user_entities.keys()))) - data = pat.sub(lambda m:user_entities[m.group(1)], data) - - # Setting huge_tree=True causes crashes in windows with large files - parser = etree.XMLParser(no_network=True) - # Try with more & more drastic measures to parse - def first_pass(data): - try: - data = etree.fromstring(data, parser=parser) - except etree.XMLSyntaxError as err: - self.oeb.log.debug('Initial parse failed, using more' - ' forgiving parsers') - repl = lambda m: ENTITYDEFS.get(m.group(1), m.group(0)) - data = ENTITY_RE.sub(repl, data) - try: - data = etree.fromstring(data, parser=parser) - except etree.XMLSyntaxError as err: - self.oeb.logger.warn('Parsing file %r as HTML' % self.href) - if err.args and err.args[0].startswith('Excessive depth'): - from calibre.utils.soupparser import fromstring - data = fromstring(data) - else: - data = html.fromstring(data) - data.attrib.pop('xmlns', None) - for elem in data.iter(tag=etree.Comment): - if elem.text: - elem.text = elem.text.strip('-') - data = etree.tostring(data, encoding=unicode) - try: - data = etree.fromstring(data, parser=parser) - except etree.XMLSyntaxError: - data = etree.fromstring(data, parser=RECOVER_PARSER) - return data + fname = urlunquote(self.href) + self.oeb.log.debug('Parsing', fname, '...') try: - data = self.clean_word_doc(data) - except: - pass - data = first_pass(data) - - if data.tag == 'HTML': - # Lower case all tag and attribute names - data.tag = data.tag.lower() - for x in data.iterdescendants(): - try: - x.tag = x.tag.lower() - for key, val in list(x.attrib.iteritems()): - del x.attrib[key] - key = key.lower() - x.attrib[key] = val - except: - pass - - # Handle weird (non-HTML/fragment) files - if barename(data.tag) != 'html': - if barename(data.tag) == 'ncx': - return self._parse_xml(orig_data) - self.oeb.log.warn('File %r does not appear to be (X)HTML'%self.href) - nroot = etree.fromstring('') - has_body = False - for child in list(data): - if isinstance(child.tag, (unicode, str)) and barename(child.tag) == 'body': - has_body = True - break - parent = nroot - if not has_body: - self.oeb.log.warn('File %r appears to be a HTML fragment'%self.href) - nroot = etree.fromstring('') - parent = nroot[0] - for child in list(data.iter()): - oparent = child.getparent() - if oparent is not None: - oparent.remove(child) - parent.append(child) - data = nroot - - - # Force into the XHTML namespace - if not namespace(data.tag): - self.oeb.log.warn('Forcing', self.href, 'into XHTML namespace') - data.attrib['xmlns'] = XHTML_NS - data = etree.tostring(data, encoding=unicode) - - try: - data = etree.fromstring(data, parser=parser) - except: - data = data.replace(':=', '=').replace(':>', '>') - data = data.replace('', '') - try: - data = etree.fromstring(data, parser=parser) - except etree.XMLSyntaxError: - self.oeb.logger.warn('Stripping comments from %s'% - self.href) - data = re.compile(r'', re.DOTALL).sub('', - data) - data = data.replace( - "", - '') - data = data.replace("", '') - try: - data = etree.fromstring(data, - parser=RECOVER_PARSER) - except etree.XMLSyntaxError: - self.oeb.logger.warn('Stripping meta tags from %s'% - self.href) - data = re.sub(r']+?>', '', data) - data = etree.fromstring(data, parser=RECOVER_PARSER) - elif namespace(data.tag) != XHTML_NS: - # OEB_DOC_NS, but possibly others - ns = namespace(data.tag) - attrib = dict(data.attrib) - nroot = etree.Element(XHTML('html'), - nsmap={None: XHTML_NS}, attrib=attrib) - for elem in data.iterdescendants(): - if isinstance(elem.tag, basestring) and \ - namespace(elem.tag) == ns: - elem.tag = XHTML(barename(elem.tag)) - for elem in data: - nroot.append(elem) - data = nroot - - data = merge_multiple_html_heads_and_bodies(data, self.oeb.logger) - # Ensure has a - head = xpath(data, '/h:html/h:head') - head = head[0] if head else None - if head is None: - self.oeb.logger.warn( - 'File %r missing element' % self.href) - head = etree.Element(XHTML('head')) - data.insert(0, head) - title = etree.SubElement(head, XHTML('title')) - title.text = self.oeb.translate(__('Unknown')) - elif not xpath(data, '/h:html/h:head/h:title'): - self.oeb.logger.warn( - 'File %r missing element' % self.href) - title = etree.SubElement(head, XHTML('title')) - title.text = self.oeb.translate(__('Unknown')) - # Remove any encoding-specifying <meta/> elements - for meta in self.META_XP(data): - meta.getparent().remove(meta) - etree.SubElement(head, XHTML('meta'), - attrib={'http-equiv': 'Content-Type', - 'content': '%s; charset=utf-8' % XHTML_NS}) - # Ensure has a <body/> - if not xpath(data, '/h:html/h:body'): - body = xpath(data, '//h:body') - if body: - body = body[0] - body.getparent().remove(body) - data.append(body) - else: - self.oeb.logger.warn( - 'File %r missing <body/> element' % self.href) - etree.SubElement(data, XHTML('body')) - - # Remove microsoft office markup - r = [x for x in data.iterdescendants(etree.Element) if 'microsoft-com' in x.tag] - for x in r: - x.tag = XHTML('span') - - # Remove lang redefinition inserted by the amazing Microsoft Word! - body = xpath(data, '/h:html/h:body')[0] - for key in list(body.attrib.keys()): - if key == 'lang' or key.endswith('}lang'): - body.attrib.pop(key) - - def remove_elem(a): - p = a.getparent() - idx = p.index(a) -1 - p.remove(a) - if a.tail: - if idx <= 0: - if p.text is None: - p.text = '' - p.text += a.tail - else: - if p[idx].tail is None: - p[idx].tail = '' - p[idx].tail += a.tail - - # Remove hyperlinks with no content as they cause rendering - # artifacts in browser based renderers - # Also remove empty <b>, <u> and <i> tags - for a in xpath(data, '//h:a[@href]|//h:i|//h:b|//h:u'): - if a.get('id', None) is None and a.get('name', None) is None \ - and len(a) == 0 and not a.text: - remove_elem(a) - - # Convert <br>s with content into paragraphs as ADE can't handle - # them - for br in xpath(data, '//h:br'): - if len(br) > 0 or br.text: - br.tag = XHTML('div') - + data = parse_html(data, log=self.oeb.log, + decoder=self.oeb.decode, + preprocessor=self.oeb.html_preprocessor, + filename=fname, non_html_file_tags={'ncx'}) + except NotHTML: + return self._parse_xml(orig_data) return data def _parse_txt(self, data): diff --git a/src/calibre/ebooks/oeb/entitydefs.py b/src/calibre/ebooks/oeb/entitydefs.py deleted file mode 100644 index 69fc16116c..0000000000 --- a/src/calibre/ebooks/oeb/entitydefs.py +++ /dev/null @@ -1,256 +0,0 @@ -""" -Replacement for htmlentitydefs which uses purely numeric entities. -""" - -__license__ = 'GPL v3' -__copyright__ = '2008, Marshall T. Vandegrift <llasram@gmail.com>' - -ENTITYDEFS = \ - {'AElig': 'Æ', - 'Aacute': 'Á', - 'Acirc': 'Â', - 'Agrave': 'À', - 'Alpha': 'Α', - 'Aring': 'Å', - 'Atilde': 'Ã', - 'Auml': 'Ä', - 'Beta': 'Β', - 'Ccedil': 'Ç', - 'Chi': 'Χ', - 'Dagger': '‡', - 'Delta': 'Δ', - 'ETH': 'Ð', - 'Eacute': 'É', - 'Ecirc': 'Ê', - 'Egrave': 'È', - 'Epsilon': 'Ε', - 'Eta': 'Η', - 'Euml': 'Ë', - 'Gamma': 'Γ', - 'Iacute': 'Í', - 'Icirc': 'Î', - 'Igrave': 'Ì', - 'Iota': 'Ι', - 'Iuml': 'Ï', - 'Kappa': 'Κ', - 'Lambda': 'Λ', - 'Mu': 'Μ', - 'Ntilde': 'Ñ', - 'Nu': 'Ν', - 'OElig': 'Œ', - 'Oacute': 'Ó', - 'Ocirc': 'Ô', - 'Ograve': 'Ò', - 'Omega': 'Ω', - 'Omicron': 'Ο', - 'Oslash': 'Ø', - 'Otilde': 'Õ', - 'Ouml': 'Ö', - 'Phi': 'Φ', - 'Pi': 'Π', - 'Prime': '″', - 'Psi': 'Ψ', - 'Rho': 'Ρ', - 'Scaron': 'Š', - 'Sigma': 'Σ', - 'THORN': 'Þ', - 'Tau': 'Τ', - 'Theta': 'Θ', - 'Uacute': 'Ú', - 'Ucirc': 'Û', - 'Ugrave': 'Ù', - 'Upsilon': 'Υ', - 'Uuml': 'Ü', - 'Xi': 'Ξ', - 'Yacute': 'Ý', - 'Yuml': 'Ÿ', - 'Zeta': 'Ζ', - 'aacute': 'á', - 'acirc': 'â', - 'acute': '´', - 'aelig': 'æ', - 'agrave': 'à', - 'alefsym': 'ℵ', - 'alpha': 'α', - 'and': '∧', - 'ang': '∠', - 'aring': 'å', - 'asymp': '≈', - 'atilde': 'ã', - 'auml': 'ä', - 'bdquo': '„', - 'beta': 'β', - 'brvbar': '¦', - 'bull': '•', - 'cap': '∩', - 'ccedil': 'ç', - 'cedil': '¸', - 'cent': '¢', - 'chi': 'χ', - 'circ': 'ˆ', - 'clubs': '♣', - 'cong': '≅', - 'copy': '©', - 'crarr': '↵', - 'cup': '∪', - 'curren': '¤', - 'dArr': '⇓', - 'dagger': '†', - 'darr': '↓', - 'deg': '°', - 'delta': 'δ', - 'diams': '♦', - 'divide': '÷', - 'eacute': 'é', - 'ecirc': 'ê', - 'egrave': 'è', - 'empty': '∅', - 'emsp': ' ', - 'ensp': ' ', - 'epsilon': 'ε', - 'equiv': '≡', - 'eta': 'η', - 'eth': 'ð', - 'euml': 'ë', - 'euro': '€', - 'exist': '∃', - 'fnof': 'ƒ', - 'forall': '∀', - 'frac12': '½', - 'frac14': '¼', - 'frac34': '¾', - 'frasl': '⁄', - 'gamma': 'γ', - 'ge': '≥', - 'hArr': '⇔', - 'harr': '↔', - 'hearts': '♥', - 'hellip': '…', - 'iacute': 'í', - 'icirc': 'î', - 'iexcl': '¡', - 'igrave': 'ì', - 'image': 'ℑ', - 'infin': '∞', - 'int': '∫', - 'iota': 'ι', - 'iquest': '¿', - 'isin': '∈', - 'iuml': 'ï', - 'kappa': 'κ', - 'lArr': '⇐', - 'lambda': 'λ', - 'lang': '〈', - 'laquo': '«', - 'larr': '←', - 'lceil': '⌈', - 'ldquo': '“', - 'le': '≤', - 'lfloor': '⌊', - 'lowast': '∗', - 'loz': '◊', - 'lrm': '‎', - 'lsaquo': '‹', - 'lsquo': '‘', - 'macr': '¯', - 'mdash': '—', - 'micro': 'µ', - 'middot': '·', - 'minus': '−', - 'mu': 'μ', - 'nabla': '∇', - 'nbsp': ' ', - 'ndash': '–', - 'ne': '≠', - 'ni': '∋', - 'not': '¬', - 'notin': '∉', - 'nsub': '⊄', - 'ntilde': 'ñ', - 'nu': 'ν', - 'oacute': 'ó', - 'ocirc': 'ô', - 'oelig': 'œ', - 'ograve': 'ò', - 'oline': '‾', - 'omega': 'ω', - 'omicron': 'ο', - 'oplus': '⊕', - 'or': '∨', - 'ordf': 'ª', - 'ordm': 'º', - 'oslash': 'ø', - 'otilde': 'õ', - 'otimes': '⊗', - 'ouml': 'ö', - 'para': '¶', - 'part': '∂', - 'permil': '‰', - 'perp': '⊥', - 'phi': 'φ', - 'pi': 'π', - 'piv': 'ϖ', - 'plusmn': '±', - 'pound': '£', - 'prime': '′', - 'prod': '∏', - 'prop': '∝', - 'psi': 'ψ', - 'rArr': '⇒', - 'radic': '√', - 'rang': '〉', - 'raquo': '»', - 'rarr': '→', - 'rceil': '⌉', - 'rdquo': '”', - 'real': 'ℜ', - 'reg': '®', - 'rfloor': '⌋', - 'rho': 'ρ', - 'rlm': '‏', - 'rsaquo': '›', - 'rsquo': '’', - 'sbquo': '‚', - 'scaron': 'š', - 'sdot': '⋅', - 'sect': '§', - 'shy': '­', - 'sigma': 'σ', - 'sigmaf': 'ς', - 'sim': '∼', - 'spades': '♠', - 'sub': '⊂', - 'sube': '⊆', - 'sum': '∑', - 'sup': '⊃', - 'sup1': '¹', - 'sup2': '²', - 'sup3': '³', - 'supe': '⊇', - 'szlig': 'ß', - 'tau': 'τ', - 'there4': '∴', - 'theta': 'θ', - 'thetasym': 'ϑ', - 'thinsp': ' ', - 'thorn': 'þ', - 'tilde': '˜', - 'times': '×', - 'trade': '™', - 'uArr': '⇑', - 'uacute': 'ú', - 'uarr': '↑', - 'ucirc': 'û', - 'ugrave': 'ù', - 'uml': '¨', - 'upsih': 'ϒ', - 'upsilon': 'υ', - 'uuml': 'ü', - 'weierp': '℘', - 'xi': 'ξ', - 'yacute': 'ý', - 'yen': '¥', - 'yuml': 'ÿ', - 'zeta': 'ζ', - 'zwj': '‍', - 'zwnj': '‌'} diff --git a/src/calibre/ebooks/oeb/parse_utils.py b/src/calibre/ebooks/oeb/parse_utils.py new file mode 100644 index 0000000000..57dc18bc32 --- /dev/null +++ b/src/calibre/ebooks/oeb/parse_utils.py @@ -0,0 +1,347 @@ +#!/usr/bin/env python +# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai +from __future__ import (unicode_literals, division, absolute_import, + print_function) + +__license__ = 'GPL v3' +__copyright__ = '2011, Kovid Goyal <kovid@kovidgoyal.net>' +__docformat__ = 'restructuredtext en' + +import re + +from lxml import etree, html + +from calibre import xml_replace_entities, force_unicode +from calibre.constants import filesystem_encoding +from calibre.ebooks.chardet import xml_to_unicode, strip_encoding_declarations + +RECOVER_PARSER = etree.XMLParser(recover=True, no_network=True) +XHTML_NS = 'http://www.w3.org/1999/xhtml' + +class NotHTML(Exception): + + def __init__(self, root_tag): + Exception.__init__(self, 'Data is not HTML') + self.root_tag = root_tag + +def barename(name): + return name.rpartition('}')[-1] + +def namespace(name): + if '}' in name: + return name.split('}', 1)[0][1:] + return '' + +def XHTML(name): + return '{%s}%s' % (XHTML_NS, name) + +def xpath(elem, expr): + return elem.xpath(expr, namespaces={'h':XHTML_NS}) + +def XPath(expr): + return etree.XPath(expr, namespaces={'h':XHTML_NS}) + +META_XP = XPath('/h:html/h:head/h:meta[@http-equiv="Content-Type"]') + +def merge_multiple_html_heads_and_bodies(root, log=None): + heads, bodies = xpath(root, '//h:head'), xpath(root, '//h:body') + if not (len(heads) > 1 or len(bodies) > 1): return root + for child in root: root.remove(child) + head = root.makeelement(XHTML('head')) + body = root.makeelement(XHTML('body')) + for h in heads: + for x in h: + head.append(x) + for b in bodies: + for x in b: + body.append(x) + map(root.append, (head, body)) + if log is not None: + log.warn('Merging multiple <head> and <body> sections') + return root + +def _html5_parse(data): + import html5lib + data = html5lib.parse(data, treebuilder='lxml').getroot() + html_ns = [ns for ns, val in data.nsmap.iteritems() if (val == XHTML_NS and + ns is not None)] + if html_ns: + # html5lib causes the XHTML namespace to not + # be set as the default namespace + nsmap = dict(data.nsmap) + nsmap[None] = XHTML_NS + for x in html_ns: + nsmap.pop(x) + nroot = etree.Element(data.tag, nsmap=nsmap, + attrib=dict(data.attrib)) + nroot.text = data.text + nroot.tail = data.tail + for child in data: + nroot.append(child) + data = nroot + return data + +def _html4_parse(data, prefer_soup=False): + if prefer_soup: + from calibre.utils.soupparser import fromstring + data = fromstring(data) + else: + data = html.fromstring(data) + data.attrib.pop('xmlns', None) + for elem in data.iter(tag=etree.Comment): + if elem.text: + elem.text = elem.text.strip('-') + data = etree.tostring(data, encoding=unicode) + + # Setting huge_tree=True causes crashes in windows with large files + parser = etree.XMLParser(no_network=True) + try: + data = etree.fromstring(data, parser=parser) + except etree.XMLSyntaxError: + data = etree.fromstring(data, parser=RECOVER_PARSER) + return data + +def clean_word_doc(data, log): + prefixes = [] + for match in re.finditer(r'xmlns:(\S+?)=".*?microsoft.*?"', data): + prefixes.append(match.group(1)) + if prefixes: + log.warn('Found microsoft markup, cleaning...') + # Remove empty tags as they are not rendered by browsers + # but can become renderable HTML tags like <p/> if the + # document is parsed by an HTML parser + pat = re.compile( + r'<(%s):([a-zA-Z0-9]+)[^>/]*?></\1:\2>'%('|'.join(prefixes)), + re.DOTALL) + data = pat.sub('', data) + pat = re.compile( + r'<(%s):([a-zA-Z0-9]+)[^>/]*?/>'%('|'.join(prefixes))) + data = pat.sub('', data) + return data + +def parse_html(data, log=None, decoder=None, preprocessor=None, + filename='<string>', non_html_file_tags=frozenset()): + if log is None: + from calibre.utils.logging import default_log + log = default_log + + filename = force_unicode(filename, enc=filesystem_encoding) + + if not isinstance(data, unicode): + if decoder is not None: + data = decoder(data) + else: + data = xml_to_unicode(data)[0] + + data = strip_encoding_declarations(data) + if preprocessor is not None: + data = preprocessor(data) + + # There could be null bytes in data if it had � entities in it + data = data.replace('\0', '') + + # Remove DOCTYPE declaration as it messes up parsing + # In particular, it causes tostring to insert xmlns + # declarations, which messes up the coercing logic + idx = data.find('<html') + if idx == -1: + idx = data.find('<HTML') + if idx > -1: + pre = data[:idx] + data = data[idx:] + if '<!DOCTYPE' in pre: # Handle user defined entities + user_entities = {} + for match in re.finditer(r'<!ENTITY\s+(\S+)\s+([^>]+)', pre): + val = match.group(2) + if val.startswith('"') and val.endswith('"'): + val = val[1:-1] + user_entities[match.group(1)] = val + if user_entities: + pat = re.compile(r'&(%s);'%('|'.join(user_entities.keys()))) + data = pat.sub(lambda m:user_entities[m.group(1)], data) + + data = clean_word_doc(data, log) + + # Setting huge_tree=True causes crashes in windows with large files + parser = etree.XMLParser(no_network=True) + + # Try with more & more drastic measures to parse + try: + data = etree.fromstring(data, parser=parser) + except etree.XMLSyntaxError: + log.debug('Initial parse failed, using more' + ' forgiving parsers') + data = xml_replace_entities(data) + try: + data = etree.fromstring(data, parser=parser) + except etree.XMLSyntaxError: + log.debug('Parsing %s as HTML' % filename) + try: + data = _html5_parse(data) + except: + log.exception( + 'HTML 5 parsing failed, falling back to older parsers') + data = _html4_parse(data) + + if data.tag == 'HTML': + # Lower case all tag and attribute names + data.tag = data.tag.lower() + for x in data.iterdescendants(): + try: + x.tag = x.tag.lower() + for key, val in list(x.attrib.iteritems()): + del x.attrib[key] + key = key.lower() + x.attrib[key] = val + except: + pass + + if barename(data.tag) != 'html': + if barename(data.tag) in non_html_file_tags: + raise NotHTML(data.tag) + log.warn('File %r does not appear to be (X)HTML'%filename) + nroot = etree.fromstring('<html></html>') + has_body = False + for child in list(data): + if isinstance(child.tag, (unicode, str)) and barename(child.tag) == 'body': + has_body = True + break + parent = nroot + if not has_body: + log.warn('File %r appears to be a HTML fragment'%filename) + nroot = etree.fromstring('<html><body/></html>') + parent = nroot[0] + for child in list(data.iter()): + oparent = child.getparent() + if oparent is not None: + oparent.remove(child) + parent.append(child) + data = nroot + + # Force into the XHTML namespace + if not namespace(data.tag): + log.warn('Forcing', filename, 'into XHTML namespace') + data.attrib['xmlns'] = XHTML_NS + data = etree.tostring(data, encoding=unicode) + + try: + data = etree.fromstring(data, parser=parser) + except: + data = data.replace(':=', '=').replace(':>', '>') + data = data.replace('<http:/>', '') + try: + data = etree.fromstring(data, parser=parser) + except etree.XMLSyntaxError: + log.warn('Stripping comments from %s'% + filename) + data = re.compile(r'<!--.*?-->', re.DOTALL).sub('', + data) + data = data.replace( + "<?xml version='1.0' encoding='utf-8'?><o:p></o:p>", + '') + data = data.replace("<?xml version='1.0' encoding='utf-8'??>", '') + try: + data = etree.fromstring(data, + parser=RECOVER_PARSER) + except etree.XMLSyntaxError: + log.warn('Stripping meta tags from %s'% filename) + data = re.sub(r'<meta\s+[^>]+?>', '', data) + data = etree.fromstring(data, parser=RECOVER_PARSER) + elif namespace(data.tag) != XHTML_NS: + # OEB_DOC_NS, but possibly others + ns = namespace(data.tag) + attrib = dict(data.attrib) + nroot = etree.Element(XHTML('html'), + nsmap={None: XHTML_NS}, attrib=attrib) + for elem in data.iterdescendants(): + if isinstance(elem.tag, basestring) and \ + namespace(elem.tag) == ns: + elem.tag = XHTML(barename(elem.tag)) + for elem in data: + nroot.append(elem) + data = nroot + + data = merge_multiple_html_heads_and_bodies(data, log) + # Ensure has a <head/> + head = xpath(data, '/h:html/h:head') + head = head[0] if head else None + if head is None: + log.warn('File %s missing <head/> element' % filename) + head = etree.Element(XHTML('head')) + data.insert(0, head) + title = etree.SubElement(head, XHTML('title')) + title.text = _('Unknown') + elif not xpath(data, '/h:html/h:head/h:title'): + log.warn('File %s missing <title/> element' % filename) + title = etree.SubElement(head, XHTML('title')) + title.text = _('Unknown') + # Remove any encoding-specifying <meta/> elements + for meta in META_XP(data): + meta.getparent().remove(meta) + etree.SubElement(head, XHTML('meta'), + attrib={'http-equiv': 'Content-Type', + 'content': '%s; charset=utf-8' % XHTML_NS}) + # Ensure has a <body/> + if not xpath(data, '/h:html/h:body'): + body = xpath(data, '//h:body') + if body: + body = body[0] + body.getparent().remove(body) + data.append(body) + else: + log.warn('File %s missing <body/> element' % filename) + etree.SubElement(data, XHTML('body')) + + # Remove microsoft office markup + r = [x for x in data.iterdescendants(etree.Element) if 'microsoft-com' in x.tag] + for x in r: + x.tag = XHTML('span') + + # Remove lang redefinition inserted by the amazing Microsoft Word! + body = xpath(data, '/h:html/h:body')[0] + for key in list(body.attrib.keys()): + if key == 'lang' or key.endswith('}lang'): + body.attrib.pop(key) + + def remove_elem(a): + p = a.getparent() + idx = p.index(a) -1 + p.remove(a) + if a.tail: + if idx <= 0: + if p.text is None: + p.text = '' + p.text += a.tail + else: + if p[idx].tail is None: + p[idx].tail = '' + p[idx].tail += a.tail + + # Remove hyperlinks with no content as they cause rendering + # artifacts in browser based renderers + # Also remove empty <b>, <u> and <i> tags + for a in xpath(data, '//h:a[@href]|//h:i|//h:b|//h:u'): + if a.get('id', None) is None and a.get('name', None) is None \ + and len(a) == 0 and not a.text: + remove_elem(a) + + # Convert <br>s with content into paragraphs as ADE can't handle + # them + for br in xpath(data, '//h:br'): + if len(br) > 0 or br.text: + br.tag = XHTML('div') + + # Remove any stray text in the <head> section and format it nicely + data.text = '\n ' + head = xpath(data, '//h:head') + if head: + head = head[0] + head.text = '\n ' + head.tail = '\n ' + for child in head: + child.tail = '\n ' + child.tail = '\n ' + + return data + + diff --git a/src/calibre/ebooks/oeb/reader.py b/src/calibre/ebooks/oeb/reader.py index f94220b95b..0337d47f92 100644 --- a/src/calibre/ebooks/oeb/reader.py +++ b/src/calibre/ebooks/oeb/reader.py @@ -19,16 +19,15 @@ from calibre.ebooks.oeb.base import OPF1_NS, OPF2_NS, OPF2_NSMAP, DC11_NS, \ from calibre.ebooks.oeb.base import OEB_DOCS, OEB_STYLES, OEB_IMAGES, \ PAGE_MAP_MIME, JPEG_MIME, NCX_MIME, SVG_MIME from calibre.ebooks.oeb.base import XMLDECL_RE, COLLAPSE_RE, \ - ENTITY_RE, MS_COVER_TYPE, iterlinks + MS_COVER_TYPE, iterlinks from calibre.ebooks.oeb.base import namespace, barename, XPath, xpath, \ urlnormalize, BINARY_MIME, \ OEBError, OEBBook, DirContainer from calibre.ebooks.oeb.writer import OEBWriter -from calibre.ebooks.oeb.entitydefs import ENTITYDEFS from calibre.utils.localization import get_lang from calibre.ptempfile import TemporaryDirectory from calibre.constants import __appname__, __version__ -from calibre import guess_type +from calibre import guess_type, xml_replace_entities __all__ = ['OEBReader'] @@ -107,8 +106,7 @@ class OEBReader(object): try: opf = etree.fromstring(data) except etree.XMLSyntaxError: - repl = lambda m: ENTITYDEFS.get(m.group(1), m.group(0)) - data = ENTITY_RE.sub(repl, data) + data = xml_replace_entities(data, encoding=None) try: opf = etree.fromstring(data) self.logger.warn('OPF contains invalid HTML named entities') diff --git a/src/html5lib/__init__.py b/src/html5lib/__init__.py new file mode 100644 index 0000000000..433e0f6735 --- /dev/null +++ b/src/html5lib/__init__.py @@ -0,0 +1,17 @@ +""" +HTML parsing library based on the WHATWG "HTML5" +specification. The parser is designed to be compatible with existing +HTML found in the wild and implements well-defined error recovery that +is largely compatible with modern desktop web browsers. + +Example usage: + +import html5lib +f = open("my_document.html") +tree = html5lib.parse(f) +""" +__version__ = "0.90" +from html5parser import HTMLParser, parse, parseFragment +from treebuilders import getTreeBuilder +from treewalkers import getTreeWalker +from serializer import serialize diff --git a/src/html5lib/constants.py b/src/html5lib/constants.py new file mode 100644 index 0000000000..4157bcfcea --- /dev/null +++ b/src/html5lib/constants.py @@ -0,0 +1,1169 @@ +import string, gettext +_ = gettext.gettext + +try: + frozenset +except NameError: + # Import from the sets module for python 2.3 + from sets import Set as set + from sets import ImmutableSet as frozenset + +EOF = None + +E = { + "null-character": + _(u"Null character in input stream, replaced with U+FFFD."), + "invalid-character": + _(u"Invalid codepoint in stream."), + "incorrectly-placed-solidus": + _(u"Solidus (/) incorrectly placed in tag."), + "incorrect-cr-newline-entity": + _(u"Incorrect CR newline entity, replaced with LF."), + "illegal-windows-1252-entity": + _(u"Entity used with illegal number (windows-1252 reference)."), + "cant-convert-numeric-entity": + _(u"Numeric entity couldn't be converted to character " + u"(codepoint U+%(charAsInt)08x)."), + "illegal-codepoint-for-numeric-entity": + _(u"Numeric entity represents an illegal codepoint: " + u"U+%(charAsInt)08x."), + "numeric-entity-without-semicolon": + _(u"Numeric entity didn't end with ';'."), + "expected-numeric-entity-but-got-eof": + _(u"Numeric entity expected. Got end of file instead."), + "expected-numeric-entity": + _(u"Numeric entity expected but none found."), + "named-entity-without-semicolon": + _(u"Named entity didn't end with ';'."), + "expected-named-entity": + _(u"Named entity expected. Got none."), + "attributes-in-end-tag": + _(u"End tag contains unexpected attributes."), + "expected-tag-name-but-got-right-bracket": + _(u"Expected tag name. Got '>' instead."), + "expected-tag-name-but-got-question-mark": + _(u"Expected tag name. Got '?' instead. (HTML doesn't " + u"support processing instructions.)"), + "expected-tag-name": + _(u"Expected tag name. Got something else instead"), + "expected-closing-tag-but-got-right-bracket": + _(u"Expected closing tag. Got '>' instead. Ignoring '</>'."), + "expected-closing-tag-but-got-eof": + _(u"Expected closing tag. Unexpected end of file."), + "expected-closing-tag-but-got-char": + _(u"Expected closing tag. Unexpected character '%(data)s' found."), + "eof-in-tag-name": + _(u"Unexpected end of file in the tag name."), + "expected-attribute-name-but-got-eof": + _(u"Unexpected end of file. Expected attribute name instead."), + "eof-in-attribute-name": + _(u"Unexpected end of file in attribute name."), + "invalid-character-in-attribute-name": + _(u"Invalid chracter in attribute name"), + "duplicate-attribute": + _(u"Dropped duplicate attribute on tag."), + "expected-end-of-tag-name-but-got-eof": + _(u"Unexpected end of file. Expected = or end of tag."), + "expected-attribute-value-but-got-eof": + _(u"Unexpected end of file. Expected attribute value."), + "expected-attribute-value-but-got-right-bracket": + _(u"Expected attribute value. Got '>' instead."), + "eof-in-attribute-value-double-quote": + _(u"Unexpected end of file in attribute value (\")."), + "eof-in-attribute-value-single-quote": + _(u"Unexpected end of file in attribute value (')."), + "eof-in-attribute-value-no-quotes": + _(u"Unexpected end of file in attribute value."), + "unexpected-EOF-after-solidus-in-tag": + _(u"Unexpected end of file in tag. Expected >"), + "unexpected-character-after-soldius-in-tag": + _(u"Unexpected character after / in tag. Expected >"), + "expected-dashes-or-doctype": + _(u"Expected '--' or 'DOCTYPE'. Not found."), + "incorrect-comment": + _(u"Incorrect comment."), + "eof-in-comment": + _(u"Unexpected end of file in comment."), + "eof-in-comment-end-dash": + _(u"Unexpected end of file in comment (-)"), + "unexpected-dash-after-double-dash-in-comment": + _(u"Unexpected '-' after '--' found in comment."), + "eof-in-comment-double-dash": + _(u"Unexpected end of file in comment (--)."), + "unexpected-char-in-comment": + _(u"Unexpected character in comment found."), + "need-space-after-doctype": + _(u"No space after literal string 'DOCTYPE'."), + "expected-doctype-name-but-got-right-bracket": + _(u"Unexpected > character. Expected DOCTYPE name."), + "expected-doctype-name-but-got-eof": + _(u"Unexpected end of file. Expected DOCTYPE name."), + "eof-in-doctype-name": + _(u"Unexpected end of file in DOCTYPE name."), + "eof-in-doctype": + _(u"Unexpected end of file in DOCTYPE."), + "expected-space-or-right-bracket-in-doctype": + _(u"Expected space or '>'. Got '%(data)s'"), + "unexpected-end-of-doctype": + _(u"Unexpected end of DOCTYPE."), + "unexpected-char-in-doctype": + _(u"Unexpected character in DOCTYPE."), + "eof-in-innerhtml": + _(u"XXX innerHTML EOF"), + "unexpected-doctype": + _(u"Unexpected DOCTYPE. Ignored."), + "non-html-root": + _(u"html needs to be the first start tag."), + "expected-doctype-but-got-eof": + _(u"Unexpected End of file. Expected DOCTYPE."), + "unknown-doctype": + _(u"Erroneous DOCTYPE."), + "expected-doctype-but-got-chars": + _(u"Unexpected non-space characters. Expected DOCTYPE."), + "expected-doctype-but-got-start-tag": + _(u"Unexpected start tag (%(name)s). Expected DOCTYPE."), + "expected-doctype-but-got-end-tag": + _(u"Unexpected end tag (%(name)s). Expected DOCTYPE."), + "end-tag-after-implied-root": + _(u"Unexpected end tag (%(name)s) after the (implied) root element."), + "expected-named-closing-tag-but-got-eof": + _(u"Unexpected end of file. Expected end tag (%(name)s)."), + "two-heads-are-not-better-than-one": + _(u"Unexpected start tag head in existing head. Ignored."), + "unexpected-end-tag": + _(u"Unexpected end tag (%(name)s). Ignored."), + "unexpected-start-tag-out-of-my-head": + _(u"Unexpected start tag (%(name)s) that can be in head. Moved."), + "unexpected-start-tag": + _(u"Unexpected start tag (%(name)s)."), + "missing-end-tag": + _(u"Missing end tag (%(name)s)."), + "missing-end-tags": + _(u"Missing end tags (%(name)s)."), + "unexpected-start-tag-implies-end-tag": + _(u"Unexpected start tag (%(startName)s) " + u"implies end tag (%(endName)s)."), + "unexpected-start-tag-treated-as": + _(u"Unexpected start tag (%(originalName)s). Treated as %(newName)s."), + "deprecated-tag": + _(u"Unexpected start tag %(name)s. Don't use it!"), + "unexpected-start-tag-ignored": + _(u"Unexpected start tag %(name)s. Ignored."), + "expected-one-end-tag-but-got-another": + _(u"Unexpected end tag (%(gotName)s). " + u"Missing end tag (%(expectedName)s)."), + "end-tag-too-early": + _(u"End tag (%(name)s) seen too early. Expected other end tag."), + "end-tag-too-early-named": + _(u"Unexpected end tag (%(gotName)s). Expected end tag (%(expectedName)s)."), + "end-tag-too-early-ignored": + _(u"End tag (%(name)s) seen too early. Ignored."), + "adoption-agency-1.1": + _(u"End tag (%(name)s) violates step 1, " + u"paragraph 1 of the adoption agency algorithm."), + "adoption-agency-1.2": + _(u"End tag (%(name)s) violates step 1, " + u"paragraph 2 of the adoption agency algorithm."), + "adoption-agency-1.3": + _(u"End tag (%(name)s) violates step 1, " + u"paragraph 3 of the adoption agency algorithm."), + "unexpected-end-tag-treated-as": + _(u"Unexpected end tag (%(originalName)s). Treated as %(newName)s."), + "no-end-tag": + _(u"This element (%(name)s) has no end tag."), + "unexpected-implied-end-tag-in-table": + _(u"Unexpected implied end tag (%(name)s) in the table phase."), + "unexpected-implied-end-tag-in-table-body": + _(u"Unexpected implied end tag (%(name)s) in the table body phase."), + "unexpected-char-implies-table-voodoo": + _(u"Unexpected non-space characters in " + u"table context caused voodoo mode."), + "unexpected-hidden-input-in-table": + _(u"Unexpected input with type hidden in table context."), + "unexpected-form-in-table": + _(u"Unexpected form in table context."), + "unexpected-start-tag-implies-table-voodoo": + _(u"Unexpected start tag (%(name)s) in " + u"table context caused voodoo mode."), + "unexpected-end-tag-implies-table-voodoo": + _(u"Unexpected end tag (%(name)s) in " + u"table context caused voodoo mode."), + "unexpected-cell-in-table-body": + _(u"Unexpected table cell start tag (%(name)s) " + u"in the table body phase."), + "unexpected-cell-end-tag": + _(u"Got table cell end tag (%(name)s) " + u"while required end tags are missing."), + "unexpected-end-tag-in-table-body": + _(u"Unexpected end tag (%(name)s) in the table body phase. Ignored."), + "unexpected-implied-end-tag-in-table-row": + _(u"Unexpected implied end tag (%(name)s) in the table row phase."), + "unexpected-end-tag-in-table-row": + _(u"Unexpected end tag (%(name)s) in the table row phase. Ignored."), + "unexpected-select-in-select": + _(u"Unexpected select start tag in the select phase " + u"treated as select end tag."), + "unexpected-input-in-select": + _(u"Unexpected input start tag in the select phase."), + "unexpected-start-tag-in-select": + _(u"Unexpected start tag token (%(name)s in the select phase. " + u"Ignored."), + "unexpected-end-tag-in-select": + _(u"Unexpected end tag (%(name)s) in the select phase. Ignored."), + "unexpected-table-element-start-tag-in-select-in-table": + _(u"Unexpected table element start tag (%(name)s) in the select in table phase."), + "unexpected-table-element-end-tag-in-select-in-table": + _(u"Unexpected table element end tag (%(name)s) in the select in table phase."), + "unexpected-char-after-body": + _(u"Unexpected non-space characters in the after body phase."), + "unexpected-start-tag-after-body": + _(u"Unexpected start tag token (%(name)s)" + u" in the after body phase."), + "unexpected-end-tag-after-body": + _(u"Unexpected end tag token (%(name)s)" + u" in the after body phase."), + "unexpected-char-in-frameset": + _(u"Unepxected characters in the frameset phase. Characters ignored."), + "unexpected-start-tag-in-frameset": + _(u"Unexpected start tag token (%(name)s)" + u" in the frameset phase. Ignored."), + "unexpected-frameset-in-frameset-innerhtml": + _(u"Unexpected end tag token (frameset) " + u"in the frameset phase (innerHTML)."), + "unexpected-end-tag-in-frameset": + _(u"Unexpected end tag token (%(name)s)" + u" in the frameset phase. Ignored."), + "unexpected-char-after-frameset": + _(u"Unexpected non-space characters in the " + u"after frameset phase. Ignored."), + "unexpected-start-tag-after-frameset": + _(u"Unexpected start tag (%(name)s)" + u" in the after frameset phase. Ignored."), + "unexpected-end-tag-after-frameset": + _(u"Unexpected end tag (%(name)s)" + u" in the after frameset phase. Ignored."), + "unexpected-end-tag-after-body-innerhtml": + _(u"Unexpected end tag after body(innerHtml)"), + "expected-eof-but-got-char": + _(u"Unexpected non-space characters. Expected end of file."), + "expected-eof-but-got-start-tag": + _(u"Unexpected start tag (%(name)s)" + u". Expected end of file."), + "expected-eof-but-got-end-tag": + _(u"Unexpected end tag (%(name)s)" + u". Expected end of file."), + "eof-in-table": + _(u"Unexpected end of file. Expected table content."), + "eof-in-select": + _(u"Unexpected end of file. Expected select content."), + "eof-in-frameset": + _(u"Unexpected end of file. Expected frameset content."), + "eof-in-script-in-script": + _(u"Unexpected end of file. Expected script content."), + "non-void-element-with-trailing-solidus": + _(u"Trailing solidus not allowed on element %(name)s"), + "unexpected-html-element-in-foreign-content": + _(u"Element %(name)s not allowed in a non-html context"), + "unexpected-end-tag-before-html": + _(u"Unexpected end tag (%(name)s) before html."), + "XXX-undefined-error": + (u"Undefined error (this sucks and should be fixed)"), +} + +namespaces = { + "html":"http://www.w3.org/1999/xhtml", + "mathml":"http://www.w3.org/1998/Math/MathML", + "svg":"http://www.w3.org/2000/svg", + "xlink":"http://www.w3.org/1999/xlink", + "xml":"http://www.w3.org/XML/1998/namespace", + "xmlns":"http://www.w3.org/2000/xmlns/" +} + +scopingElements = frozenset(( + (namespaces["html"], "applet"), + (namespaces["html"], "button"), + (namespaces["html"], "caption"), + (namespaces["html"], "html"), + (namespaces["html"], "marquee"), + (namespaces["html"], "object"), + (namespaces["html"], "table"), + (namespaces["html"], "td"), + (namespaces["html"], "th"), + (namespaces["svg"], "foreignObject") +)) + +formattingElements = frozenset(( + (namespaces["html"], "a"), + (namespaces["html"], "b"), + (namespaces["html"], "big"), + (namespaces["html"], "code"), + (namespaces["html"], "em"), + (namespaces["html"], "font"), + (namespaces["html"], "i"), + (namespaces["html"], "nobr"), + (namespaces["html"], "s"), + (namespaces["html"], "small"), + (namespaces["html"], "strike"), + (namespaces["html"], "strong"), + (namespaces["html"], "tt"), + (namespaces["html"], "u") +)) + +specialElements = frozenset(( + (namespaces["html"], "address"), + (namespaces["html"], "area"), + (namespaces["html"], "article"), + (namespaces["html"], "aside"), + (namespaces["html"], "base"), + (namespaces["html"], "basefont"), + (namespaces["html"], "bgsound"), + (namespaces["html"], "blockquote"), + (namespaces["html"], "body"), + (namespaces["html"], "br"), + (namespaces["html"], "center"), + (namespaces["html"], "col"), + (namespaces["html"], "colgroup"), + (namespaces["html"], "command"), + (namespaces["html"], "datagrid"), + (namespaces["html"], "dd"), + (namespaces["html"], "details"), + (namespaces["html"], "dialog"), + (namespaces["html"], "dir"), + (namespaces["html"], "div"), + (namespaces["html"], "dl"), + (namespaces["html"], "dt"), + (namespaces["html"], "embed"), + (namespaces["html"], "event-source"), + (namespaces["html"], "fieldset"), + (namespaces["html"], "figure"), + (namespaces["html"], "footer"), + (namespaces["html"], "form"), + (namespaces["html"], "frame"), + (namespaces["html"], "frameset"), + (namespaces["html"], "h1"), + (namespaces["html"], "h2"), + (namespaces["html"], "h3"), + (namespaces["html"], "h4"), + (namespaces["html"], "h5"), + (namespaces["html"], "h6"), + (namespaces["html"], "head"), + (namespaces["html"], "header"), + (namespaces["html"], "hr"), + (namespaces["html"], "iframe"), + # Note that image is commented out in the spec as "this isn't an + # element that can end up on the stack, so it doesn't matter," + (namespaces["html"], "image"), + (namespaces["html"], "img"), + (namespaces["html"], "input"), + (namespaces["html"], "isindex"), + (namespaces["html"], "li"), + (namespaces["html"], "link"), + (namespaces["html"], "listing"), + (namespaces["html"], "menu"), + (namespaces["html"], "meta"), + (namespaces["html"], "nav"), + (namespaces["html"], "noembed"), + (namespaces["html"], "noframes"), + (namespaces["html"], "noscript"), + (namespaces["html"], "ol"), + (namespaces["html"], "optgroup"), + (namespaces["html"], "option"), + (namespaces["html"], "p"), + (namespaces["html"], "param"), + (namespaces["html"], "plaintext"), + (namespaces["html"], "pre"), + (namespaces["html"], "script"), + (namespaces["html"], "section"), + (namespaces["html"], "select"), + (namespaces["html"], "spacer"), + (namespaces["html"], "style"), + (namespaces["html"], "tbody"), + (namespaces["html"], "textarea"), + (namespaces["html"], "tfoot"), + (namespaces["html"], "thead"), + (namespaces["html"], "title"), + (namespaces["html"], "tr"), + (namespaces["html"], "ul"), + (namespaces["html"], "wbr") +)) + +spaceCharacters = frozenset(( + u"\t", + u"\n", + u"\u000C", + u" ", + u"\r" +)) + +tableInsertModeElements = frozenset(( + "table", + "tbody", + "tfoot", + "thead", + "tr" +)) + +asciiLowercase = frozenset(string.ascii_lowercase) +asciiUppercase = frozenset(string.ascii_uppercase) +asciiLetters = frozenset(string.ascii_letters) +digits = frozenset(string.digits) +hexDigits = frozenset(string.hexdigits) + +asciiUpper2Lower = dict([(ord(c),ord(c.lower())) + for c in string.ascii_uppercase]) + +# Heading elements need to be ordered +headingElements = ( + "h1", + "h2", + "h3", + "h4", + "h5", + "h6" +) + +voidElements = frozenset(( + "base", + "command", + "event-source", + "link", + "meta", + "hr", + "br", + "img", + "embed", + "param", + "area", + "col", + "input", + "source" +)) + +cdataElements = frozenset(('title', 'textarea')) + +rcdataElements = frozenset(( + 'style', + 'script', + 'xmp', + 'iframe', + 'noembed', + 'noframes', + 'noscript' +)) + +booleanAttributes = { + "": frozenset(("irrelevant",)), + "style": frozenset(("scoped",)), + "img": frozenset(("ismap",)), + "audio": frozenset(("autoplay","controls")), + "video": frozenset(("autoplay","controls")), + "script": frozenset(("defer", "async")), + "details": frozenset(("open",)), + "datagrid": frozenset(("multiple", "disabled")), + "command": frozenset(("hidden", "disabled", "checked", "default")), + "menu": frozenset(("autosubmit",)), + "fieldset": frozenset(("disabled", "readonly")), + "option": frozenset(("disabled", "readonly", "selected")), + "optgroup": frozenset(("disabled", "readonly")), + "button": frozenset(("disabled", "autofocus")), + "input": frozenset(("disabled", "readonly", "required", "autofocus", "checked", "ismap")), + "select": frozenset(("disabled", "readonly", "autofocus", "multiple")), + "output": frozenset(("disabled", "readonly")), +} + +# entitiesWindows1252 has to be _ordered_ and needs to have an index. It +# therefore can't be a frozenset. +entitiesWindows1252 = ( + 8364, # 0x80 0x20AC EURO SIGN + 65533, # 0x81 UNDEFINED + 8218, # 0x82 0x201A SINGLE LOW-9 QUOTATION MARK + 402, # 0x83 0x0192 LATIN SMALL LETTER F WITH HOOK + 8222, # 0x84 0x201E DOUBLE LOW-9 QUOTATION MARK + 8230, # 0x85 0x2026 HORIZONTAL ELLIPSIS + 8224, # 0x86 0x2020 DAGGER + 8225, # 0x87 0x2021 DOUBLE DAGGER + 710, # 0x88 0x02C6 MODIFIER LETTER CIRCUMFLEX ACCENT + 8240, # 0x89 0x2030 PER MILLE SIGN + 352, # 0x8A 0x0160 LATIN CAPITAL LETTER S WITH CARON + 8249, # 0x8B 0x2039 SINGLE LEFT-POINTING ANGLE QUOTATION MARK + 338, # 0x8C 0x0152 LATIN CAPITAL LIGATURE OE + 65533, # 0x8D UNDEFINED + 381, # 0x8E 0x017D LATIN CAPITAL LETTER Z WITH CARON + 65533, # 0x8F UNDEFINED + 65533, # 0x90 UNDEFINED + 8216, # 0x91 0x2018 LEFT SINGLE QUOTATION MARK + 8217, # 0x92 0x2019 RIGHT SINGLE QUOTATION MARK + 8220, # 0x93 0x201C LEFT DOUBLE QUOTATION MARK + 8221, # 0x94 0x201D RIGHT DOUBLE QUOTATION MARK + 8226, # 0x95 0x2022 BULLET + 8211, # 0x96 0x2013 EN DASH + 8212, # 0x97 0x2014 EM DASH + 732, # 0x98 0x02DC SMALL TILDE + 8482, # 0x99 0x2122 TRADE MARK SIGN + 353, # 0x9A 0x0161 LATIN SMALL LETTER S WITH CARON + 8250, # 0x9B 0x203A SINGLE RIGHT-POINTING ANGLE QUOTATION MARK + 339, # 0x9C 0x0153 LATIN SMALL LIGATURE OE + 65533, # 0x9D UNDEFINED + 382, # 0x9E 0x017E LATIN SMALL LETTER Z WITH CARON + 376 # 0x9F 0x0178 LATIN CAPITAL LETTER Y WITH DIAERESIS +) + +entities = { + "AElig;": u"\u00C6", + "AElig": u"\u00C6", + "AMP;": u"\u0026", + "AMP": u"\u0026", + "Aacute;": u"\u00C1", + "Aacute": u"\u00C1", + "Acirc;": u"\u00C2", + "Acirc": u"\u00C2", + "Agrave;": u"\u00C0", + "Agrave": u"\u00C0", + "Alpha;": u"\u0391", + "Aring;": u"\u00C5", + "Aring": u"\u00C5", + "Atilde;": u"\u00C3", + "Atilde": u"\u00C3", + "Auml;": u"\u00C4", + "Auml": u"\u00C4", + "Beta;": u"\u0392", + "COPY;": u"\u00A9", + "COPY": u"\u00A9", + "Ccedil;": u"\u00C7", + "Ccedil": u"\u00C7", + "Chi;": u"\u03A7", + "Dagger;": u"\u2021", + "Delta;": u"\u0394", + "ETH;": u"\u00D0", + "ETH": u"\u00D0", + "Eacute;": u"\u00C9", + "Eacute": u"\u00C9", + "Ecirc;": u"\u00CA", + "Ecirc": u"\u00CA", + "Egrave;": u"\u00C8", + "Egrave": u"\u00C8", + "Epsilon;": u"\u0395", + "Eta;": u"\u0397", + "Euml;": u"\u00CB", + "Euml": u"\u00CB", + "GT;": u"\u003E", + "GT": u"\u003E", + "Gamma;": u"\u0393", + "Iacute;": u"\u00CD", + "Iacute": u"\u00CD", + "Icirc;": u"\u00CE", + "Icirc": u"\u00CE", + "Igrave;": u"\u00CC", + "Igrave": u"\u00CC", + "Iota;": u"\u0399", + "Iuml;": u"\u00CF", + "Iuml": u"\u00CF", + "Kappa;": u"\u039A", + "LT;": u"\u003C", + "LT": u"\u003C", + "Lambda;": u"\u039B", + "Mu;": u"\u039C", + "Ntilde;": u"\u00D1", + "Ntilde": u"\u00D1", + "Nu;": u"\u039D", + "OElig;": u"\u0152", + "Oacute;": u"\u00D3", + "Oacute": u"\u00D3", + "Ocirc;": u"\u00D4", + "Ocirc": u"\u00D4", + "Ograve;": u"\u00D2", + "Ograve": u"\u00D2", + "Omega;": u"\u03A9", + "Omicron;": u"\u039F", + "Oslash;": u"\u00D8", + "Oslash": u"\u00D8", + "Otilde;": u"\u00D5", + "Otilde": u"\u00D5", + "Ouml;": u"\u00D6", + "Ouml": u"\u00D6", + "Phi;": u"\u03A6", + "Pi;": u"\u03A0", + "Prime;": u"\u2033", + "Psi;": u"\u03A8", + "QUOT;": u"\u0022", + "QUOT": u"\u0022", + "REG;": u"\u00AE", + "REG": u"\u00AE", + "Rho;": u"\u03A1", + "Scaron;": u"\u0160", + "Sigma;": u"\u03A3", + "THORN;": u"\u00DE", + "THORN": u"\u00DE", + "TRADE;": u"\u2122", + "Tau;": u"\u03A4", + "Theta;": u"\u0398", + "Uacute;": u"\u00DA", + "Uacute": u"\u00DA", + "Ucirc;": u"\u00DB", + "Ucirc": u"\u00DB", + "Ugrave;": u"\u00D9", + "Ugrave": u"\u00D9", + "Upsilon;": u"\u03A5", + "Uuml;": u"\u00DC", + "Uuml": u"\u00DC", + "Xi;": u"\u039E", + "Yacute;": u"\u00DD", + "Yacute": u"\u00DD", + "Yuml;": u"\u0178", + "Zeta;": u"\u0396", + "aacute;": u"\u00E1", + "aacute": u"\u00E1", + "acirc;": u"\u00E2", + "acirc": u"\u00E2", + "acute;": u"\u00B4", + "acute": u"\u00B4", + "aelig;": u"\u00E6", + "aelig": u"\u00E6", + "agrave;": u"\u00E0", + "agrave": u"\u00E0", + "alefsym;": u"\u2135", + "alpha;": u"\u03B1", + "amp;": u"\u0026", + "amp": u"\u0026", + "and;": u"\u2227", + "ang;": u"\u2220", + "apos;": u"\u0027", + "aring;": u"\u00E5", + "aring": u"\u00E5", + "asymp;": u"\u2248", + "atilde;": u"\u00E3", + "atilde": u"\u00E3", + "auml;": u"\u00E4", + "auml": u"\u00E4", + "bdquo;": u"\u201E", + "beta;": u"\u03B2", + "brvbar;": u"\u00A6", + "brvbar": u"\u00A6", + "bull;": u"\u2022", + "cap;": u"\u2229", + "ccedil;": u"\u00E7", + "ccedil": u"\u00E7", + "cedil;": u"\u00B8", + "cedil": u"\u00B8", + "cent;": u"\u00A2", + "cent": u"\u00A2", + "chi;": u"\u03C7", + "circ;": u"\u02C6", + "clubs;": u"\u2663", + "cong;": u"\u2245", + "copy;": u"\u00A9", + "copy": u"\u00A9", + "crarr;": u"\u21B5", + "cup;": u"\u222A", + "curren;": u"\u00A4", + "curren": u"\u00A4", + "dArr;": u"\u21D3", + "dagger;": u"\u2020", + "darr;": u"\u2193", + "deg;": u"\u00B0", + "deg": u"\u00B0", + "delta;": u"\u03B4", + "diams;": u"\u2666", + "divide;": u"\u00F7", + "divide": u"\u00F7", + "eacute;": u"\u00E9", + "eacute": u"\u00E9", + "ecirc;": u"\u00EA", + "ecirc": u"\u00EA", + "egrave;": u"\u00E8", + "egrave": u"\u00E8", + "empty;": u"\u2205", + "emsp;": u"\u2003", + "ensp;": u"\u2002", + "epsilon;": u"\u03B5", + "equiv;": u"\u2261", + "eta;": u"\u03B7", + "eth;": u"\u00F0", + "eth": u"\u00F0", + "euml;": u"\u00EB", + "euml": u"\u00EB", + "euro;": u"\u20AC", + "exist;": u"\u2203", + "fnof;": u"\u0192", + "forall;": u"\u2200", + "frac12;": u"\u00BD", + "frac12": u"\u00BD", + "frac14;": u"\u00BC", + "frac14": u"\u00BC", + "frac34;": u"\u00BE", + "frac34": u"\u00BE", + "frasl;": u"\u2044", + "gamma;": u"\u03B3", + "ge;": u"\u2265", + "gt;": u"\u003E", + "gt": u"\u003E", + "hArr;": u"\u21D4", + "harr;": u"\u2194", + "hearts;": u"\u2665", + "hellip;": u"\u2026", + "iacute;": u"\u00ED", + "iacute": u"\u00ED", + "icirc;": u"\u00EE", + "icirc": u"\u00EE", + "iexcl;": u"\u00A1", + "iexcl": u"\u00A1", + "igrave;": u"\u00EC", + "igrave": u"\u00EC", + "image;": u"\u2111", + "infin;": u"\u221E", + "int;": u"\u222B", + "iota;": u"\u03B9", + "iquest;": u"\u00BF", + "iquest": u"\u00BF", + "isin;": u"\u2208", + "iuml;": u"\u00EF", + "iuml": u"\u00EF", + "kappa;": u"\u03BA", + "lArr;": u"\u21D0", + "lambda;": u"\u03BB", + "lang;": u"\u27E8", + "laquo;": u"\u00AB", + "laquo": u"\u00AB", + "larr;": u"\u2190", + "lceil;": u"\u2308", + "ldquo;": u"\u201C", + "le;": u"\u2264", + "lfloor;": u"\u230A", + "lowast;": u"\u2217", + "loz;": u"\u25CA", + "lrm;": u"\u200E", + "lsaquo;": u"\u2039", + "lsquo;": u"\u2018", + "lt;": u"\u003C", + "lt": u"\u003C", + "macr;": u"\u00AF", + "macr": u"\u00AF", + "mdash;": u"\u2014", + "micro;": u"\u00B5", + "micro": u"\u00B5", + "middot;": u"\u00B7", + "middot": u"\u00B7", + "minus;": u"\u2212", + "mu;": u"\u03BC", + "nabla;": u"\u2207", + "nbsp;": u"\u00A0", + "nbsp": u"\u00A0", + "ndash;": u"\u2013", + "ne;": u"\u2260", + "ni;": u"\u220B", + "not;": u"\u00AC", + "not": u"\u00AC", + "notin;": u"\u2209", + "nsub;": u"\u2284", + "ntilde;": u"\u00F1", + "ntilde": u"\u00F1", + "nu;": u"\u03BD", + "oacute;": u"\u00F3", + "oacute": u"\u00F3", + "ocirc;": u"\u00F4", + "ocirc": u"\u00F4", + "oelig;": u"\u0153", + "ograve;": u"\u00F2", + "ograve": u"\u00F2", + "oline;": u"\u203E", + "omega;": u"\u03C9", + "omicron;": u"\u03BF", + "oplus;": u"\u2295", + "or;": u"\u2228", + "ordf;": u"\u00AA", + "ordf": u"\u00AA", + "ordm;": u"\u00BA", + "ordm": u"\u00BA", + "oslash;": u"\u00F8", + "oslash": u"\u00F8", + "otilde;": u"\u00F5", + "otilde": u"\u00F5", + "otimes;": u"\u2297", + "ouml;": u"\u00F6", + "ouml": u"\u00F6", + "para;": u"\u00B6", + "para": u"\u00B6", + "part;": u"\u2202", + "permil;": u"\u2030", + "perp;": u"\u22A5", + "phi;": u"\u03C6", + "pi;": u"\u03C0", + "piv;": u"\u03D6", + "plusmn;": u"\u00B1", + "plusmn": u"\u00B1", + "pound;": u"\u00A3", + "pound": u"\u00A3", + "prime;": u"\u2032", + "prod;": u"\u220F", + "prop;": u"\u221D", + "psi;": u"\u03C8", + "quot;": u"\u0022", + "quot": u"\u0022", + "rArr;": u"\u21D2", + "radic;": u"\u221A", + "rang;": u"\u27E9", + "raquo;": u"\u00BB", + "raquo": u"\u00BB", + "rarr;": u"\u2192", + "rceil;": u"\u2309", + "rdquo;": u"\u201D", + "real;": u"\u211C", + "reg;": u"\u00AE", + "reg": u"\u00AE", + "rfloor;": u"\u230B", + "rho;": u"\u03C1", + "rlm;": u"\u200F", + "rsaquo;": u"\u203A", + "rsquo;": u"\u2019", + "sbquo;": u"\u201A", + "scaron;": u"\u0161", + "sdot;": u"\u22C5", + "sect;": u"\u00A7", + "sect": u"\u00A7", + "shy;": u"\u00AD", + "shy": u"\u00AD", + "sigma;": u"\u03C3", + "sigmaf;": u"\u03C2", + "sim;": u"\u223C", + "spades;": u"\u2660", + "sub;": u"\u2282", + "sube;": u"\u2286", + "sum;": u"\u2211", + "sup1;": u"\u00B9", + "sup1": u"\u00B9", + "sup2;": u"\u00B2", + "sup2": u"\u00B2", + "sup3;": u"\u00B3", + "sup3": u"\u00B3", + "sup;": u"\u2283", + "supe;": u"\u2287", + "szlig;": u"\u00DF", + "szlig": u"\u00DF", + "tau;": u"\u03C4", + "there4;": u"\u2234", + "theta;": u"\u03B8", + "thetasym;": u"\u03D1", + "thinsp;": u"\u2009", + "thorn;": u"\u00FE", + "thorn": u"\u00FE", + "tilde;": u"\u02DC", + "times;": u"\u00D7", + "times": u"\u00D7", + "trade;": u"\u2122", + "uArr;": u"\u21D1", + "uacute;": u"\u00FA", + "uacute": u"\u00FA", + "uarr;": u"\u2191", + "ucirc;": u"\u00FB", + "ucirc": u"\u00FB", + "ugrave;": u"\u00F9", + "ugrave": u"\u00F9", + "uml;": u"\u00A8", + "uml": u"\u00A8", + "upsih;": u"\u03D2", + "upsilon;": u"\u03C5", + "uuml;": u"\u00FC", + "uuml": u"\u00FC", + "weierp;": u"\u2118", + "xi;": u"\u03BE", + "yacute;": u"\u00FD", + "yacute": u"\u00FD", + "yen;": u"\u00A5", + "yen": u"\u00A5", + "yuml;": u"\u00FF", + "yuml": u"\u00FF", + "zeta;": u"\u03B6", + "zwj;": u"\u200D", + "zwnj;": u"\u200C" +} + +replacementCharacters = { + 0x0:u"\uFFFD", + 0x0d:u"\u000A", + 0x80:u"\u20AC", + 0x81:u"\u0081", + 0x81:u"\u0081", + 0x82:u"\u201A", + 0x83:u"\u0192", + 0x84:u"\u201E", + 0x85:u"\u2026", + 0x86:u"\u2020", + 0x87:u"\u2021", + 0x88:u"\u02C6", + 0x89:u"\u2030", + 0x8A:u"\u0160", + 0x8B:u"\u2039", + 0x8C:u"\u0152", + 0x8D:u"\u008D", + 0x8E:u"\u017D", + 0x8F:u"\u008F", + 0x90:u"\u0090", + 0x91:u"\u2018", + 0x92:u"\u2019", + 0x93:u"\u201C", + 0x94:u"\u201D", + 0x95:u"\u2022", + 0x96:u"\u2013", + 0x97:u"\u2014", + 0x98:u"\u02DC", + 0x99:u"\u2122", + 0x9A:u"\u0161", + 0x9B:u"\u203A", + 0x9C:u"\u0153", + 0x9D:u"\u009D", + 0x9E:u"\u017E", + 0x9F:u"\u0178", +} + +encodings = { + '437': 'cp437', + '850': 'cp850', + '852': 'cp852', + '855': 'cp855', + '857': 'cp857', + '860': 'cp860', + '861': 'cp861', + '862': 'cp862', + '863': 'cp863', + '865': 'cp865', + '866': 'cp866', + '869': 'cp869', + 'ansix341968': 'ascii', + 'ansix341986': 'ascii', + 'arabic': 'iso8859-6', + 'ascii': 'ascii', + 'asmo708': 'iso8859-6', + 'big5': 'big5', + 'big5hkscs': 'big5hkscs', + 'chinese': 'gbk', + 'cp037': 'cp037', + 'cp1026': 'cp1026', + 'cp154': 'ptcp154', + 'cp367': 'ascii', + 'cp424': 'cp424', + 'cp437': 'cp437', + 'cp500': 'cp500', + 'cp775': 'cp775', + 'cp819': 'windows-1252', + 'cp850': 'cp850', + 'cp852': 'cp852', + 'cp855': 'cp855', + 'cp857': 'cp857', + 'cp860': 'cp860', + 'cp861': 'cp861', + 'cp862': 'cp862', + 'cp863': 'cp863', + 'cp864': 'cp864', + 'cp865': 'cp865', + 'cp866': 'cp866', + 'cp869': 'cp869', + 'cp936': 'gbk', + 'cpgr': 'cp869', + 'cpis': 'cp861', + 'csascii': 'ascii', + 'csbig5': 'big5', + 'cseuckr': 'cp949', + 'cseucpkdfmtjapanese': 'euc_jp', + 'csgb2312': 'gbk', + 'cshproman8': 'hp-roman8', + 'csibm037': 'cp037', + 'csibm1026': 'cp1026', + 'csibm424': 'cp424', + 'csibm500': 'cp500', + 'csibm855': 'cp855', + 'csibm857': 'cp857', + 'csibm860': 'cp860', + 'csibm861': 'cp861', + 'csibm863': 'cp863', + 'csibm864': 'cp864', + 'csibm865': 'cp865', + 'csibm866': 'cp866', + 'csibm869': 'cp869', + 'csiso2022jp': 'iso2022_jp', + 'csiso2022jp2': 'iso2022_jp_2', + 'csiso2022kr': 'iso2022_kr', + 'csiso58gb231280': 'gbk', + 'csisolatin1': 'windows-1252', + 'csisolatin2': 'iso8859-2', + 'csisolatin3': 'iso8859-3', + 'csisolatin4': 'iso8859-4', + 'csisolatin5': 'windows-1254', + 'csisolatin6': 'iso8859-10', + 'csisolatinarabic': 'iso8859-6', + 'csisolatincyrillic': 'iso8859-5', + 'csisolatingreek': 'iso8859-7', + 'csisolatinhebrew': 'iso8859-8', + 'cskoi8r': 'koi8-r', + 'csksc56011987': 'cp949', + 'cspc775baltic': 'cp775', + 'cspc850multilingual': 'cp850', + 'cspc862latinhebrew': 'cp862', + 'cspc8codepage437': 'cp437', + 'cspcp852': 'cp852', + 'csptcp154': 'ptcp154', + 'csshiftjis': 'shift_jis', + 'csunicode11utf7': 'utf-7', + 'cyrillic': 'iso8859-5', + 'cyrillicasian': 'ptcp154', + 'ebcdiccpbe': 'cp500', + 'ebcdiccpca': 'cp037', + 'ebcdiccpch': 'cp500', + 'ebcdiccphe': 'cp424', + 'ebcdiccpnl': 'cp037', + 'ebcdiccpus': 'cp037', + 'ebcdiccpwt': 'cp037', + 'ecma114': 'iso8859-6', + 'ecma118': 'iso8859-7', + 'elot928': 'iso8859-7', + 'eucjp': 'euc_jp', + 'euckr': 'cp949', + 'extendedunixcodepackedformatforjapanese': 'euc_jp', + 'gb18030': 'gb18030', + 'gb2312': 'gbk', + 'gb231280': 'gbk', + 'gbk': 'gbk', + 'greek': 'iso8859-7', + 'greek8': 'iso8859-7', + 'hebrew': 'iso8859-8', + 'hproman8': 'hp-roman8', + 'hzgb2312': 'hz', + 'ibm037': 'cp037', + 'ibm1026': 'cp1026', + 'ibm367': 'ascii', + 'ibm424': 'cp424', + 'ibm437': 'cp437', + 'ibm500': 'cp500', + 'ibm775': 'cp775', + 'ibm819': 'windows-1252', + 'ibm850': 'cp850', + 'ibm852': 'cp852', + 'ibm855': 'cp855', + 'ibm857': 'cp857', + 'ibm860': 'cp860', + 'ibm861': 'cp861', + 'ibm862': 'cp862', + 'ibm863': 'cp863', + 'ibm864': 'cp864', + 'ibm865': 'cp865', + 'ibm866': 'cp866', + 'ibm869': 'cp869', + 'iso2022jp': 'iso2022_jp', + 'iso2022jp2': 'iso2022_jp_2', + 'iso2022kr': 'iso2022_kr', + 'iso646irv1991': 'ascii', + 'iso646us': 'ascii', + 'iso88591': 'windows-1252', + 'iso885910': 'iso8859-10', + 'iso8859101992': 'iso8859-10', + 'iso885911987': 'windows-1252', + 'iso885913': 'iso8859-13', + 'iso885914': 'iso8859-14', + 'iso8859141998': 'iso8859-14', + 'iso885915': 'iso8859-15', + 'iso885916': 'iso8859-16', + 'iso8859162001': 'iso8859-16', + 'iso88592': 'iso8859-2', + 'iso885921987': 'iso8859-2', + 'iso88593': 'iso8859-3', + 'iso885931988': 'iso8859-3', + 'iso88594': 'iso8859-4', + 'iso885941988': 'iso8859-4', + 'iso88595': 'iso8859-5', + 'iso885951988': 'iso8859-5', + 'iso88596': 'iso8859-6', + 'iso885961987': 'iso8859-6', + 'iso88597': 'iso8859-7', + 'iso885971987': 'iso8859-7', + 'iso88598': 'iso8859-8', + 'iso885981988': 'iso8859-8', + 'iso88599': 'windows-1254', + 'iso885991989': 'windows-1254', + 'isoceltic': 'iso8859-14', + 'isoir100': 'windows-1252', + 'isoir101': 'iso8859-2', + 'isoir109': 'iso8859-3', + 'isoir110': 'iso8859-4', + 'isoir126': 'iso8859-7', + 'isoir127': 'iso8859-6', + 'isoir138': 'iso8859-8', + 'isoir144': 'iso8859-5', + 'isoir148': 'windows-1254', + 'isoir149': 'cp949', + 'isoir157': 'iso8859-10', + 'isoir199': 'iso8859-14', + 'isoir226': 'iso8859-16', + 'isoir58': 'gbk', + 'isoir6': 'ascii', + 'koi8r': 'koi8-r', + 'koi8u': 'koi8-u', + 'korean': 'cp949', + 'ksc5601': 'cp949', + 'ksc56011987': 'cp949', + 'ksc56011989': 'cp949', + 'l1': 'windows-1252', + 'l10': 'iso8859-16', + 'l2': 'iso8859-2', + 'l3': 'iso8859-3', + 'l4': 'iso8859-4', + 'l5': 'windows-1254', + 'l6': 'iso8859-10', + 'l8': 'iso8859-14', + 'latin1': 'windows-1252', + 'latin10': 'iso8859-16', + 'latin2': 'iso8859-2', + 'latin3': 'iso8859-3', + 'latin4': 'iso8859-4', + 'latin5': 'windows-1254', + 'latin6': 'iso8859-10', + 'latin8': 'iso8859-14', + 'latin9': 'iso8859-15', + 'ms936': 'gbk', + 'mskanji': 'shift_jis', + 'pt154': 'ptcp154', + 'ptcp154': 'ptcp154', + 'r8': 'hp-roman8', + 'roman8': 'hp-roman8', + 'shiftjis': 'shift_jis', + 'tis620': 'cp874', + 'unicode11utf7': 'utf-7', + 'us': 'ascii', + 'usascii': 'ascii', + 'utf16': 'utf-16', + 'utf16be': 'utf-16-be', + 'utf16le': 'utf-16-le', + 'utf8': 'utf-8', + 'windows1250': 'cp1250', + 'windows1251': 'cp1251', + 'windows1252': 'cp1252', + 'windows1253': 'cp1253', + 'windows1254': 'cp1254', + 'windows1255': 'cp1255', + 'windows1256': 'cp1256', + 'windows1257': 'cp1257', + 'windows1258': 'cp1258', + 'windows936': 'gbk', + 'x-x-big5': 'big5'} + +tokenTypes = { + "Doctype":0, + "Characters":1, + "SpaceCharacters":2, + "StartTag":3, + "EndTag":4, + "EmptyTag":5, + "Comment":6, + "ParseError":7 +} + +tagTokenTypes = frozenset((tokenTypes["StartTag"], tokenTypes["EndTag"], + tokenTypes["EmptyTag"])) + + +prefixes = dict([(v,k) for k,v in namespaces.iteritems()]) +prefixes["http://www.w3.org/1998/Math/MathML"] = "math" + +class DataLossWarning(UserWarning): + pass + +class ReparseException(Exception): + pass diff --git a/src/html5lib/filters/__init__.py b/src/html5lib/filters/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/src/html5lib/filters/_base.py b/src/html5lib/filters/_base.py new file mode 100644 index 0000000000..bca94ada40 --- /dev/null +++ b/src/html5lib/filters/_base.py @@ -0,0 +1,10 @@ + +class Filter(object): + def __init__(self, source): + self.source = source + + def __iter__(self): + return iter(self.source) + + def __getattr__(self, name): + return getattr(self.source, name) diff --git a/src/html5lib/filters/formfiller.py b/src/html5lib/filters/formfiller.py new file mode 100644 index 0000000000..940017149b --- /dev/null +++ b/src/html5lib/filters/formfiller.py @@ -0,0 +1,127 @@ +# +# The goal is to finally have a form filler where you pass data for +# each form, using the algorithm for "Seeding a form with initial values" +# See http://www.whatwg.org/specs/web-forms/current-work/#seeding +# + +import _base + +from html5lib.constants import spaceCharacters +spaceCharacters = u"".join(spaceCharacters) + +class SimpleFilter(_base.Filter): + def __init__(self, source, fieldStorage): + _base.Filter.__init__(self, source) + self.fieldStorage = fieldStorage + + def __iter__(self): + field_indices = {} + state = None + field_name = None + for token in _base.Filter.__iter__(self): + type = token["type"] + if type in ("StartTag", "EmptyTag"): + name = token["name"].lower() + if name == "input": + field_name = None + field_type = None + input_value_index = -1 + input_checked_index = -1 + for i,(n,v) in enumerate(token["data"]): + n = n.lower() + if n == u"name": + field_name = v.strip(spaceCharacters) + elif n == u"type": + field_type = v.strip(spaceCharacters) + elif n == u"checked": + input_checked_index = i + elif n == u"value": + input_value_index = i + + value_list = self.fieldStorage.getlist(field_name) + field_index = field_indices.setdefault(field_name, 0) + if field_index < len(value_list): + value = value_list[field_index] + else: + value = "" + + if field_type in (u"checkbox", u"radio"): + if value_list: + if token["data"][input_value_index][1] == value: + if input_checked_index < 0: + token["data"].append((u"checked", u"")) + field_indices[field_name] = field_index + 1 + elif input_checked_index >= 0: + del token["data"][input_checked_index] + + elif field_type not in (u"button", u"submit", u"reset"): + if input_value_index >= 0: + token["data"][input_value_index] = (u"value", value) + else: + token["data"].append((u"value", value)) + field_indices[field_name] = field_index + 1 + + field_type = None + field_name = None + + elif name == "textarea": + field_type = "textarea" + field_name = dict((token["data"])[::-1])["name"] + + elif name == "select": + field_type = "select" + attributes = dict(token["data"][::-1]) + field_name = attributes.get("name") + is_select_multiple = "multiple" in attributes + is_selected_option_found = False + + elif field_type == "select" and field_name and name == "option": + option_selected_index = -1 + option_value = None + for i,(n,v) in enumerate(token["data"]): + n = n.lower() + if n == "selected": + option_selected_index = i + elif n == "value": + option_value = v.strip(spaceCharacters) + if option_value is None: + raise NotImplementedError("<option>s without a value= attribute") + else: + value_list = self.fieldStorage.getlist(field_name) + if value_list: + field_index = field_indices.setdefault(field_name, 0) + if field_index < len(value_list): + value = value_list[field_index] + else: + value = "" + if (is_select_multiple or not is_selected_option_found) and option_value == value: + if option_selected_index < 0: + token["data"].append((u"selected", u"")) + field_indices[field_name] = field_index + 1 + is_selected_option_found = True + elif option_selected_index >= 0: + del token["data"][option_selected_index] + + elif field_type is not None and field_name and type == "EndTag": + name = token["name"].lower() + if name == field_type: + if name == "textarea": + value_list = self.fieldStorage.getlist(field_name) + if value_list: + field_index = field_indices.setdefault(field_name, 0) + if field_index < len(value_list): + value = value_list[field_index] + else: + value = "" + yield {"type": "Characters", "data": value} + field_indices[field_name] = field_index + 1 + + field_name = None + + elif name == "option" and field_type == "select": + pass # TODO: part of "option without value= attribute" processing + + elif field_type == "textarea": + continue # ignore token + + yield token diff --git a/src/html5lib/filters/inject_meta_charset.py b/src/html5lib/filters/inject_meta_charset.py new file mode 100644 index 0000000000..35a2d95f21 --- /dev/null +++ b/src/html5lib/filters/inject_meta_charset.py @@ -0,0 +1,63 @@ +import _base + +class Filter(_base.Filter): + def __init__(self, source, encoding): + _base.Filter.__init__(self, source) + self.encoding = encoding + + def __iter__(self): + state = "pre_head" + meta_found = (self.encoding is None) + pending = [] + + for token in _base.Filter.__iter__(self): + type = token["type"] + if type == "StartTag": + if token["name"].lower() == "head": + state = "in_head" + + elif type == "EmptyTag": + if token["name"].lower() == "meta": + # replace charset with actual encoding + has_http_equiv_content_type = False + content_index = -1 + for i,(name,value) in enumerate(token["data"]): + if name.lower() == 'charset': + token["data"][i] = (u'charset', self.encoding) + meta_found = True + break + elif name == 'http-equiv' and value.lower() == 'content-type': + has_http_equiv_content_type = True + elif name == 'content': + content_index = i + else: + if has_http_equiv_content_type and content_index >= 0: + token["data"][content_index] = (u'content', u'text/html; charset=%s' % self.encoding) + meta_found = True + + elif token["name"].lower() == "head" and not meta_found: + # insert meta into empty head + yield {"type": "StartTag", "name": "head", + "data": token["data"]} + yield {"type": "EmptyTag", "name": "meta", + "data": [["charset", self.encoding]]} + yield {"type": "EndTag", "name": "head"} + meta_found = True + continue + + elif type == "EndTag": + if token["name"].lower() == "head" and pending: + # insert meta into head (if necessary) and flush pending queue + yield pending.pop(0) + if not meta_found: + yield {"type": "EmptyTag", "name": "meta", + "data": [["charset", self.encoding]]} + while pending: + yield pending.pop(0) + meta_found = True + state = "post_head" + + if state == "in_head": + pending.append(token) + else: + yield token diff --git a/src/html5lib/filters/lint.py b/src/html5lib/filters/lint.py new file mode 100644 index 0000000000..ea5c619f15 --- /dev/null +++ b/src/html5lib/filters/lint.py @@ -0,0 +1,88 @@ +from gettext import gettext +_ = gettext + +import _base +from html5lib.constants import cdataElements, rcdataElements, voidElements + +from html5lib.constants import spaceCharacters +spaceCharacters = u"".join(spaceCharacters) + +class LintError(Exception): pass + +class Filter(_base.Filter): + def __iter__(self): + open_elements = [] + contentModelFlag = "PCDATA" + for token in _base.Filter.__iter__(self): + type = token["type"] + if type in ("StartTag", "EmptyTag"): + name = token["name"] + if contentModelFlag != "PCDATA": + raise LintError(_("StartTag not in PCDATA content model flag: %s") % name) + if not isinstance(name, unicode): + raise LintError(_(u"Tag name is not a string: %r") % name) + if not name: + raise LintError(_(u"Empty tag name")) + if type == "StartTag" and name in voidElements: + raise LintError(_(u"Void element reported as StartTag token: %s") % name) + elif type == "EmptyTag" and name not in voidElements: + raise LintError(_(u"Non-void element reported as EmptyTag token: %s") % token["name"]) + if type == "StartTag": + open_elements.append(name) + for name, value in token["data"]: + if not isinstance(name, unicode): + raise LintError(_("Attribute name is not a string: %r") % name) + if not name: + raise LintError(_(u"Empty attribute name")) + if not isinstance(value, unicode): + raise LintError(_("Attribute value is not a string: %r") % value) + if name in cdataElements: + contentModelFlag = "CDATA" + elif name in rcdataElements: + contentModelFlag = "RCDATA" + elif name == "plaintext": + contentModelFlag = "PLAINTEXT" + + elif type == "EndTag": + name = token["name"] + if not isinstance(name, unicode): + raise LintError(_(u"Tag name is not a string: %r") % name) + if not name: + raise LintError(_(u"Empty tag name")) + if name in voidElements: + raise LintError(_(u"Void element reported as EndTag token: %s") % name) + start_name = open_elements.pop() + if start_name != name: + raise LintError(_(u"EndTag (%s) does not match StartTag (%s)") % (name, start_name)) + contentModelFlag = "PCDATA" + + elif type == "Comment": + if contentModelFlag != "PCDATA": + raise LintError(_("Comment not in PCDATA content model flag")) + + elif type in ("Characters", "SpaceCharacters"): + data = token["data"] + if not isinstance(data, unicode): + raise LintError(_("Attribute name is not a string: %r") % data) + if not data: + raise LintError(_(u"%s token with empty data") % type) + if type == "SpaceCharacters": + data = data.strip(spaceCharacters) + if data: + raise LintError(_(u"Non-space character(s) found in SpaceCharacters token: ") % data) + + elif type == "Doctype": + name = token["name"] + if contentModelFlag != "PCDATA": + raise LintError(_("Doctype not in PCDATA content model flag: %s") % name) + if not isinstance(name, unicode): + raise LintError(_(u"Tag name is not a string: %r") % name) + # XXX: what to do with token["data"] ? + + elif type in ("ParseError", "SerializeError"): + pass + + else: + raise LintError(_(u"Unknown token type: %s") % type) + + yield token diff --git a/src/html5lib/filters/optionaltags.py b/src/html5lib/filters/optionaltags.py new file mode 100644 index 0000000000..a77aa72c4b --- /dev/null +++ b/src/html5lib/filters/optionaltags.py @@ -0,0 +1,202 @@ +import _base + +class Filter(_base.Filter): + def slider(self): + previous1 = previous2 = None + for token in self.source: + if previous1 is not None: + yield previous2, previous1, token + previous2 = previous1 + previous1 = token + yield previous2, previous1, None + + def __iter__(self): + for previous, token, next in self.slider(): + type = token["type"] + if type == "StartTag": + if (token["data"] or + not self.is_optional_start(token["name"], previous, next)): + yield token + elif type == "EndTag": + if not self.is_optional_end(token["name"], next): + yield token + else: + yield token + + def is_optional_start(self, tagname, previous, next): + type = next and next["type"] or None + if tagname in 'html': + # An html element's start tag may be omitted if the first thing + # inside the html element is not a space character or a comment. + return type not in ("Comment", "SpaceCharacters") + elif tagname == 'head': + # A head element's start tag may be omitted if the first thing + # inside the head element is an element. + # XXX: we also omit the start tag if the head element is empty + if type in ("StartTag", "EmptyTag"): + return True + elif type == "EndTag": + return next["name"] == "head" + elif tagname == 'body': + # A body element's start tag may be omitted if the first thing + # inside the body element is not a space character or a comment, + # except if the first thing inside the body element is a script + # or style element and the node immediately preceding the body + # element is a head element whose end tag has been omitted. + if type in ("Comment", "SpaceCharacters"): + return False + elif type == "StartTag": + # XXX: we do not look at the preceding event, so we never omit + # the body element's start tag if it's followed by a script or + # a style element. + return next["name"] not in ('script', 'style') + else: + return True + elif tagname == 'colgroup': + # A colgroup element's start tag may be omitted if the first thing + # inside the colgroup element is a col element, and if the element + # is not immediately preceeded by another colgroup element whose + # end tag has been omitted. + if type in ("StartTag", "EmptyTag"): + # XXX: we do not look at the preceding event, so instead we never + # omit the colgroup element's end tag when it is immediately + # followed by another colgroup element. See is_optional_end. + return next["name"] == "col" + else: + return False + elif tagname == 'tbody': + # A tbody element's start tag may be omitted if the first thing + # inside the tbody element is a tr element, and if the element is + # not immediately preceeded by a tbody, thead, or tfoot element + # whose end tag has been omitted. + if type == "StartTag": + # omit the thead and tfoot elements' end tag when they are + # immediately followed by a tbody element. See is_optional_end. + if previous and previous['type'] == 'EndTag' and \ + previous['name'] in ('tbody','thead','tfoot'): + return False + return next["name"] == 'tr' + else: + return False + return False + + def is_optional_end(self, tagname, next): + type = next and next["type"] or None + if tagname in ('html', 'head', 'body'): + # An html element's end tag may be omitted if the html element + # is not immediately followed by a space character or a comment. + return type not in ("Comment", "SpaceCharacters") + elif tagname in ('li', 'optgroup', 'tr'): + # A li element's end tag may be omitted if the li element is + # immediately followed by another li element or if there is + # no more content in the parent element. + # An optgroup element's end tag may be omitted if the optgroup + # element is immediately followed by another optgroup element, + # or if there is no more content in the parent element. + # A tr element's end tag may be omitted if the tr element is + # immediately followed by another tr element, or if there is + # no more content in the parent element. + if type == "StartTag": + return next["name"] == tagname + else: + return type == "EndTag" or type is None + elif tagname in ('dt', 'dd'): + # A dt element's end tag may be omitted if the dt element is + # immediately followed by another dt element or a dd element. + # A dd element's end tag may be omitted if the dd element is + # immediately followed by another dd element or a dt element, + # or if there is no more content in the parent element. + if type == "StartTag": + return next["name"] in ('dt', 'dd') + elif tagname == 'dd': + return type == "EndTag" or type is None + else: + return False + elif tagname == 'p': + # A p element's end tag may be omitted if the p element is + # immediately followed by an address, article, aside, + # blockquote, datagrid, dialog, dir, div, dl, fieldset, + # footer, form, h1, h2, h3, h4, h5, h6, header, hr, menu, + # nav, ol, p, pre, section, table, or ul, element, or if + # there is no more content in the parent element. + if type in ("StartTag", "EmptyTag"): + return next["name"] in ('address', 'article', 'aside', + 'blockquote', 'datagrid', 'dialog', + 'dir', 'div', 'dl', 'fieldset', 'footer', + 'form', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', + 'header', 'hr', 'menu', 'nav', 'ol', + 'p', 'pre', 'section', 'table', 'ul') + else: + return type == "EndTag" or type is None + elif tagname == 'option': + # An option element's end tag may be omitted if the option + # element is immediately followed by another option element, + # or if it is immediately followed by an <code>optgroup</code> + # element, or if there is no more content in the parent + # element. + if type == "StartTag": + return next["name"] in ('option', 'optgroup') + else: + return type == "EndTag" or type is None + elif tagname in ('rt', 'rp'): + # An rt element's end tag may be omitted if the rt element is + # immediately followed by an rt or rp element, or if there is + # no more content in the parent element. + # An rp element's end tag may be omitted if the rp element is + # immediately followed by an rt or rp element, or if there is + # no more content in the parent element. + if type == "StartTag": + return next["name"] in ('rt', 'rp') + else: + return type == "EndTag" or type is None + elif tagname == 'colgroup': + # A colgroup element's end tag may be omitted if the colgroup + # element is not immediately followed by a space character or + # a comment. + if type in ("Comment", "SpaceCharacters"): + return False + elif type == "StartTag": + # XXX: we also look for an immediately following colgroup + # element. See is_optional_start. + return next["name"] != 'colgroup' + else: + return True + elif tagname in ('thead', 'tbody'): + # A thead element's end tag may be omitted if the thead element + # is immediately followed by a tbody or tfoot element. + # A tbody element's end tag may be omitted if the tbody element + # is immediately followed by a tbody or tfoot element, or if + # there is no more content in the parent element. + # A tfoot element's end tag may be omitted if the tfoot element + # is immediately followed by a tbody element, or if there is no + # more content in the parent element. + # XXX: we never omit the end tag when the following element is + # a tbody. See is_optional_start. + if type == "StartTag": + return next["name"] in ['tbody', 'tfoot'] + elif tagname == 'tbody': + return type == "EndTag" or type is None + else: + return False + elif tagname == 'tfoot': + # A tfoot element's end tag may be omitted if the tfoot element + # is immediately followed by a tbody element, or if there is no + # more content in the parent element. + # XXX: we never omit the end tag when the following element is + # a tbody. See is_optional_start. + if type == "StartTag": + return next["name"] == 'tbody' + else: + return type == "EndTag" or type is None + elif tagname in ('td', 'th'): + # A td element's end tag may be omitted if the td element is + # immediately followed by a td or th element, or if there is + # no more content in the parent element. + # A th element's end tag may be omitted if the th element is + # immediately followed by a td or th element, or if there is + # no more content in the parent element. + if type == "StartTag": + return next["name"] in ('td', 'th') + else: + return type == "EndTag" or type is None + return False diff --git a/src/html5lib/filters/sanitizer.py b/src/html5lib/filters/sanitizer.py new file mode 100644 index 0000000000..00235278a1 --- /dev/null +++ b/src/html5lib/filters/sanitizer.py @@ -0,0 +1,8 @@ +import _base +from html5lib.sanitizer import HTMLSanitizerMixin + +class Filter(_base.Filter, HTMLSanitizerMixin): + def __iter__(self): + for token in _base.Filter.__iter__(self): + token = self.sanitize_token(token) + if token: yield token diff --git a/src/html5lib/filters/whitespace.py b/src/html5lib/filters/whitespace.py new file mode 100644 index 0000000000..74d6f4d810 --- /dev/null +++ b/src/html5lib/filters/whitespace.py @@ -0,0 +1,41 @@ +try: + frozenset +except NameError: + # Import from the sets module for python 2.3 + from sets import ImmutableSet as frozenset + +import re + +import _base +from html5lib.constants import rcdataElements, spaceCharacters +spaceCharacters = u"".join(spaceCharacters) + +SPACES_REGEX = re.compile(u"[%s]+" % spaceCharacters) + +class Filter(_base.Filter): + + spacePreserveElements = frozenset(["pre", "textarea"] + list(rcdataElements)) + + def __iter__(self): + preserve = 0 + for token in _base.Filter.__iter__(self): + type = token["type"] + if type == "StartTag" \ + and (preserve or token["name"] in self.spacePreserveElements): + preserve += 1 + + elif type == "EndTag" and preserve: + preserve -= 1 + + elif not preserve and type == "SpaceCharacters" and token["data"]: + # Test on token["data"] above to not introduce spaces where there were not + token["data"] = u" " + + elif not preserve and type == "Characters": + token["data"] = collapse_spaces(token["data"]) + + yield token + +def collapse_spaces(text): + return SPACES_REGEX.sub(' ', text) + diff --git a/src/html5lib/html5parser.py b/src/html5lib/html5parser.py new file mode 100644 index 0000000000..c0855362da --- /dev/null +++ b/src/html5lib/html5parser.py @@ -0,0 +1,2625 @@ +try: + frozenset +except NameError: + # Import from the sets module for python 2.3 + from sets import Set as set + from sets import ImmutableSet as frozenset + +try: + any +except: + # Implement 'any' for python 2.4 and previous + def any(iterable): + for element in iterable: + if element: + return True + return False + +import sys + +import inputstream +import tokenizer + +import treebuilders +from treebuilders._base import Marker +from treebuilders import simpletree + +import utils +from constants import spaceCharacters, asciiUpper2Lower +from constants import scopingElements, formattingElements, specialElements +from constants import headingElements, tableInsertModeElements +from constants import cdataElements, rcdataElements, voidElements +from constants import tokenTypes, ReparseException, namespaces + +def parse(doc, treebuilder="simpletree", encoding=None, + namespaceHTMLElements=True): + tb = treebuilders.getTreeBuilder(treebuilder) + p = HTMLParser(tb, namespaceHTMLElements=namespaceHTMLElements) + return p.parse(doc, encoding=encoding) + +def parseFragment(doc, container="div", treebuilder="simpletree", encoding=None, + namespaceHTMLElements=True): + tb = treebuilders.getTreeBuilder(treebuilder) + p = HTMLParser(tb, namespaceHTMLElements=namespaceHTMLElements) + return p.parseFragment(doc, container=container, encoding=encoding) + +class HTMLParser(object): + """HTML parser. Generates a tree structure from a stream of (possibly + malformed) HTML""" + + def __init__(self, tree = simpletree.TreeBuilder, + tokenizer = tokenizer.HTMLTokenizer, strict = False, + namespaceHTMLElements = True): + """ + strict - raise an exception when a parse error is encountered + + tree - a treebuilder class controlling the type of tree that will be + returned. Built in treebuilders can be accessed through + html5lib.treebuilders.getTreeBuilder(treeType) + + tokenizer - a class that provides a stream of tokens to the treebuilder. + This may be replaced for e.g. a sanitizer which converts some tags to + text + """ + + # Raise an exception on the first error encountered + self.strict = strict + + self.tree = tree(namespaceHTMLElements) + self.tokenizer_class = tokenizer + self.errors = [] + + self.phases = { + "initial": InitialPhase(self, self.tree), + "beforeHtml": BeforeHtmlPhase(self, self.tree), + "beforeHead": BeforeHeadPhase(self, self.tree), + "inHead": InHeadPhase(self, self.tree), + # XXX "inHeadNoscript": InHeadNoScriptPhase(self, self.tree), + "afterHead": AfterHeadPhase(self, self.tree), + "inBody": InBodyPhase(self, self.tree), + "text": TextPhase(self, self.tree), + "inTable": InTablePhase(self, self.tree), + "inTableText": InTableTextPhase(self, self.tree), + "inCaption": InCaptionPhase(self, self.tree), + "inColumnGroup": InColumnGroupPhase(self, self.tree), + "inTableBody": InTableBodyPhase(self, self.tree), + "inRow": InRowPhase(self, self.tree), + "inCell": InCellPhase(self, self.tree), + "inSelect": InSelectPhase(self, self.tree), + "inSelectInTable": InSelectInTablePhase(self, self.tree), + "inForeignContent": InForeignContentPhase(self, self.tree), + "afterBody": AfterBodyPhase(self, self.tree), + "inFrameset": InFramesetPhase(self, self.tree), + "afterFrameset": AfterFramesetPhase(self, self.tree), + "afterAfterBody": AfterAfterBodyPhase(self, self.tree), + "afterAfterFrameset": AfterAfterFramesetPhase(self, self.tree), + # XXX after after frameset + } + + def _parse(self, stream, innerHTML=False, container="div", + encoding=None, parseMeta=True, useChardet=True, **kwargs): + + self.innerHTMLMode = innerHTML + self.container = container + self.tokenizer = self.tokenizer_class(stream, encoding=encoding, + parseMeta=parseMeta, + useChardet=useChardet, **kwargs) + self.reset() + + while True: + try: + self.mainLoop() + break + except ReparseException, e: + self.reset() + + def reset(self): + self.tree.reset() + self.firstStartTag = False + self.errors = [] + # "quirks" / "limited quirks" / "no quirks" + self.compatMode = "no quirks" + + if self.innerHTMLMode: + self.innerHTML = self.container.lower() + + if self.innerHTML in cdataElements: + self.tokenizer.state = self.tokenizer.rcdataState + elif self.innerHTML in rcdataElements: + self.tokenizer.state = self.tokenizer.rawtextState + elif self.innerHTML == 'plaintext': + self.tokenizer.state = self.tokenizer.plaintextState + else: + # state already is data state + # self.tokenizer.state = self.tokenizer.dataState + pass + self.phase = self.phases["beforeHtml"] + self.phase.insertHtmlElement() + self.resetInsertionMode() + else: + self.innerHTML = False + self.phase = self.phases["initial"] + + self.lastPhase = None + self.secondaryPhase = None + + self.beforeRCDataPhase = None + + self.framesetOK = True + + def mainLoop(self): + (CharactersToken, + SpaceCharactersToken, + StartTagToken, + EndTagToken, + CommentToken, + DoctypeToken) = (tokenTypes["Characters"], + tokenTypes["SpaceCharacters"], + tokenTypes["StartTag"], + tokenTypes["EndTag"], + tokenTypes["Comment"], + tokenTypes["Doctype"]) + + CharactersToken = tokenTypes["Characters"] + SpaceCharactersToken = tokenTypes["SpaceCharacters"] + StartTagToken = tokenTypes["StartTag"] + EndTagToken = tokenTypes["EndTag"] + CommentToken = tokenTypes["Comment"] + DoctypeToken = tokenTypes["Doctype"] + + + for token in self.normalizedTokens(): + type = token["type"] + if type == CharactersToken: + self.phase.processCharacters(token) + elif type == SpaceCharactersToken: + self.phase.processSpaceCharacters(token) + elif type == StartTagToken: + self.selfClosingAcknowledged = False + self.phase.processStartTag(token) + if (token["selfClosing"] + and not self.selfClosingAcknowledged): + self.parseError("non-void-element-with-trailing-solidus", + {"name":token["name"]}) + elif type == EndTagToken: + self.phase.processEndTag(token) + elif type == CommentToken: + self.phase.processComment(token) + elif type == DoctypeToken: + self.phase.processDoctype(token) + else: + self.parseError(token["data"], token.get("datavars", {})) + + # When the loop finishes it's EOF + self.phase.processEOF() + + def normalizedTokens(self): + for token in self.tokenizer: + yield self.normalizeToken(token) + + def parse(self, stream, encoding=None, parseMeta=True, useChardet=True): + """Parse a HTML document into a well-formed tree + + stream - a filelike object or string containing the HTML to be parsed + + The optional encoding parameter must be a string that indicates + the encoding. If specified, that encoding will be used, + regardless of any BOM or later declaration (such as in a meta + element) + """ + self._parse(stream, innerHTML=False, encoding=encoding, + parseMeta=parseMeta, useChardet=useChardet) + return self.tree.getDocument() + + def parseFragment(self, stream, container="div", encoding=None, + parseMeta=False, useChardet=True): + """Parse a HTML fragment into a well-formed tree fragment + + container - name of the element we're setting the innerHTML property + if set to None, default to 'div' + + stream - a filelike object or string containing the HTML to be parsed + + The optional encoding parameter must be a string that indicates + the encoding. If specified, that encoding will be used, + regardless of any BOM or later declaration (such as in a meta + element) + """ + self._parse(stream, True, container=container, encoding=encoding) + return self.tree.getFragment() + + def parseError(self, errorcode="XXX-undefined-error", datavars={}): + # XXX The idea is to make errorcode mandatory. + self.errors.append((self.tokenizer.stream.position(), errorcode, datavars)) + if self.strict: + raise ParseError + + def normalizeToken(self, token): + """ HTML5 specific normalizations to the token stream """ + + if token["type"] == tokenTypes["StartTag"]: + token["data"] = dict(token["data"][::-1]) + + return token + + def adjustMathMLAttributes(self, token): + replacements = {"definitionurl":"definitionURL"} + for k,v in replacements.iteritems(): + if k in token["data"]: + token["data"][v] = token["data"][k] + del token["data"][k] + + def adjustSVGAttributes(self, token): + replacements = { + "attributename" : "attributeName", + "attributetype" : "attributeType", + "basefrequency" : "baseFrequency", + "baseprofile" : "baseProfile", + "calcmode" : "calcMode", + "clippathunits" : "clipPathUnits", + "contentscripttype" : "contentScriptType", + "contentstyletype" : "contentStyleType", + "diffuseconstant" : "diffuseConstant", + "edgemode" : "edgeMode", + "externalresourcesrequired" : "externalResourcesRequired", + "filterres" : "filterRes", + "filterunits" : "filterUnits", + "glyphref" : "glyphRef", + "gradienttransform" : "gradientTransform", + "gradientunits" : "gradientUnits", + "kernelmatrix" : "kernelMatrix", + "kernelunitlength" : "kernelUnitLength", + "keypoints" : "keyPoints", + "keysplines" : "keySplines", + "keytimes" : "keyTimes", + "lengthadjust" : "lengthAdjust", + "limitingconeangle" : "limitingConeAngle", + "markerheight" : "markerHeight", + "markerunits" : "markerUnits", + "markerwidth" : "markerWidth", + "maskcontentunits" : "maskContentUnits", + "maskunits" : "maskUnits", + "numoctaves" : "numOctaves", + "pathlength" : "pathLength", + "patterncontentunits" : "patternContentUnits", + "patterntransform" : "patternTransform", + "patternunits" : "patternUnits", + "pointsatx" : "pointsAtX", + "pointsaty" : "pointsAtY", + "pointsatz" : "pointsAtZ", + "preservealpha" : "preserveAlpha", + "preserveaspectratio" : "preserveAspectRatio", + "primitiveunits" : "primitiveUnits", + "refx" : "refX", + "refy" : "refY", + "repeatcount" : "repeatCount", + "repeatdur" : "repeatDur", + "requiredextensions" : "requiredExtensions", + "requiredfeatures" : "requiredFeatures", + "specularconstant" : "specularConstant", + "specularexponent" : "specularExponent", + "spreadmethod" : "spreadMethod", + "startoffset" : "startOffset", + "stddeviation" : "stdDeviation", + "stitchtiles" : "stitchTiles", + "surfacescale" : "surfaceScale", + "systemlanguage" : "systemLanguage", + "tablevalues" : "tableValues", + "targetx" : "targetX", + "targety" : "targetY", + "textlength" : "textLength", + "viewbox" : "viewBox", + "viewtarget" : "viewTarget", + "xchannelselector" : "xChannelSelector", + "ychannelselector" : "yChannelSelector", + "zoomandpan" : "zoomAndPan" + } + for originalName in token["data"].keys(): + if originalName in replacements: + svgName = replacements[originalName] + token["data"][svgName] = token["data"][originalName] + del token["data"][originalName] + + def adjustForeignAttributes(self, token): + replacements = { + "xlink:actuate":("xlink", "actuate", namespaces["xlink"]), + "xlink:arcrole":("xlink", "arcrole", namespaces["xlink"]), + "xlink:href":("xlink", "href", namespaces["xlink"]), + "xlink:role":("xlink", "role", namespaces["xlink"]), + "xlink:show":("xlink", "show", namespaces["xlink"]), + "xlink:title":("xlink", "title", namespaces["xlink"]), + "xlink:type":("xlink", "type", namespaces["xlink"]), + "xml:base":("xml", "base", namespaces["xml"]), + "xml:lang":("xml", "lang", namespaces["xml"]), + "xml:space":("xml", "space", namespaces["xml"]), + "xmlns":(None, "xmlns", namespaces["xmlns"]), + "xmlns:xlink":("xmlns", "xlink", namespaces["xmlns"]) + } + + for originalName in token["data"].iterkeys(): + if originalName in replacements: + foreignName = replacements[originalName] + token["data"][foreignName] = token["data"][originalName] + del token["data"][originalName] + + def resetInsertionMode(self): + # The name of this method is mostly historical. (It's also used in the + # specification.) + last = False + newModes = { + "select":"inSelect", + "td":"inCell", + "th":"inCell", + "tr":"inRow", + "tbody":"inTableBody", + "thead":"inTableBody", + "tfoot":"inTableBody", + "caption":"inCaption", + "colgroup":"inColumnGroup", + "table":"inTable", + "head":"inBody", + "body":"inBody", + "frameset":"inFrameset" + } + for node in self.tree.openElements[::-1]: + nodeName = node.name + if node == self.tree.openElements[0]: + last = True + if nodeName not in ['td', 'th']: + # XXX + assert self.innerHTML + nodeName = self.innerHTML + # Check for conditions that should only happen in the innerHTML + # case + if nodeName in ("select", "colgroup", "head", "frameset"): + # XXX + assert self.innerHTML + if nodeName in newModes: + self.phase = self.phases[newModes[nodeName]] + break + elif node.namespace in (namespaces["mathml"], namespaces["svg"]): + self.phase = self.phases["inForeignContent"] + self.secondaryPhase = self.phases["inBody"] + break + elif nodeName == "html": + if self.tree.headPointer is None: + self.phase = self.phases["beforeHead"] + else: + self.phase = self.phases["afterHead"] + break + elif last: + self.phase = self.phases["inBody"] + break + + def parseRCDataRawtext(self, token, contentType): + """Generic RCDATA/RAWTEXT Parsing algorithm + contentType - RCDATA or RAWTEXT + """ + assert contentType in ("RAWTEXT", "RCDATA") + + element = self.tree.insertElement(token) + + if contentType == "RAWTEXT": + self.tokenizer.state = self.tokenizer.rawtextState + else: + self.tokenizer.state = self.tokenizer.rcdataState + + self.originalPhase = self.phase + + self.phase = self.phases["text"] + +class Phase(object): + """Base class for helper object that implements each phase of processing + """ + # Order should be (they can be omitted): + # * EOF + # * Comment + # * Doctype + # * SpaceCharacters + # * Characters + # * StartTag + # - startTag* methods + # * EndTag + # - endTag* methods + + def __init__(self, parser, tree): + self.parser = parser + self.tree = tree + + def processEOF(self): + raise NotImplementedError + + def processComment(self, token): + # For most phases the following is correct. Where it's not it will be + # overridden. + self.tree.insertComment(token, self.tree.openElements[-1]) + + def processDoctype(self, token): + self.parser.parseError("unexpected-doctype") + + def processCharacters(self, token): + self.tree.insertText(token["data"]) + + def processSpaceCharacters(self, token): + self.tree.insertText(token["data"]) + + def processStartTag(self, token): + self.startTagHandler[token["name"]](token) + + def startTagHtml(self, token): + if self.parser.firstStartTag == False and token["name"] == "html": + self.parser.parseError("non-html-root") + # XXX Need a check here to see if the first start tag token emitted is + # this token... If it's not, invoke self.parser.parseError(). + for attr, value in token["data"].iteritems(): + if attr not in self.tree.openElements[0].attributes: + self.tree.openElements[0].attributes[attr] = value + self.parser.firstStartTag = False + + def processEndTag(self, token): + self.endTagHandler[token["name"]](token) + +class InitialPhase(Phase): + def processSpaceCharacters(self, token): + pass + + def processComment(self, token): + self.tree.insertComment(token, self.tree.document) + + def processDoctype(self, token): + name = token["name"] + publicId = token["publicId"] + systemId = token["systemId"] + correct = token["correct"] + + if (name != "html" or publicId != None or + systemId != None and systemId != "about:legacy-compat"): + self.parser.parseError("unknown-doctype") + + if publicId is None: + publicId = "" + + self.tree.insertDoctype(token) + + if publicId != "": + publicId = publicId.translate(asciiUpper2Lower) + + if (not correct or token["name"] != "html" + or publicId.startswith( + ("+//silmaril//dtd html pro v0r11 19970101//", + "-//advasoft ltd//dtd html 3.0 aswedit + extensions//", + "-//as//dtd html 3.0 aswedit + extensions//", + "-//ietf//dtd html 2.0 level 1//", + "-//ietf//dtd html 2.0 level 2//", + "-//ietf//dtd html 2.0 strict level 1//", + "-//ietf//dtd html 2.0 strict level 2//", + "-//ietf//dtd html 2.0 strict//", + "-//ietf//dtd html 2.0//", + "-//ietf//dtd html 2.1e//", + "-//ietf//dtd html 3.0//", + "-//ietf//dtd html 3.2 final//", + "-//ietf//dtd html 3.2//", + "-//ietf//dtd html 3//", + "-//ietf//dtd html level 0//", + "-//ietf//dtd html level 1//", + "-//ietf//dtd html level 2//", + "-//ietf//dtd html level 3//", + "-//ietf//dtd html strict level 0//", + "-//ietf//dtd html strict level 1//", + "-//ietf//dtd html strict level 2//", + "-//ietf//dtd html strict level 3//", + "-//ietf//dtd html strict//", + "-//ietf//dtd html//", + "-//metrius//dtd metrius presentational//", + "-//microsoft//dtd internet explorer 2.0 html strict//", + "-//microsoft//dtd internet explorer 2.0 html//", + "-//microsoft//dtd internet explorer 2.0 tables//", + "-//microsoft//dtd internet explorer 3.0 html strict//", + "-//microsoft//dtd internet explorer 3.0 html//", + "-//microsoft//dtd internet explorer 3.0 tables//", + "-//netscape comm. corp.//dtd html//", + "-//netscape comm. corp.//dtd strict html//", + "-//o'reilly and associates//dtd html 2.0//", + "-//o'reilly and associates//dtd html extended 1.0//", + "-//o'reilly and associates//dtd html extended relaxed 1.0//", + "-//softquad software//dtd hotmetal pro 6.0::19990601::extensions to html 4.0//", + "-//softquad//dtd hotmetal pro 4.0::19971010::extensions to html 4.0//", + "-//spyglass//dtd html 2.0 extended//", + "-//sq//dtd html 2.0 hotmetal + extensions//", + "-//sun microsystems corp.//dtd hotjava html//", + "-//sun microsystems corp.//dtd hotjava strict html//", + "-//w3c//dtd html 3 1995-03-24//", + "-//w3c//dtd html 3.2 draft//", + "-//w3c//dtd html 3.2 final//", + "-//w3c//dtd html 3.2//", + "-//w3c//dtd html 3.2s draft//", + "-//w3c//dtd html 4.0 frameset//", + "-//w3c//dtd html 4.0 transitional//", + "-//w3c//dtd html experimental 19960712//", + "-//w3c//dtd html experimental 970421//", + "-//w3c//dtd w3 html//", + "-//w3o//dtd w3 html 3.0//", + "-//webtechs//dtd mozilla html 2.0//", + "-//webtechs//dtd mozilla html//")) + or publicId in + ("-//w3o//dtd w3 html strict 3.0//en//", + "-/w3c/dtd html 4.0 transitional/en", + "html") + or publicId.startswith( + ("-//w3c//dtd html 4.01 frameset//", + "-//w3c//dtd html 4.01 transitional//")) and + systemId == None + or systemId and systemId.lower() == "http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd"): + self.parser.compatMode = "quirks" + elif (publicId.startswith( + ("-//w3c//dtd xhtml 1.0 frameset//", + "-//w3c//dtd xhtml 1.0 transitional//")) + or publicId.startswith( + ("-//w3c//dtd html 4.01 frameset//", + "-//w3c//dtd html 4.01 transitional//")) and + systemId != None): + self.parser.compatMode = "limited quirks" + + self.parser.phase = self.parser.phases["beforeHtml"] + + def anythingElse(self): + self.parser.compatMode = "quirks" + self.parser.phase = self.parser.phases["beforeHtml"] + + def processCharacters(self, token): + self.parser.parseError("expected-doctype-but-got-chars") + self.anythingElse() + self.parser.phase.processCharacters(token) + + def processStartTag(self, token): + self.parser.parseError("expected-doctype-but-got-start-tag", + {"name": token["name"]}) + self.anythingElse() + self.parser.phase.processStartTag(token) + + def processEndTag(self, token): + self.parser.parseError("expected-doctype-but-got-end-tag", + {"name": token["name"]}) + self.anythingElse() + self.parser.phase.processEndTag(token) + + def processEOF(self): + self.parser.parseError("expected-doctype-but-got-eof") + self.anythingElse() + self.parser.phase.processEOF() + + +class BeforeHtmlPhase(Phase): + # helper methods + def insertHtmlElement(self): + self.tree.insertRoot(impliedTagToken("html", "StartTag")) + self.parser.phase = self.parser.phases["beforeHead"] + + # other + def processEOF(self): + self.insertHtmlElement() + self.parser.phase.processEOF() + + def processComment(self, token): + self.tree.insertComment(token, self.tree.document) + + def processSpaceCharacters(self, token): + pass + + def processCharacters(self, token): + self.insertHtmlElement() + self.parser.phase.processCharacters(token) + + def processStartTag(self, token): + if token["name"] == "html": + self.parser.firstStartTag = True + self.insertHtmlElement() + self.parser.phase.processStartTag(token) + + def processEndTag(self, token): + if token["name"] not in ("head", "body", "html", "br"): + self.parser.parseError("unexpected-end-tag-before-html", + {"name": token["name"]}) + else: + self.insertHtmlElement() + self.parser.phase.processEndTag(token) + + +class BeforeHeadPhase(Phase): + def __init__(self, parser, tree): + Phase.__init__(self, parser, tree) + + self.startTagHandler = utils.MethodDispatcher([ + ("html", self.startTagHtml), + ("head", self.startTagHead) + ]) + self.startTagHandler.default = self.startTagOther + + self.endTagHandler = utils.MethodDispatcher([ + (("head", "body", "html", "br"), self.endTagImplyHead) + ]) + self.endTagHandler.default = self.endTagOther + + def processEOF(self): + self.startTagHead(impliedTagToken("head", "StartTag")) + self.parser.phase.processEOF() + + def processSpaceCharacters(self, token): + pass + + def processCharacters(self, token): + self.startTagHead(impliedTagToken("head", "StartTag")) + self.parser.phase.processCharacters(token) + + def startTagHtml(self, token): + self.parser.phases["inBody"].processStartTag(token) + + def startTagHead(self, token): + self.tree.insertElement(token) + self.tree.headPointer = self.tree.openElements[-1] + self.parser.phase = self.parser.phases["inHead"] + + def startTagOther(self, token): + self.startTagHead(impliedTagToken("head", "StartTag")) + self.parser.phase.processStartTag(token) + + def endTagImplyHead(self, token): + self.startTagHead(impliedTagToken("head", "StartTag")) + self.parser.phase.processEndTag(token) + + def endTagOther(self, token): + self.parser.parseError("end-tag-after-implied-root", + {"name": token["name"]}) + +class InHeadPhase(Phase): + def __init__(self, parser, tree): + Phase.__init__(self, parser, tree) + + self.startTagHandler = utils.MethodDispatcher([ + ("html", self.startTagHtml), + ("title", self.startTagTitle), + (("noscript", "noframes", "style"), self.startTagNoScriptNoFramesStyle), + ("script", self.startTagScript), + (("base", "link", "command"), + self.startTagBaseLinkCommand), + ("meta", self.startTagMeta), + ("head", self.startTagHead) + ]) + self.startTagHandler.default = self.startTagOther + + self. endTagHandler = utils.MethodDispatcher([ + ("head", self.endTagHead), + (("br", "html", "body"), self.endTagHtmlBodyBr) + ]) + self.endTagHandler.default = self.endTagOther + + # helper + def appendToHead(self, element): + if self.tree.headPointer is not None: + self.tree.headPointer.appendChild(element) + else: + assert self.parser.innerHTML + self.tree.openElementsw[-1].appendChild(element) + + # the real thing + def processEOF (self): + self.anythingElse() + self.parser.phase.processEOF() + + def processCharacters(self, token): + self.anythingElse() + self.parser.phase.processCharacters(token) + + def startTagHtml(self, token): + self.parser.phases["inBody"].processStartTag(token) + + def startTagHead(self, token): + self.parser.parseError("two-heads-are-not-better-than-one") + + def startTagBaseLinkCommand(self, token): + self.tree.insertElement(token) + self.tree.openElements.pop() + token["selfClosingAcknowledged"] = True + + def startTagMeta(self, token): + self.tree.insertElement(token) + self.tree.openElements.pop() + token["selfClosingAcknowledged"] = True + + attributes = token["data"] + if self.parser.tokenizer.stream.charEncoding[1] == "tentative": + if "charset" in attributes: + self.parser.tokenizer.stream.changeEncoding(attributes["charset"]) + elif "content" in attributes: + # Encoding it as UTF-8 here is a hack, as really we should pass + # the abstract Unicode string, and just use the + # ContentAttrParser on that, but using UTF-8 allows all chars + # to be encoded and as a ASCII-superset works. + data = inputstream.EncodingBytes(attributes["content"].encode("utf-8")) + parser = inputstream.ContentAttrParser(data) + codec = parser.parse() + self.parser.tokenizer.stream.changeEncoding(codec) + + def startTagTitle(self, token): + self.parser.parseRCDataRawtext(token, "RCDATA") + + def startTagNoScriptNoFramesStyle(self, token): + #Need to decide whether to implement the scripting-disabled case + self.parser.parseRCDataRawtext(token, "RAWTEXT") + + def startTagScript(self, token): + self.tree.insertElement(token) + self.parser.tokenizer.state = self.parser.tokenizer.scriptDataState + self.parser.originalPhase = self.parser.phase + self.parser.phase = self.parser.phases["text"] + + def startTagOther(self, token): + self.anythingElse() + self.parser.phase.processStartTag(token) + + def endTagHead(self, token): + node = self.parser.tree.openElements.pop() + assert node.name == "head", "Expected head got %s"%node.name + self.parser.phase = self.parser.phases["afterHead"] + + def endTagHtmlBodyBr(self, token): + self.anythingElse() + self.parser.phase.processEndTag(token) + + def endTagOther(self, token): + self.parser.parseError("unexpected-end-tag", {"name": token["name"]}) + + def anythingElse(self): + self.endTagHead(impliedTagToken("head")) + + +# XXX If we implement a parser for which scripting is disabled we need to +# implement this phase. +# +# class InHeadNoScriptPhase(Phase): + +class AfterHeadPhase(Phase): + def __init__(self, parser, tree): + Phase.__init__(self, parser, tree) + + self.startTagHandler = utils.MethodDispatcher([ + ("html", self.startTagHtml), + ("body", self.startTagBody), + ("frameset", self.startTagFrameset), + (("base", "link", "meta", "noframes", "script", "style", "title"), + self.startTagFromHead), + ("head", self.startTagHead) + ]) + self.startTagHandler.default = self.startTagOther + self.endTagHandler = utils.MethodDispatcher([(("body", "html", "br"), + self.endTagHtmlBodyBr)]) + self.endTagHandler.default = self.endTagOther + + def processEOF(self): + self.anythingElse() + self.parser.phase.processEOF() + + def processCharacters(self, token): + self.anythingElse() + self.parser.phase.processCharacters(token) + + def startTagBody(self, token): + self.parser.framesetOK = False + self.tree.insertElement(token) + self.parser.phase = self.parser.phases["inBody"] + + def startTagFrameset(self, token): + self.tree.insertElement(token) + self.parser.phase = self.parser.phases["inFrameset"] + + def startTagFromHead(self, token): + self.parser.parseError("unexpected-start-tag-out-of-my-head", + {"name": token["name"]}) + self.tree.openElements.append(self.tree.headPointer) + self.parser.phases["inHead"].processStartTag(token) + for node in self.tree.openElements[::-1]: + if node.name == "head": + self.tree.openElements.remove(node) + break + + def startTagHead(self, token): + self.parser.parseError("unexpected-start-tag", {"name":token["name"]}) + + def startTagOther(self, token): + self.anythingElse() + self.parser.phase.processStartTag(token) + + def endTagHtmlBodyBr(self, token): + self.anythingElse() + self.parser.phase.processEndTag(token) + + def endTagOther(self, token): + self.parser.parseError("unexpected-end-tag", {"name":token["name"]}) + + def anythingElse(self): + self.tree.insertElement(impliedTagToken("body", "StartTag")) + self.parser.phase = self.parser.phases["inBody"] + self.parser.framesetOK = True + + +class InBodyPhase(Phase): + # http://www.whatwg.org/specs/web-apps/current-work/#parsing-main-inbody + # the really-really-really-very crazy mode + def __init__(self, parser, tree): + Phase.__init__(self, parser, tree) + + #Keep a ref to this for special handling of whitespace in <pre> + self.processSpaceCharactersNonPre = self.processSpaceCharacters + + self.startTagHandler = utils.MethodDispatcher([ + ("html", self.startTagHtml), + (("base", "command", "link", "meta", "noframes", "script", "style", + "title"), self.startTagProcessInHead), + ("body", self.startTagBody), + ("frameset", self.startTagFrameset), + (("address", "article", "aside", "blockquote", "center", "datagrid", + "details", "dir", "div", "dl", "fieldset", "figure", + "footer", "header", "hgroup", "menu", "nav", "ol", "p", + "section", "ul"), + self.startTagCloseP), + (("pre", "listing"), self.startTagPreListing), + ("form", self.startTagForm), + (("li", "dd", "dt"), self.startTagListItem), + ("plaintext",self.startTagPlaintext), + (headingElements, self.startTagHeading), + ("a", self.startTagA), + (("b", "big", "code", "em", "font", "i", "s", "small", "strike", + "strong", "tt", "u"),self.startTagFormatting), + ("nobr", self.startTagNobr), + ("button", self.startTagButton), + (("applet", "marquee", "object"), self.startTagAppletMarqueeObject), + ("xmp", self.startTagXmp), + ("table", self.startTagTable), + (("area", "basefont", "bgsound", "br", "embed", "img", "input", + "keygen", "spacer", "wbr"), self.startTagVoidFormatting), + (("param", "source"), self.startTagParamSource), + ("hr", self.startTagHr), + ("image", self.startTagImage), + ("isindex", self.startTagIsIndex), + ("textarea", self.startTagTextarea), + ("iframe", self.startTagIFrame), + (("noembed", "noframes", "noscript"), self.startTagRawtext), + ("select", self.startTagSelect), + (("rp", "rt"), self.startTagRpRt), + (("option", "optgroup"), self.startTagOpt), + (("math"), self.startTagMath), + (("svg"), self.startTagSvg), + (("caption", "col", "colgroup", "frame", "head", + "tbody", "td", "tfoot", "th", "thead", + "tr"), self.startTagMisplaced) + ]) + self.startTagHandler.default = self.startTagOther + + self.endTagHandler = utils.MethodDispatcher([ + ("body",self.endTagBody), + ("html",self.endTagHtml), + (("address", "article", "aside", "blockquote", "center", "datagrid", + "details", "dir", "div", "dl", "fieldset", "figure", + "footer", "header", "hgroup", "listing", "menu", "nav", "ol", "pre", + "section", "ul"), self.endTagBlock), + ("form", self.endTagForm), + ("p",self.endTagP), + (("dd", "dt", "li"), self.endTagListItem), + (headingElements, self.endTagHeading), + (("a", "b", "big", "code", "em", "font", "i", "nobr", "s", "small", + "strike", "strong", "tt", "u"), self.endTagFormatting), + (("applet", "button", "marquee", "object"), self.endTagAppletButtonMarqueeObject), + ("br", self.endTagBr), + ]) + self.endTagHandler.default = self.endTagOther + + # helper + def addFormattingElement(self, token): + self.tree.insertElement(token) + self.tree.activeFormattingElements.append( + self.tree.openElements[-1]) + + # the real deal + def processEOF(self): + allowed_elements = frozenset(("dd", "dt", "li", "p", "tbody", "td", + "tfoot", "th", "thead", "tr", "body", + "html")) + for node in self.tree.openElements[::-1]: + if node.name not in allowed_elements: + self.parser.parseError("expected-closing-tag-but-got-eof") + break + #Stop parsing + + def processSpaceCharactersDropNewline(self, token): + # Sometimes (start of <pre>, <listing>, and <textarea> blocks) we + # want to drop leading newlines + data = token["data"] + self.processSpaceCharacters = self.processSpaceCharactersNonPre + if (data.startswith("\n") and + self.tree.openElements[-1].name in ("pre", "listing", "textarea") + and not self.tree.openElements[-1].hasContent()): + data = data[1:] + if data: + self.tree.reconstructActiveFormattingElements() + self.tree.insertText(data) + + def processCharacters(self, token): + self.tree.reconstructActiveFormattingElements() + self.tree.insertText(token["data"]) + self.parser.framesetOK = False + + def processSpaceCharacters(self, token): + self.tree.reconstructActiveFormattingElements() + self.tree.insertText(token["data"]) + + def startTagProcessInHead(self, token): + self.parser.phases["inHead"].processStartTag(token) + + def startTagBody(self, token): + self.parser.parseError("unexpected-start-tag", {"name": "body"}) + if (len(self.tree.openElements) == 1 + or self.tree.openElements[1].name != "body"): + assert self.parser.innerHTML + else: + for attr, value in token["data"].iteritems(): + if attr not in self.tree.openElements[1].attributes: + self.tree.openElements[1].attributes[attr] = value + + def startTagFrameset(self, token): + self.parser.parseError("unexpected-start-tag", {"name": "frameset"}) + if (len(self.tree.openElements) == 1 or self.tree.openElements[1].name != "body"): + assert self.parser.innerHTML + elif not self.parser.framesetOK: + pass + else: + if self.tree.openElements[1].parent: + self.tree.openElements[1].parent.removeChild(self.tree.openElements[1]) + while self.tree.openElements[-1].name != "html": + self.tree.openElements.pop() + self.tree.insertElement(token) + self.parser.phase = self.parser.phases["inFrameset"] + + def startTagCloseP(self, token): + if self.tree.elementInScope("p"): + self.endTagP(impliedTagToken("p")) + self.tree.insertElement(token) + + def startTagPreListing(self, token): + if self.tree.elementInScope("p"): + self.endTagP(impliedTagToken("p")) + self.tree.insertElement(token) + self.parser.framesetOK = False + self.processSpaceCharacters = self.processSpaceCharactersDropNewline + + def startTagForm(self, token): + if self.tree.formPointer: + self.parser.parseError(u"unexpected-start-tag", {"name": "form"}) + else: + if self.tree.elementInScope("p"): + self.endTagP("p") + self.tree.insertElement(token) + self.tree.formPointer = self.tree.openElements[-1] + + def startTagListItem(self, token): + self.parser.framesetOK = False + + stopNamesMap = {"li":["li"], + "dt":["dt", "dd"], + "dd":["dt", "dd"]} + stopNames = stopNamesMap[token["name"]] + for node in reversed(self.tree.openElements): + if node.name in stopNames: + self.parser.phase.processEndTag( + impliedTagToken(node.name, "EndTag")) + break + if (node.nameTuple in (scopingElements | specialElements) and + node.name not in ("address", "div", "p")): + break + + if self.tree.elementInScope("p"): + self.parser.phase.processEndTag( + impliedTagToken("p", "EndTag")) + + self.tree.insertElement(token) + + def startTagPlaintext(self, token): + if self.tree.elementInScope("p"): + self.endTagP(impliedTagToken("p")) + self.tree.insertElement(token) + self.parser.tokenizer.state = self.parser.tokenizer.plaintextState + + def startTagHeading(self, token): + if self.tree.elementInScope("p"): + self.endTagP(impliedTagToken("p")) + if self.tree.openElements[-1].name in headingElements: + self.parser.parseError("unexpected-start-tag", {"name": token["name"]}) + self.tree.openElements.pop() + self.tree.insertElement(token) + + def startTagA(self, token): + afeAElement = self.tree.elementInActiveFormattingElements("a") + if afeAElement: + self.parser.parseError("unexpected-start-tag-implies-end-tag", + {"startName": "a", "endName": "a"}) + self.endTagFormatting(impliedTagToken("a")) + if afeAElement in self.tree.openElements: + self.tree.openElements.remove(afeAElement) + if afeAElement in self.tree.activeFormattingElements: + self.tree.activeFormattingElements.remove(afeAElement) + self.tree.reconstructActiveFormattingElements() + self.addFormattingElement(token) + + def startTagFormatting(self, token): + self.tree.reconstructActiveFormattingElements() + self.addFormattingElement(token) + + def startTagNobr(self, token): + self.tree.reconstructActiveFormattingElements() + if self.tree.elementInScope("nobr"): + self.parser.parseError("unexpected-start-tag-implies-end-tag", + {"startName": "nobr", "endName": "nobr"}) + self.processEndTag(impliedTagToken("nobr")) + # XXX Need tests that trigger the following + self.tree.reconstructActiveFormattingElements() + self.addFormattingElement(token) + + def startTagButton(self, token): + if self.tree.elementInScope("button"): + self.parser.parseError("unexpected-start-tag-implies-end-tag", + {"startName": "button", "endName": "button"}) + self.processEndTag(impliedTagToken("button")) + self.parser.phase.processStartTag(token) + else: + self.tree.reconstructActiveFormattingElements() + self.tree.insertElement(token) + self.tree.activeFormattingElements.append(Marker) + self.parser.framesetOK = False + + def startTagAppletMarqueeObject(self, token): + self.tree.reconstructActiveFormattingElements() + self.tree.insertElement(token) + self.tree.activeFormattingElements.append(Marker) + self.parser.framesetOK = False + + def startTagXmp(self, token): + if self.tree.elementInScope("p"): + self.endTagP(impliedTagToken("p")) + self.tree.reconstructActiveFormattingElements() + self.parser.framesetOK = False + self.parser.parseRCDataRawtext(token, "RAWTEXT") + + def startTagTable(self, token): + if self.parser.compatMode != "quirks": + if self.tree.elementInScope("p"): + self.processEndTag(impliedTagToken("p")) + self.tree.insertElement(token) + self.parser.framesetOK = False + self.parser.phase = self.parser.phases["inTable"] + + def startTagVoidFormatting(self, token): + self.tree.reconstructActiveFormattingElements() + self.tree.insertElement(token) + self.tree.openElements.pop() + token["selfClosingAcknowledged"] = True + self.parser.framesetOK = False + + def startTagParamSource(self, token): + self.tree.insertElement(token) + self.tree.openElements.pop() + token["selfClosingAcknowledged"] = True + + def startTagHr(self, token): + if self.tree.elementInScope("p"): + self.endTagP(impliedTagToken("p")) + self.tree.insertElement(token) + self.tree.openElements.pop() + token["selfClosingAcknowledged"] = True + self.parser.framesetOK = False + + def startTagImage(self, token): + # No really... + self.parser.parseError("unexpected-start-tag-treated-as", + {"originalName": "image", "newName": "img"}) + self.processStartTag(impliedTagToken("img", "StartTag", + attributes=token["data"], + selfClosing=token["selfClosing"])) + + def startTagIsIndex(self, token): + self.parser.parseError("deprecated-tag", {"name": "isindex"}) + if self.tree.formPointer: + return + form_attrs = {} + if "action" in token["data"]: + form_attrs["action"] = token["data"]["action"] + self.processStartTag(impliedTagToken("form", "StartTag", + attributes=form_attrs)) + self.processStartTag(impliedTagToken("hr", "StartTag")) + self.processStartTag(impliedTagToken("label", "StartTag")) + # XXX Localization ... + if "prompt" in token["data"]: + prompt = token["data"]["prompt"] + else: + prompt = "This is a searchable index. Insert your search keywords here: " + self.processCharacters( + {"type":tokenTypes["Characters"], "data":prompt}) + attributes = token["data"].copy() + if "action" in attributes: + del attributes["action"] + if "prompt" in attributes: + del attributes["prompt"] + attributes["name"] = "isindex" + self.processStartTag(impliedTagToken("input", "StartTag", + attributes = attributes, + selfClosing = + token["selfClosing"])) + self.processEndTag(impliedTagToken("label")) + self.processStartTag(impliedTagToken("hr", "StartTag")) + self.processEndTag(impliedTagToken("form")) + + def startTagTextarea(self, token): + self.tree.insertElement(token) + self.parser.tokenizer.state = self.parser.tokenizer.rcdataState + self.processSpaceCharacters = self.processSpaceCharactersDropNewline + self.parser.framesetOK = False + + def startTagIFrame(self, token): + self.parser.framesetOK = False + self.startTagRawtext(token) + + def startTagRawtext(self, token): + """iframe, noembed noframes, noscript(if scripting enabled)""" + self.parser.parseRCDataRawtext(token, "RAWTEXT") + + def startTagOpt(self, token): + if self.tree.elementInScope("option"): + self.parser.phase.processEndTag(impliedTagToken("option")) + self.tree.reconstructActiveFormattingElements() + self.parser.tree.insertElement(token) + + def startTagSelect(self, token): + self.tree.reconstructActiveFormattingElements() + self.tree.insertElement(token) + self.parser.framesetOK = False + if self.parser.phase in (self.parser.phases["inTable"], + self.parser.phases["inCaption"], + self.parser.phases["inColumnGroup"], + self.parser.phases["inTableBody"], + self.parser.phases["inRow"], + self.parser.phases["inCell"]): + self.parser.phase = self.parser.phases["inSelectInTable"] + else: + self.parser.phase = self.parser.phases["inSelect"] + + def startTagRpRt(self, token): + if self.tree.elementInScope("ruby"): + self.tree.generateImpliedEndTags() + if self.tree.openElements[-1].name != "ruby": + self.parser.parseError() + while self.tree.openElements[-1].name != "ruby": + self.tree.openElements.pop() + self.tree.insertElement(token) + + def startTagMath(self, token): + self.tree.reconstructActiveFormattingElements() + self.parser.adjustMathMLAttributes(token) + self.parser.adjustForeignAttributes(token) + token["namespace"] = namespaces["mathml"] + self.tree.insertElement(token) + #Need to get the parse error right for the case where the token + #has a namespace not equal to the xmlns attribute + if self.parser.phase != self.parser.phases["inForeignContent"]: + self.parser.secondaryPhase = self.parser.phase + self.parser.phase = self.parser.phases["inForeignContent"] + if token["selfClosing"]: + self.tree.openElements.pop() + token["selfClosingAcknowledged"] = True + + def startTagSvg(self, token): + self.tree.reconstructActiveFormattingElements() + self.parser.adjustSVGAttributes(token) + self.parser.adjustForeignAttributes(token) + token["namespace"] = namespaces["svg"] + self.tree.insertElement(token) + #Need to get the parse error right for the case where the token + #has a namespace not equal to the xmlns attribute + if self.parser.phase != self.parser.phases["inForeignContent"]: + self.parser.secondaryPhase = self.parser.phase + self.parser.phase = self.parser.phases["inForeignContent"] + if token["selfClosing"]: + self.tree.openElements.pop() + token["selfClosingAcknowledged"] = True + + def startTagMisplaced(self, token): + """ Elements that should be children of other elements that have a + different insertion mode; here they are ignored + "caption", "col", "colgroup", "frame", "frameset", "head", + "option", "optgroup", "tbody", "td", "tfoot", "th", "thead", + "tr", "noscript" + """ + self.parser.parseError("unexpected-start-tag-ignored", {"name": token["name"]}) + + def startTagOther(self, token): + self.tree.reconstructActiveFormattingElements() + self.tree.insertElement(token) + + def endTagP(self, token): + if not self.tree.elementInScope("p"): + self.startTagCloseP(impliedTagToken("p", "StartTag")) + self.parser.parseError("unexpected-end-tag", {"name": "p"}) + self.endTagP(impliedTagToken("p", "EndTag")) + else: + self.tree.generateImpliedEndTags("p") + if self.tree.openElements[-1].name != "p": + self.parser.parseError("unexpected-end-tag", {"name": "p"}) + node = self.tree.openElements.pop() + while node.name != "p": + node = self.tree.openElements.pop() + + def endTagBody(self, token): + if not self.tree.elementInScope("body"): + self.parser.parseError() + return + elif self.tree.openElements[-1].name != "body": + for node in self.tree.openElements[2:]: + if node.name not in frozenset(("dd", "dt", "li", "optgroup", + "option", "p", "rp", "rt", + "tbody", "td", "tfoot", + "th", "thead", "tr", "body", + "html")): + #Not sure this is the correct name for the parse error + self.parser.parseError( + "expected-one-end-tag-but-got-another", + {"expectedName": "body", "gotName": node.name}) + break + self.parser.phase = self.parser.phases["afterBody"] + + def endTagHtml(self, token): + #We repeat the test for the body end tag token being ignored here + if self.tree.elementInScope("body"): + self.endTagBody(impliedTagToken("body")) + self.parser.phase.processEndTag(token) + + def endTagBlock(self, token): + #Put us back in the right whitespace handling mode + if token["name"] == "pre": + self.processSpaceCharacters = self.processSpaceCharactersNonPre + inScope = self.tree.elementInScope(token["name"]) + if inScope: + self.tree.generateImpliedEndTags() + if self.tree.openElements[-1].name != token["name"]: + self.parser.parseError("end-tag-too-early", {"name": token["name"]}) + if inScope: + node = self.tree.openElements.pop() + while node.name != token["name"]: + node = self.tree.openElements.pop() + + def endTagForm(self, token): + node = self.tree.formPointer + self.tree.formPointer = None + if node is None or not self.tree.elementInScope(node): + self.parser.parseError("unexpected-end-tag", + {"name":"form"}) + else: + self.tree.generateImpliedEndTags() + if self.tree.openElements[-1].name != node: + self.parser.parseError("end-tag-too-early-ignored", + {"name": "form"}) + self.tree.openElements.remove(node) + + def endTagListItem(self, token): + if token["name"] == "li": + variant = "list" + else: + variant = None + if not self.tree.elementInScope(token["name"], variant=variant): + self.parser.parseError("unexpected-end-tag", {"name": token["name"]}) + else: + self.tree.generateImpliedEndTags(exclude = token["name"]) + if self.tree.openElements[-1].name != token["name"]: + self.parser.parseError( + "end-tag-too-early", + {"name": token["name"]}) + node = self.tree.openElements.pop() + while node.name != token["name"]: + node = self.tree.openElements.pop() + + def endTagHeading(self, token): + for item in headingElements: + if self.tree.elementInScope(item): + self.tree.generateImpliedEndTags() + break + if self.tree.openElements[-1].name != token["name"]: + self.parser.parseError("end-tag-too-early", {"name": token["name"]}) + + for item in headingElements: + if self.tree.elementInScope(item): + item = self.tree.openElements.pop() + while item.name not in headingElements: + item = self.tree.openElements.pop() + break + + def endTagFormatting(self, token): + """The much-feared adoption agency algorithm""" + # http://www.whatwg.org/specs/web-apps/current-work/#adoptionAgency + # XXX Better parseError messages appreciated. + name = token["name"] + while True: + # Step 1 paragraph 1 + formattingElement = self.tree.elementInActiveFormattingElements( + token["name"]) + if not formattingElement or (formattingElement in + self.tree.openElements and + not self.tree.elementInScope( + formattingElement.name)): + self.parser.parseError("adoption-agency-1.1", {"name": token["name"]}) + return + + # Step 1 paragraph 2 + elif formattingElement not in self.tree.openElements: + self.parser.parseError("adoption-agency-1.2", {"name": token["name"]}) + self.tree.activeFormattingElements.remove(formattingElement) + return + + # Step 1 paragraph 3 + if formattingElement != self.tree.openElements[-1]: + self.parser.parseError("adoption-agency-1.3", {"name": token["name"]}) + + # Step 2 + # Start of the adoption agency algorithm proper + afeIndex = self.tree.openElements.index(formattingElement) + furthestBlock = None + for element in self.tree.openElements[afeIndex:]: + if (element.nameTuple in + specialElements | scopingElements): + furthestBlock = element + break + + # Step 3 + if furthestBlock is None: + element = self.tree.openElements.pop() + while element != formattingElement: + element = self.tree.openElements.pop() + self.tree.activeFormattingElements.remove(element) + return + commonAncestor = self.tree.openElements[afeIndex-1] + + # Step 5 + #if furthestBlock.parent: + # furthestBlock.parent.removeChild(furthestBlock) + + # Step 5 + # The bookmark is supposed to help us identify where to reinsert + # nodes in step 12. We have to ensure that we reinsert nodes after + # the node before the active formatting element. Note the bookmark + # can move in step 7.4 + bookmark = self.tree.activeFormattingElements.index(formattingElement) + + # Step 6 + lastNode = node = furthestBlock + while True: + # AT replace this with a function and recursion? + # Node is element before node in open elements + node = self.tree.openElements[ + self.tree.openElements.index(node)-1] + while node not in self.tree.activeFormattingElements: + tmpNode = node + node = self.tree.openElements[ + self.tree.openElements.index(node)-1] + self.tree.openElements.remove(tmpNode) + # Step 6.3 + if node == formattingElement: + break + # Step 6.4 + if lastNode == furthestBlock: + bookmark = (self.tree.activeFormattingElements.index(node) + + 1) + # Step 6.5 + #cite = node.parent + #if node.hasContent(): + clone = node.cloneNode() + # Replace node with clone + self.tree.activeFormattingElements[ + self.tree.activeFormattingElements.index(node)] = clone + self.tree.openElements[ + self.tree.openElements.index(node)] = clone + node = clone + + # Step 6.6 + # Remove lastNode from its parents, if any + if lastNode.parent: + lastNode.parent.removeChild(lastNode) + node.appendChild(lastNode) + # Step 7.7 + lastNode = node + # End of inner loop + + # Step 7 + # Foster parent lastNode if commonAncestor is a + # table, tbody, tfoot, thead, or tr we need to foster parent the + # lastNode + if lastNode.parent: + lastNode.parent.removeChild(lastNode) + commonAncestor.appendChild(lastNode) + + # Step 8 + clone = formattingElement.cloneNode() + + # Step 9 + furthestBlock.reparentChildren(clone) + + # Step 10 + furthestBlock.appendChild(clone) + + # Step 11 + self.tree.activeFormattingElements.remove(formattingElement) + self.tree.activeFormattingElements.insert(bookmark, clone) + + # Step 12 + self.tree.openElements.remove(formattingElement) + self.tree.openElements.insert( + self.tree.openElements.index(furthestBlock) + 1, clone) + + def endTagAppletButtonMarqueeObject(self, token): + if self.tree.elementInScope(token["name"]): + self.tree.generateImpliedEndTags() + if self.tree.openElements[-1].name != token["name"]: + self.parser.parseError("end-tag-too-early", {"name": token["name"]}) + + if self.tree.elementInScope(token["name"]): + element = self.tree.openElements.pop() + while element.name != token["name"]: + element = self.tree.openElements.pop() + self.tree.clearActiveFormattingElements() + + def endTagBr(self, token): + self.parser.parseError("unexpected-end-tag-treated-as", + {"originalName": "br", "newName": "br element"}) + self.tree.reconstructActiveFormattingElements() + self.tree.insertElement(impliedTagToken("br", "StartTag")) + self.tree.openElements.pop() + + def endTagOther(self, token): + for node in self.tree.openElements[::-1]: + if node.name == token["name"]: + self.tree.generateImpliedEndTags() + if self.tree.openElements[-1].name != token["name"]: + self.parser.parseError("unexpected-end-tag", {"name": token["name"]}) + while self.tree.openElements.pop() != node: + pass + break + else: + if (node.nameTuple in + specialElements | scopingElements): + self.parser.parseError("unexpected-end-tag", {"name": token["name"]}) + break + +class TextPhase(Phase): + def __init__(self, parser, tree): + Phase.__init__(self, parser, tree) + self.startTagHandler = utils.MethodDispatcher([]) + self.startTagHandler.default = self.startTagOther + self.endTagHandler = utils.MethodDispatcher([ + ("script", self.endTagScript)]) + self.endTagHandler.default = self.endTagOther + + def processCharacters(self, token): + self.tree.insertText(token["data"]) + + def processEOF(self): + self.parser.parseError("expected-named-closing-tag-but-got-eof", + self.tree.openElements[-1].name) + self.tree.openElements.pop() + self.parser.phase = self.parser.originalPhase + self.parser.phase.processEOF() + + def startTagOther(self, token): + assert False, "Tried to process start tag %s in RCDATA/RAWTEXT mode"%name + + def endTagScript(self, token): + node = self.tree.openElements.pop() + assert node.name == "script" + self.parser.phase = self.parser.originalPhase + #The rest of this method is all stuff that only happens if + #document.write works + + def endTagOther(self, token): + node = self.tree.openElements.pop() + self.parser.phase = self.parser.originalPhase + +class InTablePhase(Phase): + # http://www.whatwg.org/specs/web-apps/current-work/#in-table + def __init__(self, parser, tree): + Phase.__init__(self, parser, tree) + self.startTagHandler = utils.MethodDispatcher([ + ("html", self.startTagHtml), + ("caption", self.startTagCaption), + ("colgroup", self.startTagColgroup), + ("col", self.startTagCol), + (("tbody", "tfoot", "thead"), self.startTagRowGroup), + (("td", "th", "tr"), self.startTagImplyTbody), + ("table", self.startTagTable), + (("style", "script"), self.startTagStyleScript), + ("input", self.startTagInput), + ("form", self.startTagForm) + ]) + self.startTagHandler.default = self.startTagOther + + self.endTagHandler = utils.MethodDispatcher([ + ("table", self.endTagTable), + (("body", "caption", "col", "colgroup", "html", "tbody", "td", + "tfoot", "th", "thead", "tr"), self.endTagIgnore) + ]) + self.endTagHandler.default = self.endTagOther + + # helper methods + def clearStackToTableContext(self): + # "clear the stack back to a table context" + while self.tree.openElements[-1].name not in ("table", "html"): + #self.parser.parseError("unexpected-implied-end-tag-in-table", + # {"name": self.tree.openElements[-1].name}) + self.tree.openElements.pop() + # When the current node is <html> it's an innerHTML case + + def getCurrentTable(self): + i = -1 + while -i <= len(self.tree.openElements) and self.tree.openElements[i].name != "table": + i -= 1 + if -i > len(self.tree.openElements): + return self.tree.openElements[0] + else: + return self.tree.openElements[i] + + # processing methods + def processEOF(self): + if self.tree.openElements[-1].name != "html": + self.parser.parseError("eof-in-table") + else: + assert self.parser.innerHTML + #Stop parsing + + def processSpaceCharacters(self, token): + originalPhase = self.parser.phase + self.parser.phase = self.parser.phases["inTableText"] + self.parser.phase.originalPhase = originalPhase + self.parser.phase.characterTokens.append(token) + + def processCharacters(self, token): + #If we get here there must be at least one non-whitespace character + # Do the table magic! + self.tree.insertFromTable = True + self.parser.phases["inBody"].processCharacters(token) + self.tree.insertFromTable = False + + def startTagCaption(self, token): + self.clearStackToTableContext() + self.tree.activeFormattingElements.append(Marker) + self.tree.insertElement(token) + self.parser.phase = self.parser.phases["inCaption"] + + def startTagColgroup(self, token): + self.clearStackToTableContext() + self.tree.insertElement(token) + self.parser.phase = self.parser.phases["inColumnGroup"] + + def startTagCol(self, token): + self.startTagColgroup(impliedTagToken("colgroup", "StartTag")) + self.parser.phase.processStartTag(token) + + def startTagRowGroup(self, token): + self.clearStackToTableContext() + self.tree.insertElement(token) + self.parser.phase = self.parser.phases["inTableBody"] + + def startTagImplyTbody(self, token): + self.startTagRowGroup(impliedTagToken("tbody", "StartTag")) + self.parser.phase.processStartTag(token) + + def startTagTable(self, token): + self.parser.parseError("unexpected-start-tag-implies-end-tag", + {"startName": "table", "endName": "table"}) + self.parser.phase.processEndTag(impliedTagToken("table")) + if not self.parser.innerHTML: + self.parser.phase.processStartTag(token) + + def startTagStyleScript(self, token): + self.parser.phases["inHead"].processStartTag(token) + + def startTagInput(self, token): + if ("type" in token["data"] and + token["data"]["type"].translate(asciiUpper2Lower) == "hidden"): + self.parser.parseError("unexpected-hidden-input-in-table") + self.tree.insertElement(token) + # XXX associate with form + self.tree.openElements.pop() + else: + self.startTagOther(token) + + def startTagForm(self, token): + self.parser.parseError("unexpected-form-in-table") + self.tree.insertElement(token) + self.tree.openElements.pop() + + def startTagOther(self, token): + self.parser.parseError("unexpected-start-tag-implies-table-voodoo", {"name": token["name"]}) + if "tainted" not in self.getCurrentTable()._flags: + self.getCurrentTable()._flags.append("tainted") + # Do the table magic! + self.tree.insertFromTable = True + self.parser.phases["inBody"].processStartTag(token) + self.tree.insertFromTable = False + + def endTagTable(self, token): + if self.tree.elementInScope("table", variant="table"): + self.tree.generateImpliedEndTags() + if self.tree.openElements[-1].name != "table": + self.parser.parseError("end-tag-too-early-named", + {"gotName": "table", + "expectedName": self.tree.openElements[-1].name}) + while self.tree.openElements[-1].name != "table": + self.tree.openElements.pop() + self.tree.openElements.pop() + self.parser.resetInsertionMode() + else: + # innerHTML case + assert self.parser.innerHTML + self.parser.parseError() + + def endTagIgnore(self, token): + self.parser.parseError("unexpected-end-tag", {"name": token["name"]}) + + def endTagOther(self, token): + self.parser.parseError("unexpected-end-tag-implies-table-voodoo", {"name": token["name"]}) + if "tainted" not in self.getCurrentTable()._flags: + self.getCurrentTable()._flags.append("tainted") + # Do the table magic! + self.tree.insertFromTable = True + self.parser.phases["inBody"].processEndTag(token) + self.tree.insertFromTable = False + +class InTableTextPhase(Phase): + def __init__(self, parser, tree): + Phase.__init__(self, parser, tree) + self.originalPhase = None + self.characterTokens = [] + + def flushCharacters(self): + data = "".join([item["data"] for item in self.characterTokens]) + if any([item not in spaceCharacters for item in data]): + token = {"type":tokenTypes["Characters"], "data":data} + self.originalPhase.processCharacters(token) + elif data: + self.tree.insertText(data) + self.characterTokens = [] + + def processComment(self, token): + self.flushCharacters() + self.phase = self.originalPhase + self.phase.processComment(token) + + def processEOF(self): + self.flushCharacters() + self.phase = self.originalPhase + self.phase.processEOF() + + def processCharacters(self, token): + self.characterTokens.append(token) + + def processSpaceCharacters(self, token): + #pretty sure we should never reach here + self.characterTokens.append(token) +# assert False + + def processStartTag(self, token): + self.flushCharacters() + self.phase = self.originalPhase + self.phase.processStartTag(token) + + def processEndTag(self, token): + self.flushCharacters() + self.phase = self.originalPhase + self.phase.processEndTag(token) + + +class InCaptionPhase(Phase): + # http://www.whatwg.org/specs/web-apps/current-work/#in-caption + def __init__(self, parser, tree): + Phase.__init__(self, parser, tree) + + self.startTagHandler = utils.MethodDispatcher([ + ("html", self.startTagHtml), + (("caption", "col", "colgroup", "tbody", "td", "tfoot", "th", + "thead", "tr"), self.startTagTableElement) + ]) + self.startTagHandler.default = self.startTagOther + + self.endTagHandler = utils.MethodDispatcher([ + ("caption", self.endTagCaption), + ("table", self.endTagTable), + (("body", "col", "colgroup", "html", "tbody", "td", "tfoot", "th", + "thead", "tr"), self.endTagIgnore) + ]) + self.endTagHandler.default = self.endTagOther + + def ignoreEndTagCaption(self): + return not self.tree.elementInScope("caption", variant="table") + + def processEOF(self): + self.parser.phases["inBody"].processEOF() + + def processCharacters(self, token): + self.parser.phases["inBody"].processCharacters(token) + + def startTagTableElement(self, token): + self.parser.parseError() + #XXX Have to duplicate logic here to find out if the tag is ignored + ignoreEndTag = self.ignoreEndTagCaption() + self.parser.phase.processEndTag(impliedTagToken("caption")) + if not ignoreEndTag: + self.parser.phase.processStartTag(token) + + def startTagOther(self, token): + self.parser.phases["inBody"].processStartTag(token) + + def endTagCaption(self, token): + if not self.ignoreEndTagCaption(): + # AT this code is quite similar to endTagTable in "InTable" + self.tree.generateImpliedEndTags() + if self.tree.openElements[-1].name != "caption": + self.parser.parseError("expected-one-end-tag-but-got-another", + {"gotName": "caption", + "expectedName": self.tree.openElements[-1].name}) + while self.tree.openElements[-1].name != "caption": + self.tree.openElements.pop() + self.tree.openElements.pop() + self.tree.clearActiveFormattingElements() + self.parser.phase = self.parser.phases["inTable"] + else: + # innerHTML case + assert self.parser.innerHTML + self.parser.parseError() + + def endTagTable(self, token): + self.parser.parseError() + ignoreEndTag = self.ignoreEndTagCaption() + self.parser.phase.processEndTag(impliedTagToken("caption")) + if not ignoreEndTag: + self.parser.phase.processEndTag(token) + + def endTagIgnore(self, token): + self.parser.parseError("unexpected-end-tag", {"name": token["name"]}) + + def endTagOther(self, token): + self.parser.phases["inBody"].processEndTag(token) + + +class InColumnGroupPhase(Phase): + # http://www.whatwg.org/specs/web-apps/current-work/#in-column + + def __init__(self, parser, tree): + Phase.__init__(self, parser, tree) + + self.startTagHandler = utils.MethodDispatcher([ + ("html", self.startTagHtml), + ("col", self.startTagCol) + ]) + self.startTagHandler.default = self.startTagOther + + self.endTagHandler = utils.MethodDispatcher([ + ("colgroup", self.endTagColgroup), + ("col", self.endTagCol) + ]) + self.endTagHandler.default = self.endTagOther + + def ignoreEndTagColgroup(self): + return self.tree.openElements[-1].name == "html" + + def processEOF(self): + if self.tree.openElements[-1].name == "html": + assert self.parser.innerHTML + return + else: + ignoreEndTag = self.ignoreEndTagColgroup() + self.endTagColgroup("colgroup") + if not ignoreEndTag: + self.parser.phase.processEOF() + + def processCharacters(self, token): + ignoreEndTag = self.ignoreEndTagColgroup() + self.endTagColgroup(impliedTagToken("colgroup")) + if not ignoreEndTag: + self.parser.phase.processCharacters(token) + + def startTagCol(self, token): + self.tree.insertElement(token) + self.tree.openElements.pop() + + def startTagOther(self, token): + ignoreEndTag = self.ignoreEndTagColgroup() + self.endTagColgroup("colgroup") + if not ignoreEndTag: + self.parser.phase.processStartTag(token) + + def endTagColgroup(self, token): + if self.ignoreEndTagColgroup(): + # innerHTML case + assert self.parser.innerHTML + self.parser.parseError() + else: + self.tree.openElements.pop() + self.parser.phase = self.parser.phases["inTable"] + + def endTagCol(self, token): + self.parser.parseError("no-end-tag", {"name": "col"}) + + def endTagOther(self, token): + ignoreEndTag = self.ignoreEndTagColgroup() + self.endTagColgroup("colgroup") + if not ignoreEndTag: + self.parser.phase.processEndTag(token) + + +class InTableBodyPhase(Phase): + # http://www.whatwg.org/specs/web-apps/current-work/#in-table0 + def __init__(self, parser, tree): + Phase.__init__(self, parser, tree) + self.startTagHandler = utils.MethodDispatcher([ + ("html", self.startTagHtml), + ("tr", self.startTagTr), + (("td", "th"), self.startTagTableCell), + (("caption", "col", "colgroup", "tbody", "tfoot", "thead"), + self.startTagTableOther) + ]) + self.startTagHandler.default = self.startTagOther + + self.endTagHandler = utils.MethodDispatcher([ + (("tbody", "tfoot", "thead"), self.endTagTableRowGroup), + ("table", self.endTagTable), + (("body", "caption", "col", "colgroup", "html", "td", "th", + "tr"), self.endTagIgnore) + ]) + self.endTagHandler.default = self.endTagOther + + # helper methods + def clearStackToTableBodyContext(self): + while self.tree.openElements[-1].name not in ("tbody", "tfoot", + "thead", "html"): + #self.parser.parseError("unexpected-implied-end-tag-in-table", + # {"name": self.tree.openElements[-1].name}) + self.tree.openElements.pop() + if self.tree.openElements[-1].name == "html": + assert self.parser.innerHTML + + # the rest + def processEOF(self): + self.parser.phases["inTable"].processEOF() + + def processSpaceCharacters(self, token): + self.parser.phases["inTable"].processSpaceCharacters(token) + + def processCharacters(self, token): + self.parser.phases["inTable"].processCharacters(token) + + def startTagTr(self, token): + self.clearStackToTableBodyContext() + self.tree.insertElement(token) + self.parser.phase = self.parser.phases["inRow"] + + def startTagTableCell(self, token): + self.parser.parseError("unexpected-cell-in-table-body", + {"name": token["name"]}) + self.startTagTr(impliedTagToken("tr", "StartTag")) + self.parser.phase.processStartTag(token) + + def startTagTableOther(self, token): + # XXX AT Any ideas on how to share this with endTagTable? + if (self.tree.elementInScope("tbody", variant="table") or + self.tree.elementInScope("thead", variant="table") or + self.tree.elementInScope("tfoot", variant="table")): + self.clearStackToTableBodyContext() + self.endTagTableRowGroup( + impliedTagToken(self.tree.openElements[-1].name)) + self.parser.phase.processStartTag(token) + else: + # innerHTML case + self.parser.parseError() + + def startTagOther(self, token): + self.parser.phases["inTable"].processStartTag(token) + + def endTagTableRowGroup(self, token): + if self.tree.elementInScope(token["name"], variant="table"): + self.clearStackToTableBodyContext() + self.tree.openElements.pop() + self.parser.phase = self.parser.phases["inTable"] + else: + self.parser.parseError("unexpected-end-tag-in-table-body", + {"name": token["name"]}) + + def endTagTable(self, token): + if (self.tree.elementInScope("tbody", variant="table") or + self.tree.elementInScope("thead", variant="table") or + self.tree.elementInScope("tfoot", variant="table")): + self.clearStackToTableBodyContext() + self.endTagTableRowGroup( + impliedTagToken(self.tree.openElements[-1].name)) + self.parser.phase.processEndTag(token) + else: + # innerHTML case + self.parser.parseError() + + def endTagIgnore(self, token): + self.parser.parseError("unexpected-end-tag-in-table-body", + {"name": token["name"]}) + + def endTagOther(self, token): + self.parser.phases["inTable"].processEndTag(token) + + +class InRowPhase(Phase): + # http://www.whatwg.org/specs/web-apps/current-work/#in-row + def __init__(self, parser, tree): + Phase.__init__(self, parser, tree) + self.startTagHandler = utils.MethodDispatcher([ + ("html", self.startTagHtml), + (("td", "th"), self.startTagTableCell), + (("caption", "col", "colgroup", "tbody", "tfoot", "thead", + "tr"), self.startTagTableOther) + ]) + self.startTagHandler.default = self.startTagOther + + self.endTagHandler = utils.MethodDispatcher([ + ("tr", self.endTagTr), + ("table", self.endTagTable), + (("tbody", "tfoot", "thead"), self.endTagTableRowGroup), + (("body", "caption", "col", "colgroup", "html", "td", "th"), + self.endTagIgnore) + ]) + self.endTagHandler.default = self.endTagOther + + # helper methods (XXX unify this with other table helper methods) + def clearStackToTableRowContext(self): + while self.tree.openElements[-1].name not in ("tr", "html"): + self.parser.parseError("unexpected-implied-end-tag-in-table-row", + {"name": self.tree.openElements[-1].name}) + self.tree.openElements.pop() + + def ignoreEndTagTr(self): + return not self.tree.elementInScope("tr", variant="table") + + # the rest + def processEOF(self): + self.parser.phases["inTable"].processEOF() + + def processSpaceCharacters(self, token): + self.parser.phases["inTable"].processSpaceCharacters(token) + + def processCharacters(self, token): + self.parser.phases["inTable"].processCharacters(token) + + def startTagTableCell(self, token): + self.clearStackToTableRowContext() + self.tree.insertElement(token) + self.parser.phase = self.parser.phases["inCell"] + self.tree.activeFormattingElements.append(Marker) + + def startTagTableOther(self, token): + ignoreEndTag = self.ignoreEndTagTr() + self.endTagTr("tr") + # XXX how are we sure it's always ignored in the innerHTML case? + if not ignoreEndTag: + self.parser.phase.processStartTag(token) + + def startTagOther(self, token): + self.parser.phases["inTable"].processStartTag(token) + + def endTagTr(self, token): + if not self.ignoreEndTagTr(): + self.clearStackToTableRowContext() + self.tree.openElements.pop() + self.parser.phase = self.parser.phases["inTableBody"] + else: + # innerHTML case + assert self.parser.innerHTML + self.parser.parseError() + + def endTagTable(self, token): + ignoreEndTag = self.ignoreEndTagTr() + self.endTagTr("tr") + # Reprocess the current tag if the tr end tag was not ignored + # XXX how are we sure it's always ignored in the innerHTML case? + if not ignoreEndTag: + self.parser.phase.processEndTag(token) + + def endTagTableRowGroup(self, token): + if self.tree.elementInScope(token["name"], variant="table"): + self.endTagTr("tr") + self.parser.phase.processEndTag(token) + else: + # innerHTML case + self.parser.parseError() + + def endTagIgnore(self, token): + self.parser.parseError("unexpected-end-tag-in-table-row", + {"name": token["name"]}) + + def endTagOther(self, token): + self.parser.phases["inTable"].processEndTag(token) + +class InCellPhase(Phase): + # http://www.whatwg.org/specs/web-apps/current-work/#in-cell + def __init__(self, parser, tree): + Phase.__init__(self, parser, tree) + self.startTagHandler = utils.MethodDispatcher([ + ("html", self.startTagHtml), + (("caption", "col", "colgroup", "tbody", "td", "tfoot", "th", + "thead", "tr"), self.startTagTableOther) + ]) + self.startTagHandler.default = self.startTagOther + + self.endTagHandler = utils.MethodDispatcher([ + (("td", "th"), self.endTagTableCell), + (("body", "caption", "col", "colgroup", "html"), self.endTagIgnore), + (("table", "tbody", "tfoot", "thead", "tr"), self.endTagImply) + ]) + self.endTagHandler.default = self.endTagOther + + # helper + def closeCell(self): + if self.tree.elementInScope("td", variant="table"): + self.endTagTableCell(impliedTagToken("td")) + elif self.tree.elementInScope("th", variant="table"): + self.endTagTableCell(impliedTagToken("th")) + + # the rest + def processEOF(self): + self.parser.phases["inBody"].processEOF() + + def processCharacters(self, token): + self.parser.phases["inBody"].processCharacters(token) + + def startTagTableOther(self, token): + if (self.tree.elementInScope("td", variant="table") or + self.tree.elementInScope("th", variant="table")): + self.closeCell() + self.parser.phase.processStartTag(token) + else: + # innerHTML case + self.parser.parseError() + + def startTagOther(self, token): + self.parser.phases["inBody"].processStartTag(token) + # Optimize this for subsequent invocations. Can't do this initially + # because self.phases doesn't really exist at that point. + self.startTagHandler.default =\ + self.parser.phases["inBody"].processStartTag + + def endTagTableCell(self, token): + if self.tree.elementInScope(token["name"], variant="table"): + self.tree.generateImpliedEndTags(token["name"]) + if self.tree.openElements[-1].name != token["name"]: + self.parser.parseError("unexpected-cell-end-tag", + {"name": token["name"]}) + while True: + node = self.tree.openElements.pop() + if node.name == token["name"]: + break + else: + self.tree.openElements.pop() + self.tree.clearActiveFormattingElements() + self.parser.phase = self.parser.phases["inRow"] + else: + self.parser.parseError("unexpected-end-tag", {"name": token["name"]}) + + def endTagIgnore(self, token): + self.parser.parseError("unexpected-end-tag", {"name": token["name"]}) + + def endTagImply(self, token): + if self.tree.elementInScope(token["name"], variant="table"): + self.closeCell() + self.parser.phase.processEndTag(token) + else: + # sometimes innerHTML case + self.parser.parseError() + + def endTagOther(self, token): + self.parser.phases["inBody"].processEndTag(token) + # Optimize this for subsequent invocations. Can't do this initially + # because self.phases doesn't really exist at that point. + self.endTagHandler.default = self.parser.phases["inBody"].processEndTag + + +class InSelectPhase(Phase): + def __init__(self, parser, tree): + Phase.__init__(self, parser, tree) + + self.startTagHandler = utils.MethodDispatcher([ + ("html", self.startTagHtml), + ("option", self.startTagOption), + ("optgroup", self.startTagOptgroup), + ("select", self.startTagSelect), + (("input", "keygen", "textarea"), self.startTagInput) + ]) + self.startTagHandler.default = self.startTagOther + + self.endTagHandler = utils.MethodDispatcher([ + ("option", self.endTagOption), + ("optgroup", self.endTagOptgroup), + ("select", self.endTagSelect), + (("caption", "table", "tbody", "tfoot", "thead", "tr", "td", + "th"), self.endTagTableElements) + ]) + self.endTagHandler.default = self.endTagOther + + # http://www.whatwg.org/specs/web-apps/current-work/#in-select + def processEOF(self): + if self.tree.openElements[-1].name != "html": + self.parser.parseError("eof-in-select") + else: + assert self.parser.innerHTML + + def processCharacters(self, token): + self.tree.insertText(token["data"]) + + def startTagOption(self, token): + # We need to imply </option> if <option> is the current node. + if self.tree.openElements[-1].name == "option": + self.tree.openElements.pop() + self.tree.insertElement(token) + + def startTagOptgroup(self, token): + if self.tree.openElements[-1].name == "option": + self.tree.openElements.pop() + if self.tree.openElements[-1].name == "optgroup": + self.tree.openElements.pop() + self.tree.insertElement(token) + + def startTagSelect(self, token): + self.parser.parseError("unexpected-select-in-select") + self.endTagSelect("select") + + def startTagInput(self, token): + self.parser.parseError("unexpected-input-in-select") + if self.tree.elementInScope("select", variant="table"): + self.endTagSelect("select") + self.parser.phase.processStartTag(token) + + def startTagOther(self, token): + self.parser.parseError("unexpected-start-tag-in-select", + {"name": token["name"]}) + + def endTagOption(self, token): + if self.tree.openElements[-1].name == "option": + self.tree.openElements.pop() + else: + self.parser.parseError("unexpected-end-tag-in-select", + {"name": "option"}) + + def endTagOptgroup(self, token): + # </optgroup> implicitly closes <option> + if (self.tree.openElements[-1].name == "option" and + self.tree.openElements[-2].name == "optgroup"): + self.tree.openElements.pop() + # It also closes </optgroup> + if self.tree.openElements[-1].name == "optgroup": + self.tree.openElements.pop() + # But nothing else + else: + self.parser.parseError("unexpected-end-tag-in-select", + {"name": "optgroup"}) + + def endTagSelect(self, token): + if self.tree.elementInScope("select", variant="table"): + node = self.tree.openElements.pop() + while node.name != "select": + node = self.tree.openElements.pop() + self.parser.resetInsertionMode() + else: + # innerHTML case + self.parser.parseError() + + def endTagTableElements(self, token): + self.parser.parseError("unexpected-end-tag-in-select", + {"name": token["name"]}) + if self.tree.elementInScope(token["name"], variant="table"): + self.endTagSelect("select") + self.parser.phase.processEndTag(token) + + def endTagOther(self, token): + self.parser.parseError("unexpected-end-tag-in-select", + {"name": token["name"]}) + + +class InSelectInTablePhase(Phase): + def __init__(self, parser, tree): + Phase.__init__(self, parser, tree) + + self.startTagHandler = utils.MethodDispatcher([ + (("caption", "table", "tbody", "tfoot", "thead", "tr", "td", "th"), + self.startTagTable) + ]) + self.startTagHandler.default = self.startTagOther + + self.endTagHandler = utils.MethodDispatcher([ + (("caption", "table", "tbody", "tfoot", "thead", "tr", "td", "th"), + self.endTagTable) + ]) + self.endTagHandler.default = self.endTagOther + + def processEOF(self): + self.parser.phases["inSelect"].processEOF() + + def processCharacters(self, token): + self.parser.phases["inSelect"].processCharacters(token) + + def startTagTable(self, token): + self.parser.parseError("unexpected-table-element-start-tag-in-select-in-table", {"name": token["name"]}) + self.endTagOther(impliedTagToken("select")) + self.parser.phase.processStartTag(token) + + def startTagOther(self, token): + self.parser.phases["inSelect"].processStartTag(token) + + def endTagTable(self, token): + self.parser.parseError("unexpected-table-element-end-tag-in-select-in-table", {"name": token["name"]}) + if self.tree.elementInScope(token["name"], variant="table"): + self.endTagOther(impliedTagToken("select")) + self.parser.phase.processEndTag(token) + + def endTagOther(self, token): + self.parser.phases["inSelect"].processEndTag(token) + + +class InForeignContentPhase(Phase): + breakoutElements = frozenset(["b", "big", "blockquote", "body", "br", + "center", "code", "dd", "div", "dl", "dt", + "em", "embed", "font", "h1", "h2", "h3", + "h4", "h5", "h6", "head", "hr", "i", "img", + "li", "listing", "menu", "meta", "nobr", + "ol", "p", "pre", "ruby", "s", "small", + "span", "strong", "strike", "sub", "sup", + "table", "tt", "u", "ul", "var"]) + def __init__(self, parser, tree): + Phase.__init__(self, parser, tree) + + def nonHTMLElementInScope(self): + for element in self.tree.openElements[::-1]: + if element.namespace == self.tree.defaultNamespace: + return self.tree.elementInScope(element) + assert False + for item in self.tree.openElements[::-1]: + if item.namespace == self.tree.defaultNamespace: + return True + elif item.nameTuple in scopingElements: + return False + return False + + def adjustSVGTagNames(self, token): + replacements = {"altglyph":"altGlyph", + "altglyphdef":"altGlyphDef", + "altglyphitem":"altGlyphItem", + "animatecolor":"animateColor", + "animatemotion":"animateMotion", + "animatetransform":"animateTransform", + "clippath":"clipPath", + "feblend":"feBlend", + "fecolormatrix":"feColorMatrix", + "fecomponenttransfer":"feComponentTransfer", + "fecomposite":"feComposite", + "feconvolvematrix":"feConvolveMatrix", + "fediffuselighting":"feDiffuseLighting", + "fedisplacementmap":"feDisplacementMap", + "fedistantlight":"feDistantLight", + "feflood":"feFlood", + "fefunca":"feFuncA", + "fefuncb":"feFuncB", + "fefuncg":"feFuncG", + "fefuncr":"feFuncR", + "fegaussianblur":"feGaussianBlur", + "feimage":"feImage", + "femerge":"feMerge", + "femergenode":"feMergeNode", + "femorphology":"feMorphology", + "feoffset":"feOffset", + "fepointlight":"fePointLight", + "fespecularlighting":"feSpecularLighting", + "fespotlight":"feSpotLight", + "fetile":"feTile", + "feturbulence":"feTurbulence", + "foreignobject":"foreignObject", + "glyphref":"glyphRef", + "lineargradient":"linearGradient", + "radialgradient":"radialGradient", + "textpath":"textPath"} + + if token["name"] in replacements: + token["name"] = replacements[token["name"]] + + def processCharacters(self, token): + self.parser.framesetOK = False + Phase.processCharacters(self, token) + + def processEOF(self): + pass + + def processStartTag(self, token): + currentNode = self.tree.openElements[-1] + if (currentNode.namespace == self.tree.defaultNamespace or + (currentNode.namespace == namespaces["mathml"] and + token["name"] not in frozenset(["mglyph", "malignmark"]) and + currentNode.name in frozenset(["mi", "mo", "mn", + "ms", "mtext"])) or + (currentNode.namespace == namespaces["mathml"] and + currentNode.name == "annotation-xml" and + token["name"] == "svg") or + (currentNode.namespace == namespaces["svg"] and + currentNode.name in frozenset(["foreignObject", + "desc", "title"]) + )): + assert self.parser.secondaryPhase != self + self.parser.secondaryPhase.processStartTag(token) + if self.parser.phase == self and self.nonHTMLElementInScope(): + self.parser.phase = self.parser.secondaryPhase + elif token["name"] in self.breakoutElements: + self.parser.parseError("unexpected-html-element-in-foreign-content", + token["name"]) + while (self.tree.openElements[-1].namespace != + self.tree.defaultNamespace): + self.tree.openElements.pop() + self.parser.phase = self.parser.secondaryPhase + self.parser.phase.processStartTag(token) + else: + if currentNode.namespace == namespaces["mathml"]: + self.parser.adjustMathMLAttributes(token) + elif currentNode.namespace == namespaces["svg"]: + self.adjustSVGTagNames(token) + self.parser.adjustSVGAttributes(token) + self.parser.adjustForeignAttributes(token) + token["namespace"] = currentNode.namespace + self.tree.insertElement(token) + if token["selfClosing"]: + self.tree.openElements.pop() + token["selfClosingAcknowledged"] = True + + def processEndTag(self, token): + self.adjustSVGTagNames(token) + self.parser.secondaryPhase.processEndTag(token) + if self.parser.phase == self and self.nonHTMLElementInScope(): + self.parser.phase = self.parser.secondaryPhase + +class AfterBodyPhase(Phase): + def __init__(self, parser, tree): + Phase.__init__(self, parser, tree) + + self.startTagHandler = utils.MethodDispatcher([ + ("html", self.startTagHtml) + ]) + self.startTagHandler.default = self.startTagOther + + self.endTagHandler = utils.MethodDispatcher([("html", self.endTagHtml)]) + self.endTagHandler.default = self.endTagOther + + def processEOF(self): + #Stop parsing + pass + + def processComment(self, token): + # This is needed because data is to be appended to the <html> element + # here and not to whatever is currently open. + self.tree.insertComment(token, self.tree.openElements[0]) + + def processCharacters(self, token): + self.parser.parseError("unexpected-char-after-body") + self.parser.phase = self.parser.phases["inBody"] + self.parser.phase.processCharacters(token) + + def startTagHtml(self, token): + self.parser.phases["inBody"].processStartTag(token) + + def startTagOther(self, token): + self.parser.parseError("unexpected-start-tag-after-body", + {"name": token["name"]}) + self.parser.phase = self.parser.phases["inBody"] + self.parser.phase.processStartTag(token) + + def endTagHtml(self,name): + if self.parser.innerHTML: + self.parser.parseError("unexpected-end-tag-after-body-innerhtml") + else: + self.parser.phase = self.parser.phases["afterAfterBody"] + + def endTagOther(self, token): + self.parser.parseError("unexpected-end-tag-after-body", + {"name": token["name"]}) + self.parser.phase = self.parser.phases["inBody"] + self.parser.phase.processEndTag(token) + +class InFramesetPhase(Phase): + # http://www.whatwg.org/specs/web-apps/current-work/#in-frameset + def __init__(self, parser, tree): + Phase.__init__(self, parser, tree) + + self.startTagHandler = utils.MethodDispatcher([ + ("html", self.startTagHtml), + ("frameset", self.startTagFrameset), + ("frame", self.startTagFrame), + ("noframes", self.startTagNoframes) + ]) + self.startTagHandler.default = self.startTagOther + + self.endTagHandler = utils.MethodDispatcher([ + ("frameset", self.endTagFrameset), + ("noframes", self.endTagNoframes) + ]) + self.endTagHandler.default = self.endTagOther + + def processEOF(self): + if self.tree.openElements[-1].name != "html": + self.parser.parseError("eof-in-frameset") + else: + assert self.parser.innerHTML + + def processCharacters(self, token): + self.parser.parseError("unexpected-char-in-frameset") + + def startTagFrameset(self, token): + self.tree.insertElement(token) + + def startTagFrame(self, token): + self.tree.insertElement(token) + self.tree.openElements.pop() + + def startTagNoframes(self, token): + self.parser.phases["inBody"].processStartTag(token) + + def startTagOther(self, token): + self.parser.parseError("unexpected-start-tag-in-frameset", + {"name": token["name"]}) + + def endTagFrameset(self, token): + if self.tree.openElements[-1].name == "html": + # innerHTML case + self.parser.parseError("unexpected-frameset-in-frameset-innerhtml") + else: + self.tree.openElements.pop() + if (not self.parser.innerHTML and + self.tree.openElements[-1].name != "frameset"): + # If we're not in innerHTML mode and the the current node is not a + # "frameset" element (anymore) then switch. + self.parser.phase = self.parser.phases["afterFrameset"] + + def endTagNoframes(self, token): + self.parser.phases["inBody"].processEndTag(token) + + def endTagOther(self, token): + self.parser.parseError("unexpected-end-tag-in-frameset", + {"name": token["name"]}) + + +class AfterFramesetPhase(Phase): + # http://www.whatwg.org/specs/web-apps/current-work/#after3 + def __init__(self, parser, tree): + Phase.__init__(self, parser, tree) + + self.startTagHandler = utils.MethodDispatcher([ + ("html", self.startTagHtml), + ("noframes", self.startTagNoframes) + ]) + self.startTagHandler.default = self.startTagOther + + self.endTagHandler = utils.MethodDispatcher([ + ("html", self.endTagHtml) + ]) + self.endTagHandler.default = self.endTagOther + + def processEOF(self): + #Stop parsing + pass + + def processCharacters(self, token): + self.parser.parseError("unexpected-char-after-frameset") + + def startTagNoframes(self, token): + self.parser.phases["inHead"].processStartTag(token) + + def startTagOther(self, token): + self.parser.parseError("unexpected-start-tag-after-frameset", + {"name": token["name"]}) + + def endTagHtml(self, token): + self.parser.phase = self.parser.phases["afterAfterFrameset"] + + def endTagOther(self, token): + self.parser.parseError("unexpected-end-tag-after-frameset", + {"name": token["name"]}) + + +class AfterAfterBodyPhase(Phase): + def __init__(self, parser, tree): + Phase.__init__(self, parser, tree) + + self.startTagHandler = utils.MethodDispatcher([ + ("html", self.startTagHtml) + ]) + self.startTagHandler.default = self.startTagOther + + def processEOF(self): + pass + + def processComment(self, token): + self.tree.insertComment(token, self.tree.document) + + def processSpaceCharacters(self, token): + self.parser.phases["inBody"].processSpaceCharacters(token) + + def processCharacters(self, token): + self.parser.parseError("expected-eof-but-got-char") + self.parser.phase = self.parser.phases["inBody"] + self.parser.phase.processCharacters(token) + + def startTagHtml(self, token): + self.parser.phases["inBody"].processStartTag(token) + + def startTagOther(self, token): + self.parser.parseError("expected-eof-but-got-start-tag", + {"name": token["name"]}) + self.parser.phase = self.parser.phases["inBody"] + self.parser.phase.processStartTag(token) + + def processEndTag(self, token): + self.parser.parseError("expected-eof-but-got-end-tag", + {"name": token["name"]}) + self.parser.phase = self.parser.phases["inBody"] + self.parser.phase.processEndTag(token) + +class AfterAfterFramesetPhase(Phase): + def __init__(self, parser, tree): + Phase.__init__(self, parser, tree) + + self.startTagHandler = utils.MethodDispatcher([ + ("html", self.startTagHtml), + ("noframes", self.startTagNoFrames) + ]) + self.startTagHandler.default = self.startTagOther + + def processEOF(self): + pass + + def processComment(self, token): + self.tree.insertComment(token, self.tree.document) + + def processSpaceCharacters(self, token): + self.parser.phases["inBody"].processSpaceCharacters(token) + + def processCharacters(self, token): + self.parser.parseError("expected-eof-but-got-char") + self.parser.phase = self.parser.phases["inBody"] + self.parser.phase.processCharacters(token) + + def startTagHtml(self, token): + self.parser.phases["inBody"].processStartTag(token) + + def startTagNoFrames(self, token): + self.parser.phases["inHead"].processStartTag(token) + + def startTagOther(self, token): + self.parser.parseError("expected-eof-but-got-start-tag", + {"name": token["name"]}) + self.parser.phase = self.parser.phases["inBody"] + self.parser.phase.processStartTag(token) + + def processEndTag(self, token): + self.parser.parseError("expected-eof-but-got-end-tag", + {"name": token["name"]}) + self.parser.phase = self.parser.phases["inBody"] + self.parser.phase.processEndTag(token) + +def impliedTagToken(name, type="EndTag", attributes = None, + selfClosing = False): + if attributes is None: + attributes = {} + return {"type":tokenTypes[type], "name":name, "data":attributes, + "selfClosing":selfClosing} + +class ParseError(Exception): + """Error in parsed document""" + pass diff --git a/src/html5lib/ihatexml.py b/src/html5lib/ihatexml.py new file mode 100644 index 0000000000..dd78563908 --- /dev/null +++ b/src/html5lib/ihatexml.py @@ -0,0 +1,177 @@ +import re + +baseChar = """[#x0041-#x005A] | [#x0061-#x007A] | [#x00C0-#x00D6] | [#x00D8-#x00F6] | [#x00F8-#x00FF] | [#x0100-#x0131] | [#x0134-#x013E] | [#x0141-#x0148] | [#x014A-#x017E] | [#x0180-#x01C3] | [#x01CD-#x01F0] | [#x01F4-#x01F5] | [#x01FA-#x0217] | [#x0250-#x02A8] | [#x02BB-#x02C1] | #x0386 | [#x0388-#x038A] | #x038C | [#x038E-#x03A1] | [#x03A3-#x03CE] | [#x03D0-#x03D6] | #x03DA | #x03DC | #x03DE | #x03E0 | [#x03E2-#x03F3] | [#x0401-#x040C] | [#x040E-#x044F] | [#x0451-#x045C] | [#x045E-#x0481] | [#x0490-#x04C4] | [#x04C7-#x04C8] | [#x04CB-#x04CC] | [#x04D0-#x04EB] | [#x04EE-#x04F5] | [#x04F8-#x04F9] | [#x0531-#x0556] | #x0559 | [#x0561-#x0586] | [#x05D0-#x05EA] | [#x05F0-#x05F2] | [#x0621-#x063A] | [#x0641-#x064A] | [#x0671-#x06B7] | [#x06BA-#x06BE] | [#x06C0-#x06CE] | [#x06D0-#x06D3] | #x06D5 | [#x06E5-#x06E6] | [#x0905-#x0939] | #x093D | [#x0958-#x0961] | [#x0985-#x098C] | [#x098F-#x0990] | [#x0993-#x09A8] | [#x09AA-#x09B0] | #x09B2 | [#x09B6-#x09B9] | [#x09DC-#x09DD] | [#x09DF-#x09E1] | [#x09F0-#x09F1] | [#x0A05-#x0A0A] | [#x0A0F-#x0A10] | [#x0A13-#x0A28] | [#x0A2A-#x0A30] | [#x0A32-#x0A33] | [#x0A35-#x0A36] | [#x0A38-#x0A39] | [#x0A59-#x0A5C] | #x0A5E | [#x0A72-#x0A74] | [#x0A85-#x0A8B] | #x0A8D | [#x0A8F-#x0A91] | [#x0A93-#x0AA8] | [#x0AAA-#x0AB0] | [#x0AB2-#x0AB3] | [#x0AB5-#x0AB9] | #x0ABD | #x0AE0 | [#x0B05-#x0B0C] | [#x0B0F-#x0B10] | [#x0B13-#x0B28] | [#x0B2A-#x0B30] | [#x0B32-#x0B33] | [#x0B36-#x0B39] | #x0B3D | [#x0B5C-#x0B5D] | [#x0B5F-#x0B61] | [#x0B85-#x0B8A] | [#x0B8E-#x0B90] | [#x0B92-#x0B95] | [#x0B99-#x0B9A] | #x0B9C | [#x0B9E-#x0B9F] | [#x0BA3-#x0BA4] | [#x0BA8-#x0BAA] | [#x0BAE-#x0BB5] | [#x0BB7-#x0BB9] | [#x0C05-#x0C0C] | [#x0C0E-#x0C10] | [#x0C12-#x0C28] | [#x0C2A-#x0C33] | [#x0C35-#x0C39] | [#x0C60-#x0C61] | [#x0C85-#x0C8C] | [#x0C8E-#x0C90] | [#x0C92-#x0CA8] | [#x0CAA-#x0CB3] | [#x0CB5-#x0CB9] | #x0CDE | [#x0CE0-#x0CE1] | [#x0D05-#x0D0C] | [#x0D0E-#x0D10] | [#x0D12-#x0D28] | [#x0D2A-#x0D39] | [#x0D60-#x0D61] | [#x0E01-#x0E2E] | #x0E30 | [#x0E32-#x0E33] | [#x0E40-#x0E45] | [#x0E81-#x0E82] | #x0E84 | [#x0E87-#x0E88] | #x0E8A | #x0E8D | [#x0E94-#x0E97] | [#x0E99-#x0E9F] | [#x0EA1-#x0EA3] | #x0EA5 | #x0EA7 | [#x0EAA-#x0EAB] | [#x0EAD-#x0EAE] | #x0EB0 | [#x0EB2-#x0EB3] | #x0EBD | [#x0EC0-#x0EC4] | [#x0F40-#x0F47] | [#x0F49-#x0F69] | [#x10A0-#x10C5] | [#x10D0-#x10F6] | #x1100 | [#x1102-#x1103] | [#x1105-#x1107] | #x1109 | [#x110B-#x110C] | [#x110E-#x1112] | #x113C | #x113E | #x1140 | #x114C | #x114E | #x1150 | [#x1154-#x1155] | #x1159 | [#x115F-#x1161] | #x1163 | #x1165 | #x1167 | #x1169 | [#x116D-#x116E] | [#x1172-#x1173] | #x1175 | #x119E | #x11A8 | #x11AB | [#x11AE-#x11AF] | [#x11B7-#x11B8] | #x11BA | [#x11BC-#x11C2] | #x11EB | #x11F0 | #x11F9 | [#x1E00-#x1E9B] | [#x1EA0-#x1EF9] | [#x1F00-#x1F15] | [#x1F18-#x1F1D] | [#x1F20-#x1F45] | [#x1F48-#x1F4D] | [#x1F50-#x1F57] | #x1F59 | #x1F5B | #x1F5D | [#x1F5F-#x1F7D] | [#x1F80-#x1FB4] | [#x1FB6-#x1FBC] | #x1FBE | [#x1FC2-#x1FC4] | [#x1FC6-#x1FCC] | [#x1FD0-#x1FD3] | [#x1FD6-#x1FDB] | [#x1FE0-#x1FEC] | [#x1FF2-#x1FF4] | [#x1FF6-#x1FFC] | #x2126 | [#x212A-#x212B] | #x212E | [#x2180-#x2182] | [#x3041-#x3094] | [#x30A1-#x30FA] | [#x3105-#x312C] | [#xAC00-#xD7A3]""" + +ideographic = """[#x4E00-#x9FA5] | #x3007 | [#x3021-#x3029]""" + +combiningCharacter = """[#x0300-#x0345] | [#x0360-#x0361] | [#x0483-#x0486] | [#x0591-#x05A1] | [#x05A3-#x05B9] | [#x05BB-#x05BD] | #x05BF | [#x05C1-#x05C2] | #x05C4 | [#x064B-#x0652] | #x0670 | [#x06D6-#x06DC] | [#x06DD-#x06DF] | [#x06E0-#x06E4] | [#x06E7-#x06E8] | [#x06EA-#x06ED] | [#x0901-#x0903] | #x093C | [#x093E-#x094C] | #x094D | [#x0951-#x0954] | [#x0962-#x0963] | [#x0981-#x0983] | #x09BC | #x09BE | #x09BF | [#x09C0-#x09C4] | [#x09C7-#x09C8] | [#x09CB-#x09CD] | #x09D7 | [#x09E2-#x09E3] | #x0A02 | #x0A3C | #x0A3E | #x0A3F | [#x0A40-#x0A42] | [#x0A47-#x0A48] | [#x0A4B-#x0A4D] | [#x0A70-#x0A71] | [#x0A81-#x0A83] | #x0ABC | [#x0ABE-#x0AC5] | [#x0AC7-#x0AC9] | [#x0ACB-#x0ACD] | [#x0B01-#x0B03] | #x0B3C | [#x0B3E-#x0B43] | [#x0B47-#x0B48] | [#x0B4B-#x0B4D] | [#x0B56-#x0B57] | [#x0B82-#x0B83] | [#x0BBE-#x0BC2] | [#x0BC6-#x0BC8] | [#x0BCA-#x0BCD] | #x0BD7 | [#x0C01-#x0C03] | [#x0C3E-#x0C44] | [#x0C46-#x0C48] | [#x0C4A-#x0C4D] | [#x0C55-#x0C56] | [#x0C82-#x0C83] | [#x0CBE-#x0CC4] | [#x0CC6-#x0CC8] | [#x0CCA-#x0CCD] | [#x0CD5-#x0CD6] | [#x0D02-#x0D03] | [#x0D3E-#x0D43] | [#x0D46-#x0D48] | [#x0D4A-#x0D4D] | #x0D57 | #x0E31 | [#x0E34-#x0E3A] | [#x0E47-#x0E4E] | #x0EB1 | [#x0EB4-#x0EB9] | [#x0EBB-#x0EBC] | [#x0EC8-#x0ECD] | [#x0F18-#x0F19] | #x0F35 | #x0F37 | #x0F39 | #x0F3E | #x0F3F | [#x0F71-#x0F84] | [#x0F86-#x0F8B] | [#x0F90-#x0F95] | #x0F97 | [#x0F99-#x0FAD] | [#x0FB1-#x0FB7] | #x0FB9 | [#x20D0-#x20DC] | #x20E1 | [#x302A-#x302F] | #x3099 | #x309A""" + +digit = """[#x0030-#x0039] | [#x0660-#x0669] | [#x06F0-#x06F9] | [#x0966-#x096F] | [#x09E6-#x09EF] | [#x0A66-#x0A6F] | [#x0AE6-#x0AEF] | [#x0B66-#x0B6F] | [#x0BE7-#x0BEF] | [#x0C66-#x0C6F] | [#x0CE6-#x0CEF] | [#x0D66-#x0D6F] | [#x0E50-#x0E59] | [#x0ED0-#x0ED9] | [#x0F20-#x0F29]""" + +extender = """#x00B7 | #x02D0 | #x02D1 | #x0387 | #x0640 | #x0E46 | #x0EC6 | #x3005 | [#x3031-#x3035] | [#x309D-#x309E] | [#x30FC-#x30FE]""" + +letter = " | ".join([baseChar, ideographic]) + +#Without the +name = " | ".join([letter, digit, ".", "-", "_", combiningCharacter, + extender]) +nameFirst = " | ".join([letter, "_"]) + +reChar = re.compile(r"#x([\d|A-F]{4,4})") +reCharRange = re.compile(r"\[#x([\d|A-F]{4,4})-#x([\d|A-F]{4,4})\]") + +def charStringToList(chars): + charRanges = [item.strip() for item in chars.split(" | ")] + rv = [] + for item in charRanges: + foundMatch = False + for regexp in (reChar, reCharRange): + match = regexp.match(item) + if match is not None: + rv.append([hexToInt(item) for item in match.groups()]) + if len(rv[-1]) == 1: + rv[-1] = rv[-1]*2 + foundMatch = True + break + if not foundMatch: + assert len(item) == 1 + + rv.append([ord(item)] * 2) + rv = normaliseCharList(rv) + return rv + +def normaliseCharList(charList): + charList = sorted(charList) + for item in charList: + assert item[1] >= item[0] + rv = [] + i = 0 + while i < len(charList): + j = 1 + rv.append(charList[i]) + while i + j < len(charList) and charList[i+j][0] <= rv[-1][1] + 1: + rv[-1][1] = charList[i+j][1] + j += 1 + i += j + return rv + +#We don't really support characters above the BMP :( +max_unicode = int("FFFF", 16) + +def missingRanges(charList): + rv = [] + if charList[0] != 0: + rv.append([0, charList[0][0] - 1]) + for i, item in enumerate(charList[:-1]): + rv.append([item[1]+1, charList[i+1][0] - 1]) + if charList[-1][1] != max_unicode: + rv.append([charList[-1][1] + 1, max_unicode]) + return rv + +def listToRegexpStr(charList): + rv = [] + for item in charList: + if item[0] == item[1]: + rv.append(escapeRegexp(unichr(item[0]))) + else: + rv.append(escapeRegexp(unichr(item[0])) + "-" + + escapeRegexp(unichr(item[1]))) + return "[%s]"%"".join(rv) + +def hexToInt(hex_str): + return int(hex_str, 16) + +def escapeRegexp(string): + specialCharacters = (".", "^", "$", "*", "+", "?", "{", "}", + "[", "]", "|", "(", ")", "-") + for char in specialCharacters: + string = string.replace(char, "\\" + char) + if char in string: + print string + + return string + +#output from the above +nonXmlNameBMPRegexp = re.compile(u'[\x00-,/:-@\\[-\\^`\\{-\xb6\xb8-\xbf\xd7\xf7\u0132-\u0133\u013f-\u0140\u0149\u017f\u01c4-\u01cc\u01f1-\u01f3\u01f6-\u01f9\u0218-\u024f\u02a9-\u02ba\u02c2-\u02cf\u02d2-\u02ff\u0346-\u035f\u0362-\u0385\u038b\u038d\u03a2\u03cf\u03d7-\u03d9\u03db\u03dd\u03df\u03e1\u03f4-\u0400\u040d\u0450\u045d\u0482\u0487-\u048f\u04c5-\u04c6\u04c9-\u04ca\u04cd-\u04cf\u04ec-\u04ed\u04f6-\u04f7\u04fa-\u0530\u0557-\u0558\u055a-\u0560\u0587-\u0590\u05a2\u05ba\u05be\u05c0\u05c3\u05c5-\u05cf\u05eb-\u05ef\u05f3-\u0620\u063b-\u063f\u0653-\u065f\u066a-\u066f\u06b8-\u06b9\u06bf\u06cf\u06d4\u06e9\u06ee-\u06ef\u06fa-\u0900\u0904\u093a-\u093b\u094e-\u0950\u0955-\u0957\u0964-\u0965\u0970-\u0980\u0984\u098d-\u098e\u0991-\u0992\u09a9\u09b1\u09b3-\u09b5\u09ba-\u09bb\u09bd\u09c5-\u09c6\u09c9-\u09ca\u09ce-\u09d6\u09d8-\u09db\u09de\u09e4-\u09e5\u09f2-\u0a01\u0a03-\u0a04\u0a0b-\u0a0e\u0a11-\u0a12\u0a29\u0a31\u0a34\u0a37\u0a3a-\u0a3b\u0a3d\u0a43-\u0a46\u0a49-\u0a4a\u0a4e-\u0a58\u0a5d\u0a5f-\u0a65\u0a75-\u0a80\u0a84\u0a8c\u0a8e\u0a92\u0aa9\u0ab1\u0ab4\u0aba-\u0abb\u0ac6\u0aca\u0ace-\u0adf\u0ae1-\u0ae5\u0af0-\u0b00\u0b04\u0b0d-\u0b0e\u0b11-\u0b12\u0b29\u0b31\u0b34-\u0b35\u0b3a-\u0b3b\u0b44-\u0b46\u0b49-\u0b4a\u0b4e-\u0b55\u0b58-\u0b5b\u0b5e\u0b62-\u0b65\u0b70-\u0b81\u0b84\u0b8b-\u0b8d\u0b91\u0b96-\u0b98\u0b9b\u0b9d\u0ba0-\u0ba2\u0ba5-\u0ba7\u0bab-\u0bad\u0bb6\u0bba-\u0bbd\u0bc3-\u0bc5\u0bc9\u0bce-\u0bd6\u0bd8-\u0be6\u0bf0-\u0c00\u0c04\u0c0d\u0c11\u0c29\u0c34\u0c3a-\u0c3d\u0c45\u0c49\u0c4e-\u0c54\u0c57-\u0c5f\u0c62-\u0c65\u0c70-\u0c81\u0c84\u0c8d\u0c91\u0ca9\u0cb4\u0cba-\u0cbd\u0cc5\u0cc9\u0cce-\u0cd4\u0cd7-\u0cdd\u0cdf\u0ce2-\u0ce5\u0cf0-\u0d01\u0d04\u0d0d\u0d11\u0d29\u0d3a-\u0d3d\u0d44-\u0d45\u0d49\u0d4e-\u0d56\u0d58-\u0d5f\u0d62-\u0d65\u0d70-\u0e00\u0e2f\u0e3b-\u0e3f\u0e4f\u0e5a-\u0e80\u0e83\u0e85-\u0e86\u0e89\u0e8b-\u0e8c\u0e8e-\u0e93\u0e98\u0ea0\u0ea4\u0ea6\u0ea8-\u0ea9\u0eac\u0eaf\u0eba\u0ebe-\u0ebf\u0ec5\u0ec7\u0ece-\u0ecf\u0eda-\u0f17\u0f1a-\u0f1f\u0f2a-\u0f34\u0f36\u0f38\u0f3a-\u0f3d\u0f48\u0f6a-\u0f70\u0f85\u0f8c-\u0f8f\u0f96\u0f98\u0fae-\u0fb0\u0fb8\u0fba-\u109f\u10c6-\u10cf\u10f7-\u10ff\u1101\u1104\u1108\u110a\u110d\u1113-\u113b\u113d\u113f\u1141-\u114b\u114d\u114f\u1151-\u1153\u1156-\u1158\u115a-\u115e\u1162\u1164\u1166\u1168\u116a-\u116c\u116f-\u1171\u1174\u1176-\u119d\u119f-\u11a7\u11a9-\u11aa\u11ac-\u11ad\u11b0-\u11b6\u11b9\u11bb\u11c3-\u11ea\u11ec-\u11ef\u11f1-\u11f8\u11fa-\u1dff\u1e9c-\u1e9f\u1efa-\u1eff\u1f16-\u1f17\u1f1e-\u1f1f\u1f46-\u1f47\u1f4e-\u1f4f\u1f58\u1f5a\u1f5c\u1f5e\u1f7e-\u1f7f\u1fb5\u1fbd\u1fbf-\u1fc1\u1fc5\u1fcd-\u1fcf\u1fd4-\u1fd5\u1fdc-\u1fdf\u1fed-\u1ff1\u1ff5\u1ffd-\u20cf\u20dd-\u20e0\u20e2-\u2125\u2127-\u2129\u212c-\u212d\u212f-\u217f\u2183-\u3004\u3006\u3008-\u3020\u3030\u3036-\u3040\u3095-\u3098\u309b-\u309c\u309f-\u30a0\u30fb\u30ff-\u3104\u312d-\u4dff\u9fa6-\uabff\ud7a4-\uffff]') + +nonXmlNameFirstBMPRegexp = re.compile(u'[\x00-@\\[-\\^`\\{-\xbf\xd7\xf7\u0132-\u0133\u013f-\u0140\u0149\u017f\u01c4-\u01cc\u01f1-\u01f3\u01f6-\u01f9\u0218-\u024f\u02a9-\u02ba\u02c2-\u0385\u0387\u038b\u038d\u03a2\u03cf\u03d7-\u03d9\u03db\u03dd\u03df\u03e1\u03f4-\u0400\u040d\u0450\u045d\u0482-\u048f\u04c5-\u04c6\u04c9-\u04ca\u04cd-\u04cf\u04ec-\u04ed\u04f6-\u04f7\u04fa-\u0530\u0557-\u0558\u055a-\u0560\u0587-\u05cf\u05eb-\u05ef\u05f3-\u0620\u063b-\u0640\u064b-\u0670\u06b8-\u06b9\u06bf\u06cf\u06d4\u06d6-\u06e4\u06e7-\u0904\u093a-\u093c\u093e-\u0957\u0962-\u0984\u098d-\u098e\u0991-\u0992\u09a9\u09b1\u09b3-\u09b5\u09ba-\u09db\u09de\u09e2-\u09ef\u09f2-\u0a04\u0a0b-\u0a0e\u0a11-\u0a12\u0a29\u0a31\u0a34\u0a37\u0a3a-\u0a58\u0a5d\u0a5f-\u0a71\u0a75-\u0a84\u0a8c\u0a8e\u0a92\u0aa9\u0ab1\u0ab4\u0aba-\u0abc\u0abe-\u0adf\u0ae1-\u0b04\u0b0d-\u0b0e\u0b11-\u0b12\u0b29\u0b31\u0b34-\u0b35\u0b3a-\u0b3c\u0b3e-\u0b5b\u0b5e\u0b62-\u0b84\u0b8b-\u0b8d\u0b91\u0b96-\u0b98\u0b9b\u0b9d\u0ba0-\u0ba2\u0ba5-\u0ba7\u0bab-\u0bad\u0bb6\u0bba-\u0c04\u0c0d\u0c11\u0c29\u0c34\u0c3a-\u0c5f\u0c62-\u0c84\u0c8d\u0c91\u0ca9\u0cb4\u0cba-\u0cdd\u0cdf\u0ce2-\u0d04\u0d0d\u0d11\u0d29\u0d3a-\u0d5f\u0d62-\u0e00\u0e2f\u0e31\u0e34-\u0e3f\u0e46-\u0e80\u0e83\u0e85-\u0e86\u0e89\u0e8b-\u0e8c\u0e8e-\u0e93\u0e98\u0ea0\u0ea4\u0ea6\u0ea8-\u0ea9\u0eac\u0eaf\u0eb1\u0eb4-\u0ebc\u0ebe-\u0ebf\u0ec5-\u0f3f\u0f48\u0f6a-\u109f\u10c6-\u10cf\u10f7-\u10ff\u1101\u1104\u1108\u110a\u110d\u1113-\u113b\u113d\u113f\u1141-\u114b\u114d\u114f\u1151-\u1153\u1156-\u1158\u115a-\u115e\u1162\u1164\u1166\u1168\u116a-\u116c\u116f-\u1171\u1174\u1176-\u119d\u119f-\u11a7\u11a9-\u11aa\u11ac-\u11ad\u11b0-\u11b6\u11b9\u11bb\u11c3-\u11ea\u11ec-\u11ef\u11f1-\u11f8\u11fa-\u1dff\u1e9c-\u1e9f\u1efa-\u1eff\u1f16-\u1f17\u1f1e-\u1f1f\u1f46-\u1f47\u1f4e-\u1f4f\u1f58\u1f5a\u1f5c\u1f5e\u1f7e-\u1f7f\u1fb5\u1fbd\u1fbf-\u1fc1\u1fc5\u1fcd-\u1fcf\u1fd4-\u1fd5\u1fdc-\u1fdf\u1fed-\u1ff1\u1ff5\u1ffd-\u2125\u2127-\u2129\u212c-\u212d\u212f-\u217f\u2183-\u3006\u3008-\u3020\u302a-\u3040\u3095-\u30a0\u30fb-\u3104\u312d-\u4dff\u9fa6-\uabff\ud7a4-\uffff]') + +class InfosetFilter(object): + replacementRegexp = re.compile(r"U[\dA-F]{5,5}") + def __init__(self, replaceChars = None, + dropXmlnsLocalName = False, + dropXmlnsAttrNs = False, + preventDoubleDashComments = False, + preventDashAtCommentEnd = False, + replaceFormFeedCharacters = True): + + self.dropXmlnsLocalName = dropXmlnsLocalName + self.dropXmlnsAttrNs = dropXmlnsAttrNs + + self.preventDoubleDashComments = preventDoubleDashComments + self.preventDashAtCommentEnd = preventDashAtCommentEnd + + self.replaceFormFeedCharacters = replaceFormFeedCharacters + + self.replaceCache = {} + + def coerceAttribute(self, name, namespace=None): + if self.dropXmlnsLocalName and name.startswith("xmlns:"): + #Need a datalosswarning here + return None + elif (self.dropXmlnsAttrNs and + namespace == "http://www.w3.org/2000/xmlns/"): + return None + else: + return self.toXmlName(name) + + def coerceElement(self, name, namespace=None): + return self.toXmlName(name) + + def coerceComment(self, data): + if self.preventDoubleDashComments: + while "--" in data: + data = data.replace("--", "- -") + return data + + def coerceCharacters(self, data): + if self.replaceFormFeedCharacters: + data = data.replace("\x0C", " ") + #Other non-xml characters + return data + + def toXmlName(self, name): + nameFirst = name[0] + nameRest = name[1:] + m = nonXmlNameFirstBMPRegexp.match(nameFirst) + if m: + nameFirstOutput = self.getReplacementCharacter(nameFirst) + else: + nameFirstOutput = nameFirst + + nameRestOutput = nameRest + replaceChars = set(nonXmlNameBMPRegexp.findall(nameRest)) + for char in replaceChars: + replacement = self.getReplacementCharacter(char) + nameRestOutput = nameRestOutput.replace(char, replacement) + return nameFirstOutput + nameRestOutput + + def getReplacementCharacter(self, char): + if char in self.replaceCache: + replacement = self.replaceCache[char] + else: + replacement = self.escapeChar(char) + return replacement + + def fromXmlName(self, name): + for item in set(self.replacementRegexp.findall(name)): + name = name.replace(item, self.unescapeChar(item)) + return name + + def escapeChar(self, char): + replacement = "U" + hex(ord(char))[2:].upper().rjust(5, "0") + self.replaceCache[char] = replacement + return replacement + + def unescapeChar(self, charcode): + return unichr(int(charcode[1:], 16)) diff --git a/src/html5lib/inputstream.py b/src/html5lib/inputstream.py new file mode 100644 index 0000000000..8e2a3896ce --- /dev/null +++ b/src/html5lib/inputstream.py @@ -0,0 +1,794 @@ +import codecs +import re +import types +import sys + +from constants import EOF, spaceCharacters, asciiLetters, asciiUppercase +from constants import encodings, ReparseException + +#Non-unicode versions of constants for use in the pre-parser +spaceCharactersBytes = frozenset([str(item) for item in spaceCharacters]) +asciiLettersBytes = frozenset([str(item) for item in asciiLetters]) +asciiUppercaseBytes = frozenset([str(item) for item in asciiUppercase]) +spacesAngleBrackets = spaceCharactersBytes | frozenset([">", "<"]) + +invalid_unicode_re = re.compile(u"[\u0001-\u0008\u000B\u000E-\u001F\u007F-\u009F\uD800-\uDFFF\uFDD0-\uFDEF\uFFFE\uFFFF\U0001FFFE\U0001FFFF\U0002FFFE\U0002FFFF\U0003FFFE\U0003FFFF\U0004FFFE\U0004FFFF\U0005FFFE\U0005FFFF\U0006FFFE\U0006FFFF\U0007FFFE\U0007FFFF\U0008FFFE\U0008FFFF\U0009FFFE\U0009FFFF\U000AFFFE\U000AFFFF\U000BFFFE\U000BFFFF\U000CFFFE\U000CFFFF\U000DFFFE\U000DFFFF\U000EFFFE\U000EFFFF\U000FFFFE\U000FFFFF\U0010FFFE\U0010FFFF]") + +non_bmp_invalid_codepoints = set([0x1FFFE, 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE, + 0x3FFFF, 0x4FFFE, 0x4FFFF, 0x5FFFE, 0x5FFFF, + 0x6FFFE, 0x6FFFF, 0x7FFFE, 0x7FFFF, 0x8FFFE, + 0x8FFFF, 0x9FFFE, 0x9FFFF, 0xAFFFE, 0xAFFFF, + 0xBFFFE, 0xBFFFF, 0xCFFFE, 0xCFFFF, 0xDFFFE, + 0xDFFFF, 0xEFFFE, 0xEFFFF, 0xFFFFE, 0xFFFFF, + 0x10FFFE, 0x10FFFF]) + +ascii_punctuation_re = re.compile(ur"[\u0009-\u000D\u0020-\u002F\u003A-\u0040\u005B-\u0060\u007B-\u007E]") + +# Cache for charsUntil() +charsUntilRegEx = {} + +class BufferedStream: + """Buffering for streams that do not have buffering of their own + + The buffer is implemented as a list of chunks on the assumption that + joining many strings will be slow since it is O(n**2) + """ + + def __init__(self, stream): + self.stream = stream + self.buffer = [] + self.position = [-1,0] #chunk number, offset + + def tell(self): + pos = 0 + for chunk in self.buffer[:self.position[0]]: + pos += len(chunk) + pos += self.position[1] + return pos + + def seek(self, pos): + assert pos < self._bufferedBytes() + offset = pos + i = 0 + while len(self.buffer[i]) < offset: + offset -= pos + i += 1 + self.position = [i, offset] + + def read(self, bytes): + if not self.buffer: + return self._readStream(bytes) + elif (self.position[0] == len(self.buffer) and + self.position[1] == len(self.buffer[-1])): + return self._readStream(bytes) + else: + return self._readFromBuffer(bytes) + + def _bufferedBytes(self): + return sum([len(item) for item in self.buffer]) + + def _readStream(self, bytes): + data = self.stream.read(bytes) + self.buffer.append(data) + self.position[0] += 1 + self.position[1] = len(data) + return data + + def _readFromBuffer(self, bytes): + remainingBytes = bytes + rv = [] + bufferIndex = self.position[0] + bufferOffset = self.position[1] + while bufferIndex < len(self.buffer) and remainingBytes != 0: + assert remainingBytes > 0 + bufferedData = self.buffer[bufferIndex] + + if remainingBytes <= len(bufferedData) - bufferOffset: + bytesToRead = remainingBytes + self.position = [bufferIndex, bufferOffset + bytesToRead] + else: + bytesToRead = len(bufferedData) - bufferOffset + self.position = [bufferIndex, len(bufferedData)] + bufferIndex += 1 + data = rv.append(bufferedData[bufferOffset: + bufferOffset + bytesToRead]) + remainingBytes -= bytesToRead + + bufferOffset = 0 + + if remainingBytes: + rv.append(self._readStream(remainingBytes)) + + return "".join(rv) + + + +class HTMLInputStream: + """Provides a unicode stream of characters to the HTMLTokenizer. + + This class takes care of character encoding and removing or replacing + incorrect byte-sequences and also provides column and line tracking. + + """ + + _defaultChunkSize = 10240 + + def __init__(self, source, encoding=None, parseMeta=True, chardet=True): + """Initialises the HTMLInputStream. + + HTMLInputStream(source, [encoding]) -> Normalized stream from source + for use by html5lib. + + source can be either a file-object, local filename or a string. + + The optional encoding parameter must be a string that indicates + the encoding. If specified, that encoding will be used, + regardless of any BOM or later declaration (such as in a meta + element) + + parseMeta - Look for a <meta> element containing encoding information + + """ + + #Craziness + if len(u"\U0010FFFF") == 1: + self.reportCharacterErrors = self.characterErrorsUCS4 + else: + self.reportCharacterErrors = self.characterErrorsUCS2 + + # List of where new lines occur + self.newLines = [0] + + self.charEncoding = (codecName(encoding), "certain") + + # Raw Stream - for unicode objects this will encode to utf-8 and set + # self.charEncoding as appropriate + self.rawStream = self.openStream(source) + + # Encoding Information + #Number of bytes to use when looking for a meta element with + #encoding information + self.numBytesMeta = 512 + #Number of bytes to use when using detecting encoding using chardet + self.numBytesChardet = 100 + #Encoding to use if no other information can be found + self.defaultEncoding = "windows-1252" + + #Detect encoding iff no explicit "transport level" encoding is supplied + if (self.charEncoding[0] is None): + self.charEncoding = self.detectEncoding(parseMeta, chardet) + + + self.reset() + + def reset(self): + self.dataStream = codecs.getreader(self.charEncoding[0])(self.rawStream, + 'replace') + + self.chunk = u"" + self.chunkSize = 0 + self.chunkOffset = 0 + self.errors = [] + + # number of (complete) lines in previous chunks + self.prevNumLines = 0 + # number of columns in the last line of the previous chunk + self.prevNumCols = 0 + + #Flag to indicate we may have a CR LF broken across a data chunk + self._lastChunkEndsWithCR = False + + def openStream(self, source): + """Produces a file object from source. + + source can be either a file object, local filename or a string. + + """ + # Already a file object + if hasattr(source, 'read'): + stream = source + else: + # Otherwise treat source as a string and convert to a file object + if isinstance(source, unicode): + source = source.encode('utf-8') + self.charEncoding = ("utf-8", "certain") + import cStringIO + stream = cStringIO.StringIO(str(source)) + + if (not(hasattr(stream, "tell") and hasattr(stream, "seek")) or + stream is sys.stdin): + stream = BufferedStream(stream) + + return stream + + def detectEncoding(self, parseMeta=True, chardet=True): + #First look for a BOM + #This will also read past the BOM if present + encoding = self.detectBOM() + confidence = "certain" + #If there is no BOM need to look for meta elements with encoding + #information + if encoding is None and parseMeta: + encoding = self.detectEncodingMeta() + confidence = "tentative" + #Guess with chardet, if avaliable + if encoding is None and chardet: + confidence = "tentative" + try: + from chardet.universaldetector import UniversalDetector + buffers = [] + detector = UniversalDetector() + while not detector.done: + buffer = self.rawStream.read(self.numBytesChardet) + if not buffer: + break + buffers.append(buffer) + detector.feed(buffer) + detector.close() + encoding = detector.result['encoding'] + self.rawStream.seek(0) + except ImportError: + pass + # If all else fails use the default encoding + if encoding is None: + confidence="tentative" + encoding = self.defaultEncoding + + #Substitute for equivalent encodings: + encodingSub = {"iso-8859-1":"windows-1252"} + + if encoding.lower() in encodingSub: + encoding = encodingSub[encoding.lower()] + + return encoding, confidence + + def changeEncoding(self, newEncoding): + newEncoding = codecName(newEncoding) + if newEncoding in ("utf-16", "utf-16-be", "utf-16-le"): + newEncoding = "utf-8" + if newEncoding is None: + return + elif newEncoding == self.charEncoding[0]: + self.charEncoding = (self.charEncoding[0], "certain") + else: + self.rawStream.seek(0) + self.reset() + self.charEncoding = (newEncoding, "certain") + raise ReparseException, "Encoding changed from %s to %s"%(self.charEncoding[0], newEncoding) + + def detectBOM(self): + """Attempts to detect at BOM at the start of the stream. If + an encoding can be determined from the BOM return the name of the + encoding otherwise return None""" + bomDict = { + codecs.BOM_UTF8: 'utf-8', + codecs.BOM_UTF16_LE: 'utf-16-le', codecs.BOM_UTF16_BE: 'utf-16-be', + codecs.BOM_UTF32_LE: 'utf-32-le', codecs.BOM_UTF32_BE: 'utf-32-be' + } + + # Go to beginning of file and read in 4 bytes + string = self.rawStream.read(4) + + # Try detecting the BOM using bytes from the string + encoding = bomDict.get(string[:3]) # UTF-8 + seek = 3 + if not encoding: + # Need to detect UTF-32 before UTF-16 + encoding = bomDict.get(string) # UTF-32 + seek = 4 + if not encoding: + encoding = bomDict.get(string[:2]) # UTF-16 + seek = 2 + + # Set the read position past the BOM if one was found, otherwise + # set it to the start of the stream + self.rawStream.seek(encoding and seek or 0) + + return encoding + + def detectEncodingMeta(self): + """Report the encoding declared by the meta element + """ + buffer = self.rawStream.read(self.numBytesMeta) + parser = EncodingParser(buffer) + self.rawStream.seek(0) + encoding = parser.getEncoding() + + if encoding in ("utf-16", "utf-16-be", "utf-16-le"): + encoding = "utf-8" + + return encoding + + def _position(self, offset): + chunk = self.chunk + nLines = chunk.count(u'\n', 0, offset) + positionLine = self.prevNumLines + nLines + lastLinePos = chunk.rfind(u'\n', 0, offset) + if lastLinePos == -1: + positionColumn = self.prevNumCols + offset + else: + positionColumn = offset - (lastLinePos + 1) + return (positionLine, positionColumn) + + def position(self): + """Returns (line, col) of the current position in the stream.""" + line, col = self._position(self.chunkOffset) + return (line+1, col) + + def char(self): + """ Read one character from the stream or queue if available. Return + EOF when EOF is reached. + """ + # Read a new chunk from the input stream if necessary + if self.chunkOffset >= self.chunkSize: + if not self.readChunk(): + return EOF + + chunkOffset = self.chunkOffset + char = self.chunk[chunkOffset] + self.chunkOffset = chunkOffset + 1 + + return char + + def readChunk(self, chunkSize=None): + if chunkSize is None: + chunkSize = self._defaultChunkSize + + self.prevNumLines, self.prevNumCols = self._position(self.chunkSize) + + self.chunk = u"" + self.chunkSize = 0 + self.chunkOffset = 0 + + data = self.dataStream.read(chunkSize) + + if not data: + return False + + self.reportCharacterErrors(data) + + data = data.replace(u"\u0000", u"\ufffd") + #Check for CR LF broken across chunks + if (self._lastChunkEndsWithCR and data[0] == u"\n"): + data = data[1:] + # Stop if the chunk is now empty + if not data: + return False + self._lastChunkEndsWithCR = data[-1] == u"\r" + data = data.replace(u"\r\n", u"\n") + data = data.replace(u"\r", u"\n") + + self.chunk = data + self.chunkSize = len(data) + + return True + + def characterErrorsUCS4(self, data): + for i in xrange(data.count(u"\u0000")): + self.errors.append("null-character") + for i in xrange(len(invalid_unicode_re.findall(data))): + self.errors.append("invalid-codepoint") + + def characterErrorsUCS2(self, data): + #Someone picked the wrong compile option + #You lose + for i in xrange(data.count(u"\u0000")): + self.errors.append("null-character") + skip = False + import sys + for match in invalid_unicode_re.finditer(data): + if skip: + continue + codepoint = ord(match.group()) + pos = match.start() + #Pretty sure there should be endianness issues here + if (codepoint >= 0xD800 and codepoint <= 0xDBFF and + pos < len(data) - 1 and + ord(data[pos + 1]) >= 0xDC00 and + ord(data[pos + 1]) <= 0xDFFF): + #We have a surrogate pair! + #From a perl manpage + char_val = (0x10000 + (codepoint - 0xD800) * 0x400 + + (ord(data[pos + 1]) - 0xDC00)) + if char_val in non_bmp_invalid_codepoints: + self.errors.append("invalid-codepoint") + skip = True + elif (codepoint >= 0xD800 and codepoint <= 0xDFFF and + pos == len(data) - 1): + self.errors.append("invalid-codepoint") + else: + skip = False + self.errors.append("invalid-codepoint") + #This is still wrong if it is possible for a surrogate pair to break a + #chunk boundary + + def charsUntil(self, characters, opposite = False): + """ Returns a string of characters from the stream up to but not + including any character in 'characters' or EOF. 'characters' must be + a container that supports the 'in' method and iteration over its + characters. + """ + + # Use a cache of regexps to find the required characters + try: + chars = charsUntilRegEx[(characters, opposite)] + except KeyError: + if __debug__: + for c in characters: + assert(ord(c) < 128) + regex = u"".join([u"\\x%02x" % ord(c) for c in characters]) + if not opposite: + regex = u"^%s" % regex + chars = charsUntilRegEx[(characters, opposite)] = re.compile(u"[%s]+" % regex) + + rv = [] + + while True: + # Find the longest matching prefix + m = chars.match(self.chunk, self.chunkOffset) + if m is None: + # If nothing matched, and it wasn't because we ran out of chunk, + # then stop + if self.chunkOffset != self.chunkSize: + break + else: + end = m.end() + # If not the whole chunk matched, return everything + # up to the part that didn't match + if end != self.chunkSize: + rv.append(self.chunk[self.chunkOffset:end]) + self.chunkOffset = end + break + # If the whole remainder of the chunk matched, + # use it all and read the next chunk + rv.append(self.chunk[self.chunkOffset:]) + if not self.readChunk(): + # Reached EOF + break + + r = u"".join(rv) + return r + + def charsUntilEOF(self): + """ Returns a string of characters from the stream up to EOF.""" + + rv = [] + + while True: + rv.append(self.chunk[self.chunkOffset:]) + if not self.readChunk(): + # Reached EOF + break + + r = u"".join(rv) + return r + + def unget(self, char): + # Only one character is allowed to be ungotten at once - it must + # be consumed again before any further call to unget + + if char is not None: + if self.chunkOffset == 0: + # unget is called quite rarely, so it's a good idea to do + # more work here if it saves a bit of work in the frequently + # called char and charsUntil. + # So, just prepend the ungotten character onto the current + # chunk: + self.chunk = char + self.chunk + self.chunkSize += 1 + else: + self.chunkOffset -= 1 + assert self.chunk[self.chunkOffset] == char + +class EncodingBytes(str): + """String-like object with an associated position and various extra methods + If the position is ever greater than the string length then an exception is + raised""" + def __new__(self, value): + return str.__new__(self, value.lower()) + + def __init__(self, value): + self._position=-1 + + def __iter__(self): + return self + + def next(self): + p = self._position = self._position + 1 + if p >= len(self): + raise StopIteration + elif p < 0: + raise TypeError + return self[p] + + def previous(self): + p = self._position + if p >= len(self): + raise StopIteration + elif p < 0: + raise TypeError + self._position = p = p - 1 + return self[p] + + def setPosition(self, position): + if self._position >= len(self): + raise StopIteration + self._position = position + + def getPosition(self): + if self._position >= len(self): + raise StopIteration + if self._position >= 0: + return self._position + else: + return None + + position = property(getPosition, setPosition) + + def getCurrentByte(self): + return self[self.position] + + currentByte = property(getCurrentByte) + + def skip(self, chars=spaceCharactersBytes): + """Skip past a list of characters""" + p = self.position # use property for the error-checking + while p < len(self): + c = self[p] + if c not in chars: + self._position = p + return c + p += 1 + self._position = p + return None + + def skipUntil(self, chars): + p = self.position + while p < len(self): + c = self[p] + if c in chars: + self._position = p + return c + p += 1 + self._position = p + return None + + def matchBytes(self, bytes): + """Look for a sequence of bytes at the start of a string. If the bytes + are found return True and advance the position to the byte after the + match. Otherwise return False and leave the position alone""" + p = self.position + data = self[p:p+len(bytes)] + rv = data.startswith(bytes) + if rv: + self.position += len(bytes) + return rv + + def jumpTo(self, bytes): + """Look for the next sequence of bytes matching a given sequence. If + a match is found advance the position to the last byte of the match""" + newPosition = self[self.position:].find(bytes) + if newPosition > -1: + # XXX: This is ugly, but I can't see a nicer way to fix this. + if self._position == -1: + self._position = 0 + self._position += (newPosition + len(bytes)-1) + return True + else: + raise StopIteration + +class EncodingParser(object): + """Mini parser for detecting character encoding from meta elements""" + + def __init__(self, data): + """string - the data to work on for encoding detection""" + self.data = EncodingBytes(data) + self.encoding = None + + def getEncoding(self): + methodDispatch = ( + ("<!--",self.handleComment), + ("<meta",self.handleMeta), + ("</",self.handlePossibleEndTag), + ("<!",self.handleOther), + ("<?",self.handleOther), + ("<",self.handlePossibleStartTag)) + for byte in self.data: + keepParsing = True + for key, method in methodDispatch: + if self.data.matchBytes(key): + try: + keepParsing = method() + break + except StopIteration: + keepParsing=False + break + if not keepParsing: + break + + return self.encoding + + def handleComment(self): + """Skip over comments""" + return self.data.jumpTo("-->") + + def handleMeta(self): + if self.data.currentByte not in spaceCharactersBytes: + #if we have <meta not followed by a space so just keep going + return True + #We have a valid meta element we want to search for attributes + while True: + #Try to find the next attribute after the current position + attr = self.getAttribute() + if attr is None: + return True + else: + if attr[0] == "charset": + tentativeEncoding = attr[1] + codec = codecName(tentativeEncoding) + if codec is not None: + self.encoding = codec + return False + elif attr[0] == "content": + contentParser = ContentAttrParser(EncodingBytes(attr[1])) + tentativeEncoding = contentParser.parse() + codec = codecName(tentativeEncoding) + if codec is not None: + self.encoding = codec + return False + + def handlePossibleStartTag(self): + return self.handlePossibleTag(False) + + def handlePossibleEndTag(self): + self.data.next() + return self.handlePossibleTag(True) + + def handlePossibleTag(self, endTag): + data = self.data + if data.currentByte not in asciiLettersBytes: + #If the next byte is not an ascii letter either ignore this + #fragment (possible start tag case) or treat it according to + #handleOther + if endTag: + data.previous() + self.handleOther() + return True + + c = data.skipUntil(spacesAngleBrackets) + if c == "<": + #return to the first step in the overall "two step" algorithm + #reprocessing the < byte + data.previous() + else: + #Read all attributes + attr = self.getAttribute() + while attr is not None: + attr = self.getAttribute() + return True + + def handleOther(self): + return self.data.jumpTo(">") + + def getAttribute(self): + """Return a name,value pair for the next attribute in the stream, + if one is found, or None""" + data = self.data + # Step 1 (skip chars) + c = data.skip(spaceCharactersBytes | frozenset("/")) + # Step 2 + if c in (">", None): + return None + # Step 3 + attrName = [] + attrValue = [] + #Step 4 attribute name + while True: + if c == "=" and attrName: + break + elif c in spaceCharactersBytes: + #Step 6! + c = data.skip() + c = data.next() + break + elif c in ("/", ">"): + return "".join(attrName), "" + elif c in asciiUppercaseBytes: + attrName.append(c.lower()) + elif c == None: + return None + else: + attrName.append(c) + #Step 5 + c = data.next() + #Step 7 + if c != "=": + data.previous() + return "".join(attrName), "" + #Step 8 + data.next() + #Step 9 + c = data.skip() + #Step 10 + if c in ("'", '"'): + #10.1 + quoteChar = c + while True: + #10.2 + c = data.next() + #10.3 + if c == quoteChar: + data.next() + return "".join(attrName), "".join(attrValue) + #10.4 + elif c in asciiUppercaseBytes: + attrValue.append(c.lower()) + #10.5 + else: + attrValue.append(c) + elif c == ">": + return "".join(attrName), "" + elif c in asciiUppercaseBytes: + attrValue.append(c.lower()) + elif c is None: + return None + else: + attrValue.append(c) + # Step 11 + while True: + c = data.next() + if c in spacesAngleBrackets: + return "".join(attrName), "".join(attrValue) + elif c in asciiUppercaseBytes: + attrValue.append(c.lower()) + elif c is None: + return None + else: + attrValue.append(c) + + +class ContentAttrParser(object): + def __init__(self, data): + self.data = data + def parse(self): + try: + #Check if the attr name is charset + #otherwise return + self.data.jumpTo("charset") + self.data.position += 1 + self.data.skip() + if not self.data.currentByte == "=": + #If there is no = sign keep looking for attrs + return None + self.data.position += 1 + self.data.skip() + #Look for an encoding between matching quote marks + if self.data.currentByte in ('"', "'"): + quoteMark = self.data.currentByte + self.data.position += 1 + oldPosition = self.data.position + if self.data.jumpTo(quoteMark): + return self.data[oldPosition:self.data.position] + else: + return None + else: + #Unquoted value + oldPosition = self.data.position + try: + self.data.skipUntil(spaceCharactersBytes) + return self.data[oldPosition:self.data.position] + except StopIteration: + #Return the whole remaining value + return self.data[oldPosition:] + except StopIteration: + return None + + +def codecName(encoding): + """Return the python codec name corresponding to an encoding or None if the + string doesn't correspond to a valid encoding.""" + if (encoding is not None and type(encoding) in types.StringTypes): + canonicalName = ascii_punctuation_re.sub("", encoding).lower() + return encodings.get(canonicalName, None) + else: + return None diff --git a/src/html5lib/sanitizer.py b/src/html5lib/sanitizer.py new file mode 100644 index 0000000000..d0f994600b --- /dev/null +++ b/src/html5lib/sanitizer.py @@ -0,0 +1,230 @@ +import re +from xml.sax.saxutils import escape, unescape + +from tokenizer import HTMLTokenizer +from constants import tokenTypes + +class HTMLSanitizerMixin(object): + """ sanitization of XHTML+MathML+SVG and of inline style attributes.""" + + acceptable_elements = ['a', 'abbr', 'acronym', 'address', 'area', 'b', + 'big', 'blockquote', 'br', 'button', 'caption', 'center', 'cite', + 'code', 'col', 'colgroup', 'dd', 'del', 'dfn', 'dir', 'div', 'dl', 'dt', + 'em', 'fieldset', 'font', 'form', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', + 'hr', 'i', 'img', 'input', 'ins', 'kbd', 'label', 'legend', 'li', 'map', + 'menu', 'ol', 'optgroup', 'option', 'p', 'pre', 'q', 's', 'samp', + 'select', 'small', 'span', 'strike', 'strong', 'sub', 'sup', 'table', + 'tbody', 'td', 'textarea', 'tfoot', 'th', 'thead', 'tr', 'tt', 'u', + 'ul', 'var'] + + mathml_elements = ['maction', 'math', 'merror', 'mfrac', 'mi', + 'mmultiscripts', 'mn', 'mo', 'mover', 'mpadded', 'mphantom', + 'mprescripts', 'mroot', 'mrow', 'mspace', 'msqrt', 'mstyle', 'msub', + 'msubsup', 'msup', 'mtable', 'mtd', 'mtext', 'mtr', 'munder', + 'munderover', 'none'] + + svg_elements = ['a', 'animate', 'animateColor', 'animateMotion', + 'animateTransform', 'circle', 'defs', 'desc', 'ellipse', 'font-face', + 'font-face-name', 'font-face-src', 'g', 'glyph', 'hkern', + 'linearGradient', 'line', 'marker', 'metadata', 'missing-glyph', + 'mpath', 'path', 'polygon', 'polyline', 'radialGradient', 'rect', + 'set', 'stop', 'svg', 'switch', 'text', 'title', 'tspan', 'use'] + + acceptable_attributes = ['abbr', 'accept', 'accept-charset', 'accesskey', + 'action', 'align', 'alt', 'axis', 'border', 'cellpadding', + 'cellspacing', 'char', 'charoff', 'charset', 'checked', 'cite', 'class', + 'clear', 'cols', 'colspan', 'color', 'compact', 'coords', 'datetime', + 'dir', 'disabled', 'enctype', 'for', 'frame', 'headers', 'height', + 'href', 'hreflang', 'hspace', 'id', 'ismap', 'label', 'lang', + 'longdesc', 'maxlength', 'media', 'method', 'multiple', 'name', + 'nohref', 'noshade', 'nowrap', 'prompt', 'readonly', 'rel', 'rev', + 'rows', 'rowspan', 'rules', 'scope', 'selected', 'shape', 'size', + 'span', 'src', 'start', 'style', 'summary', 'tabindex', 'target', + 'title', 'type', 'usemap', 'valign', 'value', 'vspace', 'width', + 'xml:lang'] + + mathml_attributes = ['actiontype', 'align', 'columnalign', 'columnalign', + 'columnalign', 'columnlines', 'columnspacing', 'columnspan', 'depth', + 'display', 'displaystyle', 'equalcolumns', 'equalrows', 'fence', + 'fontstyle', 'fontweight', 'frame', 'height', 'linethickness', 'lspace', + 'mathbackground', 'mathcolor', 'mathvariant', 'mathvariant', 'maxsize', + 'minsize', 'other', 'rowalign', 'rowalign', 'rowalign', 'rowlines', + 'rowspacing', 'rowspan', 'rspace', 'scriptlevel', 'selection', + 'separator', 'stretchy', 'width', 'width', 'xlink:href', 'xlink:show', + 'xlink:type', 'xmlns', 'xmlns:xlink'] + + svg_attributes = ['accent-height', 'accumulate', 'additive', 'alphabetic', + 'arabic-form', 'ascent', 'attributeName', 'attributeType', + 'baseProfile', 'bbox', 'begin', 'by', 'calcMode', 'cap-height', + 'class', 'color', 'color-rendering', 'content', 'cx', 'cy', 'd', 'dx', + 'dy', 'descent', 'display', 'dur', 'end', 'fill', 'fill-opacity', + 'fill-rule', 'font-family', 'font-size', 'font-stretch', 'font-style', + 'font-variant', 'font-weight', 'from', 'fx', 'fy', 'g1', 'g2', + 'glyph-name', 'gradientUnits', 'hanging', 'height', 'horiz-adv-x', + 'horiz-origin-x', 'id', 'ideographic', 'k', 'keyPoints', + 'keySplines', 'keyTimes', 'lang', 'marker-end', 'marker-mid', + 'marker-start', 'markerHeight', 'markerUnits', 'markerWidth', + 'mathematical', 'max', 'min', 'name', 'offset', 'opacity', 'orient', + 'origin', 'overline-position', 'overline-thickness', 'panose-1', + 'path', 'pathLength', 'points', 'preserveAspectRatio', 'r', 'refX', + 'refY', 'repeatCount', 'repeatDur', 'requiredExtensions', + 'requiredFeatures', 'restart', 'rotate', 'rx', 'ry', 'slope', + 'stemh', 'stemv', 'stop-color', 'stop-opacity', + 'strikethrough-position', 'strikethrough-thickness', 'stroke', + 'stroke-dasharray', 'stroke-dashoffset', 'stroke-linecap', + 'stroke-linejoin', 'stroke-miterlimit', 'stroke-opacity', + 'stroke-width', 'systemLanguage', 'target', 'text-anchor', 'to', + 'transform', 'type', 'u1', 'u2', 'underline-position', + 'underline-thickness', 'unicode', 'unicode-range', 'units-per-em', + 'values', 'version', 'viewBox', 'visibility', 'width', 'widths', 'x', + 'x-height', 'x1', 'x2', 'xlink:actuate', 'xlink:arcrole', + 'xlink:href', 'xlink:role', 'xlink:show', 'xlink:title', + 'xlink:type', 'xml:base', 'xml:lang', 'xml:space', 'xmlns', + 'xmlns:xlink', 'y', 'y1', 'y2', 'zoomAndPan'] + + attr_val_is_uri = ['href', 'src', 'cite', 'action', 'longdesc', + 'xlink:href', 'xml:base'] + + svg_attr_val_allows_ref = ['clip-path', 'color-profile', 'cursor', 'fill', + 'filter', 'marker', 'marker-start', 'marker-mid', 'marker-end', 'mask', 'stroke'] + + svg_allow_local_href = ['altGlyph', 'animate', 'animateColor', 'animateMotion', + 'animateTransform', 'cursor', 'feImage', 'filter', 'linearGradient', 'pattern', + 'radialGradient', 'textpath', 'tref', 'set', 'use'] + + acceptable_css_properties = ['azimuth', 'background-color', + 'border-bottom-color', 'border-collapse', 'border-color', + 'border-left-color', 'border-right-color', 'border-top-color', 'clear', + 'color', 'cursor', 'direction', 'display', 'elevation', 'float', 'font', + 'font-family', 'font-size', 'font-style', 'font-variant', 'font-weight', + 'height', 'letter-spacing', 'line-height', 'overflow', 'pause', + 'pause-after', 'pause-before', 'pitch', 'pitch-range', 'richness', + 'speak', 'speak-header', 'speak-numeral', 'speak-punctuation', + 'speech-rate', 'stress', 'text-align', 'text-decoration', 'text-indent', + 'unicode-bidi', 'vertical-align', 'voice-family', 'volume', + 'white-space', 'width'] + + acceptable_css_keywords = ['auto', 'aqua', 'black', 'block', 'blue', + 'bold', 'both', 'bottom', 'brown', 'center', 'collapse', 'dashed', + 'dotted', 'fuchsia', 'gray', 'green', '!important', 'italic', 'left', + 'lime', 'maroon', 'medium', 'none', 'navy', 'normal', 'nowrap', 'olive', + 'pointer', 'purple', 'red', 'right', 'solid', 'silver', 'teal', 'top', + 'transparent', 'underline', 'white', 'yellow'] + + acceptable_svg_properties = [ 'fill', 'fill-opacity', 'fill-rule', + 'stroke', 'stroke-width', 'stroke-linecap', 'stroke-linejoin', + 'stroke-opacity'] + + acceptable_protocols = [ 'ed2k', 'ftp', 'http', 'https', 'irc', + 'mailto', 'news', 'gopher', 'nntp', 'telnet', 'webcal', + 'xmpp', 'callto', 'feed', 'urn', 'aim', 'rsync', 'tag', + 'ssh', 'sftp', 'rtsp', 'afs' ] + + # subclasses may define their own versions of these constants + allowed_elements = acceptable_elements + mathml_elements + svg_elements + allowed_attributes = acceptable_attributes + mathml_attributes + svg_attributes + allowed_css_properties = acceptable_css_properties + allowed_css_keywords = acceptable_css_keywords + allowed_svg_properties = acceptable_svg_properties + allowed_protocols = acceptable_protocols + + # Sanitize the +html+, escaping all elements not in ALLOWED_ELEMENTS, and + # stripping out all # attributes not in ALLOWED_ATTRIBUTES. Style + # attributes are parsed, and a restricted set, # specified by + # ALLOWED_CSS_PROPERTIES and ALLOWED_CSS_KEYWORDS, are allowed through. + # attributes in ATTR_VAL_IS_URI are scanned, and only URI schemes specified + # in ALLOWED_PROTOCOLS are allowed. + # + # sanitize_html('<script> do_nasty_stuff() </script>') + # => <script> do_nasty_stuff() </script> + # sanitize_html('<a href="javascript: sucker();">Click here for $100</a>') + # => <a>Click here for $100</a> + def sanitize_token(self, token): + if token["type"] in (tokenTypes["StartTag"], tokenTypes["EndTag"], + tokenTypes["EmptyTag"]): + if token["name"] in self.allowed_elements: + if token.has_key("data"): + attrs = dict([(name,val) for name,val in + token["data"][::-1] + if name in self.allowed_attributes]) + for attr in self.attr_val_is_uri: + if not attrs.has_key(attr): + continue + val_unescaped = re.sub("[`\000-\040\177-\240\s]+", '', + unescape(attrs[attr])).lower() + #remove replacement characters from unescaped characters + val_unescaped = val_unescaped.replace(u"\ufffd", "") + if (re.match("^[a-z0-9][-+.a-z0-9]*:",val_unescaped) and + (val_unescaped.split(':')[0] not in + self.allowed_protocols)): + del attrs[attr] + for attr in self.svg_attr_val_allows_ref: + if attr in attrs: + attrs[attr] = re.sub(r'url\s*\(\s*[^#\s][^)]+?\)', + ' ', + unescape(attrs[attr])) + if (token["name"] in self.svg_allow_local_href and + 'xlink:href' in attrs and re.search('^\s*[^#\s].*', + attrs['xlink:href'])): + del attrs['xlink:href'] + if attrs.has_key('style'): + attrs['style'] = self.sanitize_css(attrs['style']) + token["data"] = [[name,val] for name,val in attrs.items()] + return token + else: + if token["type"] == tokenTypes["EndTag"]: + token["data"] = "</%s>" % token["name"] + elif token["data"]: + attrs = ''.join([' %s="%s"' % (k,escape(v)) for k,v in token["data"]]) + token["data"] = "<%s%s>" % (token["name"],attrs) + else: + token["data"] = "<%s>" % token["name"] + if token["selfClosing"]: + token["data"]=token["data"][:-1] + "/>" + token["type"] = tokenTypes["Characters"] + del token["name"] + return token + elif token["type"] == tokenTypes["Comment"]: + pass + else: + return token + + def sanitize_css(self, style): + # disallow urls + style=re.compile('url\s*\(\s*[^\s)]+?\s*\)\s*').sub(' ',style) + + # gauntlet + if not re.match("""^([:,;#%.\sa-zA-Z0-9!]|\w-\w|'[\s\w]+'|"[\s\w]+"|\([\d,\s]+\))*$""", style): return '' + if not re.match("^\s*([-\w]+\s*:[^:;]*(;\s*|$))*$", style): return '' + + clean = [] + for prop,value in re.findall("([-\w]+)\s*:\s*([^:;]*)",style): + if not value: continue + if prop.lower() in self.allowed_css_properties: + clean.append(prop + ': ' + value + ';') + elif prop.split('-')[0].lower() in ['background','border','margin', + 'padding']: + for keyword in value.split(): + if not keyword in self.acceptable_css_keywords and \ + not re.match("^(#[0-9a-f]+|rgb\(\d+%?,\d*%?,?\d*%?\)?|\d{0,2}\.?\d{0,2}(cm|em|ex|in|mm|pc|pt|px|%|,|\))?)$",keyword): + break + else: + clean.append(prop + ': ' + value + ';') + elif prop.lower() in self.allowed_svg_properties: + clean.append(prop + ': ' + value + ';') + + return ' '.join(clean) + +class HTMLSanitizer(HTMLTokenizer, HTMLSanitizerMixin): + def __init__(self, stream, encoding=None, parseMeta=True, useChardet=True, + lowercaseElementName=False, lowercaseAttrName=False): + #Change case matching defaults as we only output lowercase html anyway + #This solution doesn't seem ideal... + HTMLTokenizer.__init__(self, stream, encoding, parseMeta, useChardet, + lowercaseElementName, lowercaseAttrName) + + def __iter__(self): + for token in HTMLTokenizer.__iter__(self): + token = self.sanitize_token(token) + if token: + yield token diff --git a/src/html5lib/serializer/__init__.py b/src/html5lib/serializer/__init__.py new file mode 100644 index 0000000000..1b746655d6 --- /dev/null +++ b/src/html5lib/serializer/__init__.py @@ -0,0 +1,17 @@ + +from html5lib import treewalkers + +from htmlserializer import HTMLSerializer +from xhtmlserializer import XHTMLSerializer + +def serialize(input, tree="simpletree", format="html", encoding=None, + **serializer_opts): + # XXX: Should we cache this? + walker = treewalkers.getTreeWalker(tree) + if format == "html": + s = HTMLSerializer(**serializer_opts) + elif format == "xhtml": + s = XHTMLSerializer(**serializer_opts) + else: + raise ValueError, "type must be either html or xhtml" + return s.render(walker(input), encoding) diff --git a/src/html5lib/serializer/htmlserializer.py b/src/html5lib/serializer/htmlserializer.py new file mode 100644 index 0000000000..a95072a897 --- /dev/null +++ b/src/html5lib/serializer/htmlserializer.py @@ -0,0 +1,234 @@ +try: + frozenset +except NameError: + # Import from the sets module for python 2.3 + from sets import ImmutableSet as frozenset + +import gettext +_ = gettext.gettext + +from html5lib.constants import voidElements, booleanAttributes, spaceCharacters +from html5lib.constants import rcdataElements + +from xml.sax.saxutils import escape + +spaceCharacters = u"".join(spaceCharacters) + +try: + from codecs import register_error, xmlcharrefreplace_errors +except ImportError: + unicode_encode_errors = "strict" +else: + unicode_encode_errors = "htmlentityreplace" + + from html5lib.constants import entities + + encode_entity_map = {} + for k, v in entities.items(): + if v != "&" and encode_entity_map.get(v) != k.lower(): + # prefer < over < and similarly for &, >, etc. + encode_entity_map[v] = k + + def htmlentityreplace_errors(exc): + if isinstance(exc, (UnicodeEncodeError, UnicodeTranslateError)): + res = [] + for c in exc.object[exc.start:exc.end]: + e = encode_entity_map.get(c) + if e: + res.append("&") + res.append(e) + if not e.endswith(";"): + res.append(";") + else: + res.append(c.encode(exc.encoding, "xmlcharrefreplace")) + return (u"".join(res), exc.end) + else: + return xmlcharrefreplace_errors(exc) + + register_error(unicode_encode_errors, htmlentityreplace_errors) + + del register_error + +def encode(text, encoding): + return text.encode(encoding, unicode_encode_errors) + +class HTMLSerializer(object): + + quote_attr_values = False + quote_char = '"' + use_best_quote_char = True + minimize_boolean_attributes = True + + use_trailing_solidus = False + space_before_trailing_solidus = True + escape_lt_in_attrs = False + escape_rcdata = False + + inject_meta_charset = True + strip_whitespace = False + sanitize = False + omit_optional_tags = True + + options = ("quote_attr_values", "quote_char", "use_best_quote_char", + "minimize_boolean_attributes", "use_trailing_solidus", + "space_before_trailing_solidus", "omit_optional_tags", + "strip_whitespace", "inject_meta_charset", "escape_lt_in_attrs", + "escape_rcdata", 'use_trailing_solidus', "sanitize") + + def __init__(self, **kwargs): + if kwargs.has_key('quote_char'): + self.use_best_quote_char = False + for attr in self.options: + setattr(self, attr, kwargs.get(attr, getattr(self, attr))) + self.errors = [] + self.strict = False + + def serialize(self, treewalker, encoding=None): + in_cdata = False + self.errors = [] + if encoding and self.inject_meta_charset: + from html5lib.filters.inject_meta_charset import Filter + treewalker = Filter(treewalker, encoding) + # XXX: WhitespaceFilter should be used before OptionalTagFilter + # for maximum efficiently of this latter filter + if self.strip_whitespace: + from html5lib.filters.whitespace import Filter + treewalker = Filter(treewalker) + if self.sanitize: + from html5lib.filters.sanitizer import Filter + treewalker = Filter(treewalker) + if self.omit_optional_tags: + from html5lib.filters.optionaltags import Filter + treewalker = Filter(treewalker) + for token in treewalker: + type = token["type"] + if type == "Doctype": + doctype = u"<!DOCTYPE %s" % token["name"] + + if token["publicId"]: + doctype += u' PUBLIC "%s"' % token["publicId"] + elif token["systemId"]: + doctype += u" SYSTEM" + if token["systemId"]: + if token["systemId"].find(u'"') >= 0: + if token["systemId"].find(u"'") >= 0: + self.serializeError(_("System identifer contains both single and double quote characters")) + quote_char = u"'" + else: + quote_char = u'"' + doctype += u" %s%s%s" % (quote_char, token["systemId"], quote_char) + + doctype += u">" + + if encoding: + yield doctype.encode(encoding) + else: + yield doctype + + elif type in ("Characters", "SpaceCharacters"): + if type == "SpaceCharacters" or in_cdata: + if in_cdata and token["data"].find("</") >= 0: + self.serializeError(_("Unexpected </ in CDATA")) + if encoding: + yield token["data"].encode(encoding, "strict") + else: + yield token["data"] + elif encoding: + yield encode(escape(token["data"]), encoding) + else: + yield escape(token["data"]) + + elif type in ("StartTag", "EmptyTag"): + name = token["name"] + if name in rcdataElements and not self.escape_rcdata: + in_cdata = True + elif in_cdata: + self.serializeError(_("Unexpected child element of a CDATA element")) + attrs = token["data"] + if hasattr(attrs, "items"): + attrs = attrs.items() + attrs.sort() + attributes = [] + for k,v in attrs: + if encoding: + k = k.encode(encoding, "strict") + attributes.append(' ') + + attributes.append(k) + if not self.minimize_boolean_attributes or \ + (k not in booleanAttributes.get(name, tuple()) \ + and k not in booleanAttributes.get("", tuple())): + attributes.append("=") + if self.quote_attr_values or not v: + quote_attr = True + else: + quote_attr = reduce(lambda x,y: x or (y in v), + spaceCharacters + ">\"'=", False) + v = v.replace("&", "&") + if self.escape_lt_in_attrs: v = v.replace("<", "<") + if encoding: + v = encode(v, encoding) + if quote_attr: + quote_char = self.quote_char + if self.use_best_quote_char: + if "'" in v and '"' not in v: + quote_char = '"' + elif '"' in v and "'" not in v: + quote_char = "'" + if quote_char == "'": + v = v.replace("'", "'") + else: + v = v.replace('"', """) + attributes.append(quote_char) + attributes.append(v) + attributes.append(quote_char) + else: + attributes.append(v) + if name in voidElements and self.use_trailing_solidus: + if self.space_before_trailing_solidus: + attributes.append(" /") + else: + attributes.append("/") + if encoding: + yield "<%s%s>" % (name.encode(encoding, "strict"), "".join(attributes)) + else: + yield u"<%s%s>" % (name, u"".join(attributes)) + + elif type == "EndTag": + name = token["name"] + if name in rcdataElements: + in_cdata = False + elif in_cdata: + self.serializeError(_("Unexpected child element of a CDATA element")) + end_tag = u"</%s>" % name + if encoding: + end_tag = end_tag.encode(encoding, "strict") + yield end_tag + + elif type == "Comment": + data = token["data"] + if data.find("--") >= 0: + self.serializeError(_("Comment contains --")) + comment = u"<!--%s-->" % token["data"] + if encoding: + comment = comment.encode(encoding, unicode_encode_errors) + yield comment + + else: + self.serializeError(token["data"]) + + def render(self, treewalker, encoding=None): + if encoding: + return "".join(list(self.serialize(treewalker, encoding))) + else: + return u"".join(list(self.serialize(treewalker))) + + def serializeError(self, data="XXX ERROR MESSAGE NEEDED"): + # XXX The idea is to make data mandatory. + self.errors.append(data) + if self.strict: + raise SerializeError + +def SerializeError(Exception): + """Error in serialized tree""" + pass diff --git a/src/html5lib/serializer/xhtmlserializer.py b/src/html5lib/serializer/xhtmlserializer.py new file mode 100644 index 0000000000..7fdce47ba9 --- /dev/null +++ b/src/html5lib/serializer/xhtmlserializer.py @@ -0,0 +1,9 @@ +from htmlserializer import HTMLSerializer + +class XHTMLSerializer(HTMLSerializer): + quote_attr_values = True + minimize_boolean_attributes = False + use_trailing_solidus = True + escape_lt_in_attrs = True + omit_optional_tags = False + escape_rcdata = True diff --git a/src/html5lib/tokenizer.py b/src/html5lib/tokenizer.py new file mode 100644 index 0000000000..d7c4b5f6c0 --- /dev/null +++ b/src/html5lib/tokenizer.py @@ -0,0 +1,1586 @@ +try: + frozenset +except NameError: + # Import from the sets module for python 2.3 + from sets import Set as set + from sets import ImmutableSet as frozenset +try: + from collections import deque +except ImportError: + from utils import deque + +from constants import spaceCharacters +from constants import entitiesWindows1252, entities +from constants import asciiLowercase, asciiLetters, asciiUpper2Lower +from constants import digits, hexDigits, EOF +from constants import tokenTypes, tagTokenTypes +from constants import replacementCharacters + +from inputstream import HTMLInputStream + +# Group entities by their first character, for faster lookups +entitiesByFirstChar = {} +for e in entities: + entitiesByFirstChar.setdefault(e[0], []).append(e) + +class HTMLTokenizer: + """ This class takes care of tokenizing HTML. + + * self.currentToken + Holds the token that is currently being processed. + + * self.state + Holds a reference to the method to be invoked... XXX + + * self.stream + Points to HTMLInputStream object. + """ + + # XXX need to fix documentation + + def __init__(self, stream, encoding=None, parseMeta=True, useChardet=True, + lowercaseElementName=True, lowercaseAttrName=True): + + self.stream = HTMLInputStream(stream, encoding, parseMeta, useChardet) + + #Perform case conversions? + self.lowercaseElementName = lowercaseElementName + self.lowercaseAttrName = lowercaseAttrName + + # Setup the initial tokenizer state + self.escapeFlag = False + self.lastFourChars = [] + self.state = self.dataState + self.escape = False + + # The current token being created + self.currentToken = None + + def __iter__(self): + """ This is where the magic happens. + + We do our usually processing through the states and when we have a token + to return we yield the token which pauses processing until the next token + is requested. + """ + self.tokenQueue = deque([]) + # Start processing. When EOF is reached self.state will return False + # instead of True and the loop will terminate. + while self.state(): + while self.stream.errors: + yield {"type": tokenTypes["ParseError"], "data": self.stream.errors.pop(0)} + while self.tokenQueue: + yield self.tokenQueue.popleft() + + def consumeNumberEntity(self, isHex): + """This function returns either U+FFFD or the character based on the + decimal or hexadecimal representation. It also discards ";" if present. + If not present self.tokenQueue.append({"type": tokenTypes["ParseError"]}) is invoked. + """ + + allowed = digits + radix = 10 + if isHex: + allowed = hexDigits + radix = 16 + + charStack = [] + + # Consume all the characters that are in range while making sure we + # don't hit an EOF. + c = self.stream.char() + while c in allowed and c is not EOF: + charStack.append(c) + c = self.stream.char() + + # Convert the set of characters consumed to an int. + charAsInt = int("".join(charStack), radix) + + # Certain characters get replaced with others + if charAsInt in replacementCharacters: + char = replacementCharacters[charAsInt] + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "illegal-codepoint-for-numeric-entity", + "datavars": {"charAsInt": charAsInt}}) + elif ((0xD800 <= charAsInt <= 0xDFFF) or + (charAsInt > 0x10FFFF)): + char = u"\uFFFD" + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "illegal-codepoint-for-numeric-entity", + "datavars": {"charAsInt": charAsInt}}) + else: + #Should speed up this check somehow (e.g. move the set to a constant) + if ((0x0001 <= charAsInt <= 0x0008) or + (0x000E <= charAsInt <= 0x001F) or + (0x007F <= charAsInt <= 0x009F) or + (0xFDD0 <= charAsInt <= 0xFDEF) or + charAsInt in frozenset([0x000B, 0xFFFE, 0xFFFF, 0x1FFFE, + 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE, + 0x3FFFF, 0x4FFFE, 0x4FFFF, 0x5FFFE, + 0x5FFFF, 0x6FFFE, 0x6FFFF, 0x7FFFE, + 0x7FFFF, 0x8FFFE, 0x8FFFF, 0x9FFFE, + 0x9FFFF, 0xAFFFE, 0xAFFFF, 0xBFFFE, + 0xBFFFF, 0xCFFFE, 0xCFFFF, 0xDFFFE, + 0xDFFFF, 0xEFFFE, 0xEFFFF, 0xFFFFE, + 0xFFFFF, 0x10FFFE, 0x10FFFF])): + self.tokenQueue.append({"type": tokenTypes["ParseError"], + "data": + "illegal-codepoint-for-numeric-entity", + "datavars": {"charAsInt": charAsInt}}) + try: + # Try/except needed as UCS-2 Python builds' unichar only works + # within the BMP. + char = unichr(charAsInt) + except ValueError: + char = eval("u'\\U%08x'" % charAsInt) + + # Discard the ; if present. Otherwise, put it back on the queue and + # invoke parseError on parser. + if c != u";": + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "numeric-entity-without-semicolon"}) + self.stream.unget(c) + + return char + + def consumeEntity(self, allowedChar=None, fromAttribute=False): + # Initialise to the default output for when no entity is matched + output = u"&" + + charStack = [self.stream.char()] + if (charStack[0] in spaceCharacters or charStack[0] in (EOF, u"<", u"&") + or (allowedChar is not None and allowedChar == charStack[0])): + self.stream.unget(charStack[0]) + + elif charStack[0] == u"#": + # Read the next character to see if it's hex or decimal + hex = False + charStack.append(self.stream.char()) + if charStack[-1] in (u"x", u"X"): + hex = True + charStack.append(self.stream.char()) + + # charStack[-1] should be the first digit + if (hex and charStack[-1] in hexDigits) \ + or (not hex and charStack[-1] in digits): + # At least one digit found, so consume the whole number + self.stream.unget(charStack[-1]) + output = self.consumeNumberEntity(hex) + else: + # No digits found + self.tokenQueue.append({"type": tokenTypes["ParseError"], + "data": "expected-numeric-entity"}) + self.stream.unget(charStack.pop()) + output = u"&" + u"".join(charStack) + + else: + # At this point in the process might have named entity. Entities + # are stored in the global variable "entities". + # + # Consume characters and compare to these to a substring of the + # entity names in the list until the substring no longer matches. + filteredEntityList = entitiesByFirstChar.get(charStack[0], []) + + def entitiesStartingWith(name): + return [e for e in filteredEntityList if e.startswith(name)] + + while charStack[-1] is not EOF and\ + entitiesStartingWith("".join(charStack)): + charStack.append(self.stream.char()) + + # At this point we have a string that starts with some characters + # that may match an entity + entityName = None + + # Try to find the longest entity the string will match to take care + # of ¬i for instance. + for entityLength in xrange(len(charStack)-1, 1, -1): + possibleEntityName = "".join(charStack[:entityLength]) + if possibleEntityName in entities: + entityName = possibleEntityName + break + + if entityName is not None: + if entityName[-1] != ";": + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "named-entity-without-semicolon"}) + if entityName[-1] != ";" and fromAttribute and \ + (charStack[entityLength] in asciiLetters + or charStack[entityLength] in digits): + self.stream.unget(charStack.pop()) + output = u"&" + u"".join(charStack) + else: + output = entities[entityName] + self.stream.unget(charStack.pop()) + output += u"".join(charStack[entityLength:]) + else: + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "expected-named-entity"}) + self.stream.unget(charStack.pop()) + output = u"&" + u"".join(charStack) + + if fromAttribute: + self.currentToken["data"][-1][1] += output + else: + self.tokenQueue.append({"type": tokenTypes["Characters"], "data": output}) + + def processEntityInAttribute(self, allowedChar): + """This method replaces the need for "entityInAttributeValueState". + """ + self.consumeEntity(allowedChar=allowedChar, fromAttribute=True) + + def emitCurrentToken(self): + """This method is a generic handler for emitting the tags. It also sets + the state to "data" because that's what's needed after a token has been + emitted. + """ + token = self.currentToken + # Add token to the queue to be yielded + if (token["type"] in tagTokenTypes): + if self.lowercaseElementName: + token["name"] = token["name"].translate(asciiUpper2Lower) + if token["type"] == tokenTypes["EndTag"]: + if token["data"]: + self.tokenQueue.append({"type":tokenTypes["ParseError"], + "data":"attributes-in-end-tag"}) + if token["selfClosing"]: + self.tokenQueue.append({"type":tokenTypes["ParseError"], + "data":"self-closing-flag-on-end-tag"}) + self.tokenQueue.append(token) + self.state = self.dataState + + + # Below are the various tokenizer states worked out. + + def dataState(self): + data = self.stream.char() + if data == "&": + self.state = self.entityDataState + elif data == "<": + self.state = self.tagOpenState + elif data is EOF: + # Tokenization ends. + return False + elif data in spaceCharacters: + # Directly after emitting a token you switch back to the "data + # state". At that point spaceCharacters are important so they are + # emitted separately. + self.tokenQueue.append({"type": tokenTypes["SpaceCharacters"], "data": + data + self.stream.charsUntil(spaceCharacters, True)}) + # No need to update lastFourChars here, since the first space will + # have already been appended to lastFourChars and will have broken + # any <!-- or --> sequences + else: + chars = self.stream.charsUntil((u"&", u"<")) + self.tokenQueue.append({"type": tokenTypes["Characters"], "data": + data + chars}) + return True + + def entityDataState(self): + self.consumeEntity() + self.state = self.dataState + return True + + def rcdataState(self): + data = self.stream.char() + if data == "&": + self.state = self.characterReferenceInRcdata + elif data == "<": + self.state = self.rcdataLessThanSignState + elif data == EOF: + # Tokenization ends. + return False + elif data in spaceCharacters: + # Directly after emitting a token you switch back to the "data + # state". At that point spaceCharacters are important so they are + # emitted separately. + self.tokenQueue.append({"type": tokenTypes["SpaceCharacters"], "data": + data + self.stream.charsUntil(spaceCharacters, True)}) + # No need to update lastFourChars here, since the first space will + # have already been appended to lastFourChars and will have broken + # any <!-- or --> sequences + else: + chars = self.stream.charsUntil((u"&", u"<")) + self.tokenQueue.append({"type": tokenTypes["Characters"], "data": + data + chars}) + return True + + def characterReferenceInRcdata(self): + self.consumeEntity() + self.state = self.rcdataState + return True + + def rawtextState(self): + data = self.stream.char() + if data == "<": + self.state = self.rawtextLessThanSignState + elif data == EOF: + # Tokenization ends. + return False + else: + chars = self.stream.charsUntil((u"<")) + self.tokenQueue.append({"type": tokenTypes["Characters"], "data": + data + chars}) + return True + + def scriptDataState(self): + data = self.stream.char() + if data == "<": + self.state = self.scriptDataLessThanSignState + elif data == EOF: + # Tokenization ends. + return False + else: + chars = self.stream.charsUntil((u"<")) + self.tokenQueue.append({"type": tokenTypes["Characters"], "data": + data + chars}) + return True + + def plaintextState(self): + data = self.stream.char() + if data == EOF: + # Tokenization ends. + return False + else: + self.tokenQueue.append({"type": tokenTypes["Characters"], "data": + data + self.stream.charsUntilEOF()}) + return True + + def tagOpenState(self): + data = self.stream.char() + if data == u"!": + self.state = self.markupDeclarationOpenState + elif data == u"/": + self.state = self.closeTagOpenState + elif data in asciiLetters: + self.currentToken = {"type": tokenTypes["StartTag"], + "name": data, "data": [], + "selfClosing": False, + "selfClosingAcknowledged": False} + self.state = self.tagNameState + elif data == u">": + # XXX In theory it could be something besides a tag name. But + # do we really care? + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "expected-tag-name-but-got-right-bracket"}) + self.tokenQueue.append({"type": tokenTypes["Characters"], "data": u"<>"}) + self.state = self.dataState + elif data == u"?": + # XXX In theory it could be something besides a tag name. But + # do we really care? + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "expected-tag-name-but-got-question-mark"}) + self.stream.unget(data) + self.state = self.bogusCommentState + else: + # XXX + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "expected-tag-name"}) + self.tokenQueue.append({"type": tokenTypes["Characters"], "data": u"<"}) + self.stream.unget(data) + self.state = self.dataState + return True + + def closeTagOpenState(self): + data = self.stream.char() + if data in asciiLetters: + self.currentToken = {"type": tokenTypes["EndTag"], "name": data, + "data": [], "selfClosing":False} + self.state = self.tagNameState + elif data == u">": + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "expected-closing-tag-but-got-right-bracket"}) + self.state = self.dataState + elif data is EOF: + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "expected-closing-tag-but-got-eof"}) + self.tokenQueue.append({"type": tokenTypes["Characters"], "data": u"</"}) + self.state = self.dataState + else: + # XXX data can be _'_... + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "expected-closing-tag-but-got-char", + "datavars": {"data": data}}) + self.stream.unget(data) + self.state = self.bogusCommentState + return True + + def tagNameState(self): + data = self.stream.char() + if data in spaceCharacters: + self.state = self.beforeAttributeNameState + elif data == u">": + self.emitCurrentToken() + elif data is EOF: + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "eof-in-tag-name"}) + self.state = self.dataState + elif data == u"/": + self.state = self.selfClosingStartTagState + else: + self.currentToken["name"] += data + # (Don't use charsUntil here, because tag names are + # very short and it's faster to not do anything fancy) + return True + + def rcdataLessThanSignState(self): + data = self.stream.char() + if data == "/": + self.temporaryBuffer = "" + self.state = self.rcdataEndTagOpenState + else: + self.tokenQueue.append({"type": tokenTypes["Characters"], "data": u"<"}) + self.stream.unget(data) + self.state = self.rcdataState + return True + + def rcdataEndTagOpenState(self): + data = self.stream.char() + if data in asciiLetters: + self.temporaryBuffer += data + self.state = self.rcdataEndTagNameState + else: + self.tokenQueue.append({"type": tokenTypes["Characters"], "data": u"</"}) + self.stream.unget(data) + self.state = self.rcdataState + return True + + def rcdataEndTagNameState(self): + appropriate = self.currentToken and self.currentToken["name"].lower() == self.temporaryBuffer.lower() + data = self.stream.char() + if data in spaceCharacters and appropriate: + self.currentToken = {"type": tokenTypes["EndTag"], + "name": self.temporaryBuffer, + "data": [], "selfClosing":False} + self.state = self.beforeAttributeNameState + elif data == "/" and appropriate: + self.currentToken = {"type": tokenTypes["EndTag"], + "name": self.temporaryBuffer, + "data": [], "selfClosing":False} + self.state = self.selfClosingStartTagState + elif data == ">" and appropriate: + self.currentToken = {"type": tokenTypes["EndTag"], + "name": self.temporaryBuffer, + "data": [], "selfClosing":False} + self.emitCurrentToken() + self.state = self.dataState + elif data in asciiLetters: + self.temporaryBuffer += data + else: + self.tokenQueue.append({"type": tokenTypes["Characters"], + "data": u"</" + self.temporaryBuffer}) + self.stream.unget(data) + self.state = self.rcdataState + return True + + def rawtextLessThanSignState(self): + data = self.stream.char() + if data == "/": + self.temporaryBuffer = "" + self.state = self.rawtextEndTagOpenState + else: + self.tokenQueue.append({"type": tokenTypes["Characters"], "data": u"<"}) + self.stream.unget(data) + self.state = self.rawtextState + return True + + def rawtextEndTagOpenState(self): + data = self.stream.char() + if data in asciiLetters: + self.temporaryBuffer += data + self.state = self.rawtextEndTagNameState + else: + self.tokenQueue.append({"type": tokenTypes["Characters"], "data": u"</"}) + self.stream.unget(data) + self.state = self.rawtextState + return True + + def rawtextEndTagNameState(self): + appropriate = self.currentToken and self.currentToken["name"].lower() == self.temporaryBuffer.lower() + data = self.stream.char() + if data in spaceCharacters and appropriate: + self.currentToken = {"type": tokenTypes["EndTag"], + "name": self.temporaryBuffer, + "data": [], "selfClosing":False} + self.state = self.beforeAttributeNameState + elif data == "/" and appropriate: + self.currentToken = {"type": tokenTypes["EndTag"], + "name": self.temporaryBuffer, + "data": [], "selfClosing":False} + self.state = self.selfClosingStartTagState + elif data == ">" and appropriate: + self.currentToken = {"type": tokenTypes["EndTag"], + "name": self.temporaryBuffer, + "data": [], "selfClosing":False} + self.emitCurrentToken() + self.state = self.dataState + elif data in asciiLetters: + self.temporaryBuffer += data + else: + self.tokenQueue.append({"type": tokenTypes["Characters"], + "data": u"</" + self.temporaryBuffer}) + self.stream.unget(data) + self.state = self.rawtextState + return True + + def scriptDataLessThanSignState(self): + data = self.stream.char() + if data == "/": + self.temporaryBuffer = "" + self.state = self.scriptDataEndTagOpenState + elif data == "!": + self.tokenQueue.append({"type": tokenTypes["Characters"], "data": u"<!"}) + self.state = self.scriptDataEscapeStartState + else: + self.tokenQueue.append({"type": tokenTypes["Characters"], "data": u"<"}) + self.stream.unget(data) + self.state = self.scriptDataState + return True + + def scriptDataEndTagOpenState(self): + data = self.stream.char() + if data in asciiLetters: + self.temporaryBuffer += data + self.state = self.scriptDataEndTagNameState + else: + self.tokenQueue.append({"type": tokenTypes["Characters"], "data": u"</"}) + self.stream.unget(data) + self.state = self.scriptDataState + return True + + def scriptDataEndTagNameState(self): + appropriate = self.currentToken and self.currentToken["name"].lower() == self.temporaryBuffer.lower() + data = self.stream.char() + if data in spaceCharacters and appropriate: + self.currentToken = {"type": tokenTypes["EndTag"], + "name": self.temporaryBuffer, + "data": [], "selfClosing":False} + self.state = self.beforeAttributeNameState + elif data == "/" and appropriate: + self.currentToken = {"type": tokenTypes["EndTag"], + "name": self.temporaryBuffer, + "data": [], "selfClosing":False} + self.state = self.selfClosingStartTagState + elif data == ">" and appropriate: + self.currentToken = {"type": tokenTypes["EndTag"], + "name": self.temporaryBuffer, + "data": [], "selfClosing":False} + self.emitCurrentToken() + self.state = self.dataState + elif data in asciiLetters: + self.temporaryBuffer += data + else: + self.tokenQueue.append({"type": tokenTypes["Characters"], + "data": u"</" + self.temporaryBuffer}) + self.stream.unget(data) + self.state = self.scriptDataState + return True + + def scriptDataEscapeStartState(self): + data = self.stream.char() + if data == "-": + self.tokenQueue.append({"type": tokenTypes["Characters"], "data": u"-"}) + self.state = self.scriptDataEscapeStartDashState + else: + self.stream.unget(data) + self.state = self.scriptDataState + return True + + def scriptDataEscapeStartDashState(self): + data = self.stream.char() + if data == "-": + self.tokenQueue.append({"type": tokenTypes["Characters"], "data": u"-"}) + self.state = self.scriptDataEscapedDashDashState + else: + self.stream.unget(data) + self.state = self.scriptDataState + return True + + def scriptDataEscapedState(self): + data = self.stream.char() + if data == "-": + self.tokenQueue.append({"type": tokenTypes["Characters"], "data": u"-"}) + self.state = self.scriptDataEscapedDashState + elif data == "<": + self.state = self.scriptDataEscapedLessThanSignState + elif data == EOF: + self.state = self.dataState + else: + chars = self.stream.charsUntil((u"<-")) + self.tokenQueue.append({"type": tokenTypes["Characters"], "data": + data + chars}) + return True + + def scriptDataEscapedDashState(self): + data = self.stream.char() + if data == "-": + self.tokenQueue.append({"type": tokenTypes["Characters"], "data": u"-"}) + self.state = self.scriptDataEscapedDashDashState + elif data == "<": + self.state = self.scriptDataEscapedLessThanSignState + elif data == EOF: + self.state = self.dataState + else: + self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data}) + self.state = self.scriptDataEscapedState + return True + + def scriptDataEscapedDashDashState(self): + data = self.stream.char() + if data == "-": + self.tokenQueue.append({"type": tokenTypes["Characters"], "data": u"-"}) + elif data == "<": + self.state = self.scriptDataEscapedLessThanSignState + elif data == ">": + self.tokenQueue.append({"type": tokenTypes["Characters"], "data": u">"}) + self.state = self.scriptDataState + elif data == EOF: + self.state = self.dataState + else: + self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data}) + self.state = self.scriptDataEscapedState + return True + + def scriptDataEscapedLessThanSignState(self): + data = self.stream.char() + if data == "/": + self.temporaryBuffer = "" + self.state = self.scriptDataEscapedEndTagOpenState + elif data in asciiLetters: + self.tokenQueue.append({"type": tokenTypes["Characters"], "data": u"<" + data}) + self.temporaryBuffer = data + self.state = self.scriptDataDoubleEscapeStartState + else: + self.tokenQueue.append({"type": tokenTypes["Characters"], "data": u"<"}) + self.stream.unget(data) + self.state = self.scriptDataEscapedState + return True + + def scriptDataEscapedEndTagOpenState(self): + data = self.stream.char() + if data in asciiLetters: + self.temporaryBuffer = data + self.state = self.scriptDataEscapedEndTagNameState + else: + self.tokenQueue.append({"type": tokenTypes["Characters"], "data": u"</"}) + self.stream.unget(data) + self.state = self.scriptDataEscapedState + return True + + def scriptDataEscapedEndTagNameState(self): + appropriate = self.currentToken and self.currentToken["name"].lower() == self.temporaryBuffer.lower() + data = self.stream.char() + if data in spaceCharacters and appropriate: + self.currentToken = {"type": tokenTypes["EndTag"], + "name": self.temporaryBuffer, + "data": [], "selfClosing":False} + self.state = self.beforeAttributeNameState + elif data == "/" and appropriate: + self.currentToken = {"type": tokenTypes["EndTag"], + "name": self.temporaryBuffer, + "data": [], "selfClosing":False} + self.state = self.selfClosingStartTagState + elif data == ">" and appropriate: + self.currentToken = {"type": tokenTypes["EndTag"], + "name": self.temporaryBuffer, + "data": [], "selfClosing":False} + self.emitCurrentToken() + self.state = self.dataState + elif data in asciiLetters: + self.temporaryBuffer += data + else: + self.tokenQueue.append({"type": tokenTypes["Characters"], + "data": u"</" + self.temporaryBuffer}) + self.stream.unget(data) + self.state = self.scriptDataEscapedState + return True + + def scriptDataDoubleEscapeStartState(self): + data = self.stream.char() + if data in (spaceCharacters | frozenset(("/", ">"))): + self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data}) + if self.temporaryBuffer.lower() == "script": + self.state = self.scriptDataDoubleEscapedState + else: + self.state = self.scriptDataEscapedState + elif data in asciiLetters: + self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data}) + self.temporaryBuffer += data + else: + self.stream.unget(data) + self.state = self.scriptDataEscapedState + return True + + def scriptDataDoubleEscapedState(self): + data = self.stream.char() + if data == "-": + self.tokenQueue.append({"type": tokenTypes["Characters"], "data": u"-"}) + self.state = self.scriptDataDoubleEscapedDashState + elif data == "<": + self.tokenQueue.append({"type": tokenTypes["Characters"], "data": u"<"}) + self.state = self.scriptDataDoubleEscapedLessThanSignState + elif data == EOF: + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "eof-in-script-in-script"}) + self.state = self.dataState + else: + self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data}) + return True + + def scriptDataDoubleEscapedDashState(self): + data = self.stream.char() + if data == "-": + self.tokenQueue.append({"type": tokenTypes["Characters"], "data": u"-"}) + self.state = self.scriptDataDoubleEscapedDashDashState + elif data == "<": + self.tokenQueue.append({"type": tokenTypes["Characters"], "data": u"<"}) + self.state = self.scriptDataDoubleEscapedLessThanSignState + elif data == EOF: + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "eof-in-script-in-script"}) + self.state = self.dataState + else: + self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data}) + self.state = self.scriptDataDoubleEscapedState + return True + + def scriptDataDoubleEscapedDashState(self): + data = self.stream.char() + if data == "-": + self.tokenQueue.append({"type": tokenTypes["Characters"], "data": u"-"}) + elif data == "<": + self.tokenQueue.append({"type": tokenTypes["Characters"], "data": u"<"}) + self.state = self.scriptDataDoubleEscapedLessThanSignState + elif data == ">": + self.tokenQueue.append({"type": tokenTypes["Characters"], "data": u">"}) + self.state = self.scriptDataState + elif data == EOF: + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "eof-in-script-in-script"}) + self.state = self.dataState + else: + self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data}) + self.state = self.scriptDataDoubleEscapedState + return True + + def scriptDataDoubleEscapedLessThanSignState(self): + data = self.stream.char() + if data == "/": + self.tokenQueue.append({"type": tokenTypes["Characters"], "data": u"/"}) + self.temporaryBuffer = "" + self.state = self.scriptDataDoubleEscapeEndState + else: + self.stream.unget(data) + self.state = self.scriptDataDoubleEscapedState + return True + + def scriptDataDoubleEscapeEndState(self): + data = self.stream.char() + if data in (spaceCharacters | frozenset(("/", ">"))): + self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data}) + if self.temporaryBuffer.lower() == "script": + self.state = self.scriptDataEscapedState + else: + self.state = self.scriptDataDoubleEscapedState + elif data in asciiLetters: + self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data}) + self.temporaryBuffer += data + else: + self.stream.unget(data) + self.state = self.scriptDataDoubleEscapedState + return True + + def beforeAttributeNameState(self): + data = self.stream.char() + if data in spaceCharacters: + self.stream.charsUntil(spaceCharacters, True) + elif data in asciiLetters: + self.currentToken["data"].append([data, ""]) + self.state = self.attributeNameState + elif data == u">": + self.emitCurrentToken() + elif data == u"/": + self.state = self.selfClosingStartTagState + elif data in (u"'", u'"', u"=", u"<"): + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "invalid-character-in-attribute-name"}) + self.currentToken["data"].append([data, ""]) + self.state = self.attributeNameState + elif data is EOF: + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "expected-attribute-name-but-got-eof"}) + self.state = self.dataState + else: + self.currentToken["data"].append([data, ""]) + self.state = self.attributeNameState + return True + + def attributeNameState(self): + data = self.stream.char() + leavingThisState = True + emitToken = False + if data == u"=": + self.state = self.beforeAttributeValueState + elif data in asciiLetters: + self.currentToken["data"][-1][0] += data +\ + self.stream.charsUntil(asciiLetters, True) + leavingThisState = False + elif data == u">": + # XXX If we emit here the attributes are converted to a dict + # without being checked and when the code below runs we error + # because data is a dict not a list + emitToken = True + elif data in spaceCharacters: + self.state = self.afterAttributeNameState + elif data == u"/": + self.state = self.selfClosingStartTagState + elif data in (u"'", u'"', u"<"): + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "invalid-character-in-attribute-name"}) + self.currentToken["data"][-1][0] += data + leavingThisState = False + elif data is EOF: + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "eof-in-attribute-name"}) + self.state = self.dataState + emitToken = True + else: + self.currentToken["data"][-1][0] += data + leavingThisState = False + + if leavingThisState: + # Attributes are not dropped at this stage. That happens when the + # start tag token is emitted so values can still be safely appended + # to attributes, but we do want to report the parse error in time. + if self.lowercaseAttrName: + self.currentToken["data"][-1][0] = ( + self.currentToken["data"][-1][0].translate(asciiUpper2Lower)) + for name, value in self.currentToken["data"][:-1]: + if self.currentToken["data"][-1][0] == name: + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "duplicate-attribute"}) + break + # XXX Fix for above XXX + if emitToken: + self.emitCurrentToken() + return True + + def afterAttributeNameState(self): + data = self.stream.char() + if data in spaceCharacters: + self.stream.charsUntil(spaceCharacters, True) + elif data == u"=": + self.state = self.beforeAttributeValueState + elif data == u">": + self.emitCurrentToken() + elif data in asciiLetters: + self.currentToken["data"].append([data, ""]) + self.state = self.attributeNameState + elif data == u"/": + self.state = self.selfClosingStartTagState + elif data in (u"'", u'"', u"<"): + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "invalid-character-after-attribute-name"}) + self.currentToken["data"].append([data, ""]) + self.state = self.attributeNameState + elif data is EOF: + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "expected-end-of-tag-but-got-eof"}) + self.emitCurrentToken() + else: + self.currentToken["data"].append([data, ""]) + self.state = self.attributeNameState + return True + + def beforeAttributeValueState(self): + data = self.stream.char() + if data in spaceCharacters: + self.stream.charsUntil(spaceCharacters, True) + elif data == u"\"": + self.state = self.attributeValueDoubleQuotedState + elif data == u"&": + self.state = self.attributeValueUnQuotedState + self.stream.unget(data); + elif data == u"'": + self.state = self.attributeValueSingleQuotedState + elif data == u">": + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "expected-attribute-value-but-got-right-bracket"}) + self.emitCurrentToken() + elif data in (u"=", u"<", u"`"): + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "equals-in-unquoted-attribute-value"}) + self.currentToken["data"][-1][1] += data + self.state = self.attributeValueUnQuotedState + elif data is EOF: + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "expected-attribute-value-but-got-eof"}) + self.emitCurrentToken() + else: + self.currentToken["data"][-1][1] += data + self.state = self.attributeValueUnQuotedState + return True + + def attributeValueDoubleQuotedState(self): + data = self.stream.char() + if data == "\"": + self.state = self.afterAttributeValueState + elif data == u"&": + self.processEntityInAttribute(u'"') + elif data is EOF: + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "eof-in-attribute-value-double-quote"}) + self.emitCurrentToken() + else: + self.currentToken["data"][-1][1] += data +\ + self.stream.charsUntil(("\"", u"&")) + return True + + def attributeValueSingleQuotedState(self): + data = self.stream.char() + if data == "'": + self.state = self.afterAttributeValueState + elif data == u"&": + self.processEntityInAttribute(u"'") + elif data is EOF: + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "eof-in-attribute-value-single-quote"}) + self.emitCurrentToken() + else: + self.currentToken["data"][-1][1] += data +\ + self.stream.charsUntil(("'", u"&")) + return True + + def attributeValueUnQuotedState(self): + data = self.stream.char() + if data in spaceCharacters: + self.state = self.beforeAttributeNameState + elif data == u"&": + self.processEntityInAttribute(">") + elif data == u">": + self.emitCurrentToken() + elif data in (u'"', u"'", u"=", u"<", u"`"): + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "unexpected-character-in-unquoted-attribute-value"}) + self.currentToken["data"][-1][1] += data + elif data is EOF: + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "eof-in-attribute-value-no-quotes"}) + self.emitCurrentToken() + else: + self.currentToken["data"][-1][1] += data + self.stream.charsUntil( + frozenset((u"&", u">", u'"', u"'", u"=", u"<", u"`")) | spaceCharacters) + return True + + def afterAttributeValueState(self): + data = self.stream.char() + if data in spaceCharacters: + self.state = self.beforeAttributeNameState + elif data == u">": + self.emitCurrentToken() + elif data == u"/": + self.state = self.selfClosingStartTagState + elif data is EOF: + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "unexpected-EOF-after-attribute-value"}) + self.emitCurrentToken() + self.stream.unget(data) + self.state = self.dataState + else: + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "unexpected-character-after-attribute-value"}) + self.stream.unget(data) + self.state = self.beforeAttributeNameState + return True + + def selfClosingStartTagState(self): + data = self.stream.char() + if data == ">": + self.currentToken["selfClosing"] = True + self.emitCurrentToken() + elif data is EOF: + self.tokenQueue.append({"type": tokenTypes["ParseError"], + "data": + "unexpected-EOF-after-solidus-in-tag"}) + self.stream.unget(data) + self.state = self.dataState + else: + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "unexpected-character-after-soldius-in-tag"}) + self.stream.unget(data) + self.state = self.beforeAttributeNameState + return True + + def bogusCommentState(self): + # Make a new comment token and give it as value all the characters + # until the first > or EOF (charsUntil checks for EOF automatically) + # and emit it. + self.tokenQueue.append( + {"type": tokenTypes["Comment"], "data": self.stream.charsUntil(u">")}) + + # Eat the character directly after the bogus comment which is either a + # ">" or an EOF. + self.stream.char() + self.state = self.dataState + return True + + def bogusCommentContinuationState(self): + # Like bogusCommentState, but the caller must create the comment token + # and this state just adds more characters to it + self.currentToken["data"] += self.stream.charsUntil(u">") + self.tokenQueue.append(self.currentToken) + + # Eat the character directly after the bogus comment which is either a + # ">" or an EOF. + self.stream.char() + self.state = self.dataState + return True + + def markupDeclarationOpenState(self): + charStack = [self.stream.char()] + if charStack[-1] == u"-": + charStack.append(self.stream.char()) + if charStack[-1] == u"-": + self.currentToken = {"type": tokenTypes["Comment"], "data": u""} + self.state = self.commentStartState + return True + elif charStack[-1] in (u'd', u'D'): + matched = True + for expected in ((u'o', u'O'), (u'c', u'C'), (u't', u'T'), + (u'y', u'Y'), (u'p', u'P'), (u'e', u'E')): + charStack.append(self.stream.char()) + if charStack[-1] not in expected: + matched = False + break + if matched: + self.currentToken = {"type": tokenTypes["Doctype"], + "name": u"", + "publicId": None, "systemId": None, + "correct": True} + self.state = self.doctypeState + return True + + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "expected-dashes-or-doctype"}) + # charStack[:-2] consists of 'safe' characters ('-', 'd', 'o', etc) + # so they can be copied directly into the bogus comment data, and only + # the last character might be '>' or EOF and needs to be ungetted + self.stream.unget(charStack.pop()) + self.currentToken = {"type": tokenTypes["Comment"], + "data": u"".join(charStack)} + self.state = self.bogusCommentContinuationState + return True + + def commentStartState(self): + data = self.stream.char() + if data == "-": + self.state = self.commentStartDashState + elif data == ">": + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "incorrect-comment"}) + self.tokenQueue.append(self.currentToken) + self.state = self.dataState + elif data is EOF: + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "eof-in-comment"}) + self.tokenQueue.append(self.currentToken) + self.state = self.dataState + else: + self.currentToken["data"] += data + self.stream.charsUntil(u"-") + self.state = self.commentState + return True + + def commentStartDashState(self): + data = self.stream.char() + if data == "-": + self.state = self.commentEndState + elif data == ">": + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "incorrect-comment"}) + self.tokenQueue.append(self.currentToken) + self.state = self.dataState + elif data is EOF: + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "eof-in-comment"}) + self.tokenQueue.append(self.currentToken) + self.state = self.dataState + else: + self.currentToken["data"] += "-" + data + self.stream.charsUntil(u"-") + self.state = self.commentState + return True + + + def commentState(self): + data = self.stream.char() + if data == u"-": + self.state = self.commentEndDashState + elif data is EOF: + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "eof-in-comment"}) + self.tokenQueue.append(self.currentToken) + self.state = self.dataState + else: + self.currentToken["data"] += data + self.stream.charsUntil(u"-") + return True + + def commentEndDashState(self): + data = self.stream.char() + if data == u"-": + self.state = self.commentEndState + elif data is EOF: + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "eof-in-comment-end-dash"}) + self.tokenQueue.append(self.currentToken) + self.state = self.dataState + else: + self.currentToken["data"] += u"-" + data +\ + self.stream.charsUntil(u"-") + # Consume the next character which is either a "-" or an EOF as + # well so if there's a "-" directly after the "-" we go nicely to + # the "comment end state" without emitting a ParseError() there. + self.stream.char() + return True + + def commentEndState(self): + data = self.stream.char() + if data == u">": + self.tokenQueue.append(self.currentToken) + self.state = self.dataState + elif data == u"-": + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "unexpected-dash-after-double-dash-in-comment"}) + self.currentToken["data"] += data + elif data in spaceCharacters: + self.currentToken["data"] += "--" + data + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "unexpected-space-after-double-dash-in-comment"}) + self.state = self.commentEndSpaceState + elif data == "!": + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "unexpected-bang-after-double-dash-in-comment"}) + self.state = self.commentEndBangState + elif data is EOF: + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "eof-in-comment-double-dash"}) + self.tokenQueue.append(self.currentToken) + self.state = self.dataState + else: + # XXX + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "unexpected-char-in-comment"}) + self.currentToken["data"] += u"--" + data + self.state = self.commentState + return True + + def commentEndBangState(self): + data = self.stream.char() + if data == u">": + self.tokenQueue.append(self.currentToken) + self.state = self.dataState + elif data == u"-": + self.currentToken["data"] += "--!" + self.state = self.commentEndDashState + elif data is EOF: + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "eof-in-comment-end-bang-state"}) + self.tokenQueue.append(self.currentToken) + self.state = self.dataState + else: + self.currentToken["data"] += u"--!" + data + self.state = self.commentState + return True + + def commentEndSpaceState(self): + data = self.stream.char() + if data == u">": + self.tokenQueue.append(self.currentToken) + self.state = self.dataState + elif data == u"-": + self.state = self.commentEndDashState + elif data in spaceCharacters: + self.currentToken["data"] += data + elif data is EOF: + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "eof-in-comment-end-space-state"}) + self.tokenQueue.append(self.currentToken) + self.state = self.dataState + else: + self.currentToken["data"] += data + self.state = self.commentState + return True + + def doctypeState(self): + data = self.stream.char() + if data in spaceCharacters: + self.state = self.beforeDoctypeNameState + elif data is EOF: + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "expected-doctype-name-but-got-eof"}) + self.currentToken["correct"] = False + self.tokenQueue.append(self.currentToken) + self.state = self.dataState + else: + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "need-space-after-doctype"}) + self.stream.unget(data) + self.state = self.beforeDoctypeNameState + return True + + def beforeDoctypeNameState(self): + data = self.stream.char() + if data in spaceCharacters: + pass + elif data == u">": + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "expected-doctype-name-but-got-right-bracket"}) + self.currentToken["correct"] = False + self.tokenQueue.append(self.currentToken) + self.state = self.dataState + elif data is EOF: + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "expected-doctype-name-but-got-eof"}) + self.currentToken["correct"] = False + self.tokenQueue.append(self.currentToken) + self.state = self.dataState + else: + self.currentToken["name"] = data + self.state = self.doctypeNameState + return True + + def doctypeNameState(self): + data = self.stream.char() + if data in spaceCharacters: + self.currentToken["name"] = self.currentToken["name"].translate(asciiUpper2Lower) + self.state = self.afterDoctypeNameState + elif data == u">": + self.currentToken["name"] = self.currentToken["name"].translate(asciiUpper2Lower) + self.tokenQueue.append(self.currentToken) + self.state = self.dataState + elif data is EOF: + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "eof-in-doctype-name"}) + self.currentToken["correct"] = False + self.currentToken["name"] = self.currentToken["name"].translate(asciiUpper2Lower) + self.tokenQueue.append(self.currentToken) + self.state = self.dataState + else: + self.currentToken["name"] += data + return True + + def afterDoctypeNameState(self): + data = self.stream.char() + if data in spaceCharacters: + pass + elif data == u">": + self.tokenQueue.append(self.currentToken) + self.state = self.dataState + elif data is EOF: + self.currentToken["correct"] = False + self.stream.unget(data) + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "eof-in-doctype"}) + self.tokenQueue.append(self.currentToken) + self.state = self.dataState + else: + if data in (u"p", u"P"): + matched = True + for expected in ((u"u", u"U"), (u"b", u"B"), (u"l", u"L"), + (u"i", u"I"), (u"c", u"C")): + data = self.stream.char() + if data not in expected: + matched = False + break + if matched: + self.state = self.afterDoctypePublicKeywordState + return True + elif data in (u"s", u"S"): + matched = True + for expected in ((u"y", u"Y"), (u"s", u"S"), (u"t", u"T"), + (u"e", u"E"), (u"m", u"M")): + data = self.stream.char() + if data not in expected: + matched = False + break + if matched: + self.state = self.afterDoctypeSystemKeywordState + return True + + # All the characters read before the current 'data' will be + # [a-zA-Z], so they're garbage in the bogus doctype and can be + # discarded; only the latest character might be '>' or EOF + # and needs to be ungetted + self.stream.unget(data) + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "expected-space-or-right-bracket-in-doctype", "datavars": + {"data": data}}) + self.currentToken["correct"] = False + self.state = self.bogusDoctypeState + + return True + + def afterDoctypePublicKeywordState(self): + data = self.stream.char() + if data in spaceCharacters: + self.state = self.beforeDoctypePublicIdentifierState + elif data in ("'", '"'): + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "unexpected-char-in-doctype"}) + self.stream.unget(data) + self.state = self.beforeDoctypePublicIdentifierState + elif data is EOF: + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "eof-in-doctype"}) + self.currentToken["correct"] = False + self.tokenQueue.append(self.currentToken) + self.state = self.dataState + else: + self.stream.unget(data) + self.state = self.beforeDoctypePublicIdentifierState + return True + + def beforeDoctypePublicIdentifierState(self): + data = self.stream.char() + if data in spaceCharacters: + pass + elif data == "\"": + self.currentToken["publicId"] = u"" + self.state = self.doctypePublicIdentifierDoubleQuotedState + elif data == "'": + self.currentToken["publicId"] = u"" + self.state = self.doctypePublicIdentifierSingleQuotedState + elif data == ">": + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "unexpected-end-of-doctype"}) + self.currentToken["correct"] = False + self.tokenQueue.append(self.currentToken) + self.state = self.dataState + elif data is EOF: + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "eof-in-doctype"}) + self.currentToken["correct"] = False + self.tokenQueue.append(self.currentToken) + self.state = self.dataState + else: + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "unexpected-char-in-doctype"}) + self.currentToken["correct"] = False + self.state = self.bogusDoctypeState + return True + + def doctypePublicIdentifierDoubleQuotedState(self): + data = self.stream.char() + if data == "\"": + self.state = self.afterDoctypePublicIdentifierState + elif data == ">": + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "unexpected-end-of-doctype"}) + self.currentToken["correct"] = False + self.tokenQueue.append(self.currentToken) + self.state = self.dataState + elif data is EOF: + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "eof-in-doctype"}) + self.currentToken["correct"] = False + self.tokenQueue.append(self.currentToken) + self.state = self.dataState + else: + self.currentToken["publicId"] += data + return True + + def doctypePublicIdentifierSingleQuotedState(self): + data = self.stream.char() + if data == "'": + self.state = self.afterDoctypePublicIdentifierState + elif data == ">": + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "unexpected-end-of-doctype"}) + self.currentToken["correct"] = False + self.tokenQueue.append(self.currentToken) + self.state = self.dataState + elif data is EOF: + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "eof-in-doctype"}) + self.currentToken["correct"] = False + self.tokenQueue.append(self.currentToken) + self.state = self.dataState + else: + self.currentToken["publicId"] += data + return True + + def afterDoctypePublicIdentifierState(self): + data = self.stream.char() + if data in spaceCharacters: + self.state = self.betweenDoctypePublicAndSystemIdentifiersState + elif data == ">": + self.tokenQueue.append(self.currentToken) + self.state = self.dataState + elif data == '"': + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "unexpected-char-in-doctype"}) + self.currentToken["systemId"] = u"" + self.state = self.doctypeSystemIdentifierDoubleQuotedState + elif data == "'": + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "unexpected-char-in-doctype"}) + self.currentToken["systemId"] = u"" + self.state = self.doctypeSystemIdentifierSingleQuotedState + elif data is EOF: + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "eof-in-doctype"}) + self.currentToken["correct"] = False + self.tokenQueue.append(self.currentToken) + self.state = self.dataState + else: + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "unexpected-char-in-doctype"}) + self.currentToken["correct"] = False + self.state = self.bogusDoctypeState + return True + + def betweenDoctypePublicAndSystemIdentifiersState(self): + data = self.stream.char() + if data in spaceCharacters: + pass + elif data == ">": + self.tokenQueue.append(self.currentToken) + self.state = self.dataState + elif data == '"': + self.currentToken["systemId"] = u"" + self.state = self.doctypeSystemIdentifierDoubleQuotedState + elif data == "'": + self.currentToken["systemId"] = u"" + self.state = self.doctypeSystemIdentifierSingleQuotedState + elif data == EOF: + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "eof-in-doctype"}) + self.currentToken["correct"] = False + self.tokenQueue.append(self.currentToken) + self.state = self.dataState + else: + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "unexpected-char-in-doctype"}) + self.currentToken["correct"] = False + self.state = self.bogusDoctypeState + return True + + def afterDoctypeSystemKeywordState(self): + data = self.stream.char() + if data in spaceCharacters: + self.state = self.beforeDoctypeSystemIdentifierState + elif data in ("'", '"'): + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "unexpected-char-in-doctype"}) + self.stream.unget(data) + self.state = self.beforeDoctypeSystemIdentifierState + elif data is EOF: + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "eof-in-doctype"}) + self.currentToken["correct"] = False + self.tokenQueue.append(self.currentToken) + self.state = self.dataState + else: + self.stream.unget(data) + self.state = self.beforeDoctypeSystemIdentifierState + return True + + def beforeDoctypeSystemIdentifierState(self): + data = self.stream.char() + if data in spaceCharacters: + pass + elif data == "\"": + self.currentToken["systemId"] = u"" + self.state = self.doctypeSystemIdentifierDoubleQuotedState + elif data == "'": + self.currentToken["systemId"] = u"" + self.state = self.doctypeSystemIdentifierSingleQuotedState + elif data == ">": + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "unexpected-char-in-doctype"}) + self.currentToken["correct"] = False + self.tokenQueue.append(self.currentToken) + self.state = self.dataState + elif data is EOF: + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "eof-in-doctype"}) + self.currentToken["correct"] = False + self.tokenQueue.append(self.currentToken) + self.state = self.dataState + else: + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "unexpected-char-in-doctype"}) + self.currentToken["correct"] = False + self.state = self.bogusDoctypeState + return True + + def doctypeSystemIdentifierDoubleQuotedState(self): + data = self.stream.char() + if data == "\"": + self.state = self.afterDoctypeSystemIdentifierState + elif data == ">": + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "unexpected-end-of-doctype"}) + self.currentToken["correct"] = False + self.tokenQueue.append(self.currentToken) + self.state = self.dataState + elif data is EOF: + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "eof-in-doctype"}) + self.currentToken["correct"] = False + self.tokenQueue.append(self.currentToken) + self.state = self.dataState + else: + self.currentToken["systemId"] += data + return True + + def doctypeSystemIdentifierSingleQuotedState(self): + data = self.stream.char() + if data == "'": + self.state = self.afterDoctypeSystemIdentifierState + elif data == ">": + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "unexpected-end-of-doctype"}) + self.currentToken["correct"] = False + self.tokenQueue.append(self.currentToken) + self.state = self.dataState + elif data is EOF: + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "eof-in-doctype"}) + self.currentToken["correct"] = False + self.tokenQueue.append(self.currentToken) + self.state = self.dataState + else: + self.currentToken["systemId"] += data + return True + + def afterDoctypeSystemIdentifierState(self): + data = self.stream.char() + if data in spaceCharacters: + pass + elif data == ">": + self.tokenQueue.append(self.currentToken) + self.state = self.dataState + elif data is EOF: + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "eof-in-doctype"}) + self.currentToken["correct"] = False + self.tokenQueue.append(self.currentToken) + self.state = self.dataState + else: + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "unexpected-char-in-doctype"}) + self.state = self.bogusDoctypeState + return True + + def bogusDoctypeState(self): + data = self.stream.char() + if data == u">": + self.tokenQueue.append(self.currentToken) + self.state = self.dataState + elif data is EOF: + # XXX EMIT + self.stream.unget(data) + self.tokenQueue.append(self.currentToken) + self.state = self.dataState + else: + pass + return True diff --git a/src/html5lib/tokenizer_old.py b/src/html5lib/tokenizer_old.py new file mode 100644 index 0000000000..2d5262ca2a --- /dev/null +++ b/src/html5lib/tokenizer_old.py @@ -0,0 +1,1212 @@ +try: + frozenset +except NameError: + # Import from the sets module for python 2.3 + from sets import Set as set + from sets import ImmutableSet as frozenset +try: + from collections import deque +except ImportError: + from utils import deque + +from constants import contentModelFlags, spaceCharacters +from constants import entitiesWindows1252, entities +from constants import asciiLowercase, asciiLetters, asciiUpper2Lower +from constants import digits, hexDigits, EOF +from constants import tokenTypes, tagTokenTypes + +from inputstream import HTMLInputStream + +# Group entities by their first character, for faster lookups +entitiesByFirstChar = {} +for e in entities: + entitiesByFirstChar.setdefault(e[0], []).append(e) + +class HTMLTokenizer: + """ This class takes care of tokenizing HTML. + + * self.currentToken + Holds the token that is currently being processed. + + * self.state + Holds a reference to the method to be invoked... XXX + + * self.states + Holds a mapping between states and methods that implement the state. + + * self.stream + Points to HTMLInputStream object. + """ + + # XXX need to fix documentation + + def __init__(self, stream, encoding=None, parseMeta=True, useChardet=True, + lowercaseElementName=True, lowercaseAttrName=True): + self.stream = HTMLInputStream(stream, encoding, parseMeta, useChardet) + + #Perform case conversions? + self.lowercaseElementName = lowercaseElementName + self.lowercaseAttrName = lowercaseAttrName + + self.states = { + "data":self.dataState, + "entityData":self.entityDataState, + "tagOpen":self.tagOpenState, + "closeTagOpen":self.closeTagOpenState, + "tagName":self.tagNameState, + "beforeAttributeName":self.beforeAttributeNameState, + "attributeName":self.attributeNameState, + "afterAttributeName":self.afterAttributeNameState, + "beforeAttributeValue":self.beforeAttributeValueState, + "attributeValueDoubleQuoted":self.attributeValueDoubleQuotedState, + "attributeValueSingleQuoted":self.attributeValueSingleQuotedState, + "attributeValueUnQuoted":self.attributeValueUnQuotedState, + "afterAttributeValue":self.afterAttributeValueState, + "selfClosingStartTag":self.selfClosingStartTagState, + "bogusComment":self.bogusCommentState, + "bogusCommentContinuation":self.bogusCommentContinuationState, + "markupDeclarationOpen":self.markupDeclarationOpenState, + "commentStart":self.commentStartState, + "commentStartDash":self.commentStartDashState, + "comment":self.commentState, + "commentEndDash":self.commentEndDashState, + "commentEnd":self.commentEndState, + "commentEndBang":self.commentEndBangState, + "commentEndSpace":self.commentEndSpaceState, + "doctype":self.doctypeState, + "beforeDoctypeName":self.beforeDoctypeNameState, + "doctypeName":self.doctypeNameState, + "afterDoctypeName":self.afterDoctypeNameState, + "beforeDoctypePublicIdentifier":self.beforeDoctypePublicIdentifierState, + "doctypePublicIdentifierDoubleQuoted":self.doctypePublicIdentifierDoubleQuotedState, + "doctypePublicIdentifierSingleQuoted":self.doctypePublicIdentifierSingleQuotedState, + "afterDoctypePublicIdentifier":self.afterDoctypePublicIdentifierState, + "beforeDoctypeSystemIdentifier":self.beforeDoctypeSystemIdentifierState, + "doctypeSystemIdentifierDoubleQuoted":self.doctypeSystemIdentifierDoubleQuotedState, + "doctypeSystemIdentifierSingleQuoted":self.doctypeSystemIdentifierSingleQuotedState, + "afterDoctypeSystemIdentifier":self.afterDoctypeSystemIdentifierState, + "bogusDoctype":self.bogusDoctypeState + } + + # Setup the initial tokenizer state + self.contentModelFlag = contentModelFlags["PCDATA"] + self.escapeFlag = False + self.lastFourChars = [] + self.state = self.dataState + self.escape = False + + # The current token being created + self.currentToken = None + + def __iter__(self): + """ This is where the magic happens. + + We do our usually processing through the states and when we have a token + to return we yield the token which pauses processing until the next token + is requested. + """ + self.tokenQueue = deque([]) + # Start processing. When EOF is reached self.state will return False + # instead of True and the loop will terminate. + while self.state(): + while self.stream.errors: + yield {"type": tokenTypes["ParseError"], "data": self.stream.errors.pop(0)} + while self.tokenQueue: + yield self.tokenQueue.popleft() + + def consumeNumberEntity(self, isHex): + """This function returns either U+FFFD or the character based on the + decimal or hexadecimal representation. It also discards ";" if present. + If not present self.tokenQueue.append({"type": tokenTypes["ParseError"]}) is invoked. + """ + + allowed = digits + radix = 10 + if isHex: + allowed = hexDigits + radix = 16 + + charStack = [] + + # Consume all the characters that are in range while making sure we + # don't hit an EOF. + c = self.stream.char() + while c in allowed and c is not EOF: + charStack.append(c) + c = self.stream.char() + + # Convert the set of characters consumed to an int. + charAsInt = int("".join(charStack), radix) + + if charAsInt == 13: + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "incorrect-cr-newline-entity"}) + charAsInt = 10 + elif 127 < charAsInt < 160: + # If the integer is between 127 and 160 (so 128 and bigger and 159 + # and smaller) we need to do the "windows trick". + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "illegal-windows-1252-entity"}) + + charAsInt = entitiesWindows1252[charAsInt - 128] + + # Certain characters get replaced with U+FFFD + if ((charAsInt <= 0x0008) or (charAsInt == 0x000B) or (0x000E <= charAsInt <= 0x001F) + or (0x007F <= charAsInt <= 0x009F) + or (0xD800 <= charAsInt <= 0xDFFF) or (0xFDD0 <= charAsInt <= 0xFDEF) + or (charAsInt & 0xFFFE == 0xFFFE) # catch all U+?FFFE and U+?FFFF, where ? is 0..10 + or (0x10FFFF < charAsInt)): + char = u"\uFFFD" + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "illegal-codepoint-for-numeric-entity", + "datavars": {"charAsInt": charAsInt}}) + else: + try: + # XXX We should have a separate function that does "int" to + # "unicodestring" conversion since this doesn't always work + # according to hsivonen. Also, unichr has a limitation of 65535 + char = unichr(charAsInt) + except: + try: + char = eval("u'\\U%08x'" % charAsInt) + except: + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "cant-convert-numeric-entity", + "datavars": {"charAsInt": charAsInt}}) + + # Discard the ; if present. Otherwise, put it back on the queue and + # invoke parseError on parser. + if c != u";": + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "numeric-entity-without-semicolon"}) + self.stream.unget(c) + + return char + + def consumeEntity(self, allowedChar=None, fromAttribute=False): + # Initialise to the default output for when no entity is matched + output = u"&" + + charStack = [self.stream.char()] + if charStack[0] in spaceCharacters or charStack[0] in (EOF, u"<", u"&") \ + or (allowedChar is not None and allowedChar == charStack[0]): + self.stream.unget(charStack[0]) + + elif charStack[0] == u"#": + # Read the next character to see if it's hex or decimal + hex = False + charStack.append(self.stream.char()) + if charStack[-1] in (u"x", u"X"): + hex = True + charStack.append(self.stream.char()) + + # charStack[-1] should be the first digit + if (hex and charStack[-1] in hexDigits) \ + or (not hex and charStack[-1] in digits): + # At least one digit found, so consume the whole number + self.stream.unget(charStack[-1]) + output = self.consumeNumberEntity(hex) + else: + # No digits found + self.tokenQueue.append({"type": tokenTypes["ParseError"], + "data": "expected-numeric-entity"}) + self.stream.unget(charStack.pop()) + output = u"&" + u"".join(charStack) + + else: + # At this point in the process might have named entity. Entities + # are stored in the global variable "entities". + # + # Consume characters and compare to these to a substring of the + # entity names in the list until the substring no longer matches. + filteredEntityList = entitiesByFirstChar.get(charStack[0], []) + + def entitiesStartingWith(name): + return [e for e in filteredEntityList if e.startswith(name)] + + while charStack[-1] is not EOF and\ + entitiesStartingWith("".join(charStack)): + charStack.append(self.stream.char()) + + # At this point we have a string that starts with some characters + # that may match an entity + entityName = None + + # Try to find the longest entity the string will match to take care + # of ¬i for instance. + for entityLength in xrange(len(charStack)-1, 1, -1): + possibleEntityName = "".join(charStack[:entityLength]) + if possibleEntityName in entities: + entityName = possibleEntityName + break + + if entityName is not None: + if entityName[-1] != ";": + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "named-entity-without-semicolon"}) + if entityName[-1] != ";" and fromAttribute and \ + (charStack[entityLength] in asciiLetters + or charStack[entityLength] in digits): + self.stream.unget(charStack.pop()) + output = u"&" + u"".join(charStack) + else: + output = entities[entityName] + self.stream.unget(charStack.pop()) + output += u"".join(charStack[entityLength:]) + else: + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "expected-named-entity"}) + self.stream.unget(charStack.pop()) + output = u"&" + u"".join(charStack) + + if fromAttribute: + self.currentToken["data"][-1][1] += output + else: + self.tokenQueue.append({"type": tokenTypes["Characters"], "data": output}) + + def processEntityInAttribute(self, allowedChar): + """This method replaces the need for "entityInAttributeValueState". + """ + self.consumeEntity(allowedChar=allowedChar, fromAttribute=True) + + def emitCurrentToken(self): + """This method is a generic handler for emitting the tags. It also sets + the state to "data" because that's what's needed after a token has been + emitted. + """ + token = self.currentToken + # Add token to the queue to be yielded + if (token["type"] in tagTokenTypes): + if self.lowercaseElementName: + token["name"] = token["name"].translate(asciiUpper2Lower) + if token["type"] == tokenTypes["EndTag"]: + if token["data"]: + self.tokenQueue.append({"type":tokenTypes["ParseError"], + "data":"attributes-in-end-tag"}) + if token["selfClosing"]: + self.tokenQueue.append({"type":tokenTypes["ParseError"], + "data":"self-closing-flag-on-end-tag"}) + self.tokenQueue.append(token) + self.state = self.dataState + + + # Below are the various tokenizer states worked out. + + def dataState(self): + #XXX - consider splitting this state based on the content model flag + data = self.stream.char() + + # Keep a charbuffer to handle the escapeFlag + if (self.contentModelFlag in + (contentModelFlags["CDATA"], contentModelFlags["RCDATA"])): + if len(self.lastFourChars) == 4: + self.lastFourChars.pop(0) + self.lastFourChars.append(data) + + # The rest of the logic + if (data == "&" and self.contentModelFlag in + (contentModelFlags["PCDATA"], contentModelFlags["RCDATA"]) and + not self.escapeFlag): + self.state = self.states["entityData"] + elif (data == "-" and self.contentModelFlag in + (contentModelFlags["CDATA"], contentModelFlags["RCDATA"]) and + not self.escapeFlag and "".join(self.lastFourChars) == "<!--"): + self.escapeFlag = True + self.tokenQueue.append({"type": tokenTypes["Characters"], + "data":data}) + elif (data == "<" and (self.contentModelFlag == + contentModelFlags["PCDATA"] + or (self.contentModelFlag in + (contentModelFlags["CDATA"], + contentModelFlags["RCDATA"]) and + self.escapeFlag == False))): + self.state = self.states["tagOpen"] + elif (data == ">" and self.contentModelFlag in + (contentModelFlags["CDATA"], contentModelFlags["RCDATA"]) and + self.escapeFlag and "".join(self.lastFourChars)[1:] == "-->"): + self.escapeFlag = False + self.tokenQueue.append({"type": tokenTypes["Characters"], "data":data}) + elif data is EOF: + # Tokenization ends. + return False + elif data in spaceCharacters: + # Directly after emitting a token you switch back to the "data + # state". At that point spaceCharacters are important so they are + # emitted separately. + self.tokenQueue.append({"type": tokenTypes["SpaceCharacters"], "data": + data + self.stream.charsUntil(spaceCharacters, True)}) + # No need to update lastFourChars here, since the first space will + # have already been appended to lastFourChars and will have broken + # any <!-- or --> sequences + else: + if (self.contentModelFlag in + (contentModelFlags["CDATA"], contentModelFlags["RCDATA"])): + chars = self.stream.charsUntil((u"&", u"<", u">", u"-")) + self.lastFourChars += chars[-4:] + self.lastFourChars = self.lastFourChars[-4:] + else: + chars = self.stream.charsUntil((u"&", u"<")) + self.tokenQueue.append({"type": tokenTypes["Characters"], "data": + data + chars}) + return True + + def entityDataState(self): + self.consumeEntity() + self.state = self.states["data"] + return True + + def tagOpenState(self): + data = self.stream.char() + if self.contentModelFlag == contentModelFlags["PCDATA"]: + if data == u"!": + self.state = self.states["markupDeclarationOpen"] + elif data == u"/": + self.state = self.states["closeTagOpen"] + elif data in asciiLetters: + self.currentToken = {"type": tokenTypes["StartTag"], + "name": data, "data": [], + "selfClosing": False, + "selfClosingAcknowledged": False} + self.state = self.states["tagName"] + elif data == u">": + # XXX In theory it could be something besides a tag name. But + # do we really care? + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "expected-tag-name-but-got-right-bracket"}) + self.tokenQueue.append({"type": tokenTypes["Characters"], "data": u"<>"}) + self.state = self.states["data"] + elif data == u"?": + # XXX In theory it could be something besides a tag name. But + # do we really care? + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "expected-tag-name-but-got-question-mark"}) + self.stream.unget(data) + self.state = self.states["bogusComment"] + else: + # XXX + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "expected-tag-name"}) + self.tokenQueue.append({"type": tokenTypes["Characters"], "data": u"<"}) + self.stream.unget(data) + self.state = self.states["data"] + else: + # We know the content model flag is set to either RCDATA or CDATA + # now because this state can never be entered with the PLAINTEXT + # flag. + if data == u"/": + self.state = self.states["closeTagOpen"] + else: + self.tokenQueue.append({"type": tokenTypes["Characters"], "data": u"<"}) + self.stream.unget(data) + self.state = self.states["data"] + return True + + def closeTagOpenState(self): + if (self.contentModelFlag in (contentModelFlags["RCDATA"], + contentModelFlags["CDATA"])): + + charStack = [] + if self.currentToken: + # So far we know that "</" has been consumed. We now need to know + # whether the next few characters match the name of last emitted + # start tag which also happens to be the currentToken. + matched = True + for expected in self.currentToken["name"].lower(): + charStack.append(self.stream.char()) + if charStack[-1] not in (expected, expected.upper()): + matched = False + break + + # If the tag name prefix matched, we also need to check the + # subsequent character + if matched: + charStack.append(self.stream.char()) + if charStack[-1] in (spaceCharacters | frozenset((u">", u"/", EOF))): + self.contentModelFlag = contentModelFlags["PCDATA"] + # Unget the last character, so it can be re-processed + # in the next state + self.stream.unget(charStack.pop()) + # The remaining characters in charStack are the tag name + self.currentToken = {"type": tokenTypes["EndTag"], + "name": u"".join(charStack), + "data": [], + "selfClosing":False} + self.state = self.states["tagName"] + return True + + # Didn't find the end tag. The last character in charStack could be + # anything, so it has to be re-processed in the data state + self.stream.unget(charStack.pop()) + + # The remaining characters are a prefix of the tag name, so they're + # just letters and digits, so they can be output as character + # tokens immediately + self.tokenQueue.append({"type": tokenTypes["Characters"], "data": u"</" + u"".join(charStack)}) + self.state = self.states["data"] + return True + + data = self.stream.char() + if data in asciiLetters: + self.currentToken = {"type": tokenTypes["EndTag"], "name": data, + "data": [], "selfClosing":False} + self.state = self.states["tagName"] + elif data == u">": + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "expected-closing-tag-but-got-right-bracket"}) + self.state = self.states["data"] + elif data is EOF: + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "expected-closing-tag-but-got-eof"}) + self.tokenQueue.append({"type": tokenTypes["Characters"], "data": u"</"}) + self.state = self.states["data"] + else: + # XXX data can be _'_... + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "expected-closing-tag-but-got-char", + "datavars": {"data": data}}) + self.stream.unget(data) + self.state = self.states["bogusComment"] + return True + + def tagNameState(self): + data = self.stream.char() + if data in spaceCharacters: + self.state = self.states["beforeAttributeName"] + elif data == u">": + self.emitCurrentToken() + elif data is EOF: + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "eof-in-tag-name"}) + self.state = self.states["data"] + elif data == u"/": + self.state = self.states["selfClosingStartTag"] + else: + self.currentToken["name"] += data + # (Don't use charsUntil here, because tag names are + # very short and it's faster to not do anything fancy) + return True + + def beforeAttributeNameState(self): + data = self.stream.char() + if data in spaceCharacters: + self.stream.charsUntil(spaceCharacters, True) + elif data in asciiLetters: + self.currentToken["data"].append([data, ""]) + self.state = self.states["attributeName"] + elif data == u">": + self.emitCurrentToken() + elif data == u"/": + self.state = self.states["selfClosingStartTag"] + elif data == u"'" or data == u'"' or data == u"=": + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "invalid-character-in-attribute-name"}) + self.currentToken["data"].append([data, ""]) + self.state = self.states["attributeName"] + elif data is EOF: + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "expected-attribute-name-but-got-eof"}) + self.state = self.states["data"] + else: + self.currentToken["data"].append([data, ""]) + self.state = self.states["attributeName"] + return True + + def attributeNameState(self): + data = self.stream.char() + leavingThisState = True + emitToken = False + if data == u"=": + self.state = self.states["beforeAttributeValue"] + elif data in asciiLetters: + self.currentToken["data"][-1][0] += data +\ + self.stream.charsUntil(asciiLetters, True) + leavingThisState = False + elif data == u">": + # XXX If we emit here the attributes are converted to a dict + # without being checked and when the code below runs we error + # because data is a dict not a list + emitToken = True + elif data in spaceCharacters: + self.state = self.states["afterAttributeName"] + elif data == u"/": + self.state = self.states["selfClosingStartTag"] + elif data == u"'" or data == u'"': + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "invalid-character-in-attribute-name"}) + self.currentToken["data"][-1][0] += data + leavingThisState = False + elif data is EOF: + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "eof-in-attribute-name"}) + self.state = self.states["data"] + emitToken = True + else: + self.currentToken["data"][-1][0] += data + leavingThisState = False + + if leavingThisState: + # Attributes are not dropped at this stage. That happens when the + # start tag token is emitted so values can still be safely appended + # to attributes, but we do want to report the parse error in time. + if self.lowercaseAttrName: + self.currentToken["data"][-1][0] = ( + self.currentToken["data"][-1][0].translate(asciiUpper2Lower)) + for name, value in self.currentToken["data"][:-1]: + if self.currentToken["data"][-1][0] == name: + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "duplicate-attribute"}) + break + # XXX Fix for above XXX + if emitToken: + self.emitCurrentToken() + return True + + def afterAttributeNameState(self): + data = self.stream.char() + if data in spaceCharacters: + self.stream.charsUntil(spaceCharacters, True) + elif data == u"=": + self.state = self.states["beforeAttributeValue"] + elif data == u">": + self.emitCurrentToken() + elif data in asciiLetters: + self.currentToken["data"].append([data, ""]) + self.state = self.states["attributeName"] + elif data == u"/": + self.state = self.states["selfClosingStartTag"] + elif data == u"'" or data == u'"': + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "invalid-character-after-attribute-name"}) + self.currentToken["data"].append([data, ""]) + self.state = self.states["attributeName"] + elif data is EOF: + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "expected-end-of-tag-but-got-eof"}) + self.emitCurrentToken() + else: + self.currentToken["data"].append([data, ""]) + self.state = self.states["attributeName"] + return True + + def beforeAttributeValueState(self): + data = self.stream.char() + if data in spaceCharacters: + self.stream.charsUntil(spaceCharacters, True) + elif data == u"\"": + self.state = self.states["attributeValueDoubleQuoted"] + elif data == u"&": + self.state = self.states["attributeValueUnQuoted"] + self.stream.unget(data); + elif data == u"'": + self.state = self.states["attributeValueSingleQuoted"] + elif data == u">": + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "expected-attribute-value-but-got-right-bracket"}) + self.emitCurrentToken() + elif data == u"=": + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "equals-in-unquoted-attribute-value"}) + self.currentToken["data"][-1][1] += data + self.state = self.states["attributeValueUnQuoted"] + elif data is EOF: + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "expected-attribute-value-but-got-eof"}) + self.emitCurrentToken() + else: + self.currentToken["data"][-1][1] += data + self.state = self.states["attributeValueUnQuoted"] + return True + + def attributeValueDoubleQuotedState(self): + data = self.stream.char() + if data == "\"": + self.state = self.states["afterAttributeValue"] + elif data == u"&": + self.processEntityInAttribute(u'"') + elif data is EOF: + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "eof-in-attribute-value-double-quote"}) + self.emitCurrentToken() + else: + self.currentToken["data"][-1][1] += data +\ + self.stream.charsUntil(("\"", u"&")) + return True + + def attributeValueSingleQuotedState(self): + data = self.stream.char() + if data == "'": + self.state = self.states["afterAttributeValue"] + elif data == u"&": + self.processEntityInAttribute(u"'") + elif data is EOF: + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "eof-in-attribute-value-single-quote"}) + self.emitCurrentToken() + else: + self.currentToken["data"][-1][1] += data +\ + self.stream.charsUntil(("'", u"&")) + return True + + def attributeValueUnQuotedState(self): + data = self.stream.char() + if data in spaceCharacters: + self.state = self.states["beforeAttributeName"] + elif data == u"&": + self.processEntityInAttribute(None) + elif data == u">": + self.emitCurrentToken() + elif data in (u'"', u"'", u"=", u"<"): + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "unexpected-character-in-unquoted-attribute-value"}) + self.currentToken["data"][-1][1] += data + elif data is EOF: + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "eof-in-attribute-value-no-quotes"}) + self.emitCurrentToken() + else: + self.currentToken["data"][-1][1] += data + self.stream.charsUntil( \ + frozenset(("&", ">", "<", "=", "'", '"')) | spaceCharacters) + return True + + def afterAttributeValueState(self): + data = self.stream.char() + if data in spaceCharacters: + self.state = self.states["beforeAttributeName"] + elif data == u">": + self.emitCurrentToken() + elif data == u"/": + self.state = self.states["selfClosingStartTag"] + elif data is EOF: + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "unexpected-EOF-after-attribute-value"}) + self.emitCurrentToken() + self.stream.unget(data) + self.state = self.states["data"] + else: + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "unexpected-character-after-attribute-value"}) + self.stream.unget(data) + self.state = self.states["beforeAttributeName"] + return True + + def selfClosingStartTagState(self): + data = self.stream.char() + if data == ">": + self.currentToken["selfClosing"] = True + self.emitCurrentToken() + elif data is EOF: + self.tokenQueue.append({"type": tokenTypes["ParseError"], + "data": + "unexpected-EOF-after-solidus-in-tag"}) + self.stream.unget(data) + self.state = self.states["data"] + else: + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "unexpected-character-after-soldius-in-tag"}) + self.stream.unget(data) + self.state = self.states["beforeAttributeName"] + return True + + def bogusCommentState(self): + # Make a new comment token and give it as value all the characters + # until the first > or EOF (charsUntil checks for EOF automatically) + # and emit it. + self.tokenQueue.append( + {"type": tokenTypes["Comment"], "data": self.stream.charsUntil(u">")}) + + # Eat the character directly after the bogus comment which is either a + # ">" or an EOF. + self.stream.char() + self.state = self.states["data"] + return True + + def bogusCommentContinuationState(self): + # Like bogusCommentState, but the caller must create the comment token + # and this state just adds more characters to it + self.currentToken["data"] += self.stream.charsUntil(u">") + self.tokenQueue.append(self.currentToken) + + # Eat the character directly after the bogus comment which is either a + # ">" or an EOF. + self.stream.char() + self.state = self.states["data"] + return True + + def markupDeclarationOpenState(self): + charStack = [self.stream.char()] + if charStack[-1] == u"-": + charStack.append(self.stream.char()) + if charStack[-1] == u"-": + self.currentToken = {"type": tokenTypes["Comment"], "data": u""} + self.state = self.states["commentStart"] + return True + elif charStack[-1] in (u'd', u'D'): + matched = True + for expected in ((u'o', u'O'), (u'c', u'C'), (u't', u'T'), + (u'y', u'Y'), (u'p', u'P'), (u'e', u'E')): + charStack.append(self.stream.char()) + if charStack[-1] not in expected: + matched = False + break + if matched: + self.currentToken = {"type": tokenTypes["Doctype"], + "name": u"", + "publicId": None, "systemId": None, + "correct": True} + self.state = self.states["doctype"] + return True + + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "expected-dashes-or-doctype"}) + # charStack[:-2] consists of 'safe' characters ('-', 'd', 'o', etc) + # so they can be copied directly into the bogus comment data, and only + # the last character might be '>' or EOF and needs to be ungetted + self.stream.unget(charStack.pop()) + self.currentToken = {"type": tokenTypes["Comment"], + "data": u"".join(charStack)} + self.state = self.states["bogusCommentContinuation"] + return True + + def commentStartState(self): + data = self.stream.char() + if data == "-": + self.state = self.states["commentStartDash"] + elif data == ">": + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "incorrect-comment"}) + self.tokenQueue.append(self.currentToken) + self.state = self.states["data"] + elif data is EOF: + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "eof-in-comment"}) + self.tokenQueue.append(self.currentToken) + self.state = self.states["data"] + else: + self.currentToken["data"] += data + self.stream.charsUntil(u"-") + self.state = self.states["comment"] + return True + + def commentStartDashState(self): + data = self.stream.char() + if data == "-": + self.state = self.states["commentEnd"] + elif data == ">": + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "incorrect-comment"}) + self.tokenQueue.append(self.currentToken) + self.state = self.states["data"] + elif data is EOF: + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "eof-in-comment"}) + self.tokenQueue.append(self.currentToken) + self.state = self.states["data"] + else: + self.currentToken["data"] += "-" + data + self.stream.charsUntil(u"-") + self.state = self.states["comment"] + return True + + + def commentState(self): + data = self.stream.char() + if data == u"-": + self.state = self.states["commentEndDash"] + elif data is EOF: + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "eof-in-comment"}) + self.tokenQueue.append(self.currentToken) + self.state = self.states["data"] + else: + self.currentToken["data"] += data + self.stream.charsUntil(u"-") + return True + + def commentEndDashState(self): + data = self.stream.char() + if data == u"-": + self.state = self.states["commentEnd"] + elif data is EOF: + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "eof-in-comment-end-dash"}) + self.tokenQueue.append(self.currentToken) + self.state = self.states["data"] + else: + self.currentToken["data"] += u"-" + data +\ + self.stream.charsUntil(u"-") + # Consume the next character which is either a "-" or an EOF as + # well so if there's a "-" directly after the "-" we go nicely to + # the "comment end state" without emitting a ParseError() there. + self.stream.char() + return True + + def commentEndState(self): + data = self.stream.char() + if data == u">": + self.tokenQueue.append(self.currentToken) + self.state = self.states["data"] + elif data == u"-": + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "unexpected-dash-after-double-dash-in-comment"}) + self.currentToken["data"] += data + elif data in spaceCharacters: + self.currentToken["data"] += "--" + data + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "unexpected-space-after-double-dash-in-comment"}) + self.state = self.states["commentEndSpace"] + elif data == "!": + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "unexpected-bang-after-double-dash-in-comment"}) + self.state = self.states["commentEndBang"] + elif data is EOF: + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "eof-in-comment-double-dash"}) + self.tokenQueue.append(self.currentToken) + self.state = self.states["data"] + else: + # XXX + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "unexpected-char-in-comment"}) + self.currentToken["data"] += u"--" + data + self.state = self.states["comment"] + return True + + def commentEndBangState(self): + data = self.stream.char() + if data == u">": + self.tokenQueue.append(self.currentToken) + self.state = self.states["data"] + elif data == u"-": + self.currentToken["data"] += "--!" + self.state = self.states["commentEndDash"] + elif data is EOF: + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "eof-in-comment-end-bang-state"}) + self.tokenQueue.append(self.currentToken) + self.state = self.states["data"] + else: + self.currentToken["data"] += u"--!" + data + self.state = self.states["comment"] + return True + + def commentEndSpaceState(self): + data = self.stream.char() + if data == u">": + self.tokenQueue.append(self.currentToken) + self.state = self.states["data"] + elif data == u"-": + self.state = self.states["commentEndDash"] + elif data in spaceCharacters: + self.currentToken["data"] += data + elif data is EOF: + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "eof-in-comment-end-space-state"}) + self.tokenQueue.append(self.currentToken) + self.state = self.states["data"] + else: + self.currentToken["data"] += data + self.state = self.states["comment"] + return True + + def doctypeState(self): + data = self.stream.char() + if data in spaceCharacters: + self.state = self.states["beforeDoctypeName"] + elif data is EOF: + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "expected-doctype-name-but-got-eof"}) + self.currentToken["correct"] = False + self.tokenQueue.append(self.currentToken) + self.state = self.states["data"] + else: + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "need-space-after-doctype"}) + self.stream.unget(data) + self.state = self.states["beforeDoctypeName"] + return True + + def beforeDoctypeNameState(self): + data = self.stream.char() + if data in spaceCharacters: + pass + elif data == u">": + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "expected-doctype-name-but-got-right-bracket"}) + self.currentToken["correct"] = False + self.tokenQueue.append(self.currentToken) + self.state = self.states["data"] + elif data is EOF: + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "expected-doctype-name-but-got-eof"}) + self.currentToken["correct"] = False + self.tokenQueue.append(self.currentToken) + self.state = self.states["data"] + else: + self.currentToken["name"] = data + self.state = self.states["doctypeName"] + return True + + def doctypeNameState(self): + data = self.stream.char() + if data in spaceCharacters: + self.currentToken["name"] = self.currentToken["name"].translate(asciiUpper2Lower) + self.state = self.states["afterDoctypeName"] + elif data == u">": + self.currentToken["name"] = self.currentToken["name"].translate(asciiUpper2Lower) + self.tokenQueue.append(self.currentToken) + self.state = self.states["data"] + elif data is EOF: + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "eof-in-doctype-name"}) + self.currentToken["correct"] = False + self.currentToken["name"] = self.currentToken["name"].translate(asciiUpper2Lower) + self.tokenQueue.append(self.currentToken) + self.state = self.states["data"] + else: + self.currentToken["name"] += data + return True + + def afterDoctypeNameState(self): + data = self.stream.char() + if data in spaceCharacters: + pass + elif data == u">": + self.tokenQueue.append(self.currentToken) + self.state = self.states["data"] + elif data is EOF: + self.currentToken["correct"] = False + self.stream.unget(data) + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "eof-in-doctype"}) + self.tokenQueue.append(self.currentToken) + self.state = self.states["data"] + else: + if data in (u"p", u"P"): + matched = True + for expected in ((u"u", u"U"), (u"b", u"B"), (u"l", u"L"), + (u"i", u"I"), (u"c", u"C")): + data = self.stream.char() + if data not in expected: + matched = False + break + if matched: + self.state = self.states["beforeDoctypePublicIdentifier"] + return True + elif data in (u"s", u"S"): + matched = True + for expected in ((u"y", u"Y"), (u"s", u"S"), (u"t", u"T"), + (u"e", u"E"), (u"m", u"M")): + data = self.stream.char() + if data not in expected: + matched = False + break + if matched: + self.state = self.states["beforeDoctypeSystemIdentifier"] + return True + + # All the characters read before the current 'data' will be + # [a-zA-Z], so they're garbage in the bogus doctype and can be + # discarded; only the latest character might be '>' or EOF + # and needs to be ungetted + self.stream.unget(data) + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "expected-space-or-right-bracket-in-doctype", "datavars": + {"data": data}}) + self.currentToken["correct"] = False + self.state = self.states["bogusDoctype"] + + return True + + def beforeDoctypePublicIdentifierState(self): + data = self.stream.char() + if data in spaceCharacters: + pass + elif data == "\"": + self.currentToken["publicId"] = u"" + self.state = self.states["doctypePublicIdentifierDoubleQuoted"] + elif data == "'": + self.currentToken["publicId"] = u"" + self.state = self.states["doctypePublicIdentifierSingleQuoted"] + elif data == ">": + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "unexpected-end-of-doctype"}) + self.currentToken["correct"] = False + self.tokenQueue.append(self.currentToken) + self.state = self.states["data"] + elif data is EOF: + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "eof-in-doctype"}) + self.currentToken["correct"] = False + self.tokenQueue.append(self.currentToken) + self.state = self.states["data"] + else: + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "unexpected-char-in-doctype"}) + self.currentToken["correct"] = False + self.state = self.states["bogusDoctype"] + return True + + def doctypePublicIdentifierDoubleQuotedState(self): + data = self.stream.char() + if data == "\"": + self.state = self.states["afterDoctypePublicIdentifier"] + elif data == ">": + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "unexpected-end-of-doctype"}) + self.currentToken["correct"] = False + self.tokenQueue.append(self.currentToken) + self.state = self.states["data"] + elif data is EOF: + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "eof-in-doctype"}) + self.currentToken["correct"] = False + self.tokenQueue.append(self.currentToken) + self.state = self.states["data"] + else: + self.currentToken["publicId"] += data + return True + + def doctypePublicIdentifierSingleQuotedState(self): + data = self.stream.char() + if data == "'": + self.state = self.states["afterDoctypePublicIdentifier"] + elif data == ">": + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "unexpected-end-of-doctype"}) + self.currentToken["correct"] = False + self.tokenQueue.append(self.currentToken) + self.state = self.states["data"] + elif data is EOF: + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "eof-in-doctype"}) + self.currentToken["correct"] = False + self.tokenQueue.append(self.currentToken) + self.state = self.states["data"] + else: + self.currentToken["publicId"] += data + return True + + def afterDoctypePublicIdentifierState(self): + data = self.stream.char() + if data in spaceCharacters: + pass + elif data == "\"": + self.currentToken["systemId"] = u"" + self.state = self.states["doctypeSystemIdentifierDoubleQuoted"] + elif data == "'": + self.currentToken["systemId"] = u"" + self.state = self.states["doctypeSystemIdentifierSingleQuoted"] + elif data == ">": + self.tokenQueue.append(self.currentToken) + self.state = self.states["data"] + elif data is EOF: + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "eof-in-doctype"}) + self.currentToken["correct"] = False + self.tokenQueue.append(self.currentToken) + self.state = self.states["data"] + else: + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "unexpected-char-in-doctype"}) + self.currentToken["correct"] = False + self.state = self.states["bogusDoctype"] + return True + + def beforeDoctypeSystemIdentifierState(self): + data = self.stream.char() + if data in spaceCharacters: + pass + elif data == "\"": + self.currentToken["systemId"] = u"" + self.state = self.states["doctypeSystemIdentifierDoubleQuoted"] + elif data == "'": + self.currentToken["systemId"] = u"" + self.state = self.states["doctypeSystemIdentifierSingleQuoted"] + elif data == ">": + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "unexpected-char-in-doctype"}) + self.currentToken["correct"] = False + self.tokenQueue.append(self.currentToken) + self.state = self.states["data"] + elif data is EOF: + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "eof-in-doctype"}) + self.currentToken["correct"] = False + self.tokenQueue.append(self.currentToken) + self.state = self.states["data"] + else: + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "unexpected-char-in-doctype"}) + self.currentToken["correct"] = False + self.state = self.states["bogusDoctype"] + return True + + def doctypeSystemIdentifierDoubleQuotedState(self): + data = self.stream.char() + if data == "\"": + self.state = self.states["afterDoctypeSystemIdentifier"] + elif data == ">": + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "unexpected-end-of-doctype"}) + self.currentToken["correct"] = False + self.tokenQueue.append(self.currentToken) + self.state = self.states["data"] + elif data is EOF: + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "eof-in-doctype"}) + self.currentToken["correct"] = False + self.tokenQueue.append(self.currentToken) + self.state = self.states["data"] + else: + self.currentToken["systemId"] += data + return True + + def doctypeSystemIdentifierSingleQuotedState(self): + data = self.stream.char() + if data == "'": + self.state = self.states["afterDoctypeSystemIdentifier"] + elif data == ">": + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "unexpected-end-of-doctype"}) + self.currentToken["correct"] = False + self.tokenQueue.append(self.currentToken) + self.state = self.states["data"] + elif data is EOF: + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "eof-in-doctype"}) + self.currentToken["correct"] = False + self.tokenQueue.append(self.currentToken) + self.state = self.states["data"] + else: + self.currentToken["systemId"] += data + return True + + def afterDoctypeSystemIdentifierState(self): + data = self.stream.char() + if data in spaceCharacters: + pass + elif data == ">": + self.tokenQueue.append(self.currentToken) + self.state = self.states["data"] + elif data is EOF: + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "eof-in-doctype"}) + self.currentToken["correct"] = False + self.tokenQueue.append(self.currentToken) + self.state = self.states["data"] + else: + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "unexpected-char-in-doctype"}) + self.state = self.states["bogusDoctype"] + return True + + def bogusDoctypeState(self): + data = self.stream.char() + if data == u">": + self.tokenQueue.append(self.currentToken) + self.state = self.states["data"] + elif data is EOF: + # XXX EMIT + self.stream.unget(data) + self.tokenQueue.append(self.currentToken) + self.state = self.states["data"] + else: + pass + return True diff --git a/src/html5lib/treebuilders/__init__.py b/src/html5lib/treebuilders/__init__.py new file mode 100644 index 0000000000..403ffa5177 --- /dev/null +++ b/src/html5lib/treebuilders/__init__.py @@ -0,0 +1,92 @@ +"""A collection of modules for building different kinds of tree from +HTML documents. + +To create a treebuilder for a new type of tree, you need to do +implement several things: + +1) A set of classes for various types of elements: Document, Doctype, +Comment, Element. These must implement the interface of +_base.treebuilders.Node (although comment nodes have a different +signature for their constructor, see treebuilders.simpletree.Comment) +Textual content may also be implemented as another node type, or not, as +your tree implementation requires. + +2) A treebuilder object (called TreeBuilder by convention) that +inherits from treebuilders._base.TreeBuilder. This has 4 required attributes: +documentClass - the class to use for the bottommost node of a document +elementClass - the class to use for HTML Elements +commentClass - the class to use for comments +doctypeClass - the class to use for doctypes +It also has one required method: +getDocument - Returns the root node of the complete document tree + +3) If you wish to run the unit tests, you must also create a +testSerializer method on your treebuilder which accepts a node and +returns a string containing Node and its children serialized according +to the format used in the unittests + +The supplied simpletree module provides a python-only implementation +of a full treebuilder and is a useful reference for the semantics of +the various methods. +""" + +treeBuilderCache = {} + +def getTreeBuilder(treeType, implementation=None, **kwargs): + """Get a TreeBuilder class for various types of tree with built-in support + + treeType - the name of the tree type required (case-insensitive). Supported + values are "simpletree", "dom", "etree" and "beautifulsoup" + + "simpletree" - a built-in DOM-ish tree type with support for some + more pythonic idioms. + "dom" - A generic builder for DOM implementations, defaulting to + a xml.dom.minidom based implementation for the sake of + backwards compatibility (as releases up until 0.10 had a + builder called "dom" that was a minidom implemenation). + "etree" - A generic builder for tree implementations exposing an + elementtree-like interface (known to work with + ElementTree, cElementTree and lxml.etree). + "beautifulsoup" - Beautiful soup (if installed) + + implementation - (Currently applies to the "etree" and "dom" tree types). A + module implementing the tree type e.g. + xml.etree.ElementTree or lxml.etree.""" + + treeType = treeType.lower() + if treeType not in treeBuilderCache: + if treeType == "dom": + import dom + # XXX: Keep backwards compatibility by using minidom if no implementation is given + if implementation == None: + from xml.dom import minidom + implementation = minidom + # XXX: NEVER cache here, caching is done in the dom submodule + return dom.getDomModule(implementation, **kwargs).TreeBuilder + elif treeType == "simpletree": + import simpletree + treeBuilderCache[treeType] = simpletree.TreeBuilder + elif treeType == "beautifulsoup": + import soup + treeBuilderCache[treeType] = soup.TreeBuilder + elif treeType == "lxml": + import etree_lxml + treeBuilderCache[treeType] = etree_lxml.TreeBuilder + elif treeType == "etree": + # Come up with a sane default + if implementation == None: + try: + import xml.etree.cElementTree as ET + except ImportError: + try: + import xml.etree.ElementTree as ET + except ImportError: + try: + import cElementTree as ET + except ImportError: + import elementtree.ElementTree as ET + implementation = ET + import etree + # XXX: NEVER cache here, caching is done in the etree submodule + return etree.getETreeModule(implementation, **kwargs).TreeBuilder + return treeBuilderCache.get(treeType) diff --git a/src/html5lib/treebuilders/_base.py b/src/html5lib/treebuilders/_base.py new file mode 100644 index 0000000000..6ea58434e8 --- /dev/null +++ b/src/html5lib/treebuilders/_base.py @@ -0,0 +1,345 @@ +from html5lib.constants import scopingElements, tableInsertModeElements, namespaces +try: + frozenset +except NameError: + # Import from the sets module for python 2.3 + from sets import Set as set + from sets import ImmutableSet as frozenset + +# The scope markers are inserted when entering buttons, object elements, +# marquees, table cells, and table captions, and are used to prevent formatting +# from "leaking" into tables, buttons, object elements, and marquees. +Marker = None + +class Node(object): + def __init__(self, name): + """Node representing an item in the tree. + name - The tag name associated with the node + parent - The parent of the current node (or None for the document node) + value - The value of the current node (applies to text nodes and + comments + attributes - a dict holding name, value pairs for attributes of the node + childNodes - a list of child nodes of the current node. This must + include all elements but not necessarily other node types + _flags - A list of miscellaneous flags that can be set on the node + """ + self.name = name + self.parent = None + self.value = None + self.attributes = {} + self.childNodes = [] + self._flags = [] + + def __unicode__(self): + attributesStr = " ".join(["%s=\"%s\""%(name, value) + for name, value in + self.attributes.iteritems()]) + if attributesStr: + return "<%s %s>"%(self.name,attributesStr) + else: + return "<%s>"%(self.name) + + def __repr__(self): + return "<%s>" % (self.name) + + def appendChild(self, node): + """Insert node as a child of the current node + """ + raise NotImplementedError + + def insertText(self, data, insertBefore=None): + """Insert data as text in the current node, positioned before the + start of node insertBefore or to the end of the node's text. + """ + raise NotImplementedError + + def insertBefore(self, node, refNode): + """Insert node as a child of the current node, before refNode in the + list of child nodes. Raises ValueError if refNode is not a child of + the current node""" + raise NotImplementedError + + def removeChild(self, node): + """Remove node from the children of the current node + """ + raise NotImplementedError + + def reparentChildren(self, newParent): + """Move all the children of the current node to newParent. + This is needed so that trees that don't store text as nodes move the + text in the correct way + """ + #XXX - should this method be made more general? + for child in self.childNodes: + newParent.appendChild(child) + self.childNodes = [] + + def cloneNode(self): + """Return a shallow copy of the current node i.e. a node with the same + name and attributes but with no parent or child nodes + """ + raise NotImplementedError + + + def hasContent(self): + """Return true if the node has children or text, false otherwise + """ + raise NotImplementedError + +class TreeBuilder(object): + """Base treebuilder implementation + documentClass - the class to use for the bottommost node of a document + elementClass - the class to use for HTML Elements + commentClass - the class to use for comments + doctypeClass - the class to use for doctypes + """ + + #Document class + documentClass = None + + #The class to use for creating a node + elementClass = None + + #The class to use for creating comments + commentClass = None + + #The class to use for creating doctypes + doctypeClass = None + + #Fragment class + fragmentClass = None + + def __init__(self, namespaceHTMLElements): + if namespaceHTMLElements: + self.defaultNamespace = "http://www.w3.org/1999/xhtml" + else: + self.defaultNamespace = None + self.reset() + + def reset(self): + self.openElements = [] + self.activeFormattingElements = [] + + #XXX - rename these to headElement, formElement + self.headPointer = None + self.formPointer = None + + self.insertFromTable = False + + self.document = self.documentClass() + + def elementInScope(self, target, variant=None): + # Exit early when possible. + listElementsMap = { + None:scopingElements, + "list":scopingElements | set([(namespaces["html"], "ol"), + (namespaces["html"], "ul")]), + "table":set([(namespaces["html"], "html"), + (namespaces["html"], "table")]) + } + listElements = listElementsMap[variant] + + for node in reversed(self.openElements): + if node.name == target: + return True + elif node.nameTuple in listElements: + return False + + assert False # We should never reach this point + + def reconstructActiveFormattingElements(self): + # Within this algorithm the order of steps described in the + # specification is not quite the same as the order of steps in the + # code. It should still do the same though. + + # Step 1: stop the algorithm when there's nothing to do. + if not self.activeFormattingElements: + return + + # Step 2 and step 3: we start with the last element. So i is -1. + i = len(self.activeFormattingElements) - 1 + entry = self.activeFormattingElements[i] + if entry == Marker or entry in self.openElements: + return + + # Step 6 + while entry != Marker and entry not in self.openElements: + if i == 0: + #This will be reset to 0 below + i = -1 + break + i -= 1 + # Step 5: let entry be one earlier in the list. + entry = self.activeFormattingElements[i] + + while True: + # Step 7 + i += 1 + + # Step 8 + entry = self.activeFormattingElements[i] + clone = entry.cloneNode() #Mainly to get a new copy of the attributes + + # Step 9 + element = self.insertElement({"type":"StartTag", + "name":clone.name, + "namespace":clone.namespace, + "data":clone.attributes}) + + # Step 10 + self.activeFormattingElements[i] = element + + # Step 11 + if element == self.activeFormattingElements[-1]: + break + + def clearActiveFormattingElements(self): + entry = self.activeFormattingElements.pop() + while self.activeFormattingElements and entry != Marker: + entry = self.activeFormattingElements.pop() + + def elementInActiveFormattingElements(self, name): + """Check if an element exists between the end of the active + formatting elements and the last marker. If it does, return it, else + return false""" + + for item in self.activeFormattingElements[::-1]: + # Check for Marker first because if it's a Marker it doesn't have a + # name attribute. + if item == Marker: + break + elif item.name == name: + return item + return False + + def insertRoot(self, token): + element = self.createElement(token) + self.openElements.append(element) + self.document.appendChild(element) + + def insertDoctype(self, token): + name = token["name"] + publicId = token["publicId"] + systemId = token["systemId"] + + doctype = self.doctypeClass(name, publicId, systemId) + self.document.appendChild(doctype) + + def insertComment(self, token, parent=None): + if parent is None: + parent = self.openElements[-1] + parent.appendChild(self.commentClass(token["data"])) + + def createElement(self, token): + """Create an element but don't insert it anywhere""" + name = token["name"] + namespace = token.get("namespace", self.defaultNamespace) + element = self.elementClass(name, namespace) + element.attributes = token["data"] + return element + + def _getInsertFromTable(self): + return self._insertFromTable + + def _setInsertFromTable(self, value): + """Switch the function used to insert an element from the + normal one to the misnested table one and back again""" + self._insertFromTable = value + if value: + self.insertElement = self.insertElementTable + else: + self.insertElement = self.insertElementNormal + + insertFromTable = property(_getInsertFromTable, _setInsertFromTable) + + def insertElementNormal(self, token): + name = token["name"] + namespace = token.get("namespace", self.defaultNamespace) + element = self.elementClass(name, namespace) + element.attributes = token["data"] + self.openElements[-1].appendChild(element) + self.openElements.append(element) + return element + + def insertElementTable(self, token): + """Create an element and insert it into the tree""" + element = self.createElement(token) + if self.openElements[-1].name not in tableInsertModeElements: + return self.insertElementNormal(token) + else: + #We should be in the InTable mode. This means we want to do + #special magic element rearranging + parent, insertBefore = self.getTableMisnestedNodePosition() + if insertBefore is None: + parent.appendChild(element) + else: + parent.insertBefore(element, insertBefore) + self.openElements.append(element) + return element + + def insertText(self, data, parent=None): + """Insert text data.""" + if parent is None: + parent = self.openElements[-1] + + if (not self.insertFromTable or (self.insertFromTable and + self.openElements[-1].name + not in tableInsertModeElements)): + parent.insertText(data) + else: + # We should be in the InTable mode. This means we want to do + # special magic element rearranging + parent, insertBefore = self.getTableMisnestedNodePosition() + parent.insertText(data, insertBefore) + + def getTableMisnestedNodePosition(self): + """Get the foster parent element, and sibling to insert before + (or None) when inserting a misnested table node""" + # The foster parent element is the one which comes before the most + # recently opened table element + # XXX - this is really inelegant + lastTable=None + fosterParent = None + insertBefore = None + for elm in self.openElements[::-1]: + if elm.name == "table": + lastTable = elm + break + if lastTable: + # XXX - we should really check that this parent is actually a + # node here + if lastTable.parent: + fosterParent = lastTable.parent + insertBefore = lastTable + else: + fosterParent = self.openElements[ + self.openElements.index(lastTable) - 1] + else: + fosterParent = self.openElements[0] + return fosterParent, insertBefore + + def generateImpliedEndTags(self, exclude=None): + name = self.openElements[-1].name + # XXX td, th and tr are not actually needed + if (name in frozenset(("dd", "dt", "li", "p", "td", "th", "tr")) + and name != exclude): + self.openElements.pop() + # XXX This is not entirely what the specification says. We should + # investigate it more closely. + self.generateImpliedEndTags(exclude) + + def getDocument(self): + "Return the final tree" + return self.document + + def getFragment(self): + "Return the final fragment" + #assert self.innerHTML + fragment = self.fragmentClass() + self.openElements[0].reparentChildren(fragment) + return fragment + + def testSerializer(self, node): + """Serialize the subtree of node in the format required by unit tests + node - the node from which to start serializing""" + raise NotImplementedError diff --git a/src/html5lib/treebuilders/dom.py b/src/html5lib/treebuilders/dom.py new file mode 100644 index 0000000000..c094e1f09b --- /dev/null +++ b/src/html5lib/treebuilders/dom.py @@ -0,0 +1,286 @@ + +from xml.dom import minidom, Node, XML_NAMESPACE, XMLNS_NAMESPACE +import new +import re +import weakref + +import _base +from html5lib import constants, ihatexml +from html5lib.constants import namespaces + +moduleCache = {} + +def getDomModule(DomImplementation): + name = "_" + DomImplementation.__name__+"builder" + if name in moduleCache: + return moduleCache[name] + else: + mod = new.module(name) + objs = getDomBuilder(DomImplementation) + mod.__dict__.update(objs) + moduleCache[name] = mod + return mod + +def getDomBuilder(DomImplementation): + Dom = DomImplementation + class AttrList: + def __init__(self, element): + self.element = element + def __iter__(self): + return self.element.attributes.items().__iter__() + def __setitem__(self, name, value): + self.element.setAttribute(name, value) + def items(self): + return [(item[0], item[1]) for item in + self.element.attributes.items()] + def keys(self): + return self.element.attributes.keys() + def __getitem__(self, name): + return self.element.getAttribute(name) + + def __contains__(self, name): + if isinstance(name, tuple): + raise NotImplementedError + else: + return self.element.hasAttribute(name) + + class NodeBuilder(_base.Node): + def __init__(self, element): + _base.Node.__init__(self, element.nodeName) + self.element = element + + namespace = property(lambda self:hasattr(self.element, "namespaceURI") + and self.element.namespaceURI or None) + + def appendChild(self, node): + node.parent = self + self.element.appendChild(node.element) + + def insertText(self, data, insertBefore=None): + text = self.element.ownerDocument.createTextNode(data) + if insertBefore: + self.element.insertBefore(text, insertBefore.element) + else: + self.element.appendChild(text) + + def insertBefore(self, node, refNode): + self.element.insertBefore(node.element, refNode.element) + node.parent = self + + def removeChild(self, node): + if node.element.parentNode == self.element: + self.element.removeChild(node.element) + node.parent = None + + def reparentChildren(self, newParent): + while self.element.hasChildNodes(): + child = self.element.firstChild + self.element.removeChild(child) + newParent.element.appendChild(child) + self.childNodes = [] + + def getAttributes(self): + return AttrList(self.element) + + def setAttributes(self, attributes): + if attributes: + for name, value in attributes.items(): + if isinstance(name, tuple): + if name[0] is not None: + qualifiedName = (name[0] + ":" + name[1]) + else: + qualifiedName = name[1] + self.element.setAttributeNS(name[2], qualifiedName, + value) + else: + self.element.setAttribute( + name, value) + attributes = property(getAttributes, setAttributes) + + def cloneNode(self): + return NodeBuilder(self.element.cloneNode(False)) + + def hasContent(self): + return self.element.hasChildNodes() + + def getNameTuple(self): + if self.namespace == None: + return namespaces["html"], self.name + else: + return self.namespace, self.name + + nameTuple = property(getNameTuple) + + class TreeBuilder(_base.TreeBuilder): + def documentClass(self): + self.dom = Dom.getDOMImplementation().createDocument(None,None,None) + return weakref.proxy(self) + + def insertDoctype(self, token): + name = token["name"] + publicId = token["publicId"] + systemId = token["systemId"] + + domimpl = Dom.getDOMImplementation() + doctype = domimpl.createDocumentType(name, publicId, systemId) + self.document.appendChild(NodeBuilder(doctype)) + if Dom == minidom: + doctype.ownerDocument = self.dom + + def elementClass(self, name, namespace=None): + if namespace is None and self.defaultNamespace is None: + node = self.dom.createElement(name) + else: + node = self.dom.createElementNS(namespace, name) + + return NodeBuilder(node) + + def commentClass(self, data): + return NodeBuilder(self.dom.createComment(data)) + + def fragmentClass(self): + return NodeBuilder(self.dom.createDocumentFragment()) + + def appendChild(self, node): + self.dom.appendChild(node.element) + + def testSerializer(self, element): + return testSerializer(element) + + def getDocument(self): + return self.dom + + def getFragment(self): + return _base.TreeBuilder.getFragment(self).element + + def insertText(self, data, parent=None): + data=data + if parent <> self: + _base.TreeBuilder.insertText(self, data, parent) + else: + # HACK: allow text nodes as children of the document node + if hasattr(self.dom, '_child_node_types'): + if not Node.TEXT_NODE in self.dom._child_node_types: + self.dom._child_node_types=list(self.dom._child_node_types) + self.dom._child_node_types.append(Node.TEXT_NODE) + self.dom.appendChild(self.dom.createTextNode(data)) + + name = None + + def testSerializer(element): + element.normalize() + rv = [] + def serializeElement(element, indent=0): + if element.nodeType == Node.DOCUMENT_TYPE_NODE: + if element.name: + if element.publicId or element.systemId: + publicId = element.publicId or "" + systemId = element.systemId or "" + rv.append( """|%s<!DOCTYPE %s "%s" "%s">"""%( + ' '*indent, element.name, publicId, systemId)) + else: + rv.append("|%s<!DOCTYPE %s>"%(' '*indent, element.name)) + else: + rv.append("|%s<!DOCTYPE >"%(' '*indent,)) + elif element.nodeType == Node.DOCUMENT_NODE: + rv.append("#document") + elif element.nodeType == Node.DOCUMENT_FRAGMENT_NODE: + rv.append("#document-fragment") + elif element.nodeType == Node.COMMENT_NODE: + rv.append("|%s<!-- %s -->"%(' '*indent, element.nodeValue)) + elif element.nodeType == Node.TEXT_NODE: + rv.append("|%s\"%s\"" %(' '*indent, element.nodeValue)) + else: + if (hasattr(element, "namespaceURI") and + element.namespaceURI != None): + name = "%s %s"%(constants.prefixes[element.namespaceURI], + element.nodeName) + else: + name = element.nodeName + rv.append("|%s<%s>"%(' '*indent, name)) + if element.hasAttributes(): + i = 0 + attr = element.attributes.item(i) + while attr: + name = attr.nodeName + value = attr.value + ns = attr.namespaceURI + if ns: + name = "%s %s"%(constants.prefixes[ns], attr.localName) + else: + name = attr.nodeName + i += 1 + attr = element.attributes.item(i) + + rv.append('|%s%s="%s"' % (' '*(indent+2), name, value)) + indent += 2 + for child in element.childNodes: + serializeElement(child, indent) + serializeElement(element, 0) + + return "\n".join(rv) + + def dom2sax(node, handler, nsmap={'xml':XML_NAMESPACE}): + if node.nodeType == Node.ELEMENT_NODE: + if not nsmap: + handler.startElement(node.nodeName, node.attributes) + for child in node.childNodes: dom2sax(child, handler, nsmap) + handler.endElement(node.nodeName) + else: + attributes = dict(node.attributes.itemsNS()) + + # gather namespace declarations + prefixes = [] + for attrname in node.attributes.keys(): + attr = node.getAttributeNode(attrname) + if (attr.namespaceURI == XMLNS_NAMESPACE or + (attr.namespaceURI == None and attr.nodeName.startswith('xmlns'))): + prefix = (attr.nodeName != 'xmlns' and attr.nodeName or None) + handler.startPrefixMapping(prefix, attr.nodeValue) + prefixes.append(prefix) + nsmap = nsmap.copy() + nsmap[prefix] = attr.nodeValue + del attributes[(attr.namespaceURI, attr.nodeName)] + + # apply namespace declarations + for attrname in node.attributes.keys(): + attr = node.getAttributeNode(attrname) + if attr.namespaceURI == None and ':' in attr.nodeName: + prefix = attr.nodeName.split(':')[0] + if nsmap.has_key(prefix): + del attributes[(attr.namespaceURI, attr.nodeName)] + attributes[(nsmap[prefix],attr.nodeName)]=attr.nodeValue + + # SAX events + ns = node.namespaceURI or nsmap.get(None,None) + handler.startElementNS((ns,node.nodeName), node.nodeName, attributes) + for child in node.childNodes: dom2sax(child, handler, nsmap) + handler.endElementNS((ns, node.nodeName), node.nodeName) + for prefix in prefixes: handler.endPrefixMapping(prefix) + + elif node.nodeType in [Node.TEXT_NODE, Node.CDATA_SECTION_NODE]: + handler.characters(node.nodeValue) + + elif node.nodeType == Node.DOCUMENT_NODE: + handler.startDocument() + for child in node.childNodes: dom2sax(child, handler, nsmap) + handler.endDocument() + + elif node.nodeType == Node.DOCUMENT_FRAGMENT_NODE: + for child in node.childNodes: dom2sax(child, handler, nsmap) + + else: + # ATTRIBUTE_NODE + # ENTITY_NODE + # PROCESSING_INSTRUCTION_NODE + # COMMENT_NODE + # DOCUMENT_TYPE_NODE + # NOTATION_NODE + pass + + return locals() + +# Keep backwards compatibility with things that directly load +# classes/functions from this module +for key, value in getDomModule(minidom).__dict__.items(): + globals()[key] = value diff --git a/src/html5lib/treebuilders/etree.py b/src/html5lib/treebuilders/etree.py new file mode 100644 index 0000000000..47efface96 --- /dev/null +++ b/src/html5lib/treebuilders/etree.py @@ -0,0 +1,329 @@ +import new +import re + +import _base +from html5lib import ihatexml +from html5lib import constants +from html5lib.constants import namespaces + +tag_regexp = re.compile("{([^}]*)}(.*)") + +moduleCache = {} + +def getETreeModule(ElementTreeImplementation, fullTree=False): + name = "_" + ElementTreeImplementation.__name__+"builder" + if name in moduleCache: + return moduleCache[name] + else: + mod = new.module("_" + ElementTreeImplementation.__name__+"builder") + objs = getETreeBuilder(ElementTreeImplementation, fullTree) + mod.__dict__.update(objs) + moduleCache[name] = mod + return mod + +def getETreeBuilder(ElementTreeImplementation, fullTree=False): + ElementTree = ElementTreeImplementation + class Element(_base.Node): + def __init__(self, name, namespace=None): + self._name = name + self._namespace = namespace + self._element = ElementTree.Element(self._getETreeTag(name, + namespace)) + if namespace is None: + self.nameTuple = namespaces["html"], self._name + else: + self.nameTuple = self._namespace, self._name + self.parent = None + self._childNodes = [] + self._flags = [] + + def _getETreeTag(self, name, namespace): + if namespace is None: + etree_tag = name + else: + etree_tag = "{%s}%s"%(namespace, name) + return etree_tag + + def _setName(self, name): + self._name = name + self._element.tag = self._getETreeTag(self._name, self._namespace) + + def _getName(self): + return self._name + + name = property(_getName, _setName) + + def _setNamespace(self, namespace): + self._namespace = namespace + self._element.tag = self._getETreeTag(self._name, self._namespace) + + def _getNamespace(self): + return self._namespace + + namespace = property(_getNamespace, _setNamespace) + + def _getAttributes(self): + return self._element.attrib + + def _setAttributes(self, attributes): + #Delete existing attributes first + #XXX - there may be a better way to do this... + for key in self._element.attrib.keys(): + del self._element.attrib[key] + for key, value in attributes.iteritems(): + if isinstance(key, tuple): + name = "{%s}%s"%(key[2], key[1]) + else: + name = key + self._element.set(name, value) + + attributes = property(_getAttributes, _setAttributes) + + def _getChildNodes(self): + return self._childNodes + def _setChildNodes(self, value): + del self._element[:] + self._childNodes = [] + for element in value: + self.insertChild(element) + + childNodes = property(_getChildNodes, _setChildNodes) + + def hasContent(self): + """Return true if the node has children or text""" + return bool(self._element.text or self._element.getchildren()) + + def appendChild(self, node): + self._childNodes.append(node) + self._element.append(node._element) + node.parent = self + + def insertBefore(self, node, refNode): + index = self._element.getchildren().index(refNode._element) + self._element.insert(index, node._element) + node.parent = self + + def removeChild(self, node): + self._element.remove(node._element) + node.parent=None + + def insertText(self, data, insertBefore=None): + if not(len(self._element)): + if not self._element.text: + self._element.text = "" + self._element.text += data + elif insertBefore is None: + #Insert the text as the tail of the last child element + if not self._element[-1].tail: + self._element[-1].tail = "" + self._element[-1].tail += data + else: + #Insert the text before the specified node + children = self._element.getchildren() + index = children.index(insertBefore._element) + if index > 0: + if not self._element[index-1].tail: + self._element[index-1].tail = "" + self._element[index-1].tail += data + else: + if not self._element.text: + self._element.text = "" + self._element.text += data + + def cloneNode(self): + element = Element(self.name, self.namespace) + for name, value in self.attributes.iteritems(): + element.attributes[name] = value + return element + + def reparentChildren(self, newParent): + if newParent.childNodes: + newParent.childNodes[-1]._element.tail += self._element.text + else: + if not newParent._element.text: + newParent._element.text = "" + if self._element.text is not None: + newParent._element.text += self._element.text + self._element.text = "" + _base.Node.reparentChildren(self, newParent) + + class Comment(Element): + def __init__(self, data): + #Use the superclass constructor to set all properties on the + #wrapper element + self._element = ElementTree.Comment(data) + self.parent = None + self._childNodes = [] + self._flags = [] + + def _getData(self): + return self._element.text + + def _setData(self, value): + self._element.text = value + + data = property(_getData, _setData) + + class DocumentType(Element): + def __init__(self, name, publicId, systemId): + Element.__init__(self, "<!DOCTYPE>") + self._element.text = name + self.publicId = publicId + self.systemId = systemId + + def _getPublicId(self): + return self._element.get(u"publicId", "") + + def _setPublicId(self, value): + if value is not None: + self._element.set(u"publicId", value) + + publicId = property(_getPublicId, _setPublicId) + + def _getSystemId(self): + return self._element.get(u"systemId", "") + + def _setSystemId(self, value): + if value is not None: + self._element.set(u"systemId", value) + + systemId = property(_getSystemId, _setSystemId) + + class Document(Element): + def __init__(self): + Element.__init__(self, "<DOCUMENT_ROOT>") + + class DocumentFragment(Element): + def __init__(self): + Element.__init__(self, "<DOCUMENT_FRAGMENT>") + + def testSerializer(element): + rv = [] + finalText = None + def serializeElement(element, indent=0): + if not(hasattr(element, "tag")): + element = element.getroot() + if element.tag == "<!DOCTYPE>": + if element.get("publicId") or element.get("systemId"): + publicId = element.get("publicId") or "" + systemId = element.get("systemId") or "" + rv.append( """<!DOCTYPE %s "%s" "%s">"""%( + element.text, publicId, systemId)) + else: + rv.append("<!DOCTYPE %s>"%(element.text,)) + elif element.tag == "<DOCUMENT_ROOT>": + rv.append("#document") + if element.text: + rv.append("|%s\"%s\""%(' '*(indent+2), element.text)) + if element.tail: + finalText = element.tail + elif type(element.tag) == type(ElementTree.Comment): + rv.append("|%s<!-- %s -->"%(' '*indent, element.text)) + else: + nsmatch = tag_regexp.match(element.tag) + + if nsmatch is None: + name = element.tag + else: + ns, name = nsmatch.groups() + prefix = constants.prefixes[ns] + name = "%s %s"%(prefix, name) + rv.append("|%s<%s>"%(' '*indent, name)) + + if hasattr(element, "attrib"): + for name, value in element.attrib.iteritems(): + nsmatch = tag_regexp.match(name) + if nsmatch is not None: + ns, name = nsmatch.groups() + prefix = constants.prefixes[ns] + name = "%s %s"%(prefix, name) + rv.append('|%s%s="%s"' % (' '*(indent+2), name, value)) + if element.text: + rv.append("|%s\"%s\"" %(' '*(indent+2), element.text)) + indent += 2 + for child in element.getchildren(): + serializeElement(child, indent) + if element.tail: + rv.append("|%s\"%s\"" %(' '*(indent-2), element.tail)) + serializeElement(element, 0) + + if finalText is not None: + rv.append("|%s\"%s\""%(' '*2, finalText)) + + return "\n".join(rv) + + def tostring(element): + """Serialize an element and its child nodes to a string""" + rv = [] + finalText = None + filter = ihatexml.InfosetFilter() + def serializeElement(element): + if type(element) == type(ElementTree.ElementTree): + element = element.getroot() + + if element.tag == "<!DOCTYPE>": + if element.get("publicId") or element.get("systemId"): + publicId = element.get("publicId") or "" + systemId = element.get("systemId") or "" + rv.append( """<!DOCTYPE %s PUBLIC "%s" "%s">"""%( + element.text, publicId, systemId)) + else: + rv.append("<!DOCTYPE %s>"%(element.text,)) + elif element.tag == "<DOCUMENT_ROOT>": + if element.text: + rv.append(element.text) + if element.tail: + finalText = element.tail + + for child in element.getchildren(): + serializeElement(child) + + elif type(element.tag) == type(ElementTree.Comment): + rv.append("<!--%s-->"%(element.text,)) + else: + #This is assumed to be an ordinary element + if not element.attrib: + rv.append("<%s>"%(filter.fromXmlName(element.tag),)) + else: + attr = " ".join(["%s=\"%s\""%( + filter.fromXmlName(name), value) + for name, value in element.attrib.iteritems()]) + rv.append("<%s %s>"%(element.tag, attr)) + if element.text: + rv.append(element.text) + + for child in element.getchildren(): + serializeElement(child) + + rv.append("</%s>"%(element.tag,)) + + if element.tail: + rv.append(element.tail) + + serializeElement(element) + + if finalText is not None: + rv.append("%s\""%(' '*2, finalText)) + + return "".join(rv) + + class TreeBuilder(_base.TreeBuilder): + documentClass = Document + doctypeClass = DocumentType + elementClass = Element + commentClass = Comment + fragmentClass = DocumentFragment + + def testSerializer(self, element): + return testSerializer(element) + + def getDocument(self): + if fullTree: + return self.document._element + else: + return self.document._element.find("html") + + def getFragment(self): + return _base.TreeBuilder.getFragment(self)._element + + return locals() diff --git a/src/html5lib/treebuilders/etree_lxml.py b/src/html5lib/treebuilders/etree_lxml.py new file mode 100644 index 0000000000..80a4005108 --- /dev/null +++ b/src/html5lib/treebuilders/etree_lxml.py @@ -0,0 +1,335 @@ +import new +import warnings +import re + +import _base +from html5lib.constants import DataLossWarning +import html5lib.constants as constants +import etree as etree_builders +from html5lib import ihatexml + +try: + import lxml.etree as etree +except ImportError: + pass + +fullTree = True + +"""Module for supporting the lxml.etree library. The idea here is to use as much +of the native library as possible, without using fragile hacks like custom element +names that break between releases. The downside of this is that we cannot represent +all possible trees; specifically the following are known to cause problems: + +Text or comments as siblings of the root element +Docypes with no name + +When any of these things occur, we emit a DataLossWarning +""" + +class DocumentType(object): + def __init__(self, name, publicId, systemId): + self.name = name + self.publicId = publicId + self.systemId = systemId + +class Document(object): + def __init__(self): + self._elementTree = None + self._childNodes = [] + + def appendChild(self, element): + self._elementTree.getroot().addnext(element._element) + + def _getChildNodes(self): + return self._childNodes + + childNodes = property(_getChildNodes) + +def testSerializer(element): + rv = [] + finalText = None + filter = ihatexml.InfosetFilter() + def serializeElement(element, indent=0): + if not hasattr(element, "tag"): + if hasattr(element, "getroot"): + #Full tree case + rv.append("#document") + if element.docinfo.internalDTD: + if not (element.docinfo.public_id or + element.docinfo.system_url): + dtd_str = "<!DOCTYPE %s>"%element.docinfo.root_name + else: + dtd_str = """<!DOCTYPE %s "%s" "%s">"""%( + element.docinfo.root_name, + element.docinfo.public_id, + element.docinfo.system_url) + rv.append("|%s%s"%(' '*(indent+2), dtd_str)) + next_element = element.getroot() + while next_element.getprevious() is not None: + next_element = next_element.getprevious() + while next_element is not None: + serializeElement(next_element, indent+2) + next_element = next_element.getnext() + elif isinstance(element, basestring): + #Text in a fragment + rv.append("|%s\"%s\""%(' '*indent, element)) + else: + #Fragment case + rv.append("#document-fragment") + for next_element in element: + serializeElement(next_element, indent+2) + elif type(element.tag) == type(etree.Comment): + rv.append("|%s<!-- %s -->"%(' '*indent, element.text)) + else: + nsmatch = etree_builders.tag_regexp.match(element.tag) + if nsmatch is not None: + ns = nsmatch.group(1) + tag = nsmatch.group(2) + prefix = constants.prefixes[ns] + rv.append("|%s<%s %s>"%(' '*indent, prefix, + filter.fromXmlName(tag))) + else: + rv.append("|%s<%s>"%(' '*indent, + filter.fromXmlName(element.tag))) + + if hasattr(element, "attrib"): + for name, value in element.attrib.iteritems(): + nsmatch = etree_builders.tag_regexp.match(name) + if nsmatch: + ns = nsmatch.group(1) + name = nsmatch.group(2) + prefix = constants.prefixes[ns] + rv.append('|%s%s %s="%s"' % (' '*(indent+2), + prefix, + filter.fromXmlName(name), + value)) + else: + rv.append('|%s%s="%s"' % (' '*(indent+2), + filter.fromXmlName(name), + value)) + if element.text: + rv.append("|%s\"%s\"" %(' '*(indent+2), element.text)) + indent += 2 + for child in element.getchildren(): + serializeElement(child, indent) + if hasattr(element, "tail") and element.tail: + rv.append("|%s\"%s\"" %(' '*(indent-2), element.tail)) + serializeElement(element, 0) + + if finalText is not None: + rv.append("|%s\"%s\""%(' '*2, finalText)) + + return "\n".join(rv) + +def tostring(element): + """Serialize an element and its child nodes to a string""" + rv = [] + finalText = None + def serializeElement(element): + if not hasattr(element, "tag"): + if element.docinfo.internalDTD: + if element.docinfo.doctype: + dtd_str = element.docinfo.doctype + else: + dtd_str = "<!DOCTYPE %s>"%element.docinfo.root_name + rv.append(dtd_str) + serializeElement(element.getroot()) + + elif type(element.tag) == type(etree.Comment): + rv.append("<!--%s-->"%(element.text,)) + + else: + #This is assumed to be an ordinary element + if not element.attrib: + rv.append("<%s>"%(element.tag,)) + else: + attr = " ".join(["%s=\"%s\""%(name, value) + for name, value in element.attrib.iteritems()]) + rv.append("<%s %s>"%(element.tag, attr)) + if element.text: + rv.append(element.text) + + for child in element.getchildren(): + serializeElement(child) + + rv.append("</%s>"%(element.tag,)) + + if hasattr(element, "tail") and element.tail: + rv.append(element.tail) + + serializeElement(element) + + if finalText is not None: + rv.append("%s\""%(' '*2, finalText)) + + return "".join(rv) + + +class TreeBuilder(_base.TreeBuilder): + documentClass = Document + doctypeClass = DocumentType + elementClass = None + commentClass = None + fragmentClass = Document + + def __init__(self, namespaceHTMLElements, fullTree = False): + builder = etree_builders.getETreeModule(etree, fullTree=fullTree) + filter = self.filter = ihatexml.InfosetFilter() + self.namespaceHTMLElements = namespaceHTMLElements + + class Attributes(dict): + def __init__(self, element, value={}): + self._element = element + dict.__init__(self, value) + for key, value in self.iteritems(): + if isinstance(key, tuple): + name = "{%s}%s"%(key[2], filter.coerceAttribute(key[1])) + else: + name = filter.coerceAttribute(key) + self._element._element.attrib[name] = value + + def __setitem__(self, key, value): + dict.__setitem__(self, key, value) + if isinstance(key, tuple): + name = "{%s}%s"%(key[2], filter.coerceAttribute(key[1])) + else: + name = filter.coerceAttribute(key) + self._element._element.attrib[name] = value + + class Element(builder.Element): + def __init__(self, name, namespace): + name = filter.coerceElement(name) + builder.Element.__init__(self, name, namespace=namespace) + self._attributes = Attributes(self) + + def _setName(self, name): + self._name = filter.coerceElement(name) + self._element.tag = self._getETreeTag( + self._name, self._namespace) + + def _getName(self): + return filter.fromXmlName(self._name) + + name = property(_getName, _setName) + + def _getAttributes(self): + return self._attributes + + def _setAttributes(self, attributes): + self._attributes = Attributes(self, attributes) + + attributes = property(_getAttributes, _setAttributes) + + def insertText(self, data, insertBefore=None): + data = filter.coerceCharacters(data) + builder.Element.insertText(self, data, insertBefore) + + def appendChild(self, child): + builder.Element.appendChild(self, child) + + + class Comment(builder.Comment): + def __init__(self, data): + data = filter.coerceComment(data) + builder.Comment.__init__(self, data) + + def _setData(self, data): + data = filter.coerceComment(data) + self._element.text = data + + def _getData(self): + return self._element.text + + data = property(_getData, _setData) + + self.elementClass = Element + self.commentClass = builder.Comment + #self.fragmentClass = builder.DocumentFragment + _base.TreeBuilder.__init__(self, namespaceHTMLElements) + + def reset(self): + _base.TreeBuilder.reset(self) + self.insertComment = self.insertCommentInitial + self.initial_comments = [] + self.doctype = None + + def testSerializer(self, element): + return testSerializer(element) + + def getDocument(self): + if fullTree: + return self.document._elementTree + else: + return self.document._elementTree.getroot() + + def getFragment(self): + fragment = [] + element = self.openElements[0]._element + if element.text: + fragment.append(element.text) + fragment.extend(element.getchildren()) + if element.tail: + fragment.append(element.tail) + return fragment + + def insertDoctype(self, token): + name = token["name"] + publicId = token["publicId"] + systemId = token["systemId"] + + if not name or ihatexml.nonXmlNameBMPRegexp.search(name) or name[0] == '"': + warnings.warn("lxml cannot represent null or non-xml doctype", DataLossWarning) + + doctype = self.doctypeClass(name, publicId, systemId) + self.doctype = doctype + + def insertCommentInitial(self, data, parent=None): + self.initial_comments.append(data) + + def insertRoot(self, token): + """Create the document root""" + #Because of the way libxml2 works, it doesn't seem to be possible to + #alter information like the doctype after the tree has been parsed. + #Therefore we need to use the built-in parser to create our iniial + #tree, after which we can add elements like normal + docStr = "" + if self.doctype and self.doctype.name and not self.doctype.name.startswith('"'): + docStr += "<!DOCTYPE %s"%self.doctype.name + if (self.doctype.publicId is not None or + self.doctype.systemId is not None): + docStr += ' PUBLIC "%s" "%s"'%(self.doctype.publicId or "", + self.doctype.systemId or "") + docStr += ">" + docStr += "<THIS_SHOULD_NEVER_APPEAR_PUBLICLY/>" + + try: + root = etree.fromstring(docStr) + except etree.XMLSyntaxError: + print docStr + raise + + #Append the initial comments: + for comment_token in self.initial_comments: + root.addprevious(etree.Comment(comment_token["data"])) + + #Create the root document and add the ElementTree to it + self.document = self.documentClass() + self.document._elementTree = root.getroottree() + + # Give the root element the right name + name = token["name"] + namespace = token.get("namespace", self.defaultNamespace) + if namespace is None: + etree_tag = name + else: + etree_tag = "{%s}%s"%(namespace, name) + root.tag = etree_tag + + #Add the root element to the internal child/open data structures + root_element = self.elementClass(name, namespace) + root_element._element = root + self.document._childNodes.append(root_element) + self.openElements.append(root_element) + + #Reset to the default insert comment function + self.insertComment = super(TreeBuilder, self).insertComment diff --git a/src/html5lib/treebuilders/simpletree.py b/src/html5lib/treebuilders/simpletree.py new file mode 100644 index 0000000000..ff6bfe4003 --- /dev/null +++ b/src/html5lib/treebuilders/simpletree.py @@ -0,0 +1,248 @@ +import _base +from html5lib.constants import voidElements, namespaces, prefixes +from xml.sax.saxutils import escape + +# Really crappy basic implementation of a DOM-core like thing +class Node(_base.Node): + type = -1 + def __init__(self, name): + self.name = name + self.parent = None + self.value = None + self.childNodes = [] + self._flags = [] + + def __iter__(self): + for node in self.childNodes: + yield node + for item in node: + yield item + + def __unicode__(self): + return self.name + + def toxml(self): + raise NotImplementedError + + def printTree(self, indent=0): + tree = '\n|%s%s' % (' '* indent, unicode(self)) + for child in self.childNodes: + tree += child.printTree(indent + 2) + return tree + + def appendChild(self, node): + if (isinstance(node, TextNode) and self.childNodes and + isinstance(self.childNodes[-1], TextNode)): + self.childNodes[-1].value += node.value + else: + self.childNodes.append(node) + node.parent = self + + def insertText(self, data, insertBefore=None): + if insertBefore is None: + self.appendChild(TextNode(data)) + else: + self.insertBefore(TextNode(data), insertBefore) + + def insertBefore(self, node, refNode): + index = self.childNodes.index(refNode) + if (isinstance(node, TextNode) and index > 0 and + isinstance(self.childNodes[index - 1], TextNode)): + self.childNodes[index - 1].value += node.value + else: + self.childNodes.insert(index, node) + node.parent = self + + def removeChild(self, node): + try: + self.childNodes.remove(node) + except: + # XXX + raise + node.parent = None + + def cloneNode(self): + raise NotImplementedError + + def hasContent(self): + """Return true if the node has children or text""" + return bool(self.childNodes) + + def getNameTuple(self): + if self.namespace == None: + return namespaces["html"], self.name + else: + return self.namespace, self.name + + nameTuple = property(getNameTuple) + +class Document(Node): + type = 1 + def __init__(self): + Node.__init__(self, None) + + def __unicode__(self): + return "#document" + + def appendChild(self, child): + Node.appendChild(self, child) + + def toxml(self, encoding="utf=8"): + result = "" + for child in self.childNodes: + result += child.toxml() + return result.encode(encoding) + + def hilite(self, encoding="utf-8"): + result = "<pre>" + for child in self.childNodes: + result += child.hilite() + return result.encode(encoding) + "</pre>" + + def printTree(self): + tree = unicode(self) + for child in self.childNodes: + tree += child.printTree(2) + return tree + + def cloneNode(self): + return Document() + +class DocumentFragment(Document): + type = 2 + def __unicode__(self): + return "#document-fragment" + + def cloneNode(self): + return DocumentFragment() + +class DocumentType(Node): + type = 3 + def __init__(self, name, publicId, systemId): + Node.__init__(self, name) + self.publicId = publicId + self.systemId = systemId + + def __unicode__(self): + if self.publicId or self.systemId: + publicId = self.publicId or "" + systemId = self.systemId or "" + return """<!DOCTYPE %s "%s" "%s">"""%( + self.name, publicId, systemId) + + else: + return u"<!DOCTYPE %s>" % self.name + + + toxml = __unicode__ + + def hilite(self): + return '<code class="markup doctype"><!DOCTYPE %s></code>' % self.name + + def cloneNode(self): + return DocumentType(self.name, self.publicId, self.systemId) + +class TextNode(Node): + type = 4 + def __init__(self, value): + Node.__init__(self, None) + self.value = value + + def __unicode__(self): + return u"\"%s\"" % self.value + + def toxml(self): + return escape(self.value) + + hilite = toxml + + def cloneNode(self): + return TextNode(self.value) + +class Element(Node): + type = 5 + def __init__(self, name, namespace=None): + Node.__init__(self, name) + self.namespace = namespace + self.attributes = {} + + def __unicode__(self): + if self.namespace == None: + return u"<%s>" % self.name + else: + return u"<%s %s>"%(prefixes[self.namespace], self.name) + + def toxml(self): + result = '<' + self.name + if self.attributes: + for name,value in self.attributes.iteritems(): + result += u' %s="%s"' % (name, escape(value,{'"':'"'})) + if self.childNodes: + result += '>' + for child in self.childNodes: + result += child.toxml() + result += u'</%s>' % self.name + else: + result += u'/>' + return result + + def hilite(self): + result = '<<code class="markup element-name">%s</code>' % self.name + if self.attributes: + for name, value in self.attributes.iteritems(): + result += ' <code class="markup attribute-name">%s</code>=<code class="markup attribute-value">"%s"</code>' % (name, escape(value, {'"':'"'})) + if self.childNodes: + result += ">" + for child in self.childNodes: + result += child.hilite() + elif self.name in voidElements: + return result + ">" + return result + '</<code class="markup element-name">%s</code>>' % self.name + + def printTree(self, indent): + tree = '\n|%s%s' % (' '*indent, unicode(self)) + indent += 2 + if self.attributes: + for name, value in self.attributes.iteritems(): + if isinstance(name, tuple): + name = "%s %s"%(name[0], name[1]) + tree += '\n|%s%s="%s"' % (' ' * indent, name, value) + for child in self.childNodes: + tree += child.printTree(indent) + return tree + + def cloneNode(self): + newNode = Element(self.name) + if hasattr(self, 'namespace'): + newNode.namespace = self.namespace + for attr, value in self.attributes.iteritems(): + newNode.attributes[attr] = value + return newNode + +class CommentNode(Node): + type = 6 + def __init__(self, data): + Node.__init__(self, None) + self.data = data + + def __unicode__(self): + return "<!-- %s -->" % self.data + + def toxml(self): + return "<!--%s-->" % self.data + + def hilite(self): + return '<code class="markup comment"><!--%s--></code>' % escape(self.data) + + def cloneNode(self): + return CommentNode(self.data) + +class TreeBuilder(_base.TreeBuilder): + documentClass = Document + doctypeClass = DocumentType + elementClass = Element + commentClass = CommentNode + fragmentClass = DocumentFragment + + def testSerializer(self, node): + return node.printTree() diff --git a/src/html5lib/treebuilders/soup.py b/src/html5lib/treebuilders/soup.py new file mode 100644 index 0000000000..807a998cd1 --- /dev/null +++ b/src/html5lib/treebuilders/soup.py @@ -0,0 +1,228 @@ +import warnings + +warnings.warn("BeautifulSoup 3.x (as of 3.1) is not fully compatible with html5lib and support will be removed in the future", DeprecationWarning) + +from BeautifulSoup import BeautifulSoup, Tag, NavigableString, Comment, Declaration + +import _base +from html5lib.constants import namespaces, DataLossWarning + +class AttrList(object): + def __init__(self, element): + self.element = element + self.attrs = dict(self.element.attrs) + def __iter__(self): + return self.attrs.items().__iter__() + def __setitem__(self, name, value): + "set attr", name, value + self.element[name] = value + def items(self): + return self.attrs.items() + def keys(self): + return self.attrs.keys() + def __getitem__(self, name): + return self.attrs[name] + def __contains__(self, name): + return name in self.attrs.keys() + + +class Element(_base.Node): + def __init__(self, element, soup, namespace): + _base.Node.__init__(self, element.name) + self.element = element + self.soup = soup + self.namespace = namespace + + def _nodeIndex(self, node, refNode): + # Finds a node by identity rather than equality + for index in range(len(self.element.contents)): + if id(self.element.contents[index]) == id(refNode.element): + return index + return None + + def appendChild(self, node): + if (node.element.__class__ == NavigableString and self.element.contents + and self.element.contents[-1].__class__ == NavigableString): + # Concatenate new text onto old text node + # (TODO: This has O(n^2) performance, for input like "a</a>a</a>a</a>...") + newStr = NavigableString(self.element.contents[-1]+node.element) + + # Remove the old text node + # (Can't simply use .extract() by itself, because it fails if + # an equal text node exists within the parent node) + oldElement = self.element.contents[-1] + del self.element.contents[-1] + oldElement.parent = None + oldElement.extract() + + self.element.insert(len(self.element.contents), newStr) + else: + self.element.insert(len(self.element.contents), node.element) + node.parent = self + + def getAttributes(self): + return AttrList(self.element) + + def setAttributes(self, attributes): + if attributes: + for name, value in attributes.items(): + self.element[name] = value + + attributes = property(getAttributes, setAttributes) + + def insertText(self, data, insertBefore=None): + text = TextNode(NavigableString(data), self.soup) + if insertBefore: + self.insertBefore(text, insertBefore) + else: + self.appendChild(text) + + def insertBefore(self, node, refNode): + index = self._nodeIndex(node, refNode) + if (node.element.__class__ == NavigableString and self.element.contents + and self.element.contents[index-1].__class__ == NavigableString): + # (See comments in appendChild) + newStr = NavigableString(self.element.contents[index-1]+node.element) + oldNode = self.element.contents[index-1] + del self.element.contents[index-1] + oldNode.parent = None + oldNode.extract() + + self.element.insert(index-1, newStr) + else: + self.element.insert(index, node.element) + node.parent = self + + def removeChild(self, node): + index = self._nodeIndex(node.parent, node) + del node.parent.element.contents[index] + node.element.parent = None + node.element.extract() + node.parent = None + + def reparentChildren(self, newParent): + while self.element.contents: + child = self.element.contents[0] + child.extract() + if isinstance(child, Tag): + newParent.appendChild(Element(child, self.soup, namespaces["html"])) + else: + newParent.appendChild(TextNode(child, self.soup)) + + def cloneNode(self): + node = Element(Tag(self.soup, self.element.name), self.soup, self.namespace) + for key,value in self.attributes: + node.attributes[key] = value + return node + + def hasContent(self): + return self.element.contents + + def getNameTuple(self): + if self.namespace == None: + return namespaces["html"], self.name + else: + return self.namespace, self.name + + nameTuple = property(getNameTuple) + +class TextNode(Element): + def __init__(self, element, soup): + _base.Node.__init__(self, None) + self.element = element + self.soup = soup + + def cloneNode(self): + raise NotImplementedError + +class TreeBuilder(_base.TreeBuilder): + def __init__(self, namespaceHTMLElements): + if namespaceHTMLElements: + warnings.warn("BeautifulSoup cannot represent elements in any namespace", DataLossWarning) + _base.TreeBuilder.__init__(self, namespaceHTMLElements) + + def documentClass(self): + self.soup = BeautifulSoup("") + return Element(self.soup, self.soup, None) + + def insertDoctype(self, token): + name = token["name"] + publicId = token["publicId"] + systemId = token["systemId"] + + if publicId: + self.soup.insert(0, Declaration("%s PUBLIC \"%s\" \"%s\""%(name, publicId, systemId or ""))) + elif systemId: + self.soup.insert(0, Declaration("%s SYSTEM \"%s\""% + (name, systemId))) + else: + self.soup.insert(0, Declaration(name)) + + def elementClass(self, name, namespace): + if namespace is not None: + warnings.warn("BeautifulSoup cannot represent elements in any namespace", DataLossWarning) + return Element(Tag(self.soup, name), self.soup, namespace) + + def commentClass(self, data): + return TextNode(Comment(data), self.soup) + + def fragmentClass(self): + self.soup = BeautifulSoup("") + self.soup.name = "[document_fragment]" + return Element(self.soup, self.soup, None) + + def appendChild(self, node): + self.soup.insert(len(self.soup.contents), node.element) + + def testSerializer(self, element): + return testSerializer(element) + + def getDocument(self): + return self.soup + + def getFragment(self): + return _base.TreeBuilder.getFragment(self).element + +def testSerializer(element): + import re + rv = [] + def serializeElement(element, indent=0): + if isinstance(element, Declaration): + doctype_regexp = r'(?P<name>[^\s]*)( PUBLIC "(?P<publicId>.*)" "(?P<systemId1>.*)"| SYSTEM "(?P<systemId2>.*)")?' + m = re.compile(doctype_regexp).match(element.string) + assert m is not None, "DOCTYPE did not match expected format" + name = m.group('name') + publicId = m.group('publicId') + if publicId is not None: + systemId = m.group('systemId1') or "" + else: + systemId = m.group('systemId2') + + if publicId is not None or systemId is not None: + rv.append("""|%s<!DOCTYPE %s "%s" "%s">"""% + (' '*indent, name, publicId or "", systemId or "")) + else: + rv.append("|%s<!DOCTYPE %s>"%(' '*indent, name)) + + elif isinstance(element, BeautifulSoup): + if element.name == "[document_fragment]": + rv.append("#document-fragment") + else: + rv.append("#document") + + elif isinstance(element, Comment): + rv.append("|%s<!-- %s -->"%(' '*indent, element.string)) + elif isinstance(element, unicode): + rv.append("|%s\"%s\"" %(' '*indent, element)) + else: + rv.append("|%s<%s>"%(' '*indent, element.name)) + if element.attrs: + for name, value in element.attrs: + rv.append('|%s%s="%s"' % (' '*(indent+2), name, value)) + indent += 2 + if hasattr(element, "contents"): + for child in element.contents: + serializeElement(child, indent) + serializeElement(element, 0) + + return "\n".join(rv) diff --git a/src/html5lib/treewalkers/__init__.py b/src/html5lib/treewalkers/__init__.py new file mode 100644 index 0000000000..3a606a8b3c --- /dev/null +++ b/src/html5lib/treewalkers/__init__.py @@ -0,0 +1,52 @@ +"""A collection of modules for iterating through different kinds of +tree, generating tokens identical to those produced by the tokenizer +module. + +To create a tree walker for a new type of tree, you need to do +implement a tree walker object (called TreeWalker by convention) that +implements a 'serialize' method taking a tree as sole argument and +returning an iterator generating tokens. +""" + +treeWalkerCache = {} + +def getTreeWalker(treeType, implementation=None, **kwargs): + """Get a TreeWalker class for various types of tree with built-in support + + treeType - the name of the tree type required (case-insensitive). Supported + values are "simpletree", "dom", "etree" and "beautifulsoup" + + "simpletree" - a built-in DOM-ish tree type with support for some + more pythonic idioms. + "dom" - The xml.dom.minidom DOM implementation + "pulldom" - The xml.dom.pulldom event stream + "etree" - A generic walker for tree implementations exposing an + elementtree-like interface (known to work with + ElementTree, cElementTree and lxml.etree). + "lxml" - Optimized walker for lxml.etree + "beautifulsoup" - Beautiful soup (if installed) + "genshi" - a Genshi stream + + implementation - (Currently applies to the "etree" tree type only). A module + implementing the tree type e.g. xml.etree.ElementTree or + cElementTree.""" + + treeType = treeType.lower() + if treeType not in treeWalkerCache: + if treeType in ("dom", "pulldom", "simpletree"): + mod = __import__(treeType, globals()) + treeWalkerCache[treeType] = mod.TreeWalker + elif treeType == "genshi": + import genshistream + treeWalkerCache[treeType] = genshistream.TreeWalker + elif treeType == "beautifulsoup": + import soup + treeWalkerCache[treeType] = soup.TreeWalker + elif treeType == "lxml": + import lxmletree + treeWalkerCache[treeType] = lxmletree.TreeWalker + elif treeType == "etree": + import etree + # XXX: NEVER cache here, caching is done in the etree submodule + return etree.getETreeModule(implementation, **kwargs).TreeWalker + return treeWalkerCache.get(treeType) diff --git a/src/html5lib/treewalkers/_base.py b/src/html5lib/treewalkers/_base.py new file mode 100644 index 0000000000..2c3b819d88 --- /dev/null +++ b/src/html5lib/treewalkers/_base.py @@ -0,0 +1,165 @@ +import gettext +_ = gettext.gettext + +from html5lib.constants import voidElements, spaceCharacters +spaceCharacters = u"".join(spaceCharacters) + +class TreeWalker(object): + def __init__(self, tree): + self.tree = tree + + def __iter__(self): + raise NotImplementedError + + def error(self, msg): + return {"type": "SerializeError", "data": msg} + + def normalizeAttrs(self, attrs): + if not attrs: + attrs = [] + elif hasattr(attrs, 'items'): + attrs = attrs.items() + return [(unicode(name),unicode(value)) for name,value in attrs] + + def emptyTag(self, namespace, name, attrs, hasChildren=False): + yield {"type": "EmptyTag", "name": unicode(name), + "namespace":unicode(namespace), + "data": self.normalizeAttrs(attrs)} + if hasChildren: + yield self.error(_("Void element has children")) + + def startTag(self, namespace, name, attrs): + return {"type": "StartTag", + "name": unicode(name), + "namespace":unicode(namespace), + "data": self.normalizeAttrs(attrs)} + + def endTag(self, namespace, name): + return {"type": "EndTag", + "name": unicode(name), + "namespace":unicode(namespace), + "data": []} + + def text(self, data): + data = unicode(data) + middle = data.lstrip(spaceCharacters) + left = data[:len(data)-len(middle)] + if left: + yield {"type": "SpaceCharacters", "data": left} + data = middle + middle = data.rstrip(spaceCharacters) + right = data[len(middle):] + if middle: + yield {"type": "Characters", "data": middle} + if right: + yield {"type": "SpaceCharacters", "data": right} + + def comment(self, data): + return {"type": "Comment", "data": unicode(data)} + + def doctype(self, name, publicId=None, systemId=None, correct=True): + return {"type": "Doctype", + "name": name is not None and unicode(name) or u"", + "publicId": publicId, + "systemId": systemId, + "correct": correct} + + def unknown(self, nodeType): + return self.error(_("Unknown node type: ") + nodeType) + +class RecursiveTreeWalker(TreeWalker): + def walkChildren(self, node): + raise NodeImplementedError + + def element(self, node, namespace, name, attrs, hasChildren): + if name in voidElements: + for token in self.emptyTag(namespace, name, attrs, hasChildren): + yield token + else: + yield self.startTag(name, attrs) + if hasChildren: + for token in self.walkChildren(node): + yield token + yield self.endTag(name) + +from xml.dom import Node + +DOCUMENT = Node.DOCUMENT_NODE +DOCTYPE = Node.DOCUMENT_TYPE_NODE +TEXT = Node.TEXT_NODE +ELEMENT = Node.ELEMENT_NODE +COMMENT = Node.COMMENT_NODE +UNKNOWN = "<#UNKNOWN#>" + +class NonRecursiveTreeWalker(TreeWalker): + def getNodeDetails(self, node): + raise NotImplementedError + + def getFirstChild(self, node): + raise NotImplementedError + + def getNextSibling(self, node): + raise NotImplementedError + + def getParentNode(self, node): + raise NotImplementedError + + def __iter__(self): + currentNode = self.tree + while currentNode is not None: + details = self.getNodeDetails(currentNode) + type, details = details[0], details[1:] + hasChildren = False + endTag = None + + if type == DOCTYPE: + yield self.doctype(*details) + + elif type == TEXT: + for token in self.text(*details): + yield token + + elif type == ELEMENT: + namespace, name, attributes, hasChildren = details + if name in voidElements: + for token in self.emptyTag(namespace, name, attributes, + hasChildren): + yield token + hasChildren = False + else: + endTag = name + yield self.startTag(namespace, name, attributes) + + elif type == COMMENT: + yield self.comment(details[0]) + + elif type == DOCUMENT: + hasChildren = True + + else: + yield self.unknown(details[0]) + + if hasChildren: + firstChild = self.getFirstChild(currentNode) + else: + firstChild = None + + if firstChild is not None: + currentNode = firstChild + else: + while currentNode is not None: + details = self.getNodeDetails(currentNode) + type, details = details[0], details[1:] + if type == ELEMENT: + namespace, name, attributes, hasChildren = details + if name not in voidElements: + yield self.endTag(namespace, name) + if self.tree is currentNode: + currentNode = None + break + nextSibling = self.getNextSibling(currentNode) + if nextSibling is not None: + currentNode = nextSibling + break + else: + currentNode = self.getParentNode(currentNode) diff --git a/src/html5lib/treewalkers/dom.py b/src/html5lib/treewalkers/dom.py new file mode 100644 index 0000000000..0adc77f0e8 --- /dev/null +++ b/src/html5lib/treewalkers/dom.py @@ -0,0 +1,37 @@ +from xml.dom import Node + +import gettext +_ = gettext.gettext + +import _base +from html5lib.constants import voidElements + +class TreeWalker(_base.NonRecursiveTreeWalker): + def getNodeDetails(self, node): + if node.nodeType == Node.DOCUMENT_TYPE_NODE: + return _base.DOCTYPE, node.name, node.publicId, node.systemId + + elif node.nodeType in (Node.TEXT_NODE, Node.CDATA_SECTION_NODE): + return _base.TEXT, node.nodeValue + + elif node.nodeType == Node.ELEMENT_NODE: + return (_base.ELEMENT, node.namespaceURI, node.nodeName, + node.attributes.items(), node.hasChildNodes) + + elif node.nodeType == Node.COMMENT_NODE: + return _base.COMMENT, node.nodeValue + + elif node.nodeType in (Node.DOCUMENT_NODE, Node.DOCUMENT_FRAGMENT_NODE): + return (_base.DOCUMENT,) + + else: + return _base.UNKNOWN, node.nodeType + + def getFirstChild(self, node): + return node.firstChild + + def getNextSibling(self, node): + return node.nextSibling + + def getParentNode(self, node): + return node.parentNode diff --git a/src/html5lib/treewalkers/etree.py b/src/html5lib/treewalkers/etree.py new file mode 100644 index 0000000000..739d30756f --- /dev/null +++ b/src/html5lib/treewalkers/etree.py @@ -0,0 +1,130 @@ +import gettext +_ = gettext.gettext + +import new +import copy +import re + +import _base +from html5lib.constants import voidElements + +tag_regexp = re.compile("{([^}]*)}(.*)") + +moduleCache = {} + +def getETreeModule(ElementTreeImplementation): + name = "_" + ElementTreeImplementation.__name__+"builder" + if name in moduleCache: + return moduleCache[name] + else: + mod = new.module("_" + ElementTreeImplementation.__name__+"builder") + objs = getETreeBuilder(ElementTreeImplementation) + mod.__dict__.update(objs) + moduleCache[name] = mod + return mod + +def getETreeBuilder(ElementTreeImplementation): + ElementTree = ElementTreeImplementation + + class TreeWalker(_base.NonRecursiveTreeWalker): + """Given the particular ElementTree representation, this implementation, + to avoid using recursion, returns "nodes" as tuples with the following + content: + + 1. The current element + + 2. The index of the element relative to its parent + + 3. A stack of ancestor elements + + 4. A flag "text", "tail" or None to indicate if the current node is a + text node; either the text or tail of the current element (1) + """ + def getNodeDetails(self, node): + if isinstance(node, tuple): # It might be the root Element + elt, key, parents, flag = node + if flag in ("text", "tail"): + return _base.TEXT, getattr(elt, flag) + else: + node = elt + + if not(hasattr(node, "tag")): + node = node.getroot() + + if node.tag in ("<DOCUMENT_ROOT>", "<DOCUMENT_FRAGMENT>"): + return (_base.DOCUMENT,) + + elif node.tag == "<!DOCTYPE>": + return (_base.DOCTYPE, node.text, + node.get("publicId"), node.get("systemId")) + + elif type(node.tag) == type(ElementTree.Comment): + return _base.COMMENT, node.text + + else: + #This is assumed to be an ordinary element + match = tag_regexp.match(node.tag) + if match: + namespace, tag = match.groups() + else: + namespace = None + tag = node.tag + return (_base.ELEMENT, namespace, tag, + node.attrib.items(), len(node) or node.text) + + def getFirstChild(self, node): + if isinstance(node, tuple): + element, key, parents, flag = node + else: + element, key, parents, flag = node, None, [], None + + if flag in ("text", "tail"): + return None + else: + if element.text: + return element, key, parents, "text" + elif len(element): + parents.append(element) + return element[0], 0, parents, None + else: + return None + + def getNextSibling(self, node): + if isinstance(node, tuple): + element, key, parents, flag = node + else: + return None + + if flag == "text": + if len(element): + parents.append(element) + return element[0], 0, parents, None + else: + return None + else: + if element.tail and flag != "tail": + return element, key, parents, "tail" + elif key < len(parents[-1]) - 1: + return parents[-1][key+1], key+1, parents, None + else: + return None + + def getParentNode(self, node): + if isinstance(node, tuple): + element, key, parents, flag = node + else: + return None + + if flag == "text": + if not parents: + return element + else: + return element, key, parents, None + else: + parent = parents.pop() + if not parents: + return parent + else: + return parent, list(parents[-1]).index(parent), parents, None + + return locals() diff --git a/src/html5lib/treewalkers/genshistream.py b/src/html5lib/treewalkers/genshistream.py new file mode 100644 index 0000000000..ef71a83e31 --- /dev/null +++ b/src/html5lib/treewalkers/genshistream.py @@ -0,0 +1,70 @@ +from genshi.core import START, END, XML_NAMESPACE, DOCTYPE, TEXT +from genshi.core import START_NS, END_NS, START_CDATA, END_CDATA, PI, COMMENT +from genshi.output import NamespaceFlattener + +import _base + +from html5lib.constants import voidElements + +class TreeWalker(_base.TreeWalker): + def __iter__(self): + depth = 0 + ignore_until = None + previous = None + for event in self.tree: + if previous is not None: + if previous[0] == START: + depth += 1 + if ignore_until <= depth: + ignore_until = None + if ignore_until is None: + for token in self.tokens(previous, event): + yield token + if token["type"] == "EmptyTag": + ignore_until = depth + if previous[0] == END: + depth -= 1 + previous = event + if previous is not None: + if ignore_until is None or ignore_until <= depth: + for token in self.tokens(previous, None): + yield token + elif ignore_until is not None: + raise ValueError("Illformed DOM event stream: void element without END_ELEMENT") + + def tokens(self, event, next): + kind, data, pos = event + if kind == START: + tag, attrib = data + name = tag.localname + namespace = tag.namespace + if tag in voidElements: + for token in self.emptyTag(namespace, name, list(attrib), + not next or next[0] != END + or next[1] != tag): + yield token + else: + yield self.startTag(namespace, name, list(attrib)) + + elif kind == END: + name = data.localname + namespace = data.namespace + if name not in voidElements: + yield self.endTag(namespace, name) + + elif kind == COMMENT: + yield self.comment(data) + + elif kind == TEXT: + for token in self.text(data): + yield token + + elif kind == DOCTYPE: + yield self.doctype(*data) + + elif kind in (XML_NAMESPACE, DOCTYPE, START_NS, END_NS, \ + START_CDATA, END_CDATA, PI): + pass + + else: + yield self.unknown(kind) diff --git a/src/html5lib/treewalkers/lxmletree.py b/src/html5lib/treewalkers/lxmletree.py new file mode 100644 index 0000000000..3f4de4fcd4 --- /dev/null +++ b/src/html5lib/treewalkers/lxmletree.py @@ -0,0 +1,175 @@ +from lxml import etree +from html5lib.treebuilders.etree import tag_regexp + +from gettext import gettext +_ = gettext + +import _base + +from html5lib.constants import voidElements +from html5lib import ihatexml + +class Root(object): + def __init__(self, et): + self.elementtree = et + self.children = [] + if et.docinfo.internalDTD: + self.children.append(Doctype(self, et.docinfo.root_name, + et.docinfo.public_id, + et.docinfo.system_url)) + root = et.getroot() + node = root + + while node.getprevious() is not None: + node = node.getprevious() + while node is not None: + self.children.append(node) + node = node.getnext() + + self.text = None + self.tail = None + + def __getitem__(self, key): + return self.children[key] + + def getnext(self): + return None + + def __len__(self): + return 1 + +class Doctype(object): + def __init__(self, root_node, name, public_id, system_id): + self.root_node = root_node + self.name = name + self.public_id = public_id + self.system_id = system_id + + self.text = None + self.tail = None + + def getnext(self): + return self.root_node.children[1] + +class FragmentRoot(Root): + def __init__(self, children): + self.children = [FragmentWrapper(self, child) for child in children] + self.text = self.tail = None + + def getnext(self): + return None + +class FragmentWrapper(object): + def __init__(self, fragment_root, obj): + self.root_node = fragment_root + self.obj = obj + if hasattr(self.obj, 'text'): + self.text = self.obj.text + else: + self.text = None + if hasattr(self.obj, 'tail'): + self.tail = self.obj.tail + else: + self.tail = None + self.isstring = isinstance(obj, basestring) + + def __getattr__(self, name): + return getattr(self.obj, name) + + def getnext(self): + siblings = self.root_node.children + idx = siblings.index(self) + if idx < len(siblings) - 1: + return siblings[idx + 1] + else: + return None + + def __getitem__(self, key): + return self.obj[key] + + def __nonzero__(self): + return bool(self.obj) + + def getparent(self): + return None + + def __str__(self): + return str(self.obj) + + def __len__(self): + return len(self.obj) + + +class TreeWalker(_base.NonRecursiveTreeWalker): + def __init__(self, tree): + if hasattr(tree, "getroot"): + tree = Root(tree) + elif isinstance(tree, list): + tree = FragmentRoot(tree) + _base.NonRecursiveTreeWalker.__init__(self, tree) + self.filter = ihatexml.InfosetFilter() + def getNodeDetails(self, node): + if isinstance(node, tuple): # Text node + node, key = node + assert key in ("text", "tail"), _("Text nodes are text or tail, found %s") % key + return _base.TEXT, getattr(node, key) + + elif isinstance(node, Root): + return (_base.DOCUMENT,) + + elif isinstance(node, Doctype): + return _base.DOCTYPE, node.name, node.public_id, node.system_id + + elif isinstance(node, FragmentWrapper) and node.isstring: + return _base.TEXT, node + + elif node.tag == etree.Comment: + return _base.COMMENT, node.text + + else: + #This is assumed to be an ordinary element + match = tag_regexp.match(node.tag) + if match: + namespace, tag = match.groups() + else: + namespace = None + tag = node.tag + return (_base.ELEMENT, namespace, self.filter.fromXmlName(tag), + [(self.filter.fromXmlName(name), value) for + name,value in node.attrib.iteritems()], + len(node) > 0 or node.text) + + def getFirstChild(self, node): + assert not isinstance(node, tuple), _("Text nodes have no children") + + assert len(node) or node.text, "Node has no children" + if node.text: + return (node, "text") + else: + return node[0] + + def getNextSibling(self, node): + if isinstance(node, tuple): # Text node + node, key = node + assert key in ("text", "tail"), _("Text nodes are text or tail, found %s") % key + if key == "text": + # XXX: we cannot use a "bool(node) and node[0] or None" construct here + # because node[0] might evaluate to False if it has no child element + if len(node): + return node[0] + else: + return None + else: # tail + return node.getnext() + + return node.tail and (node, "tail") or node.getnext() + + def getParentNode(self, node): + if isinstance(node, tuple): # Text node + node, key = node + assert key in ("text", "tail"), _("Text nodes are text or tail, found %s") % key + if key == "text": + return node + # else: fallback to "normal" processing + + return node.getparent() diff --git a/src/html5lib/treewalkers/pulldom.py b/src/html5lib/treewalkers/pulldom.py new file mode 100644 index 0000000000..7354a0e4a6 --- /dev/null +++ b/src/html5lib/treewalkers/pulldom.py @@ -0,0 +1,56 @@ +from xml.dom.pulldom import START_ELEMENT, END_ELEMENT, \ + COMMENT, IGNORABLE_WHITESPACE, CHARACTERS + +import _base + +from html5lib.constants import voidElements + +class TreeWalker(_base.TreeWalker): + def __iter__(self): + ignore_until = None + previous = None + for event in self.tree: + if previous is not None and \ + (ignore_until is None or previous[1] is ignore_until): + if previous[1] is ignore_until: + ignore_until = None + for token in self.tokens(previous, event): + yield token + if token["type"] == "EmptyTag": + ignore_until = previous[1] + previous = event + if ignore_until is None or previous[1] is ignore_until: + for token in self.tokens(previous, None): + yield token + elif ignore_until is not None: + raise ValueError("Illformed DOM event stream: void element without END_ELEMENT") + + def tokens(self, event, next): + type, node = event + if type == START_ELEMENT: + name = node.nodeName + namespace = node.namespaceURI + if name in voidElements: + for token in self.emptyTag(namespace, + name, + node.attributes.items(), + not next or next[1] is not node): + yield token + else: + yield self.startTag(namespace, name, node.attributes.items()) + + elif type == END_ELEMENT: + name = node.nodeName + namespace = node.namespaceURI + if name not in voidElements: + yield self.endTag(namespace, name) + + elif type == COMMENT: + yield self.comment(node.nodeValue) + + elif type in (IGNORABLE_WHITESPACE, CHARACTERS): + for token in self.text(node.nodeValue): + yield token + + else: + yield self.unknown(type) diff --git a/src/html5lib/treewalkers/simpletree.py b/src/html5lib/treewalkers/simpletree.py new file mode 100644 index 0000000000..42be2a2d25 --- /dev/null +++ b/src/html5lib/treewalkers/simpletree.py @@ -0,0 +1,72 @@ +import gettext +_ = gettext.gettext + +import _base + +class TreeWalker(_base.NonRecursiveTreeWalker): + """Given that simpletree has no performant way of getting a node's + next sibling, this implementation returns "nodes" as tuples with the + following content: + + 1. The parent Node (Element, Document or DocumentFragment) + + 2. The child index of the current node in its parent's children list + + 3. A list used as a stack of all ancestors. It is a pair tuple whose + first item is a parent Node and second item is a child index. + """ + + def getNodeDetails(self, node): + if isinstance(node, tuple): # It might be the root Node + parent, idx, parents = node + node = parent.childNodes[idx] + + # testing node.type allows us not to import treebuilders.simpletree + if node.type in (1, 2): # Document or DocumentFragment + return (_base.DOCUMENT,) + + elif node.type == 3: # DocumentType + return _base.DOCTYPE, node.name, node.publicId, node.systemId + + elif node.type == 4: # TextNode + return _base.TEXT, node.value + + elif node.type == 5: # Element + return (_base.ELEMENT, node.namespace, node.name, + node.attributes.items(), node.hasContent()) + + elif node.type == 6: # CommentNode + return _base.COMMENT, node.data + + else: + return _node.UNKNOWN, node.type + + def getFirstChild(self, node): + if isinstance(node, tuple): # It might be the root Node + parent, idx, parents = node + parents.append((parent, idx)) + node = parent.childNodes[idx] + else: + parents = [] + + assert node.hasContent(), "Node has no children" + return (node, 0, parents) + + def getNextSibling(self, node): + assert isinstance(node, tuple), "Node is not a tuple: " + str(node) + parent, idx, parents = node + idx += 1 + if len(parent.childNodes) > idx: + return (parent, idx, parents) + else: + return None + + def getParentNode(self, node): + assert isinstance(node, tuple) + parent, idx, parents = node + if parents: + parent, idx = parents.pop() + return parent, idx, parents + else: + # HACK: We could return ``parent`` but None will stop the algorithm the same way + return None diff --git a/src/html5lib/treewalkers/soup.py b/src/html5lib/treewalkers/soup.py new file mode 100644 index 0000000000..f754e5e1c6 --- /dev/null +++ b/src/html5lib/treewalkers/soup.py @@ -0,0 +1,59 @@ +import re +import gettext +_ = gettext.gettext + +from BeautifulSoup import BeautifulSoup, Declaration, Comment, Tag +from html5lib.constants import namespaces +import _base + +class TreeWalker(_base.NonRecursiveTreeWalker): + doctype_regexp = re.compile( + r'(?P<name>[^\s]*)(\s*PUBLIC\s*"(?P<publicId>.*)"\s*"(?P<systemId1>.*)"|\s*SYSTEM\s*"(?P<systemId2>.*)")?') + def getNodeDetails(self, node): + if isinstance(node, BeautifulSoup): # Document or DocumentFragment + return (_base.DOCUMENT,) + + elif isinstance(node, Declaration): # DocumentType + string = unicode(node.string) + #Slice needed to remove markup added during unicode conversion, + #but only in some versions of BeautifulSoup/Python + if string.startswith('<!') and string.endswith('>'): + string = string[2:-1] + m = self.doctype_regexp.match(string) + #This regexp approach seems wrong and fragile + #but beautiful soup stores the doctype as a single thing and we want the seperate bits + #It should work as long as the tree is created by html5lib itself but may be wrong if it's + #been modified at all + #We could just feed to it a html5lib tokenizer, I guess... + assert m is not None, "DOCTYPE did not match expected format" + name = m.group('name') + publicId = m.group('publicId') + if publicId is not None: + systemId = m.group('systemId1') + else: + systemId = m.group('systemId2') + return _base.DOCTYPE, name, publicId or "", systemId or "" + + elif isinstance(node, Comment): + string = unicode(node.string) + if string.startswith('<!--') and string.endswith('-->'): + string = string[4:-3] + return _base.COMMENT, string + + elif isinstance(node, unicode): # TextNode + return _base.TEXT, node + + elif isinstance(node, Tag): # Element + return (_base.ELEMENT, namespaces["html"], node.name, + dict(node.attrs).items(), node.contents) + else: + return _base.UNKNOWN, node.__class__.__name__ + + def getFirstChild(self, node): + return node.contents[0] + + def getNextSibling(self, node): + return node.nextSibling + + def getParentNode(self, node): + return node.parent diff --git a/src/html5lib/utils.py b/src/html5lib/utils.py new file mode 100644 index 0000000000..7c6c8ae840 --- /dev/null +++ b/src/html5lib/utils.py @@ -0,0 +1,156 @@ +try: + frozenset +except NameError: + #Import from the sets module for python 2.3 + from sets import Set as set + from sets import ImmutableSet as frozenset + +class MethodDispatcher(dict): + """Dict with 2 special properties: + + On initiation, keys that are lists, sets or tuples are converted to + multiple keys so accessing any one of the items in the original + list-like object returns the matching value + + md = MethodDispatcher({("foo", "bar"):"baz"}) + md["foo"] == "baz" + + A default value which can be set through the default attribute. + """ + + def __init__(self, items=()): + # Using _dictEntries instead of directly assigning to self is about + # twice as fast. Please do careful performance testing before changing + # anything here. + _dictEntries = [] + for name,value in items: + if type(name) in (list, tuple, frozenset, set): + for item in name: + _dictEntries.append((item, value)) + else: + _dictEntries.append((name, value)) + dict.__init__(self, _dictEntries) + self.default = None + + def __getitem__(self, key): + return dict.get(self, key, self.default) + +#Pure python implementation of deque taken from the ASPN Python Cookbook +#Original code by Raymond Hettinger + +class deque(object): + + def __init__(self, iterable=(), maxsize=-1): + if not hasattr(self, 'data'): + self.left = self.right = 0 + self.data = {} + self.maxsize = maxsize + self.extend(iterable) + + def append(self, x): + self.data[self.right] = x + self.right += 1 + if self.maxsize != -1 and len(self) > self.maxsize: + self.popleft() + + def appendleft(self, x): + self.left -= 1 + self.data[self.left] = x + if self.maxsize != -1 and len(self) > self.maxsize: + self.pop() + + def pop(self): + if self.left == self.right: + raise IndexError('cannot pop from empty deque') + self.right -= 1 + elem = self.data[self.right] + del self.data[self.right] + return elem + + def popleft(self): + if self.left == self.right: + raise IndexError('cannot pop from empty deque') + elem = self.data[self.left] + del self.data[self.left] + self.left += 1 + return elem + + def clear(self): + self.data.clear() + self.left = self.right = 0 + + def extend(self, iterable): + for elem in iterable: + self.append(elem) + + def extendleft(self, iterable): + for elem in iterable: + self.appendleft(elem) + + def rotate(self, n=1): + if self: + n %= len(self) + for i in xrange(n): + self.appendleft(self.pop()) + + def __getitem__(self, i): + if i < 0: + i += len(self) + try: + return self.data[i + self.left] + except KeyError: + raise IndexError + + def __setitem__(self, i, value): + if i < 0: + i += len(self) + try: + self.data[i + self.left] = value + except KeyError: + raise IndexError + + def __delitem__(self, i): + size = len(self) + if not (-size <= i < size): + raise IndexError + data = self.data + if i < 0: + i += size + for j in xrange(self.left+i, self.right-1): + data[j] = data[j+1] + self.pop() + + def __len__(self): + return self.right - self.left + + def __cmp__(self, other): + if type(self) != type(other): + return cmp(type(self), type(other)) + return cmp(list(self), list(other)) + + def __repr__(self, _track=[]): + if id(self) in _track: + return '...' + _track.append(id(self)) + r = 'deque(%r)' % (list(self),) + _track.remove(id(self)) + return r + + def __getstate__(self): + return (tuple(self),) + + def __setstate__(self, s): + self.__init__(s[0]) + + def __hash__(self): + raise TypeError + + def __copy__(self): + return self.__class__(self) + + def __deepcopy__(self, memo={}): + from copy import deepcopy + result = self.__class__() + memo[id(self)] = result + result.__init__(deepcopy(tuple(self), memo)) + return result \ No newline at end of file