mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Update HTML 5 parser used in calibre (html5lib-python)
This commit is contained in:
parent
b4bf871077
commit
b9421065f9
@ -562,9 +562,9 @@ def entity_to_unicode(match, exceptions=[], encoding='cp1252',
|
|||||||
return check(chr(num).decode(encoding))
|
return check(chr(num).decode(encoding))
|
||||||
except UnicodeDecodeError:
|
except UnicodeDecodeError:
|
||||||
return check(my_unichr(num))
|
return check(my_unichr(num))
|
||||||
from calibre.utils.html5_entities import entity_map
|
from html5lib.constants import entities
|
||||||
try:
|
try:
|
||||||
return check(entity_map[ent])
|
return check(entities[ent])
|
||||||
except KeyError:
|
except KeyError:
|
||||||
pass
|
pass
|
||||||
from htmlentitydefs import name2codepoint
|
from htmlentitydefs import name2codepoint
|
||||||
|
@ -81,10 +81,13 @@ def node_depth(node):
|
|||||||
return ans
|
return ans
|
||||||
|
|
||||||
def html5_parse(data, max_nesting_depth=100):
|
def html5_parse(data, max_nesting_depth=100):
|
||||||
import html5lib
|
import html5lib, warnings
|
||||||
# html5lib bug: http://code.google.com/p/html5lib/issues/detail?id=195
|
from html5lib.constants import cdataElements, rcdataElements
|
||||||
data = re.sub(r'<\s*(title|style|script|textarea)\s*[^>]*/\s*>', r'<\1></\1>', data, flags=re.I)
|
# HTML5 parsing algorithm idiocy: http://code.google.com/p/html5lib/issues/detail?id=195
|
||||||
|
data = re.sub(r'<\s*(%s)\s*[^>]*/\s*>' % ('|'.join(cdataElements|rcdataElements)), r'<\1></\1>', data, flags=re.I)
|
||||||
|
|
||||||
|
with warnings.catch_warnings():
|
||||||
|
warnings.simplefilter('ignore')
|
||||||
data = html5lib.parse(data, treebuilder='lxml').getroot()
|
data = html5lib.parse(data, treebuilder='lxml').getroot()
|
||||||
|
|
||||||
# Check that the asinine HTML 5 algorithm did not result in a tree with
|
# Check that the asinine HTML 5 algorithm did not result in a tree with
|
||||||
|
@ -7,6 +7,7 @@ __license__ = 'GPL v3'
|
|||||||
__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
|
__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
|
||||||
|
|
||||||
from lxml import etree
|
from lxml import etree
|
||||||
|
from html5lib.constants import cdataElements, rcdataElements
|
||||||
|
|
||||||
from calibre.ebooks.oeb.polish.tests.base import BaseTest
|
from calibre.ebooks.oeb.polish.tests.base import BaseTest
|
||||||
from calibre.ebooks.oeb.base import XPath, XHTML_NS, SVG_NS, XLINK_NS
|
from calibre.ebooks.oeb.base import XPath, XHTML_NS, SVG_NS, XLINK_NS
|
||||||
@ -18,7 +19,7 @@ def nonvoid_cdata_elements(test, parse_function):
|
|||||||
markup = '''
|
markup = '''
|
||||||
<html> <head><{0}/></head> <body id="test"> </html>
|
<html> <head><{0}/></head> <body id="test"> </html>
|
||||||
'''
|
'''
|
||||||
for tag in ('title', 'style', 'script', 'textarea'):
|
for tag in cdataElements | rcdataElements:
|
||||||
for x in (tag, tag.upper(), '\n' + tag, tag + ' id="xxx" '):
|
for x in (tag, tag.upper(), '\n' + tag, tag + ' id="xxx" '):
|
||||||
root = parse_function(markup.format(x))
|
root = parse_function(markup.format(x))
|
||||||
test.assertEqual(
|
test.assertEqual(
|
||||||
|
File diff suppressed because it is too large
Load Diff
@ -10,8 +10,14 @@ import html5lib
|
|||||||
f = open("my_document.html")
|
f = open("my_document.html")
|
||||||
tree = html5lib.parse(f)
|
tree = html5lib.parse(f)
|
||||||
"""
|
"""
|
||||||
__version__ = "0.90"
|
|
||||||
from html5parser import HTMLParser, parse, parseFragment
|
from __future__ import absolute_import, division, unicode_literals
|
||||||
from treebuilders import getTreeBuilder
|
|
||||||
from treewalkers import getTreeWalker
|
from .html5parser import HTMLParser, parse, parseFragment
|
||||||
from serializer import serialize
|
from .treebuilders import getTreeBuilder
|
||||||
|
from .treewalkers import getTreeWalker
|
||||||
|
from .serializer import serialize
|
||||||
|
|
||||||
|
__all__ = ["HTMLParser", "parse", "parseFragment", "getTreeBuilder",
|
||||||
|
"getTreeWalker", "serialize"]
|
||||||
|
__version__ = "0.999-dev"
|
||||||
|
File diff suppressed because it is too large
Load Diff
@ -1,3 +1,5 @@
|
|||||||
|
from __future__ import absolute_import, division, unicode_literals
|
||||||
|
|
||||||
|
|
||||||
class Filter(object):
|
class Filter(object):
|
||||||
def __init__(self, source):
|
def __init__(self, source):
|
||||||
|
20
src/html5lib/filters/alphabeticalattributes.py
Normal file
20
src/html5lib/filters/alphabeticalattributes.py
Normal file
@ -0,0 +1,20 @@
|
|||||||
|
from __future__ import absolute_import, division, unicode_literals
|
||||||
|
|
||||||
|
from . import _base
|
||||||
|
|
||||||
|
try:
|
||||||
|
from collections import OrderedDict
|
||||||
|
except ImportError:
|
||||||
|
from ordereddict import OrderedDict
|
||||||
|
|
||||||
|
|
||||||
|
class Filter(_base.Filter):
|
||||||
|
def __iter__(self):
|
||||||
|
for token in _base.Filter.__iter__(self):
|
||||||
|
if token["type"] in ("StartTag", "EmptyTag"):
|
||||||
|
attrs = OrderedDict()
|
||||||
|
for name, value in sorted(token["data"].items(),
|
||||||
|
key=lambda x: x[0]):
|
||||||
|
attrs[name] = value
|
||||||
|
token["data"] = attrs
|
||||||
|
yield token
|
@ -1,127 +0,0 @@
|
|||||||
#
|
|
||||||
# The goal is to finally have a form filler where you pass data for
|
|
||||||
# each form, using the algorithm for "Seeding a form with initial values"
|
|
||||||
# See http://www.whatwg.org/specs/web-forms/current-work/#seeding
|
|
||||||
#
|
|
||||||
|
|
||||||
import _base
|
|
||||||
|
|
||||||
from html5lib.constants import spaceCharacters
|
|
||||||
spaceCharacters = u"".join(spaceCharacters)
|
|
||||||
|
|
||||||
class SimpleFilter(_base.Filter):
|
|
||||||
def __init__(self, source, fieldStorage):
|
|
||||||
_base.Filter.__init__(self, source)
|
|
||||||
self.fieldStorage = fieldStorage
|
|
||||||
|
|
||||||
def __iter__(self):
|
|
||||||
field_indices = {}
|
|
||||||
state = None
|
|
||||||
field_name = None
|
|
||||||
for token in _base.Filter.__iter__(self):
|
|
||||||
type = token["type"]
|
|
||||||
if type in ("StartTag", "EmptyTag"):
|
|
||||||
name = token["name"].lower()
|
|
||||||
if name == "input":
|
|
||||||
field_name = None
|
|
||||||
field_type = None
|
|
||||||
input_value_index = -1
|
|
||||||
input_checked_index = -1
|
|
||||||
for i,(n,v) in enumerate(token["data"]):
|
|
||||||
n = n.lower()
|
|
||||||
if n == u"name":
|
|
||||||
field_name = v.strip(spaceCharacters)
|
|
||||||
elif n == u"type":
|
|
||||||
field_type = v.strip(spaceCharacters)
|
|
||||||
elif n == u"checked":
|
|
||||||
input_checked_index = i
|
|
||||||
elif n == u"value":
|
|
||||||
input_value_index = i
|
|
||||||
|
|
||||||
value_list = self.fieldStorage.getlist(field_name)
|
|
||||||
field_index = field_indices.setdefault(field_name, 0)
|
|
||||||
if field_index < len(value_list):
|
|
||||||
value = value_list[field_index]
|
|
||||||
else:
|
|
||||||
value = ""
|
|
||||||
|
|
||||||
if field_type in (u"checkbox", u"radio"):
|
|
||||||
if value_list:
|
|
||||||
if token["data"][input_value_index][1] == value:
|
|
||||||
if input_checked_index < 0:
|
|
||||||
token["data"].append((u"checked", u""))
|
|
||||||
field_indices[field_name] = field_index + 1
|
|
||||||
elif input_checked_index >= 0:
|
|
||||||
del token["data"][input_checked_index]
|
|
||||||
|
|
||||||
elif field_type not in (u"button", u"submit", u"reset"):
|
|
||||||
if input_value_index >= 0:
|
|
||||||
token["data"][input_value_index] = (u"value", value)
|
|
||||||
else:
|
|
||||||
token["data"].append((u"value", value))
|
|
||||||
field_indices[field_name] = field_index + 1
|
|
||||||
|
|
||||||
field_type = None
|
|
||||||
field_name = None
|
|
||||||
|
|
||||||
elif name == "textarea":
|
|
||||||
field_type = "textarea"
|
|
||||||
field_name = dict((token["data"])[::-1])["name"]
|
|
||||||
|
|
||||||
elif name == "select":
|
|
||||||
field_type = "select"
|
|
||||||
attributes = dict(token["data"][::-1])
|
|
||||||
field_name = attributes.get("name")
|
|
||||||
is_select_multiple = "multiple" in attributes
|
|
||||||
is_selected_option_found = False
|
|
||||||
|
|
||||||
elif field_type == "select" and field_name and name == "option":
|
|
||||||
option_selected_index = -1
|
|
||||||
option_value = None
|
|
||||||
for i,(n,v) in enumerate(token["data"]):
|
|
||||||
n = n.lower()
|
|
||||||
if n == "selected":
|
|
||||||
option_selected_index = i
|
|
||||||
elif n == "value":
|
|
||||||
option_value = v.strip(spaceCharacters)
|
|
||||||
if option_value is None:
|
|
||||||
raise NotImplementedError("<option>s without a value= attribute")
|
|
||||||
else:
|
|
||||||
value_list = self.fieldStorage.getlist(field_name)
|
|
||||||
if value_list:
|
|
||||||
field_index = field_indices.setdefault(field_name, 0)
|
|
||||||
if field_index < len(value_list):
|
|
||||||
value = value_list[field_index]
|
|
||||||
else:
|
|
||||||
value = ""
|
|
||||||
if (is_select_multiple or not is_selected_option_found) and option_value == value:
|
|
||||||
if option_selected_index < 0:
|
|
||||||
token["data"].append((u"selected", u""))
|
|
||||||
field_indices[field_name] = field_index + 1
|
|
||||||
is_selected_option_found = True
|
|
||||||
elif option_selected_index >= 0:
|
|
||||||
del token["data"][option_selected_index]
|
|
||||||
|
|
||||||
elif field_type is not None and field_name and type == "EndTag":
|
|
||||||
name = token["name"].lower()
|
|
||||||
if name == field_type:
|
|
||||||
if name == "textarea":
|
|
||||||
value_list = self.fieldStorage.getlist(field_name)
|
|
||||||
if value_list:
|
|
||||||
field_index = field_indices.setdefault(field_name, 0)
|
|
||||||
if field_index < len(value_list):
|
|
||||||
value = value_list[field_index]
|
|
||||||
else:
|
|
||||||
value = ""
|
|
||||||
yield {"type": "Characters", "data": value}
|
|
||||||
field_indices[field_name] = field_index + 1
|
|
||||||
|
|
||||||
field_name = None
|
|
||||||
|
|
||||||
elif name == "option" and field_type == "select":
|
|
||||||
pass # TODO: part of "option without value= attribute" processing
|
|
||||||
|
|
||||||
elif field_type == "textarea":
|
|
||||||
continue # ignore token
|
|
||||||
|
|
||||||
yield token
|
|
@ -1,4 +1,7 @@
|
|||||||
import _base
|
from __future__ import absolute_import, division, unicode_literals
|
||||||
|
|
||||||
|
from . import _base
|
||||||
|
|
||||||
|
|
||||||
class Filter(_base.Filter):
|
class Filter(_base.Filter):
|
||||||
def __init__(self, source, encoding):
|
def __init__(self, source, encoding):
|
||||||
@ -20,19 +23,18 @@ class Filter(_base.Filter):
|
|||||||
if token["name"].lower() == "meta":
|
if token["name"].lower() == "meta":
|
||||||
# replace charset with actual encoding
|
# replace charset with actual encoding
|
||||||
has_http_equiv_content_type = False
|
has_http_equiv_content_type = False
|
||||||
content_index = -1
|
for (namespace, name), value in token["data"].items():
|
||||||
for i,(name,value) in enumerate(token["data"]):
|
if namespace is not None:
|
||||||
if name.lower() == 'charset':
|
continue
|
||||||
token["data"][i] = (u'charset', self.encoding)
|
elif name.lower() == 'charset':
|
||||||
|
token["data"][(namespace, name)] = self.encoding
|
||||||
meta_found = True
|
meta_found = True
|
||||||
break
|
break
|
||||||
elif name == 'http-equiv' and value.lower() == 'content-type':
|
elif name == 'http-equiv' and value.lower() == 'content-type':
|
||||||
has_http_equiv_content_type = True
|
has_http_equiv_content_type = True
|
||||||
elif name == 'content':
|
|
||||||
content_index = i
|
|
||||||
else:
|
else:
|
||||||
if has_http_equiv_content_type and content_index >= 0:
|
if has_http_equiv_content_type and (None, "content") in token["data"]:
|
||||||
token["data"][content_index] = (u'content', u'text/html; charset=%s' % self.encoding)
|
token["data"][(None, "content")] = 'text/html; charset=%s' % self.encoding
|
||||||
meta_found = True
|
meta_found = True
|
||||||
|
|
||||||
elif token["name"].lower() == "head" and not meta_found:
|
elif token["name"].lower() == "head" and not meta_found:
|
||||||
@ -40,7 +42,7 @@ class Filter(_base.Filter):
|
|||||||
yield {"type": "StartTag", "name": "head",
|
yield {"type": "StartTag", "name": "head",
|
||||||
"data": token["data"]}
|
"data": token["data"]}
|
||||||
yield {"type": "EmptyTag", "name": "meta",
|
yield {"type": "EmptyTag", "name": "meta",
|
||||||
"data": [["charset", self.encoding]]}
|
"data": {(None, "charset"): self.encoding}}
|
||||||
yield {"type": "EndTag", "name": "head"}
|
yield {"type": "EndTag", "name": "head"}
|
||||||
meta_found = True
|
meta_found = True
|
||||||
continue
|
continue
|
||||||
@ -51,7 +53,7 @@ class Filter(_base.Filter):
|
|||||||
yield pending.pop(0)
|
yield pending.pop(0)
|
||||||
if not meta_found:
|
if not meta_found:
|
||||||
yield {"type": "EmptyTag", "name": "meta",
|
yield {"type": "EmptyTag", "name": "meta",
|
||||||
"data": [["charset", self.encoding]]}
|
"data": {(None, "charset"): self.encoding}}
|
||||||
while pending:
|
while pending:
|
||||||
yield pending.pop(0)
|
yield pending.pop(0)
|
||||||
meta_found = True
|
meta_found = True
|
||||||
|
@ -1,13 +1,18 @@
|
|||||||
|
from __future__ import absolute_import, division, unicode_literals
|
||||||
|
|
||||||
from gettext import gettext
|
from gettext import gettext
|
||||||
_ = gettext
|
_ = gettext
|
||||||
|
|
||||||
import _base
|
from . import _base
|
||||||
from html5lib.constants import cdataElements, rcdataElements, voidElements
|
from ..constants import cdataElements, rcdataElements, voidElements
|
||||||
|
|
||||||
from html5lib.constants import spaceCharacters
|
from ..constants import spaceCharacters
|
||||||
spaceCharacters = u"".join(spaceCharacters)
|
spaceCharacters = "".join(spaceCharacters)
|
||||||
|
|
||||||
|
|
||||||
|
class LintError(Exception):
|
||||||
|
pass
|
||||||
|
|
||||||
class LintError(Exception): pass
|
|
||||||
|
|
||||||
class Filter(_base.Filter):
|
class Filter(_base.Filter):
|
||||||
def __iter__(self):
|
def __iter__(self):
|
||||||
@ -18,24 +23,24 @@ class Filter(_base.Filter):
|
|||||||
if type in ("StartTag", "EmptyTag"):
|
if type in ("StartTag", "EmptyTag"):
|
||||||
name = token["name"]
|
name = token["name"]
|
||||||
if contentModelFlag != "PCDATA":
|
if contentModelFlag != "PCDATA":
|
||||||
raise LintError(_("StartTag not in PCDATA content model flag: %s") % name)
|
raise LintError(_("StartTag not in PCDATA content model flag: %(tag)s") % {"tag": name})
|
||||||
if not isinstance(name, unicode):
|
if not isinstance(name, str):
|
||||||
raise LintError(_(u"Tag name is not a string: %r") % name)
|
raise LintError(_("Tag name is not a string: %(tag)r") % {"tag": name})
|
||||||
if not name:
|
if not name:
|
||||||
raise LintError(_(u"Empty tag name"))
|
raise LintError(_("Empty tag name"))
|
||||||
if type == "StartTag" and name in voidElements:
|
if type == "StartTag" and name in voidElements:
|
||||||
raise LintError(_(u"Void element reported as StartTag token: %s") % name)
|
raise LintError(_("Void element reported as StartTag token: %(tag)s") % {"tag": name})
|
||||||
elif type == "EmptyTag" and name not in voidElements:
|
elif type == "EmptyTag" and name not in voidElements:
|
||||||
raise LintError(_(u"Non-void element reported as EmptyTag token: %s") % token["name"])
|
raise LintError(_("Non-void element reported as EmptyTag token: %(tag)s") % {"tag": token["name"]})
|
||||||
if type == "StartTag":
|
if type == "StartTag":
|
||||||
open_elements.append(name)
|
open_elements.append(name)
|
||||||
for name, value in token["data"]:
|
for name, value in token["data"]:
|
||||||
if not isinstance(name, unicode):
|
if not isinstance(name, str):
|
||||||
raise LintError(_("Attribute name is not a string: %r") % name)
|
raise LintError(_("Attribute name is not a string: %(name)r") % {"name": name})
|
||||||
if not name:
|
if not name:
|
||||||
raise LintError(_(u"Empty attribute name"))
|
raise LintError(_("Empty attribute name"))
|
||||||
if not isinstance(value, unicode):
|
if not isinstance(value, str):
|
||||||
raise LintError(_("Attribute value is not a string: %r") % value)
|
raise LintError(_("Attribute value is not a string: %(value)r") % {"value": value})
|
||||||
if name in cdataElements:
|
if name in cdataElements:
|
||||||
contentModelFlag = "CDATA"
|
contentModelFlag = "CDATA"
|
||||||
elif name in rcdataElements:
|
elif name in rcdataElements:
|
||||||
@ -45,15 +50,15 @@ class Filter(_base.Filter):
|
|||||||
|
|
||||||
elif type == "EndTag":
|
elif type == "EndTag":
|
||||||
name = token["name"]
|
name = token["name"]
|
||||||
if not isinstance(name, unicode):
|
if not isinstance(name, str):
|
||||||
raise LintError(_(u"Tag name is not a string: %r") % name)
|
raise LintError(_("Tag name is not a string: %(tag)r") % {"tag": name})
|
||||||
if not name:
|
if not name:
|
||||||
raise LintError(_(u"Empty tag name"))
|
raise LintError(_("Empty tag name"))
|
||||||
if name in voidElements:
|
if name in voidElements:
|
||||||
raise LintError(_(u"Void element reported as EndTag token: %s") % name)
|
raise LintError(_("Void element reported as EndTag token: %(tag)s") % {"tag": name})
|
||||||
start_name = open_elements.pop()
|
start_name = open_elements.pop()
|
||||||
if start_name != name:
|
if start_name != name:
|
||||||
raise LintError(_(u"EndTag (%s) does not match StartTag (%s)") % (name, start_name))
|
raise LintError(_("EndTag (%(end)s) does not match StartTag (%(start)s)") % {"end": name, "start": start_name})
|
||||||
contentModelFlag = "PCDATA"
|
contentModelFlag = "PCDATA"
|
||||||
|
|
||||||
elif type == "Comment":
|
elif type == "Comment":
|
||||||
@ -62,27 +67,27 @@ class Filter(_base.Filter):
|
|||||||
|
|
||||||
elif type in ("Characters", "SpaceCharacters"):
|
elif type in ("Characters", "SpaceCharacters"):
|
||||||
data = token["data"]
|
data = token["data"]
|
||||||
if not isinstance(data, unicode):
|
if not isinstance(data, str):
|
||||||
raise LintError(_("Attribute name is not a string: %r") % data)
|
raise LintError(_("Attribute name is not a string: %(name)r") % {"name": data})
|
||||||
if not data:
|
if not data:
|
||||||
raise LintError(_(u"%s token with empty data") % type)
|
raise LintError(_("%(type)s token with empty data") % {"type": type})
|
||||||
if type == "SpaceCharacters":
|
if type == "SpaceCharacters":
|
||||||
data = data.strip(spaceCharacters)
|
data = data.strip(spaceCharacters)
|
||||||
if data:
|
if data:
|
||||||
raise LintError(_(u"Non-space character(s) found in SpaceCharacters token: ") % data)
|
raise LintError(_("Non-space character(s) found in SpaceCharacters token: %(token)r") % {"token": data})
|
||||||
|
|
||||||
elif type == "Doctype":
|
elif type == "Doctype":
|
||||||
name = token["name"]
|
name = token["name"]
|
||||||
if contentModelFlag != "PCDATA":
|
if contentModelFlag != "PCDATA":
|
||||||
raise LintError(_("Doctype not in PCDATA content model flag: %s") % name)
|
raise LintError(_("Doctype not in PCDATA content model flag: %(name)s") % {"name": name})
|
||||||
if not isinstance(name, unicode):
|
if not isinstance(name, str):
|
||||||
raise LintError(_(u"Tag name is not a string: %r") % name)
|
raise LintError(_("Tag name is not a string: %(tag)r") % {"tag": name})
|
||||||
# XXX: what to do with token["data"] ?
|
# XXX: what to do with token["data"] ?
|
||||||
|
|
||||||
elif type in ("ParseError", "SerializeError"):
|
elif type in ("ParseError", "SerializeError"):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
else:
|
else:
|
||||||
raise LintError(_(u"Unknown token type: %s") % type)
|
raise LintError(_("Unknown token type: %(type)s") % {"type": type})
|
||||||
|
|
||||||
yield token
|
yield token
|
||||||
|
@ -1,4 +1,7 @@
|
|||||||
import _base
|
from __future__ import absolute_import, division, unicode_literals
|
||||||
|
|
||||||
|
from . import _base
|
||||||
|
|
||||||
|
|
||||||
class Filter(_base.Filter):
|
class Filter(_base.Filter):
|
||||||
def slider(self):
|
def slider(self):
|
||||||
|
@ -1,8 +1,12 @@
|
|||||||
import _base
|
from __future__ import absolute_import, division, unicode_literals
|
||||||
from html5lib.sanitizer import HTMLSanitizerMixin
|
|
||||||
|
from . import _base
|
||||||
|
from ..sanitizer import HTMLSanitizerMixin
|
||||||
|
|
||||||
|
|
||||||
class Filter(_base.Filter, HTMLSanitizerMixin):
|
class Filter(_base.Filter, HTMLSanitizerMixin):
|
||||||
def __iter__(self):
|
def __iter__(self):
|
||||||
for token in _base.Filter.__iter__(self):
|
for token in _base.Filter.__iter__(self):
|
||||||
token = self.sanitize_token(token)
|
token = self.sanitize_token(token)
|
||||||
if token: yield token
|
if token:
|
||||||
|
yield token
|
||||||
|
@ -1,16 +1,13 @@
|
|||||||
try:
|
from __future__ import absolute_import, division, unicode_literals
|
||||||
frozenset
|
|
||||||
except NameError:
|
|
||||||
# Import from the sets module for python 2.3
|
|
||||||
from sets import ImmutableSet as frozenset
|
|
||||||
|
|
||||||
import re
|
import re
|
||||||
|
|
||||||
import _base
|
from . import _base
|
||||||
from html5lib.constants import rcdataElements, spaceCharacters
|
from ..constants import rcdataElements, spaceCharacters
|
||||||
spaceCharacters = u"".join(spaceCharacters)
|
spaceCharacters = "".join(spaceCharacters)
|
||||||
|
|
||||||
|
SPACES_REGEX = re.compile("[%s]+" % spaceCharacters)
|
||||||
|
|
||||||
SPACES_REGEX = re.compile(u"[%s]+" % spaceCharacters)
|
|
||||||
|
|
||||||
class Filter(_base.Filter):
|
class Filter(_base.Filter):
|
||||||
|
|
||||||
@ -29,13 +26,13 @@ class Filter(_base.Filter):
|
|||||||
|
|
||||||
elif not preserve and type == "SpaceCharacters" and token["data"]:
|
elif not preserve and type == "SpaceCharacters" and token["data"]:
|
||||||
# Test on token["data"] above to not introduce spaces where there were not
|
# Test on token["data"] above to not introduce spaces where there were not
|
||||||
token["data"] = u" "
|
token["data"] = " "
|
||||||
|
|
||||||
elif not preserve and type == "Characters":
|
elif not preserve and type == "Characters":
|
||||||
token["data"] = collapse_spaces(token["data"])
|
token["data"] = collapse_spaces(token["data"])
|
||||||
|
|
||||||
yield token
|
yield token
|
||||||
|
|
||||||
|
|
||||||
def collapse_spaces(text):
|
def collapse_spaces(text):
|
||||||
return SPACES_REGEX.sub(' ', text)
|
return SPACES_REGEX.sub(' ', text)
|
||||||
|
|
||||||
|
File diff suppressed because it is too large
Load Diff
@ -1,14 +1,93 @@
|
|||||||
import re
|
from __future__ import absolute_import, division, unicode_literals
|
||||||
|
|
||||||
baseChar = """[#x0041-#x005A] | [#x0061-#x007A] | [#x00C0-#x00D6] | [#x00D8-#x00F6] | [#x00F8-#x00FF] | [#x0100-#x0131] | [#x0134-#x013E] | [#x0141-#x0148] | [#x014A-#x017E] | [#x0180-#x01C3] | [#x01CD-#x01F0] | [#x01F4-#x01F5] | [#x01FA-#x0217] | [#x0250-#x02A8] | [#x02BB-#x02C1] | #x0386 | [#x0388-#x038A] | #x038C | [#x038E-#x03A1] | [#x03A3-#x03CE] | [#x03D0-#x03D6] | #x03DA | #x03DC | #x03DE | #x03E0 | [#x03E2-#x03F3] | [#x0401-#x040C] | [#x040E-#x044F] | [#x0451-#x045C] | [#x045E-#x0481] | [#x0490-#x04C4] | [#x04C7-#x04C8] | [#x04CB-#x04CC] | [#x04D0-#x04EB] | [#x04EE-#x04F5] | [#x04F8-#x04F9] | [#x0531-#x0556] | #x0559 | [#x0561-#x0586] | [#x05D0-#x05EA] | [#x05F0-#x05F2] | [#x0621-#x063A] | [#x0641-#x064A] | [#x0671-#x06B7] | [#x06BA-#x06BE] | [#x06C0-#x06CE] | [#x06D0-#x06D3] | #x06D5 | [#x06E5-#x06E6] | [#x0905-#x0939] | #x093D | [#x0958-#x0961] | [#x0985-#x098C] | [#x098F-#x0990] | [#x0993-#x09A8] | [#x09AA-#x09B0] | #x09B2 | [#x09B6-#x09B9] | [#x09DC-#x09DD] | [#x09DF-#x09E1] | [#x09F0-#x09F1] | [#x0A05-#x0A0A] | [#x0A0F-#x0A10] | [#x0A13-#x0A28] | [#x0A2A-#x0A30] | [#x0A32-#x0A33] | [#x0A35-#x0A36] | [#x0A38-#x0A39] | [#x0A59-#x0A5C] | #x0A5E | [#x0A72-#x0A74] | [#x0A85-#x0A8B] | #x0A8D | [#x0A8F-#x0A91] | [#x0A93-#x0AA8] | [#x0AAA-#x0AB0] | [#x0AB2-#x0AB3] | [#x0AB5-#x0AB9] | #x0ABD | #x0AE0 | [#x0B05-#x0B0C] | [#x0B0F-#x0B10] | [#x0B13-#x0B28] | [#x0B2A-#x0B30] | [#x0B32-#x0B33] | [#x0B36-#x0B39] | #x0B3D | [#x0B5C-#x0B5D] | [#x0B5F-#x0B61] | [#x0B85-#x0B8A] | [#x0B8E-#x0B90] | [#x0B92-#x0B95] | [#x0B99-#x0B9A] | #x0B9C | [#x0B9E-#x0B9F] | [#x0BA3-#x0BA4] | [#x0BA8-#x0BAA] | [#x0BAE-#x0BB5] | [#x0BB7-#x0BB9] | [#x0C05-#x0C0C] | [#x0C0E-#x0C10] | [#x0C12-#x0C28] | [#x0C2A-#x0C33] | [#x0C35-#x0C39] | [#x0C60-#x0C61] | [#x0C85-#x0C8C] | [#x0C8E-#x0C90] | [#x0C92-#x0CA8] | [#x0CAA-#x0CB3] | [#x0CB5-#x0CB9] | #x0CDE | [#x0CE0-#x0CE1] | [#x0D05-#x0D0C] | [#x0D0E-#x0D10] | [#x0D12-#x0D28] | [#x0D2A-#x0D39] | [#x0D60-#x0D61] | [#x0E01-#x0E2E] | #x0E30 | [#x0E32-#x0E33] | [#x0E40-#x0E45] | [#x0E81-#x0E82] | #x0E84 | [#x0E87-#x0E88] | #x0E8A | #x0E8D | [#x0E94-#x0E97] | [#x0E99-#x0E9F] | [#x0EA1-#x0EA3] | #x0EA5 | #x0EA7 | [#x0EAA-#x0EAB] | [#x0EAD-#x0EAE] | #x0EB0 | [#x0EB2-#x0EB3] | #x0EBD | [#x0EC0-#x0EC4] | [#x0F40-#x0F47] | [#x0F49-#x0F69] | [#x10A0-#x10C5] | [#x10D0-#x10F6] | #x1100 | [#x1102-#x1103] | [#x1105-#x1107] | #x1109 | [#x110B-#x110C] | [#x110E-#x1112] | #x113C | #x113E | #x1140 | #x114C | #x114E | #x1150 | [#x1154-#x1155] | #x1159 | [#x115F-#x1161] | #x1163 | #x1165 | #x1167 | #x1169 | [#x116D-#x116E] | [#x1172-#x1173] | #x1175 | #x119E | #x11A8 | #x11AB | [#x11AE-#x11AF] | [#x11B7-#x11B8] | #x11BA | [#x11BC-#x11C2] | #x11EB | #x11F0 | #x11F9 | [#x1E00-#x1E9B] | [#x1EA0-#x1EF9] | [#x1F00-#x1F15] | [#x1F18-#x1F1D] | [#x1F20-#x1F45] | [#x1F48-#x1F4D] | [#x1F50-#x1F57] | #x1F59 | #x1F5B | #x1F5D | [#x1F5F-#x1F7D] | [#x1F80-#x1FB4] | [#x1FB6-#x1FBC] | #x1FBE | [#x1FC2-#x1FC4] | [#x1FC6-#x1FCC] | [#x1FD0-#x1FD3] | [#x1FD6-#x1FDB] | [#x1FE0-#x1FEC] | [#x1FF2-#x1FF4] | [#x1FF6-#x1FFC] | #x2126 | [#x212A-#x212B] | #x212E | [#x2180-#x2182] | [#x3041-#x3094] | [#x30A1-#x30FA] | [#x3105-#x312C] | [#xAC00-#xD7A3]"""
|
import re
|
||||||
|
import warnings
|
||||||
|
|
||||||
|
from .constants import DataLossWarning
|
||||||
|
|
||||||
|
baseChar = """
|
||||||
|
[#x0041-#x005A] | [#x0061-#x007A] | [#x00C0-#x00D6] | [#x00D8-#x00F6] |
|
||||||
|
[#x00F8-#x00FF] | [#x0100-#x0131] | [#x0134-#x013E] | [#x0141-#x0148] |
|
||||||
|
[#x014A-#x017E] | [#x0180-#x01C3] | [#x01CD-#x01F0] | [#x01F4-#x01F5] |
|
||||||
|
[#x01FA-#x0217] | [#x0250-#x02A8] | [#x02BB-#x02C1] | #x0386 |
|
||||||
|
[#x0388-#x038A] | #x038C | [#x038E-#x03A1] | [#x03A3-#x03CE] |
|
||||||
|
[#x03D0-#x03D6] | #x03DA | #x03DC | #x03DE | #x03E0 | [#x03E2-#x03F3] |
|
||||||
|
[#x0401-#x040C] | [#x040E-#x044F] | [#x0451-#x045C] | [#x045E-#x0481] |
|
||||||
|
[#x0490-#x04C4] | [#x04C7-#x04C8] | [#x04CB-#x04CC] | [#x04D0-#x04EB] |
|
||||||
|
[#x04EE-#x04F5] | [#x04F8-#x04F9] | [#x0531-#x0556] | #x0559 |
|
||||||
|
[#x0561-#x0586] | [#x05D0-#x05EA] | [#x05F0-#x05F2] | [#x0621-#x063A] |
|
||||||
|
[#x0641-#x064A] | [#x0671-#x06B7] | [#x06BA-#x06BE] | [#x06C0-#x06CE] |
|
||||||
|
[#x06D0-#x06D3] | #x06D5 | [#x06E5-#x06E6] | [#x0905-#x0939] | #x093D |
|
||||||
|
[#x0958-#x0961] | [#x0985-#x098C] | [#x098F-#x0990] | [#x0993-#x09A8] |
|
||||||
|
[#x09AA-#x09B0] | #x09B2 | [#x09B6-#x09B9] | [#x09DC-#x09DD] |
|
||||||
|
[#x09DF-#x09E1] | [#x09F0-#x09F1] | [#x0A05-#x0A0A] | [#x0A0F-#x0A10] |
|
||||||
|
[#x0A13-#x0A28] | [#x0A2A-#x0A30] | [#x0A32-#x0A33] | [#x0A35-#x0A36] |
|
||||||
|
[#x0A38-#x0A39] | [#x0A59-#x0A5C] | #x0A5E | [#x0A72-#x0A74] |
|
||||||
|
[#x0A85-#x0A8B] | #x0A8D | [#x0A8F-#x0A91] | [#x0A93-#x0AA8] |
|
||||||
|
[#x0AAA-#x0AB0] | [#x0AB2-#x0AB3] | [#x0AB5-#x0AB9] | #x0ABD | #x0AE0 |
|
||||||
|
[#x0B05-#x0B0C] | [#x0B0F-#x0B10] | [#x0B13-#x0B28] | [#x0B2A-#x0B30] |
|
||||||
|
[#x0B32-#x0B33] | [#x0B36-#x0B39] | #x0B3D | [#x0B5C-#x0B5D] |
|
||||||
|
[#x0B5F-#x0B61] | [#x0B85-#x0B8A] | [#x0B8E-#x0B90] | [#x0B92-#x0B95] |
|
||||||
|
[#x0B99-#x0B9A] | #x0B9C | [#x0B9E-#x0B9F] | [#x0BA3-#x0BA4] |
|
||||||
|
[#x0BA8-#x0BAA] | [#x0BAE-#x0BB5] | [#x0BB7-#x0BB9] | [#x0C05-#x0C0C] |
|
||||||
|
[#x0C0E-#x0C10] | [#x0C12-#x0C28] | [#x0C2A-#x0C33] | [#x0C35-#x0C39] |
|
||||||
|
[#x0C60-#x0C61] | [#x0C85-#x0C8C] | [#x0C8E-#x0C90] | [#x0C92-#x0CA8] |
|
||||||
|
[#x0CAA-#x0CB3] | [#x0CB5-#x0CB9] | #x0CDE | [#x0CE0-#x0CE1] |
|
||||||
|
[#x0D05-#x0D0C] | [#x0D0E-#x0D10] | [#x0D12-#x0D28] | [#x0D2A-#x0D39] |
|
||||||
|
[#x0D60-#x0D61] | [#x0E01-#x0E2E] | #x0E30 | [#x0E32-#x0E33] |
|
||||||
|
[#x0E40-#x0E45] | [#x0E81-#x0E82] | #x0E84 | [#x0E87-#x0E88] | #x0E8A |
|
||||||
|
#x0E8D | [#x0E94-#x0E97] | [#x0E99-#x0E9F] | [#x0EA1-#x0EA3] | #x0EA5 |
|
||||||
|
#x0EA7 | [#x0EAA-#x0EAB] | [#x0EAD-#x0EAE] | #x0EB0 | [#x0EB2-#x0EB3] |
|
||||||
|
#x0EBD | [#x0EC0-#x0EC4] | [#x0F40-#x0F47] | [#x0F49-#x0F69] |
|
||||||
|
[#x10A0-#x10C5] | [#x10D0-#x10F6] | #x1100 | [#x1102-#x1103] |
|
||||||
|
[#x1105-#x1107] | #x1109 | [#x110B-#x110C] | [#x110E-#x1112] | #x113C |
|
||||||
|
#x113E | #x1140 | #x114C | #x114E | #x1150 | [#x1154-#x1155] | #x1159 |
|
||||||
|
[#x115F-#x1161] | #x1163 | #x1165 | #x1167 | #x1169 | [#x116D-#x116E] |
|
||||||
|
[#x1172-#x1173] | #x1175 | #x119E | #x11A8 | #x11AB | [#x11AE-#x11AF] |
|
||||||
|
[#x11B7-#x11B8] | #x11BA | [#x11BC-#x11C2] | #x11EB | #x11F0 | #x11F9 |
|
||||||
|
[#x1E00-#x1E9B] | [#x1EA0-#x1EF9] | [#x1F00-#x1F15] | [#x1F18-#x1F1D] |
|
||||||
|
[#x1F20-#x1F45] | [#x1F48-#x1F4D] | [#x1F50-#x1F57] | #x1F59 | #x1F5B |
|
||||||
|
#x1F5D | [#x1F5F-#x1F7D] | [#x1F80-#x1FB4] | [#x1FB6-#x1FBC] | #x1FBE |
|
||||||
|
[#x1FC2-#x1FC4] | [#x1FC6-#x1FCC] | [#x1FD0-#x1FD3] | [#x1FD6-#x1FDB] |
|
||||||
|
[#x1FE0-#x1FEC] | [#x1FF2-#x1FF4] | [#x1FF6-#x1FFC] | #x2126 |
|
||||||
|
[#x212A-#x212B] | #x212E | [#x2180-#x2182] | [#x3041-#x3094] |
|
||||||
|
[#x30A1-#x30FA] | [#x3105-#x312C] | [#xAC00-#xD7A3]"""
|
||||||
|
|
||||||
ideographic = """[#x4E00-#x9FA5] | #x3007 | [#x3021-#x3029]"""
|
ideographic = """[#x4E00-#x9FA5] | #x3007 | [#x3021-#x3029]"""
|
||||||
|
|
||||||
combiningCharacter = """[#x0300-#x0345] | [#x0360-#x0361] | [#x0483-#x0486] | [#x0591-#x05A1] | [#x05A3-#x05B9] | [#x05BB-#x05BD] | #x05BF | [#x05C1-#x05C2] | #x05C4 | [#x064B-#x0652] | #x0670 | [#x06D6-#x06DC] | [#x06DD-#x06DF] | [#x06E0-#x06E4] | [#x06E7-#x06E8] | [#x06EA-#x06ED] | [#x0901-#x0903] | #x093C | [#x093E-#x094C] | #x094D | [#x0951-#x0954] | [#x0962-#x0963] | [#x0981-#x0983] | #x09BC | #x09BE | #x09BF | [#x09C0-#x09C4] | [#x09C7-#x09C8] | [#x09CB-#x09CD] | #x09D7 | [#x09E2-#x09E3] | #x0A02 | #x0A3C | #x0A3E | #x0A3F | [#x0A40-#x0A42] | [#x0A47-#x0A48] | [#x0A4B-#x0A4D] | [#x0A70-#x0A71] | [#x0A81-#x0A83] | #x0ABC | [#x0ABE-#x0AC5] | [#x0AC7-#x0AC9] | [#x0ACB-#x0ACD] | [#x0B01-#x0B03] | #x0B3C | [#x0B3E-#x0B43] | [#x0B47-#x0B48] | [#x0B4B-#x0B4D] | [#x0B56-#x0B57] | [#x0B82-#x0B83] | [#x0BBE-#x0BC2] | [#x0BC6-#x0BC8] | [#x0BCA-#x0BCD] | #x0BD7 | [#x0C01-#x0C03] | [#x0C3E-#x0C44] | [#x0C46-#x0C48] | [#x0C4A-#x0C4D] | [#x0C55-#x0C56] | [#x0C82-#x0C83] | [#x0CBE-#x0CC4] | [#x0CC6-#x0CC8] | [#x0CCA-#x0CCD] | [#x0CD5-#x0CD6] | [#x0D02-#x0D03] | [#x0D3E-#x0D43] | [#x0D46-#x0D48] | [#x0D4A-#x0D4D] | #x0D57 | #x0E31 | [#x0E34-#x0E3A] | [#x0E47-#x0E4E] | #x0EB1 | [#x0EB4-#x0EB9] | [#x0EBB-#x0EBC] | [#x0EC8-#x0ECD] | [#x0F18-#x0F19] | #x0F35 | #x0F37 | #x0F39 | #x0F3E | #x0F3F | [#x0F71-#x0F84] | [#x0F86-#x0F8B] | [#x0F90-#x0F95] | #x0F97 | [#x0F99-#x0FAD] | [#x0FB1-#x0FB7] | #x0FB9 | [#x20D0-#x20DC] | #x20E1 | [#x302A-#x302F] | #x3099 | #x309A"""
|
combiningCharacter = """
|
||||||
|
[#x0300-#x0345] | [#x0360-#x0361] | [#x0483-#x0486] | [#x0591-#x05A1] |
|
||||||
|
[#x05A3-#x05B9] | [#x05BB-#x05BD] | #x05BF | [#x05C1-#x05C2] | #x05C4 |
|
||||||
|
[#x064B-#x0652] | #x0670 | [#x06D6-#x06DC] | [#x06DD-#x06DF] |
|
||||||
|
[#x06E0-#x06E4] | [#x06E7-#x06E8] | [#x06EA-#x06ED] | [#x0901-#x0903] |
|
||||||
|
#x093C | [#x093E-#x094C] | #x094D | [#x0951-#x0954] | [#x0962-#x0963] |
|
||||||
|
[#x0981-#x0983] | #x09BC | #x09BE | #x09BF | [#x09C0-#x09C4] |
|
||||||
|
[#x09C7-#x09C8] | [#x09CB-#x09CD] | #x09D7 | [#x09E2-#x09E3] | #x0A02 |
|
||||||
|
#x0A3C | #x0A3E | #x0A3F | [#x0A40-#x0A42] | [#x0A47-#x0A48] |
|
||||||
|
[#x0A4B-#x0A4D] | [#x0A70-#x0A71] | [#x0A81-#x0A83] | #x0ABC |
|
||||||
|
[#x0ABE-#x0AC5] | [#x0AC7-#x0AC9] | [#x0ACB-#x0ACD] | [#x0B01-#x0B03] |
|
||||||
|
#x0B3C | [#x0B3E-#x0B43] | [#x0B47-#x0B48] | [#x0B4B-#x0B4D] |
|
||||||
|
[#x0B56-#x0B57] | [#x0B82-#x0B83] | [#x0BBE-#x0BC2] | [#x0BC6-#x0BC8] |
|
||||||
|
[#x0BCA-#x0BCD] | #x0BD7 | [#x0C01-#x0C03] | [#x0C3E-#x0C44] |
|
||||||
|
[#x0C46-#x0C48] | [#x0C4A-#x0C4D] | [#x0C55-#x0C56] | [#x0C82-#x0C83] |
|
||||||
|
[#x0CBE-#x0CC4] | [#x0CC6-#x0CC8] | [#x0CCA-#x0CCD] | [#x0CD5-#x0CD6] |
|
||||||
|
[#x0D02-#x0D03] | [#x0D3E-#x0D43] | [#x0D46-#x0D48] | [#x0D4A-#x0D4D] |
|
||||||
|
#x0D57 | #x0E31 | [#x0E34-#x0E3A] | [#x0E47-#x0E4E] | #x0EB1 |
|
||||||
|
[#x0EB4-#x0EB9] | [#x0EBB-#x0EBC] | [#x0EC8-#x0ECD] | [#x0F18-#x0F19] |
|
||||||
|
#x0F35 | #x0F37 | #x0F39 | #x0F3E | #x0F3F | [#x0F71-#x0F84] |
|
||||||
|
[#x0F86-#x0F8B] | [#x0F90-#x0F95] | #x0F97 | [#x0F99-#x0FAD] |
|
||||||
|
[#x0FB1-#x0FB7] | #x0FB9 | [#x20D0-#x20DC] | #x20E1 | [#x302A-#x302F] |
|
||||||
|
#x3099 | #x309A"""
|
||||||
|
|
||||||
digit = """[#x0030-#x0039] | [#x0660-#x0669] | [#x06F0-#x06F9] | [#x0966-#x096F] | [#x09E6-#x09EF] | [#x0A66-#x0A6F] | [#x0AE6-#x0AEF] | [#x0B66-#x0B6F] | [#x0BE7-#x0BEF] | [#x0C66-#x0C6F] | [#x0CE6-#x0CEF] | [#x0D66-#x0D6F] | [#x0E50-#x0E59] | [#x0ED0-#x0ED9] | [#x0F20-#x0F29]"""
|
digit = """
|
||||||
|
[#x0030-#x0039] | [#x0660-#x0669] | [#x06F0-#x06F9] | [#x0966-#x096F] |
|
||||||
|
[#x09E6-#x09EF] | [#x0A66-#x0A6F] | [#x0AE6-#x0AEF] | [#x0B66-#x0B6F] |
|
||||||
|
[#x0BE7-#x0BEF] | [#x0C66-#x0C6F] | [#x0CE6-#x0CEF] | [#x0D66-#x0D6F] |
|
||||||
|
[#x0E50-#x0E59] | [#x0ED0-#x0ED9] | [#x0F20-#x0F29]"""
|
||||||
|
|
||||||
extender = """#x00B7 | #x02D0 | #x02D1 | #x0387 | #x0640 | #x0E46 | #x0EC6 | #x3005 | [#x3031-#x3035] | [#x309D-#x309E] | [#x30FC-#x30FE]"""
|
extender = """
|
||||||
|
#x00B7 | #x02D0 | #x02D1 | #x0387 | #x0640 | #x0E46 | #x0EC6 | #x3005 |
|
||||||
|
#[#x3031-#x3035] | [#x309D-#x309E] | [#x30FC-#x30FE]"""
|
||||||
|
|
||||||
letter = " | ".join([baseChar, ideographic])
|
letter = " | ".join([baseChar, ideographic])
|
||||||
|
|
||||||
@ -20,6 +99,7 @@ nameFirst = " | ".join([letter, "_"])
|
|||||||
reChar = re.compile(r"#x([\d|A-F]{4,4})")
|
reChar = re.compile(r"#x([\d|A-F]{4,4})")
|
||||||
reCharRange = re.compile(r"\[#x([\d|A-F]{4,4})-#x([\d|A-F]{4,4})\]")
|
reCharRange = re.compile(r"\[#x([\d|A-F]{4,4})-#x([\d|A-F]{4,4})\]")
|
||||||
|
|
||||||
|
|
||||||
def charStringToList(chars):
|
def charStringToList(chars):
|
||||||
charRanges = [item.strip() for item in chars.split(" | ")]
|
charRanges = [item.strip() for item in chars.split(" | ")]
|
||||||
rv = []
|
rv = []
|
||||||
@ -40,6 +120,7 @@ def charStringToList(chars):
|
|||||||
rv = normaliseCharList(rv)
|
rv = normaliseCharList(rv)
|
||||||
return rv
|
return rv
|
||||||
|
|
||||||
|
|
||||||
def normaliseCharList(charList):
|
def normaliseCharList(charList):
|
||||||
charList = sorted(charList)
|
charList = sorted(charList)
|
||||||
for item in charList:
|
for item in charList:
|
||||||
@ -58,6 +139,7 @@ def normaliseCharList(charList):
|
|||||||
# We don't really support characters above the BMP :(
|
# We don't really support characters above the BMP :(
|
||||||
max_unicode = int("FFFF", 16)
|
max_unicode = int("FFFF", 16)
|
||||||
|
|
||||||
|
|
||||||
def missingRanges(charList):
|
def missingRanges(charList):
|
||||||
rv = []
|
rv = []
|
||||||
if charList[0] != 0:
|
if charList[0] != 0:
|
||||||
@ -68,42 +150,49 @@ def missingRanges(charList):
|
|||||||
rv.append([charList[-1][1] + 1, max_unicode])
|
rv.append([charList[-1][1] + 1, max_unicode])
|
||||||
return rv
|
return rv
|
||||||
|
|
||||||
|
|
||||||
def listToRegexpStr(charList):
|
def listToRegexpStr(charList):
|
||||||
rv = []
|
rv = []
|
||||||
for item in charList:
|
for item in charList:
|
||||||
if item[0] == item[1]:
|
if item[0] == item[1]:
|
||||||
rv.append(escapeRegexp(unichr(item[0])))
|
rv.append(escapeRegexp(chr(item[0])))
|
||||||
else:
|
else:
|
||||||
rv.append(escapeRegexp(unichr(item[0])) + "-" +
|
rv.append(escapeRegexp(chr(item[0])) + "-" +
|
||||||
escapeRegexp(unichr(item[1])))
|
escapeRegexp(chr(item[1])))
|
||||||
return "[%s]" % "".join(rv)
|
return "[%s]" % "".join(rv)
|
||||||
|
|
||||||
|
|
||||||
def hexToInt(hex_str):
|
def hexToInt(hex_str):
|
||||||
return int(hex_str, 16)
|
return int(hex_str, 16)
|
||||||
|
|
||||||
|
|
||||||
def escapeRegexp(string):
|
def escapeRegexp(string):
|
||||||
specialCharacters = (".", "^", "$", "*", "+", "?", "{", "}",
|
specialCharacters = (".", "^", "$", "*", "+", "?", "{", "}",
|
||||||
"[", "]", "|", "(", ")", "-")
|
"[", "]", "|", "(", ")", "-")
|
||||||
for char in specialCharacters:
|
for char in specialCharacters:
|
||||||
string = string.replace(char, "\\" + char)
|
string = string.replace(char, "\\" + char)
|
||||||
if char in string:
|
|
||||||
print string
|
|
||||||
|
|
||||||
return string
|
return string
|
||||||
|
|
||||||
# output from the above
|
# output from the above
|
||||||
nonXmlNameBMPRegexp = re.compile(u'[\x00-,/:-@\\[-\\^`\\{-\xb6\xb8-\xbf\xd7\xf7\u0132-\u0133\u013f-\u0140\u0149\u017f\u01c4-\u01cc\u01f1-\u01f3\u01f6-\u01f9\u0218-\u024f\u02a9-\u02ba\u02c2-\u02cf\u02d2-\u02ff\u0346-\u035f\u0362-\u0385\u038b\u038d\u03a2\u03cf\u03d7-\u03d9\u03db\u03dd\u03df\u03e1\u03f4-\u0400\u040d\u0450\u045d\u0482\u0487-\u048f\u04c5-\u04c6\u04c9-\u04ca\u04cd-\u04cf\u04ec-\u04ed\u04f6-\u04f7\u04fa-\u0530\u0557-\u0558\u055a-\u0560\u0587-\u0590\u05a2\u05ba\u05be\u05c0\u05c3\u05c5-\u05cf\u05eb-\u05ef\u05f3-\u0620\u063b-\u063f\u0653-\u065f\u066a-\u066f\u06b8-\u06b9\u06bf\u06cf\u06d4\u06e9\u06ee-\u06ef\u06fa-\u0900\u0904\u093a-\u093b\u094e-\u0950\u0955-\u0957\u0964-\u0965\u0970-\u0980\u0984\u098d-\u098e\u0991-\u0992\u09a9\u09b1\u09b3-\u09b5\u09ba-\u09bb\u09bd\u09c5-\u09c6\u09c9-\u09ca\u09ce-\u09d6\u09d8-\u09db\u09de\u09e4-\u09e5\u09f2-\u0a01\u0a03-\u0a04\u0a0b-\u0a0e\u0a11-\u0a12\u0a29\u0a31\u0a34\u0a37\u0a3a-\u0a3b\u0a3d\u0a43-\u0a46\u0a49-\u0a4a\u0a4e-\u0a58\u0a5d\u0a5f-\u0a65\u0a75-\u0a80\u0a84\u0a8c\u0a8e\u0a92\u0aa9\u0ab1\u0ab4\u0aba-\u0abb\u0ac6\u0aca\u0ace-\u0adf\u0ae1-\u0ae5\u0af0-\u0b00\u0b04\u0b0d-\u0b0e\u0b11-\u0b12\u0b29\u0b31\u0b34-\u0b35\u0b3a-\u0b3b\u0b44-\u0b46\u0b49-\u0b4a\u0b4e-\u0b55\u0b58-\u0b5b\u0b5e\u0b62-\u0b65\u0b70-\u0b81\u0b84\u0b8b-\u0b8d\u0b91\u0b96-\u0b98\u0b9b\u0b9d\u0ba0-\u0ba2\u0ba5-\u0ba7\u0bab-\u0bad\u0bb6\u0bba-\u0bbd\u0bc3-\u0bc5\u0bc9\u0bce-\u0bd6\u0bd8-\u0be6\u0bf0-\u0c00\u0c04\u0c0d\u0c11\u0c29\u0c34\u0c3a-\u0c3d\u0c45\u0c49\u0c4e-\u0c54\u0c57-\u0c5f\u0c62-\u0c65\u0c70-\u0c81\u0c84\u0c8d\u0c91\u0ca9\u0cb4\u0cba-\u0cbd\u0cc5\u0cc9\u0cce-\u0cd4\u0cd7-\u0cdd\u0cdf\u0ce2-\u0ce5\u0cf0-\u0d01\u0d04\u0d0d\u0d11\u0d29\u0d3a-\u0d3d\u0d44-\u0d45\u0d49\u0d4e-\u0d56\u0d58-\u0d5f\u0d62-\u0d65\u0d70-\u0e00\u0e2f\u0e3b-\u0e3f\u0e4f\u0e5a-\u0e80\u0e83\u0e85-\u0e86\u0e89\u0e8b-\u0e8c\u0e8e-\u0e93\u0e98\u0ea0\u0ea4\u0ea6\u0ea8-\u0ea9\u0eac\u0eaf\u0eba\u0ebe-\u0ebf\u0ec5\u0ec7\u0ece-\u0ecf\u0eda-\u0f17\u0f1a-\u0f1f\u0f2a-\u0f34\u0f36\u0f38\u0f3a-\u0f3d\u0f48\u0f6a-\u0f70\u0f85\u0f8c-\u0f8f\u0f96\u0f98\u0fae-\u0fb0\u0fb8\u0fba-\u109f\u10c6-\u10cf\u10f7-\u10ff\u1101\u1104\u1108\u110a\u110d\u1113-\u113b\u113d\u113f\u1141-\u114b\u114d\u114f\u1151-\u1153\u1156-\u1158\u115a-\u115e\u1162\u1164\u1166\u1168\u116a-\u116c\u116f-\u1171\u1174\u1176-\u119d\u119f-\u11a7\u11a9-\u11aa\u11ac-\u11ad\u11b0-\u11b6\u11b9\u11bb\u11c3-\u11ea\u11ec-\u11ef\u11f1-\u11f8\u11fa-\u1dff\u1e9c-\u1e9f\u1efa-\u1eff\u1f16-\u1f17\u1f1e-\u1f1f\u1f46-\u1f47\u1f4e-\u1f4f\u1f58\u1f5a\u1f5c\u1f5e\u1f7e-\u1f7f\u1fb5\u1fbd\u1fbf-\u1fc1\u1fc5\u1fcd-\u1fcf\u1fd4-\u1fd5\u1fdc-\u1fdf\u1fed-\u1ff1\u1ff5\u1ffd-\u20cf\u20dd-\u20e0\u20e2-\u2125\u2127-\u2129\u212c-\u212d\u212f-\u217f\u2183-\u3004\u3006\u3008-\u3020\u3030\u3036-\u3040\u3095-\u3098\u309b-\u309c\u309f-\u30a0\u30fb\u30ff-\u3104\u312d-\u4dff\u9fa6-\uabff\ud7a4-\uffff]')
|
nonXmlNameBMPRegexp = re.compile('[\x00-,/:-@\\[-\\^`\\{-\xb6\xb8-\xbf\xd7\xf7\u0132-\u0133\u013f-\u0140\u0149\u017f\u01c4-\u01cc\u01f1-\u01f3\u01f6-\u01f9\u0218-\u024f\u02a9-\u02ba\u02c2-\u02cf\u02d2-\u02ff\u0346-\u035f\u0362-\u0385\u038b\u038d\u03a2\u03cf\u03d7-\u03d9\u03db\u03dd\u03df\u03e1\u03f4-\u0400\u040d\u0450\u045d\u0482\u0487-\u048f\u04c5-\u04c6\u04c9-\u04ca\u04cd-\u04cf\u04ec-\u04ed\u04f6-\u04f7\u04fa-\u0530\u0557-\u0558\u055a-\u0560\u0587-\u0590\u05a2\u05ba\u05be\u05c0\u05c3\u05c5-\u05cf\u05eb-\u05ef\u05f3-\u0620\u063b-\u063f\u0653-\u065f\u066a-\u066f\u06b8-\u06b9\u06bf\u06cf\u06d4\u06e9\u06ee-\u06ef\u06fa-\u0900\u0904\u093a-\u093b\u094e-\u0950\u0955-\u0957\u0964-\u0965\u0970-\u0980\u0984\u098d-\u098e\u0991-\u0992\u09a9\u09b1\u09b3-\u09b5\u09ba-\u09bb\u09bd\u09c5-\u09c6\u09c9-\u09ca\u09ce-\u09d6\u09d8-\u09db\u09de\u09e4-\u09e5\u09f2-\u0a01\u0a03-\u0a04\u0a0b-\u0a0e\u0a11-\u0a12\u0a29\u0a31\u0a34\u0a37\u0a3a-\u0a3b\u0a3d\u0a43-\u0a46\u0a49-\u0a4a\u0a4e-\u0a58\u0a5d\u0a5f-\u0a65\u0a75-\u0a80\u0a84\u0a8c\u0a8e\u0a92\u0aa9\u0ab1\u0ab4\u0aba-\u0abb\u0ac6\u0aca\u0ace-\u0adf\u0ae1-\u0ae5\u0af0-\u0b00\u0b04\u0b0d-\u0b0e\u0b11-\u0b12\u0b29\u0b31\u0b34-\u0b35\u0b3a-\u0b3b\u0b44-\u0b46\u0b49-\u0b4a\u0b4e-\u0b55\u0b58-\u0b5b\u0b5e\u0b62-\u0b65\u0b70-\u0b81\u0b84\u0b8b-\u0b8d\u0b91\u0b96-\u0b98\u0b9b\u0b9d\u0ba0-\u0ba2\u0ba5-\u0ba7\u0bab-\u0bad\u0bb6\u0bba-\u0bbd\u0bc3-\u0bc5\u0bc9\u0bce-\u0bd6\u0bd8-\u0be6\u0bf0-\u0c00\u0c04\u0c0d\u0c11\u0c29\u0c34\u0c3a-\u0c3d\u0c45\u0c49\u0c4e-\u0c54\u0c57-\u0c5f\u0c62-\u0c65\u0c70-\u0c81\u0c84\u0c8d\u0c91\u0ca9\u0cb4\u0cba-\u0cbd\u0cc5\u0cc9\u0cce-\u0cd4\u0cd7-\u0cdd\u0cdf\u0ce2-\u0ce5\u0cf0-\u0d01\u0d04\u0d0d\u0d11\u0d29\u0d3a-\u0d3d\u0d44-\u0d45\u0d49\u0d4e-\u0d56\u0d58-\u0d5f\u0d62-\u0d65\u0d70-\u0e00\u0e2f\u0e3b-\u0e3f\u0e4f\u0e5a-\u0e80\u0e83\u0e85-\u0e86\u0e89\u0e8b-\u0e8c\u0e8e-\u0e93\u0e98\u0ea0\u0ea4\u0ea6\u0ea8-\u0ea9\u0eac\u0eaf\u0eba\u0ebe-\u0ebf\u0ec5\u0ec7\u0ece-\u0ecf\u0eda-\u0f17\u0f1a-\u0f1f\u0f2a-\u0f34\u0f36\u0f38\u0f3a-\u0f3d\u0f48\u0f6a-\u0f70\u0f85\u0f8c-\u0f8f\u0f96\u0f98\u0fae-\u0fb0\u0fb8\u0fba-\u109f\u10c6-\u10cf\u10f7-\u10ff\u1101\u1104\u1108\u110a\u110d\u1113-\u113b\u113d\u113f\u1141-\u114b\u114d\u114f\u1151-\u1153\u1156-\u1158\u115a-\u115e\u1162\u1164\u1166\u1168\u116a-\u116c\u116f-\u1171\u1174\u1176-\u119d\u119f-\u11a7\u11a9-\u11aa\u11ac-\u11ad\u11b0-\u11b6\u11b9\u11bb\u11c3-\u11ea\u11ec-\u11ef\u11f1-\u11f8\u11fa-\u1dff\u1e9c-\u1e9f\u1efa-\u1eff\u1f16-\u1f17\u1f1e-\u1f1f\u1f46-\u1f47\u1f4e-\u1f4f\u1f58\u1f5a\u1f5c\u1f5e\u1f7e-\u1f7f\u1fb5\u1fbd\u1fbf-\u1fc1\u1fc5\u1fcd-\u1fcf\u1fd4-\u1fd5\u1fdc-\u1fdf\u1fed-\u1ff1\u1ff5\u1ffd-\u20cf\u20dd-\u20e0\u20e2-\u2125\u2127-\u2129\u212c-\u212d\u212f-\u217f\u2183-\u3004\u3006\u3008-\u3020\u3030\u3036-\u3040\u3095-\u3098\u309b-\u309c\u309f-\u30a0\u30fb\u30ff-\u3104\u312d-\u4dff\u9fa6-\uabff\ud7a4-\uffff]')
|
||||||
|
|
||||||
|
nonXmlNameFirstBMPRegexp = re.compile('[\x00-@\\[-\\^`\\{-\xbf\xd7\xf7\u0132-\u0133\u013f-\u0140\u0149\u017f\u01c4-\u01cc\u01f1-\u01f3\u01f6-\u01f9\u0218-\u024f\u02a9-\u02ba\u02c2-\u0385\u0387\u038b\u038d\u03a2\u03cf\u03d7-\u03d9\u03db\u03dd\u03df\u03e1\u03f4-\u0400\u040d\u0450\u045d\u0482-\u048f\u04c5-\u04c6\u04c9-\u04ca\u04cd-\u04cf\u04ec-\u04ed\u04f6-\u04f7\u04fa-\u0530\u0557-\u0558\u055a-\u0560\u0587-\u05cf\u05eb-\u05ef\u05f3-\u0620\u063b-\u0640\u064b-\u0670\u06b8-\u06b9\u06bf\u06cf\u06d4\u06d6-\u06e4\u06e7-\u0904\u093a-\u093c\u093e-\u0957\u0962-\u0984\u098d-\u098e\u0991-\u0992\u09a9\u09b1\u09b3-\u09b5\u09ba-\u09db\u09de\u09e2-\u09ef\u09f2-\u0a04\u0a0b-\u0a0e\u0a11-\u0a12\u0a29\u0a31\u0a34\u0a37\u0a3a-\u0a58\u0a5d\u0a5f-\u0a71\u0a75-\u0a84\u0a8c\u0a8e\u0a92\u0aa9\u0ab1\u0ab4\u0aba-\u0abc\u0abe-\u0adf\u0ae1-\u0b04\u0b0d-\u0b0e\u0b11-\u0b12\u0b29\u0b31\u0b34-\u0b35\u0b3a-\u0b3c\u0b3e-\u0b5b\u0b5e\u0b62-\u0b84\u0b8b-\u0b8d\u0b91\u0b96-\u0b98\u0b9b\u0b9d\u0ba0-\u0ba2\u0ba5-\u0ba7\u0bab-\u0bad\u0bb6\u0bba-\u0c04\u0c0d\u0c11\u0c29\u0c34\u0c3a-\u0c5f\u0c62-\u0c84\u0c8d\u0c91\u0ca9\u0cb4\u0cba-\u0cdd\u0cdf\u0ce2-\u0d04\u0d0d\u0d11\u0d29\u0d3a-\u0d5f\u0d62-\u0e00\u0e2f\u0e31\u0e34-\u0e3f\u0e46-\u0e80\u0e83\u0e85-\u0e86\u0e89\u0e8b-\u0e8c\u0e8e-\u0e93\u0e98\u0ea0\u0ea4\u0ea6\u0ea8-\u0ea9\u0eac\u0eaf\u0eb1\u0eb4-\u0ebc\u0ebe-\u0ebf\u0ec5-\u0f3f\u0f48\u0f6a-\u109f\u10c6-\u10cf\u10f7-\u10ff\u1101\u1104\u1108\u110a\u110d\u1113-\u113b\u113d\u113f\u1141-\u114b\u114d\u114f\u1151-\u1153\u1156-\u1158\u115a-\u115e\u1162\u1164\u1166\u1168\u116a-\u116c\u116f-\u1171\u1174\u1176-\u119d\u119f-\u11a7\u11a9-\u11aa\u11ac-\u11ad\u11b0-\u11b6\u11b9\u11bb\u11c3-\u11ea\u11ec-\u11ef\u11f1-\u11f8\u11fa-\u1dff\u1e9c-\u1e9f\u1efa-\u1eff\u1f16-\u1f17\u1f1e-\u1f1f\u1f46-\u1f47\u1f4e-\u1f4f\u1f58\u1f5a\u1f5c\u1f5e\u1f7e-\u1f7f\u1fb5\u1fbd\u1fbf-\u1fc1\u1fc5\u1fcd-\u1fcf\u1fd4-\u1fd5\u1fdc-\u1fdf\u1fed-\u1ff1\u1ff5\u1ffd-\u2125\u2127-\u2129\u212c-\u212d\u212f-\u217f\u2183-\u3006\u3008-\u3020\u302a-\u3040\u3095-\u30a0\u30fb-\u3104\u312d-\u4dff\u9fa6-\uabff\ud7a4-\uffff]')
|
||||||
|
|
||||||
|
# Simpler things
|
||||||
|
nonPubidCharRegexp = re.compile("[^\x20\x0D\x0Aa-zA-Z0-9\-\'()+,./:=?;!*#@$_%]")
|
||||||
|
|
||||||
nonXmlNameFirstBMPRegexp = re.compile(u'[\x00-@\\[-\\^`\\{-\xbf\xd7\xf7\u0132-\u0133\u013f-\u0140\u0149\u017f\u01c4-\u01cc\u01f1-\u01f3\u01f6-\u01f9\u0218-\u024f\u02a9-\u02ba\u02c2-\u0385\u0387\u038b\u038d\u03a2\u03cf\u03d7-\u03d9\u03db\u03dd\u03df\u03e1\u03f4-\u0400\u040d\u0450\u045d\u0482-\u048f\u04c5-\u04c6\u04c9-\u04ca\u04cd-\u04cf\u04ec-\u04ed\u04f6-\u04f7\u04fa-\u0530\u0557-\u0558\u055a-\u0560\u0587-\u05cf\u05eb-\u05ef\u05f3-\u0620\u063b-\u0640\u064b-\u0670\u06b8-\u06b9\u06bf\u06cf\u06d4\u06d6-\u06e4\u06e7-\u0904\u093a-\u093c\u093e-\u0957\u0962-\u0984\u098d-\u098e\u0991-\u0992\u09a9\u09b1\u09b3-\u09b5\u09ba-\u09db\u09de\u09e2-\u09ef\u09f2-\u0a04\u0a0b-\u0a0e\u0a11-\u0a12\u0a29\u0a31\u0a34\u0a37\u0a3a-\u0a58\u0a5d\u0a5f-\u0a71\u0a75-\u0a84\u0a8c\u0a8e\u0a92\u0aa9\u0ab1\u0ab4\u0aba-\u0abc\u0abe-\u0adf\u0ae1-\u0b04\u0b0d-\u0b0e\u0b11-\u0b12\u0b29\u0b31\u0b34-\u0b35\u0b3a-\u0b3c\u0b3e-\u0b5b\u0b5e\u0b62-\u0b84\u0b8b-\u0b8d\u0b91\u0b96-\u0b98\u0b9b\u0b9d\u0ba0-\u0ba2\u0ba5-\u0ba7\u0bab-\u0bad\u0bb6\u0bba-\u0c04\u0c0d\u0c11\u0c29\u0c34\u0c3a-\u0c5f\u0c62-\u0c84\u0c8d\u0c91\u0ca9\u0cb4\u0cba-\u0cdd\u0cdf\u0ce2-\u0d04\u0d0d\u0d11\u0d29\u0d3a-\u0d5f\u0d62-\u0e00\u0e2f\u0e31\u0e34-\u0e3f\u0e46-\u0e80\u0e83\u0e85-\u0e86\u0e89\u0e8b-\u0e8c\u0e8e-\u0e93\u0e98\u0ea0\u0ea4\u0ea6\u0ea8-\u0ea9\u0eac\u0eaf\u0eb1\u0eb4-\u0ebc\u0ebe-\u0ebf\u0ec5-\u0f3f\u0f48\u0f6a-\u109f\u10c6-\u10cf\u10f7-\u10ff\u1101\u1104\u1108\u110a\u110d\u1113-\u113b\u113d\u113f\u1141-\u114b\u114d\u114f\u1151-\u1153\u1156-\u1158\u115a-\u115e\u1162\u1164\u1166\u1168\u116a-\u116c\u116f-\u1171\u1174\u1176-\u119d\u119f-\u11a7\u11a9-\u11aa\u11ac-\u11ad\u11b0-\u11b6\u11b9\u11bb\u11c3-\u11ea\u11ec-\u11ef\u11f1-\u11f8\u11fa-\u1dff\u1e9c-\u1e9f\u1efa-\u1eff\u1f16-\u1f17\u1f1e-\u1f1f\u1f46-\u1f47\u1f4e-\u1f4f\u1f58\u1f5a\u1f5c\u1f5e\u1f7e-\u1f7f\u1fb5\u1fbd\u1fbf-\u1fc1\u1fc5\u1fcd-\u1fcf\u1fd4-\u1fd5\u1fdc-\u1fdf\u1fed-\u1ff1\u1ff5\u1ffd-\u2125\u2127-\u2129\u212c-\u212d\u212f-\u217f\u2183-\u3006\u3008-\u3020\u302a-\u3040\u3095-\u30a0\u30fb-\u3104\u312d-\u4dff\u9fa6-\uabff\ud7a4-\uffff]')
|
|
||||||
|
|
||||||
class InfosetFilter(object):
|
class InfosetFilter(object):
|
||||||
replacementRegexp = re.compile(r"U[\dA-F]{5,5}")
|
replacementRegexp = re.compile(r"U[\dA-F]{5,5}")
|
||||||
|
|
||||||
def __init__(self, replaceChars=None,
|
def __init__(self, replaceChars=None,
|
||||||
dropXmlnsLocalName=False,
|
dropXmlnsLocalName=False,
|
||||||
dropXmlnsAttrNs=False,
|
dropXmlnsAttrNs=False,
|
||||||
preventDoubleDashComments=False,
|
preventDoubleDashComments=False,
|
||||||
preventDashAtCommentEnd=False,
|
preventDashAtCommentEnd=False,
|
||||||
replaceFormFeedCharacters = True):
|
replaceFormFeedCharacters=True,
|
||||||
|
preventSingleQuotePubid=False):
|
||||||
|
|
||||||
self.dropXmlnsLocalName = dropXmlnsLocalName
|
self.dropXmlnsLocalName = dropXmlnsLocalName
|
||||||
self.dropXmlnsAttrNs = dropXmlnsAttrNs
|
self.dropXmlnsAttrNs = dropXmlnsAttrNs
|
||||||
@ -113,14 +202,17 @@ class InfosetFilter(object):
|
|||||||
|
|
||||||
self.replaceFormFeedCharacters = replaceFormFeedCharacters
|
self.replaceFormFeedCharacters = replaceFormFeedCharacters
|
||||||
|
|
||||||
|
self.preventSingleQuotePubid = preventSingleQuotePubid
|
||||||
|
|
||||||
self.replaceCache = {}
|
self.replaceCache = {}
|
||||||
|
|
||||||
def coerceAttribute(self, name, namespace=None):
|
def coerceAttribute(self, name, namespace=None):
|
||||||
if self.dropXmlnsLocalName and name.startswith("xmlns:"):
|
if self.dropXmlnsLocalName and name.startswith("xmlns:"):
|
||||||
#Need a datalosswarning here
|
warnings.warn("Attributes cannot begin with xmlns", DataLossWarning)
|
||||||
return None
|
return None
|
||||||
elif (self.dropXmlnsAttrNs and
|
elif (self.dropXmlnsAttrNs and
|
||||||
namespace == "http://www.w3.org/2000/xmlns/"):
|
namespace == "http://www.w3.org/2000/xmlns/"):
|
||||||
|
warnings.warn("Attributes cannot be in the xml namespace", DataLossWarning)
|
||||||
return None
|
return None
|
||||||
else:
|
else:
|
||||||
return self.toXmlName(name)
|
return self.toXmlName(name)
|
||||||
@ -131,20 +223,35 @@ class InfosetFilter(object):
|
|||||||
def coerceComment(self, data):
|
def coerceComment(self, data):
|
||||||
if self.preventDoubleDashComments:
|
if self.preventDoubleDashComments:
|
||||||
while "--" in data:
|
while "--" in data:
|
||||||
|
warnings.warn("Comments cannot contain adjacent dashes", DataLossWarning)
|
||||||
data = data.replace("--", "- -")
|
data = data.replace("--", "- -")
|
||||||
return data
|
return data
|
||||||
|
|
||||||
def coerceCharacters(self, data):
|
def coerceCharacters(self, data):
|
||||||
if self.replaceFormFeedCharacters:
|
if self.replaceFormFeedCharacters:
|
||||||
|
for i in range(data.count("\x0C")):
|
||||||
|
warnings.warn("Text cannot contain U+000C", DataLossWarning)
|
||||||
data = data.replace("\x0C", " ")
|
data = data.replace("\x0C", " ")
|
||||||
# Other non-xml characters
|
# Other non-xml characters
|
||||||
return data
|
return data
|
||||||
|
|
||||||
|
def coercePubid(self, data):
|
||||||
|
dataOutput = data
|
||||||
|
for char in nonPubidCharRegexp.findall(data):
|
||||||
|
warnings.warn("Coercing non-XML pubid", DataLossWarning)
|
||||||
|
replacement = self.getReplacementCharacter(char)
|
||||||
|
dataOutput = dataOutput.replace(char, replacement)
|
||||||
|
if self.preventSingleQuotePubid and dataOutput.find("'") >= 0:
|
||||||
|
warnings.warn("Pubid cannot contain single quote", DataLossWarning)
|
||||||
|
dataOutput = dataOutput.replace("'", self.getReplacementCharacter("'"))
|
||||||
|
return dataOutput
|
||||||
|
|
||||||
def toXmlName(self, name):
|
def toXmlName(self, name):
|
||||||
nameFirst = name[0]
|
nameFirst = name[0]
|
||||||
nameRest = name[1:]
|
nameRest = name[1:]
|
||||||
m = nonXmlNameFirstBMPRegexp.match(nameFirst)
|
m = nonXmlNameFirstBMPRegexp.match(nameFirst)
|
||||||
if m:
|
if m:
|
||||||
|
warnings.warn("Coercing non-XML name", DataLossWarning)
|
||||||
nameFirstOutput = self.getReplacementCharacter(nameFirst)
|
nameFirstOutput = self.getReplacementCharacter(nameFirst)
|
||||||
else:
|
else:
|
||||||
nameFirstOutput = nameFirst
|
nameFirstOutput = nameFirst
|
||||||
@ -152,6 +259,7 @@ class InfosetFilter(object):
|
|||||||
nameRestOutput = nameRest
|
nameRestOutput = nameRest
|
||||||
replaceChars = set(nonXmlNameBMPRegexp.findall(nameRest))
|
replaceChars = set(nonXmlNameBMPRegexp.findall(nameRest))
|
||||||
for char in replaceChars:
|
for char in replaceChars:
|
||||||
|
warnings.warn("Coercing non-XML name", DataLossWarning)
|
||||||
replacement = self.getReplacementCharacter(char)
|
replacement = self.getReplacementCharacter(char)
|
||||||
nameRestOutput = nameRestOutput.replace(char, replacement)
|
nameRestOutput = nameRestOutput.replace(char, replacement)
|
||||||
return nameFirstOutput + nameRestOutput
|
return nameFirstOutput + nameRestOutput
|
||||||
@ -169,9 +277,9 @@ class InfosetFilter(object):
|
|||||||
return name
|
return name
|
||||||
|
|
||||||
def escapeChar(self, char):
|
def escapeChar(self, char):
|
||||||
replacement = "U" + hex(ord(char))[2:].upper().rjust(5, "0")
|
replacement = "U%05X" % ord(char)
|
||||||
self.replaceCache[char] = replacement
|
self.replaceCache[char] = replacement
|
||||||
return replacement
|
return replacement
|
||||||
|
|
||||||
def unescapeChar(self, charcode):
|
def unescapeChar(self, charcode):
|
||||||
return unichr(int(charcode[1:], 16))
|
return chr(int(charcode[1:], 16))
|
||||||
|
@ -1,18 +1,33 @@
|
|||||||
|
from __future__ import absolute_import, division, unicode_literals
|
||||||
|
from six import text_type
|
||||||
|
|
||||||
import codecs
|
import codecs
|
||||||
import re
|
import re
|
||||||
import types
|
|
||||||
import sys
|
|
||||||
|
|
||||||
from constants import EOF, spaceCharacters, asciiLetters, asciiUppercase
|
from .constants import EOF, spaceCharacters, asciiLetters, asciiUppercase
|
||||||
from constants import encodings, ReparseException
|
from .constants import encodings, ReparseException
|
||||||
|
from . import utils
|
||||||
|
|
||||||
|
from io import StringIO
|
||||||
|
|
||||||
|
try:
|
||||||
|
from io import BytesIO
|
||||||
|
except ImportError:
|
||||||
|
BytesIO = StringIO
|
||||||
|
|
||||||
|
try:
|
||||||
|
from io import BufferedIOBase
|
||||||
|
except ImportError:
|
||||||
|
class BufferedIOBase(object):
|
||||||
|
pass
|
||||||
|
|
||||||
# Non-unicode versions of constants for use in the pre-parser
|
# Non-unicode versions of constants for use in the pre-parser
|
||||||
spaceCharactersBytes = frozenset([str(item) for item in spaceCharacters])
|
spaceCharactersBytes = frozenset([item.encode("ascii") for item in spaceCharacters])
|
||||||
asciiLettersBytes = frozenset([str(item) for item in asciiLetters])
|
asciiLettersBytes = frozenset([item.encode("ascii") for item in asciiLetters])
|
||||||
asciiUppercaseBytes = frozenset([str(item) for item in asciiUppercase])
|
asciiUppercaseBytes = frozenset([item.encode("ascii") for item in asciiUppercase])
|
||||||
spacesAngleBrackets = spaceCharactersBytes | frozenset([">", "<"])
|
spacesAngleBrackets = spaceCharactersBytes | frozenset([b">", b"<"])
|
||||||
|
|
||||||
invalid_unicode_re = re.compile(u"[\u0001-\u0008\u000B\u000E-\u001F\u007F-\u009F\uD800-\uDFFF\uFDD0-\uFDEF\uFFFE\uFFFF\U0001FFFE\U0001FFFF\U0002FFFE\U0002FFFF\U0003FFFE\U0003FFFF\U0004FFFE\U0004FFFF\U0005FFFE\U0005FFFF\U0006FFFE\U0006FFFF\U0007FFFE\U0007FFFF\U0008FFFE\U0008FFFF\U0009FFFE\U0009FFFF\U000AFFFE\U000AFFFF\U000BFFFE\U000BFFFF\U000CFFFE\U000CFFFF\U000DFFFE\U000DFFFF\U000EFFFE\U000EFFFF\U000FFFFE\U000FFFFF\U0010FFFE\U0010FFFF]")
|
invalid_unicode_re = re.compile("[\u0001-\u0008\u000B\u000E-\u001F\u007F-\u009F\uD800-\uDFFF\uFDD0-\uFDEF\uFFFE\uFFFF\U0001FFFE\U0001FFFF\U0002FFFE\U0002FFFF\U0003FFFE\U0003FFFF\U0004FFFE\U0004FFFF\U0005FFFE\U0005FFFF\U0006FFFE\U0006FFFF\U0007FFFE\U0007FFFF\U0008FFFE\U0008FFFF\U0009FFFE\U0009FFFF\U000AFFFE\U000AFFFF\U000BFFFE\U000BFFFF\U000CFFFE\U000CFFFF\U000DFFFE\U000DFFFF\U000EFFFE\U000EFFFF\U000FFFFE\U000FFFFF\U0010FFFE\U0010FFFF]")
|
||||||
|
|
||||||
non_bmp_invalid_codepoints = set([0x1FFFE, 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE,
|
non_bmp_invalid_codepoints = set([0x1FFFE, 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE,
|
||||||
0x3FFFF, 0x4FFFE, 0x4FFFF, 0x5FFFE, 0x5FFFF,
|
0x3FFFF, 0x4FFFE, 0x4FFFF, 0x5FFFE, 0x5FFFF,
|
||||||
@ -22,12 +37,13 @@ non_bmp_invalid_codepoints = set([0x1FFFE, 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE,
|
|||||||
0xDFFFF, 0xEFFFE, 0xEFFFF, 0xFFFFE, 0xFFFFF,
|
0xDFFFF, 0xEFFFE, 0xEFFFF, 0xFFFFE, 0xFFFFF,
|
||||||
0x10FFFE, 0x10FFFF])
|
0x10FFFE, 0x10FFFF])
|
||||||
|
|
||||||
ascii_punctuation_re = re.compile(ur"[\u0009-\u000D\u0020-\u002F\u003A-\u0040\u005B-\u0060\u007B-\u007E]")
|
ascii_punctuation_re = re.compile("[\u0009-\u000D\u0020-\u002F\u003A-\u0040\u005B-\u0060\u007B-\u007E]")
|
||||||
|
|
||||||
# Cache for charsUntil()
|
# Cache for charsUntil()
|
||||||
charsUntilRegEx = {}
|
charsUntilRegEx = {}
|
||||||
|
|
||||||
class BufferedStream:
|
|
||||||
|
class BufferedStream(object):
|
||||||
"""Buffering for streams that do not have buffering of their own
|
"""Buffering for streams that do not have buffering of their own
|
||||||
|
|
||||||
The buffer is implemented as a list of chunks on the assumption that
|
The buffer is implemented as a list of chunks on the assumption that
|
||||||
@ -47,11 +63,11 @@ class BufferedStream:
|
|||||||
return pos
|
return pos
|
||||||
|
|
||||||
def seek(self, pos):
|
def seek(self, pos):
|
||||||
assert pos < self._bufferedBytes()
|
assert pos <= self._bufferedBytes()
|
||||||
offset = pos
|
offset = pos
|
||||||
i = 0
|
i = 0
|
||||||
while len(self.buffer[i]) < offset:
|
while len(self.buffer[i]) < offset:
|
||||||
offset -= pos
|
offset -= len(self.buffer[i])
|
||||||
i += 1
|
i += 1
|
||||||
self.position = [i, offset]
|
self.position = [i, offset]
|
||||||
|
|
||||||
@ -90,8 +106,7 @@ class BufferedStream:
|
|||||||
bytesToRead = len(bufferedData) - bufferOffset
|
bytesToRead = len(bufferedData) - bufferOffset
|
||||||
self.position = [bufferIndex, len(bufferedData)]
|
self.position = [bufferIndex, len(bufferedData)]
|
||||||
bufferIndex += 1
|
bufferIndex += 1
|
||||||
data = rv.append(bufferedData[bufferOffset:
|
rv.append(bufferedData[bufferOffset:bufferOffset + bytesToRead])
|
||||||
bufferOffset + bytesToRead])
|
|
||||||
remainingBytes -= bytesToRead
|
remainingBytes -= bytesToRead
|
||||||
|
|
||||||
bufferOffset = 0
|
bufferOffset = 0
|
||||||
@ -99,11 +114,25 @@ class BufferedStream:
|
|||||||
if remainingBytes:
|
if remainingBytes:
|
||||||
rv.append(self._readStream(remainingBytes))
|
rv.append(self._readStream(remainingBytes))
|
||||||
|
|
||||||
return "".join(rv)
|
return b"".join(rv)
|
||||||
|
|
||||||
|
|
||||||
|
def HTMLInputStream(source, encoding=None, parseMeta=True, chardet=True):
|
||||||
|
if hasattr(source, "read"):
|
||||||
|
isUnicode = isinstance(source.read(0), text_type)
|
||||||
|
else:
|
||||||
|
isUnicode = isinstance(source, text_type)
|
||||||
|
|
||||||
class HTMLInputStream:
|
if isUnicode:
|
||||||
|
if encoding is not None:
|
||||||
|
raise TypeError("Cannot explicitly set an encoding with a unicode string")
|
||||||
|
|
||||||
|
return HTMLUnicodeInputStream(source)
|
||||||
|
else:
|
||||||
|
return HTMLBinaryInputStream(source, encoding, parseMeta, chardet)
|
||||||
|
|
||||||
|
|
||||||
|
class HTMLUnicodeInputStream(object):
|
||||||
"""Provides a unicode stream of characters to the HTMLTokenizer.
|
"""Provides a unicode stream of characters to the HTMLTokenizer.
|
||||||
|
|
||||||
This class takes care of character encoding and removing or replacing
|
This class takes care of character encoding and removing or replacing
|
||||||
@ -113,7 +142,7 @@ class HTMLInputStream:
|
|||||||
|
|
||||||
_defaultChunkSize = 10240
|
_defaultChunkSize = 10240
|
||||||
|
|
||||||
def __init__(self, source, encoding=None, parseMeta=True, chardet=True):
|
def __init__(self, source):
|
||||||
"""Initialises the HTMLInputStream.
|
"""Initialises the HTMLInputStream.
|
||||||
|
|
||||||
HTMLInputStream(source, [encoding]) -> Normalized stream from source
|
HTMLInputStream(source, [encoding]) -> Normalized stream from source
|
||||||
@ -131,41 +160,23 @@ class HTMLInputStream:
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
# Craziness
|
# Craziness
|
||||||
if len(u"\U0010FFFF") == 1:
|
if len("\U0010FFFF") == 1:
|
||||||
self.reportCharacterErrors = self.characterErrorsUCS4
|
self.reportCharacterErrors = self.characterErrorsUCS4
|
||||||
|
self.replaceCharactersRegexp = re.compile("[\uD800-\uDFFF]")
|
||||||
else:
|
else:
|
||||||
self.reportCharacterErrors = self.characterErrorsUCS2
|
self.reportCharacterErrors = self.characterErrorsUCS2
|
||||||
|
self.replaceCharactersRegexp = re.compile("([\uD800-\uDBFF](?![\uDC00-\uDFFF])|(?<![\uD800-\uDBFF])[\uDC00-\uDFFF])")
|
||||||
|
|
||||||
# List of where new lines occur
|
# List of where new lines occur
|
||||||
self.newLines = [0]
|
self.newLines = [0]
|
||||||
|
|
||||||
self.charEncoding = (codecName(encoding), "certain")
|
self.charEncoding = ("utf-8", "certain")
|
||||||
|
self.dataStream = self.openStream(source)
|
||||||
# Raw Stream - for unicode objects this will encode to utf-8 and set
|
|
||||||
# self.charEncoding as appropriate
|
|
||||||
self.rawStream = self.openStream(source)
|
|
||||||
|
|
||||||
# Encoding Information
|
|
||||||
#Number of bytes to use when looking for a meta element with
|
|
||||||
#encoding information
|
|
||||||
self.numBytesMeta = 512
|
|
||||||
#Number of bytes to use when using detecting encoding using chardet
|
|
||||||
self.numBytesChardet = 100
|
|
||||||
#Encoding to use if no other information can be found
|
|
||||||
self.defaultEncoding = "windows-1252"
|
|
||||||
|
|
||||||
#Detect encoding iff no explicit "transport level" encoding is supplied
|
|
||||||
if (self.charEncoding[0] is None):
|
|
||||||
self.charEncoding = self.detectEncoding(parseMeta, chardet)
|
|
||||||
|
|
||||||
|
|
||||||
self.reset()
|
self.reset()
|
||||||
|
|
||||||
def reset(self):
|
def reset(self):
|
||||||
self.dataStream = codecs.getreader(self.charEncoding[0])(self.rawStream,
|
self.chunk = ""
|
||||||
'replace')
|
|
||||||
|
|
||||||
self.chunk = u""
|
|
||||||
self.chunkSize = 0
|
self.chunkSize = 0
|
||||||
self.chunkOffset = 0
|
self.chunkOffset = 0
|
||||||
self.errors = []
|
self.errors = []
|
||||||
@ -175,8 +186,8 @@ class HTMLInputStream:
|
|||||||
# number of columns in the last line of the previous chunk
|
# number of columns in the last line of the previous chunk
|
||||||
self.prevNumCols = 0
|
self.prevNumCols = 0
|
||||||
|
|
||||||
#Flag to indicate we may have a CR LF broken across a data chunk
|
# Deal with CR LF and surrogates split over chunk boundaries
|
||||||
self._lastChunkEndsWithCR = False
|
self._bufferedCharacter = None
|
||||||
|
|
||||||
def openStream(self, source):
|
def openStream(self, source):
|
||||||
"""Produces a file object from source.
|
"""Produces a file object from source.
|
||||||
@ -188,122 +199,15 @@ class HTMLInputStream:
|
|||||||
if hasattr(source, 'read'):
|
if hasattr(source, 'read'):
|
||||||
stream = source
|
stream = source
|
||||||
else:
|
else:
|
||||||
# Otherwise treat source as a string and convert to a file object
|
stream = StringIO(source)
|
||||||
if isinstance(source, unicode):
|
|
||||||
source = source.encode('utf-8')
|
|
||||||
self.charEncoding = ("utf-8", "certain")
|
|
||||||
import cStringIO
|
|
||||||
stream = cStringIO.StringIO(str(source))
|
|
||||||
|
|
||||||
if (not(hasattr(stream, "tell") and hasattr(stream, "seek")) or
|
|
||||||
stream is sys.stdin):
|
|
||||||
stream = BufferedStream(stream)
|
|
||||||
|
|
||||||
return stream
|
return stream
|
||||||
|
|
||||||
def detectEncoding(self, parseMeta=True, chardet=True):
|
|
||||||
#First look for a BOM
|
|
||||||
#This will also read past the BOM if present
|
|
||||||
encoding = self.detectBOM()
|
|
||||||
confidence = "certain"
|
|
||||||
#If there is no BOM need to look for meta elements with encoding
|
|
||||||
#information
|
|
||||||
if encoding is None and parseMeta:
|
|
||||||
encoding = self.detectEncodingMeta()
|
|
||||||
confidence = "tentative"
|
|
||||||
#Guess with chardet, if avaliable
|
|
||||||
if encoding is None and chardet:
|
|
||||||
confidence = "tentative"
|
|
||||||
try:
|
|
||||||
from chardet.universaldetector import UniversalDetector
|
|
||||||
buffers = []
|
|
||||||
detector = UniversalDetector()
|
|
||||||
while not detector.done:
|
|
||||||
buffer = self.rawStream.read(self.numBytesChardet)
|
|
||||||
if not buffer:
|
|
||||||
break
|
|
||||||
buffers.append(buffer)
|
|
||||||
detector.feed(buffer)
|
|
||||||
detector.close()
|
|
||||||
encoding = detector.result['encoding']
|
|
||||||
self.rawStream.seek(0)
|
|
||||||
except ImportError:
|
|
||||||
pass
|
|
||||||
# If all else fails use the default encoding
|
|
||||||
if encoding is None:
|
|
||||||
confidence="tentative"
|
|
||||||
encoding = self.defaultEncoding
|
|
||||||
|
|
||||||
#Substitute for equivalent encodings:
|
|
||||||
encodingSub = {"iso-8859-1":"windows-1252"}
|
|
||||||
|
|
||||||
if encoding.lower() in encodingSub:
|
|
||||||
encoding = encodingSub[encoding.lower()]
|
|
||||||
|
|
||||||
return encoding, confidence
|
|
||||||
|
|
||||||
def changeEncoding(self, newEncoding):
|
|
||||||
newEncoding = codecName(newEncoding)
|
|
||||||
if newEncoding in ("utf-16", "utf-16-be", "utf-16-le"):
|
|
||||||
newEncoding = "utf-8"
|
|
||||||
if newEncoding is None:
|
|
||||||
return
|
|
||||||
elif newEncoding == self.charEncoding[0]:
|
|
||||||
self.charEncoding = (self.charEncoding[0], "certain")
|
|
||||||
else:
|
|
||||||
self.rawStream.seek(0)
|
|
||||||
self.reset()
|
|
||||||
self.charEncoding = (newEncoding, "certain")
|
|
||||||
raise ReparseException, "Encoding changed from %s to %s"%(self.charEncoding[0], newEncoding)
|
|
||||||
|
|
||||||
def detectBOM(self):
|
|
||||||
"""Attempts to detect at BOM at the start of the stream. If
|
|
||||||
an encoding can be determined from the BOM return the name of the
|
|
||||||
encoding otherwise return None"""
|
|
||||||
bomDict = {
|
|
||||||
codecs.BOM_UTF8: 'utf-8',
|
|
||||||
codecs.BOM_UTF16_LE: 'utf-16-le', codecs.BOM_UTF16_BE: 'utf-16-be',
|
|
||||||
codecs.BOM_UTF32_LE: 'utf-32-le', codecs.BOM_UTF32_BE: 'utf-32-be'
|
|
||||||
}
|
|
||||||
|
|
||||||
# Go to beginning of file and read in 4 bytes
|
|
||||||
string = self.rawStream.read(4)
|
|
||||||
|
|
||||||
# Try detecting the BOM using bytes from the string
|
|
||||||
encoding = bomDict.get(string[:3]) # UTF-8
|
|
||||||
seek = 3
|
|
||||||
if not encoding:
|
|
||||||
# Need to detect UTF-32 before UTF-16
|
|
||||||
encoding = bomDict.get(string) # UTF-32
|
|
||||||
seek = 4
|
|
||||||
if not encoding:
|
|
||||||
encoding = bomDict.get(string[:2]) # UTF-16
|
|
||||||
seek = 2
|
|
||||||
|
|
||||||
# Set the read position past the BOM if one was found, otherwise
|
|
||||||
# set it to the start of the stream
|
|
||||||
self.rawStream.seek(encoding and seek or 0)
|
|
||||||
|
|
||||||
return encoding
|
|
||||||
|
|
||||||
def detectEncodingMeta(self):
|
|
||||||
"""Report the encoding declared by the meta element
|
|
||||||
"""
|
|
||||||
buffer = self.rawStream.read(self.numBytesMeta)
|
|
||||||
parser = EncodingParser(buffer)
|
|
||||||
self.rawStream.seek(0)
|
|
||||||
encoding = parser.getEncoding()
|
|
||||||
|
|
||||||
if encoding in ("utf-16", "utf-16-be", "utf-16-le"):
|
|
||||||
encoding = "utf-8"
|
|
||||||
|
|
||||||
return encoding
|
|
||||||
|
|
||||||
def _position(self, offset):
|
def _position(self, offset):
|
||||||
chunk = self.chunk
|
chunk = self.chunk
|
||||||
nLines = chunk.count(u'\n', 0, offset)
|
nLines = chunk.count('\n', 0, offset)
|
||||||
positionLine = self.prevNumLines + nLines
|
positionLine = self.prevNumLines + nLines
|
||||||
lastLinePos = chunk.rfind(u'\n', 0, offset)
|
lastLinePos = chunk.rfind('\n', 0, offset)
|
||||||
if lastLinePos == -1:
|
if lastLinePos == -1:
|
||||||
positionColumn = self.prevNumCols + offset
|
positionColumn = self.prevNumCols + offset
|
||||||
else:
|
else:
|
||||||
@ -336,27 +240,34 @@ class HTMLInputStream:
|
|||||||
|
|
||||||
self.prevNumLines, self.prevNumCols = self._position(self.chunkSize)
|
self.prevNumLines, self.prevNumCols = self._position(self.chunkSize)
|
||||||
|
|
||||||
self.chunk = u""
|
self.chunk = ""
|
||||||
self.chunkSize = 0
|
self.chunkSize = 0
|
||||||
self.chunkOffset = 0
|
self.chunkOffset = 0
|
||||||
|
|
||||||
data = self.dataStream.read(chunkSize)
|
data = self.dataStream.read(chunkSize)
|
||||||
|
|
||||||
if not data:
|
# Deal with CR LF and surrogates broken across chunks
|
||||||
|
if self._bufferedCharacter:
|
||||||
|
data = self._bufferedCharacter + data
|
||||||
|
self._bufferedCharacter = None
|
||||||
|
elif not data:
|
||||||
|
# We have no more data, bye-bye stream
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
if len(data) > 1:
|
||||||
|
lastv = ord(data[-1])
|
||||||
|
if lastv == 0x0D or 0xD800 <= lastv <= 0xDBFF:
|
||||||
|
self._bufferedCharacter = data[-1]
|
||||||
|
data = data[:-1]
|
||||||
|
|
||||||
self.reportCharacterErrors(data)
|
self.reportCharacterErrors(data)
|
||||||
|
|
||||||
data = data.replace(u"\u0000", u"\ufffd")
|
# Replace invalid characters
|
||||||
#Check for CR LF broken across chunks
|
# Note U+0000 is dealt with in the tokenizer
|
||||||
if (self._lastChunkEndsWithCR and data[0] == u"\n"):
|
data = self.replaceCharactersRegexp.sub("\ufffd", data)
|
||||||
data = data[1:]
|
|
||||||
# Stop if the chunk is now empty
|
data = data.replace("\r\n", "\n")
|
||||||
if not data:
|
data = data.replace("\r", "\n")
|
||||||
return False
|
|
||||||
self._lastChunkEndsWithCR = data[-1] == u"\r"
|
|
||||||
data = data.replace(u"\r\n", u"\n")
|
|
||||||
data = data.replace(u"\r", u"\n")
|
|
||||||
|
|
||||||
self.chunk = data
|
self.chunk = data
|
||||||
self.chunkSize = len(data)
|
self.chunkSize = len(data)
|
||||||
@ -364,32 +275,22 @@ class HTMLInputStream:
|
|||||||
return True
|
return True
|
||||||
|
|
||||||
def characterErrorsUCS4(self, data):
|
def characterErrorsUCS4(self, data):
|
||||||
for i in xrange(data.count(u"\u0000")):
|
for i in range(len(invalid_unicode_re.findall(data))):
|
||||||
self.errors.append("null-character")
|
|
||||||
for i in xrange(len(invalid_unicode_re.findall(data))):
|
|
||||||
self.errors.append("invalid-codepoint")
|
self.errors.append("invalid-codepoint")
|
||||||
|
|
||||||
def characterErrorsUCS2(self, data):
|
def characterErrorsUCS2(self, data):
|
||||||
# Someone picked the wrong compile option
|
# Someone picked the wrong compile option
|
||||||
# You lose
|
# You lose
|
||||||
for i in xrange(data.count(u"\u0000")):
|
|
||||||
self.errors.append("null-character")
|
|
||||||
skip = False
|
skip = False
|
||||||
import sys
|
|
||||||
for match in invalid_unicode_re.finditer(data):
|
for match in invalid_unicode_re.finditer(data):
|
||||||
if skip:
|
if skip:
|
||||||
continue
|
continue
|
||||||
codepoint = ord(match.group())
|
codepoint = ord(match.group())
|
||||||
pos = match.start()
|
pos = match.start()
|
||||||
# Pretty sure there should be endianness issues here
|
# Pretty sure there should be endianness issues here
|
||||||
if (codepoint >= 0xD800 and codepoint <= 0xDBFF and
|
if utils.isSurrogatePair(data[pos:pos + 2]):
|
||||||
pos < len(data) - 1 and
|
|
||||||
ord(data[pos + 1]) >= 0xDC00 and
|
|
||||||
ord(data[pos + 1]) <= 0xDFFF):
|
|
||||||
# We have a surrogate pair!
|
# We have a surrogate pair!
|
||||||
#From a perl manpage
|
char_val = utils.surrogatePairToCodepoint(data[pos:pos + 2])
|
||||||
char_val = (0x10000 + (codepoint - 0xD800) * 0x400 +
|
|
||||||
(ord(data[pos + 1]) - 0xDC00))
|
|
||||||
if char_val in non_bmp_invalid_codepoints:
|
if char_val in non_bmp_invalid_codepoints:
|
||||||
self.errors.append("invalid-codepoint")
|
self.errors.append("invalid-codepoint")
|
||||||
skip = True
|
skip = True
|
||||||
@ -399,8 +300,6 @@ class HTMLInputStream:
|
|||||||
else:
|
else:
|
||||||
skip = False
|
skip = False
|
||||||
self.errors.append("invalid-codepoint")
|
self.errors.append("invalid-codepoint")
|
||||||
#This is still wrong if it is possible for a surrogate pair to break a
|
|
||||||
#chunk boundary
|
|
||||||
|
|
||||||
def charsUntil(self, characters, opposite=False):
|
def charsUntil(self, characters, opposite=False):
|
||||||
""" Returns a string of characters from the stream up to but not
|
""" Returns a string of characters from the stream up to but not
|
||||||
@ -416,10 +315,10 @@ class HTMLInputStream:
|
|||||||
if __debug__:
|
if __debug__:
|
||||||
for c in characters:
|
for c in characters:
|
||||||
assert(ord(c) < 128)
|
assert(ord(c) < 128)
|
||||||
regex = u"".join([u"\\x%02x" % ord(c) for c in characters])
|
regex = "".join(["\\x%02x" % ord(c) for c in characters])
|
||||||
if not opposite:
|
if not opposite:
|
||||||
regex = u"^%s" % regex
|
regex = "^%s" % regex
|
||||||
chars = charsUntilRegEx[(characters, opposite)] = re.compile(u"[%s]+" % regex)
|
chars = charsUntilRegEx[(characters, opposite)] = re.compile("[%s]+" % regex)
|
||||||
|
|
||||||
rv = []
|
rv = []
|
||||||
|
|
||||||
@ -446,27 +345,12 @@ class HTMLInputStream:
|
|||||||
# Reached EOF
|
# Reached EOF
|
||||||
break
|
break
|
||||||
|
|
||||||
r = u"".join(rv)
|
r = "".join(rv)
|
||||||
return r
|
|
||||||
|
|
||||||
def charsUntilEOF(self):
|
|
||||||
""" Returns a string of characters from the stream up to EOF."""
|
|
||||||
|
|
||||||
rv = []
|
|
||||||
|
|
||||||
while True:
|
|
||||||
rv.append(self.chunk[self.chunkOffset:])
|
|
||||||
if not self.readChunk():
|
|
||||||
# Reached EOF
|
|
||||||
break
|
|
||||||
|
|
||||||
r = u"".join(rv)
|
|
||||||
return r
|
return r
|
||||||
|
|
||||||
def unget(self, char):
|
def unget(self, char):
|
||||||
# Only one character is allowed to be ungotten at once - it must
|
# Only one character is allowed to be ungotten at once - it must
|
||||||
# be consumed again before any further call to unget
|
# be consumed again before any further call to unget
|
||||||
|
|
||||||
if char is not None:
|
if char is not None:
|
||||||
if self.chunkOffset == 0:
|
if self.chunkOffset == 0:
|
||||||
# unget is called quite rarely, so it's a good idea to do
|
# unget is called quite rarely, so it's a good idea to do
|
||||||
@ -480,12 +364,192 @@ class HTMLInputStream:
|
|||||||
self.chunkOffset -= 1
|
self.chunkOffset -= 1
|
||||||
assert self.chunk[self.chunkOffset] == char
|
assert self.chunk[self.chunkOffset] == char
|
||||||
|
|
||||||
class EncodingBytes(str):
|
|
||||||
|
class HTMLBinaryInputStream(HTMLUnicodeInputStream):
|
||||||
|
"""Provides a unicode stream of characters to the HTMLTokenizer.
|
||||||
|
|
||||||
|
This class takes care of character encoding and removing or replacing
|
||||||
|
incorrect byte-sequences and also provides column and line tracking.
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, source, encoding=None, parseMeta=True, chardet=True):
|
||||||
|
"""Initialises the HTMLInputStream.
|
||||||
|
|
||||||
|
HTMLInputStream(source, [encoding]) -> Normalized stream from source
|
||||||
|
for use by html5lib.
|
||||||
|
|
||||||
|
source can be either a file-object, local filename or a string.
|
||||||
|
|
||||||
|
The optional encoding parameter must be a string that indicates
|
||||||
|
the encoding. If specified, that encoding will be used,
|
||||||
|
regardless of any BOM or later declaration (such as in a meta
|
||||||
|
element)
|
||||||
|
|
||||||
|
parseMeta - Look for a <meta> element containing encoding information
|
||||||
|
|
||||||
|
"""
|
||||||
|
# Raw Stream - for unicode objects this will encode to utf-8 and set
|
||||||
|
# self.charEncoding as appropriate
|
||||||
|
self.rawStream = self.openStream(source)
|
||||||
|
|
||||||
|
HTMLUnicodeInputStream.__init__(self, self.rawStream)
|
||||||
|
|
||||||
|
self.charEncoding = (codecName(encoding), "certain")
|
||||||
|
|
||||||
|
# Encoding Information
|
||||||
|
# Number of bytes to use when looking for a meta element with
|
||||||
|
# encoding information
|
||||||
|
self.numBytesMeta = 512
|
||||||
|
# Number of bytes to use when using detecting encoding using chardet
|
||||||
|
self.numBytesChardet = 100
|
||||||
|
# Encoding to use if no other information can be found
|
||||||
|
self.defaultEncoding = "windows-1252"
|
||||||
|
|
||||||
|
# Detect encoding iff no explicit "transport level" encoding is supplied
|
||||||
|
if (self.charEncoding[0] is None):
|
||||||
|
self.charEncoding = self.detectEncoding(parseMeta, chardet)
|
||||||
|
|
||||||
|
# Call superclass
|
||||||
|
self.reset()
|
||||||
|
|
||||||
|
def reset(self):
|
||||||
|
self.dataStream = codecs.getreader(self.charEncoding[0])(self.rawStream,
|
||||||
|
'replace')
|
||||||
|
HTMLUnicodeInputStream.reset(self)
|
||||||
|
|
||||||
|
def openStream(self, source):
|
||||||
|
"""Produces a file object from source.
|
||||||
|
|
||||||
|
source can be either a file object, local filename or a string.
|
||||||
|
|
||||||
|
"""
|
||||||
|
# Already a file object
|
||||||
|
if hasattr(source, 'read'):
|
||||||
|
stream = source
|
||||||
|
else:
|
||||||
|
stream = BytesIO(source)
|
||||||
|
|
||||||
|
try:
|
||||||
|
stream.seek(stream.tell())
|
||||||
|
except:
|
||||||
|
stream = BufferedStream(stream)
|
||||||
|
|
||||||
|
return stream
|
||||||
|
|
||||||
|
def detectEncoding(self, parseMeta=True, chardet=True):
|
||||||
|
# First look for a BOM
|
||||||
|
# This will also read past the BOM if present
|
||||||
|
encoding = self.detectBOM()
|
||||||
|
confidence = "certain"
|
||||||
|
# If there is no BOM need to look for meta elements with encoding
|
||||||
|
# information
|
||||||
|
if encoding is None and parseMeta:
|
||||||
|
encoding = self.detectEncodingMeta()
|
||||||
|
confidence = "tentative"
|
||||||
|
# Guess with chardet, if avaliable
|
||||||
|
if encoding is None and chardet:
|
||||||
|
confidence = "tentative"
|
||||||
|
try:
|
||||||
|
try:
|
||||||
|
from charade.universaldetector import UniversalDetector
|
||||||
|
except ImportError:
|
||||||
|
from chardet.universaldetector import UniversalDetector
|
||||||
|
buffers = []
|
||||||
|
detector = UniversalDetector()
|
||||||
|
while not detector.done:
|
||||||
|
buffer = self.rawStream.read(self.numBytesChardet)
|
||||||
|
assert isinstance(buffer, bytes)
|
||||||
|
if not buffer:
|
||||||
|
break
|
||||||
|
buffers.append(buffer)
|
||||||
|
detector.feed(buffer)
|
||||||
|
detector.close()
|
||||||
|
encoding = detector.result['encoding']
|
||||||
|
self.rawStream.seek(0)
|
||||||
|
except ImportError:
|
||||||
|
pass
|
||||||
|
# If all else fails use the default encoding
|
||||||
|
if encoding is None:
|
||||||
|
confidence = "tentative"
|
||||||
|
encoding = self.defaultEncoding
|
||||||
|
|
||||||
|
# Substitute for equivalent encodings:
|
||||||
|
encodingSub = {"iso-8859-1": "windows-1252"}
|
||||||
|
|
||||||
|
if encoding.lower() in encodingSub:
|
||||||
|
encoding = encodingSub[encoding.lower()]
|
||||||
|
|
||||||
|
return encoding, confidence
|
||||||
|
|
||||||
|
def changeEncoding(self, newEncoding):
|
||||||
|
assert self.charEncoding[1] != "certain"
|
||||||
|
newEncoding = codecName(newEncoding)
|
||||||
|
if newEncoding in ("utf-16", "utf-16-be", "utf-16-le"):
|
||||||
|
newEncoding = "utf-8"
|
||||||
|
if newEncoding is None:
|
||||||
|
return
|
||||||
|
elif newEncoding == self.charEncoding[0]:
|
||||||
|
self.charEncoding = (self.charEncoding[0], "certain")
|
||||||
|
else:
|
||||||
|
self.rawStream.seek(0)
|
||||||
|
self.reset()
|
||||||
|
self.charEncoding = (newEncoding, "certain")
|
||||||
|
raise ReparseException("Encoding changed from %s to %s" % (self.charEncoding[0], newEncoding))
|
||||||
|
|
||||||
|
def detectBOM(self):
|
||||||
|
"""Attempts to detect at BOM at the start of the stream. If
|
||||||
|
an encoding can be determined from the BOM return the name of the
|
||||||
|
encoding otherwise return None"""
|
||||||
|
bomDict = {
|
||||||
|
codecs.BOM_UTF8: 'utf-8',
|
||||||
|
codecs.BOM_UTF16_LE: 'utf-16-le', codecs.BOM_UTF16_BE: 'utf-16-be',
|
||||||
|
codecs.BOM_UTF32_LE: 'utf-32-le', codecs.BOM_UTF32_BE: 'utf-32-be'
|
||||||
|
}
|
||||||
|
|
||||||
|
# Go to beginning of file and read in 4 bytes
|
||||||
|
string = self.rawStream.read(4)
|
||||||
|
assert isinstance(string, bytes)
|
||||||
|
|
||||||
|
# Try detecting the BOM using bytes from the string
|
||||||
|
encoding = bomDict.get(string[:3]) # UTF-8
|
||||||
|
seek = 3
|
||||||
|
if not encoding:
|
||||||
|
# Need to detect UTF-32 before UTF-16
|
||||||
|
encoding = bomDict.get(string) # UTF-32
|
||||||
|
seek = 4
|
||||||
|
if not encoding:
|
||||||
|
encoding = bomDict.get(string[:2]) # UTF-16
|
||||||
|
seek = 2
|
||||||
|
|
||||||
|
# Set the read position past the BOM if one was found, otherwise
|
||||||
|
# set it to the start of the stream
|
||||||
|
self.rawStream.seek(encoding and seek or 0)
|
||||||
|
|
||||||
|
return encoding
|
||||||
|
|
||||||
|
def detectEncodingMeta(self):
|
||||||
|
"""Report the encoding declared by the meta element
|
||||||
|
"""
|
||||||
|
buffer = self.rawStream.read(self.numBytesMeta)
|
||||||
|
assert isinstance(buffer, bytes)
|
||||||
|
parser = EncodingParser(buffer)
|
||||||
|
self.rawStream.seek(0)
|
||||||
|
encoding = parser.getEncoding()
|
||||||
|
|
||||||
|
if encoding in ("utf-16", "utf-16-be", "utf-16-le"):
|
||||||
|
encoding = "utf-8"
|
||||||
|
|
||||||
|
return encoding
|
||||||
|
|
||||||
|
|
||||||
|
class EncodingBytes(bytes):
|
||||||
"""String-like object with an associated position and various extra methods
|
"""String-like object with an associated position and various extra methods
|
||||||
If the position is ever greater than the string length then an exception is
|
If the position is ever greater than the string length then an exception is
|
||||||
raised"""
|
raised"""
|
||||||
def __new__(self, value):
|
def __new__(self, value):
|
||||||
return str.__new__(self, value.lower())
|
assert isinstance(value, bytes)
|
||||||
|
return bytes.__new__(self, value.lower())
|
||||||
|
|
||||||
def __init__(self, value):
|
def __init__(self, value):
|
||||||
self._position = -1
|
self._position = -1
|
||||||
@ -493,13 +557,17 @@ class EncodingBytes(str):
|
|||||||
def __iter__(self):
|
def __iter__(self):
|
||||||
return self
|
return self
|
||||||
|
|
||||||
def next(self):
|
def __next__(self):
|
||||||
p = self._position = self._position + 1
|
p = self._position = self._position + 1
|
||||||
if p >= len(self):
|
if p >= len(self):
|
||||||
raise StopIteration
|
raise StopIteration
|
||||||
elif p < 0:
|
elif p < 0:
|
||||||
raise TypeError
|
raise TypeError
|
||||||
return self[p]
|
return self[p:p + 1]
|
||||||
|
|
||||||
|
def next(self):
|
||||||
|
# Py2 compat
|
||||||
|
return self.__next__()
|
||||||
|
|
||||||
def previous(self):
|
def previous(self):
|
||||||
p = self._position
|
p = self._position
|
||||||
@ -508,7 +576,7 @@ class EncodingBytes(str):
|
|||||||
elif p < 0:
|
elif p < 0:
|
||||||
raise TypeError
|
raise TypeError
|
||||||
self._position = p = p - 1
|
self._position = p = p - 1
|
||||||
return self[p]
|
return self[p:p + 1]
|
||||||
|
|
||||||
def setPosition(self, position):
|
def setPosition(self, position):
|
||||||
if self._position >= len(self):
|
if self._position >= len(self):
|
||||||
@ -526,7 +594,7 @@ class EncodingBytes(str):
|
|||||||
position = property(getPosition, setPosition)
|
position = property(getPosition, setPosition)
|
||||||
|
|
||||||
def getCurrentByte(self):
|
def getCurrentByte(self):
|
||||||
return self[self.position]
|
return self[self.position:self.position + 1]
|
||||||
|
|
||||||
currentByte = property(getCurrentByte)
|
currentByte = property(getCurrentByte)
|
||||||
|
|
||||||
@ -534,7 +602,7 @@ class EncodingBytes(str):
|
|||||||
"""Skip past a list of characters"""
|
"""Skip past a list of characters"""
|
||||||
p = self.position # use property for the error-checking
|
p = self.position # use property for the error-checking
|
||||||
while p < len(self):
|
while p < len(self):
|
||||||
c = self[p]
|
c = self[p:p + 1]
|
||||||
if c not in chars:
|
if c not in chars:
|
||||||
self._position = p
|
self._position = p
|
||||||
return c
|
return c
|
||||||
@ -545,7 +613,7 @@ class EncodingBytes(str):
|
|||||||
def skipUntil(self, chars):
|
def skipUntil(self, chars):
|
||||||
p = self.position
|
p = self.position
|
||||||
while p < len(self):
|
while p < len(self):
|
||||||
c = self[p]
|
c = self[p:p + 1]
|
||||||
if c in chars:
|
if c in chars:
|
||||||
self._position = p
|
self._position = p
|
||||||
return c
|
return c
|
||||||
@ -577,6 +645,7 @@ class EncodingBytes(str):
|
|||||||
else:
|
else:
|
||||||
raise StopIteration
|
raise StopIteration
|
||||||
|
|
||||||
|
|
||||||
class EncodingParser(object):
|
class EncodingParser(object):
|
||||||
"""Mini parser for detecting character encoding from meta elements"""
|
"""Mini parser for detecting character encoding from meta elements"""
|
||||||
|
|
||||||
@ -587,12 +656,12 @@ class EncodingParser(object):
|
|||||||
|
|
||||||
def getEncoding(self):
|
def getEncoding(self):
|
||||||
methodDispatch = (
|
methodDispatch = (
|
||||||
("<!--",self.handleComment),
|
(b"<!--", self.handleComment),
|
||||||
("<meta",self.handleMeta),
|
(b"<meta", self.handleMeta),
|
||||||
("</",self.handlePossibleEndTag),
|
(b"</", self.handlePossibleEndTag),
|
||||||
("<!",self.handleOther),
|
(b"<!", self.handleOther),
|
||||||
("<?",self.handleOther),
|
(b"<?", self.handleOther),
|
||||||
("<",self.handlePossibleStartTag))
|
(b"<", self.handlePossibleStartTag))
|
||||||
for byte in self.data:
|
for byte in self.data:
|
||||||
keepParsing = True
|
keepParsing = True
|
||||||
for key, method in methodDispatch:
|
for key, method in methodDispatch:
|
||||||
@ -610,38 +679,49 @@ class EncodingParser(object):
|
|||||||
|
|
||||||
def handleComment(self):
|
def handleComment(self):
|
||||||
"""Skip over comments"""
|
"""Skip over comments"""
|
||||||
return self.data.jumpTo("-->")
|
return self.data.jumpTo(b"-->")
|
||||||
|
|
||||||
def handleMeta(self):
|
def handleMeta(self):
|
||||||
if self.data.currentByte not in spaceCharactersBytes:
|
if self.data.currentByte not in spaceCharactersBytes:
|
||||||
# if we have <meta not followed by a space so just keep going
|
# if we have <meta not followed by a space so just keep going
|
||||||
return True
|
return True
|
||||||
# We have a valid meta element we want to search for attributes
|
# We have a valid meta element we want to search for attributes
|
||||||
|
hasPragma = False
|
||||||
|
pendingEncoding = None
|
||||||
while True:
|
while True:
|
||||||
# Try to find the next attribute after the current position
|
# Try to find the next attribute after the current position
|
||||||
attr = self.getAttribute()
|
attr = self.getAttribute()
|
||||||
if attr is None:
|
if attr is None:
|
||||||
return True
|
return True
|
||||||
else:
|
else:
|
||||||
if attr[0] == "charset":
|
if attr[0] == b"http-equiv":
|
||||||
|
hasPragma = attr[1] == b"content-type"
|
||||||
|
if hasPragma and pendingEncoding is not None:
|
||||||
|
self.encoding = pendingEncoding
|
||||||
|
return False
|
||||||
|
elif attr[0] == b"charset":
|
||||||
tentativeEncoding = attr[1]
|
tentativeEncoding = attr[1]
|
||||||
codec = codecName(tentativeEncoding)
|
codec = codecName(tentativeEncoding)
|
||||||
if codec is not None:
|
if codec is not None:
|
||||||
self.encoding = codec
|
self.encoding = codec
|
||||||
return False
|
return False
|
||||||
elif attr[0] == "content":
|
elif attr[0] == b"content":
|
||||||
contentParser = ContentAttrParser(EncodingBytes(attr[1]))
|
contentParser = ContentAttrParser(EncodingBytes(attr[1]))
|
||||||
tentativeEncoding = contentParser.parse()
|
tentativeEncoding = contentParser.parse()
|
||||||
|
if tentativeEncoding is not None:
|
||||||
codec = codecName(tentativeEncoding)
|
codec = codecName(tentativeEncoding)
|
||||||
if codec is not None:
|
if codec is not None:
|
||||||
|
if hasPragma:
|
||||||
self.encoding = codec
|
self.encoding = codec
|
||||||
return False
|
return False
|
||||||
|
else:
|
||||||
|
pendingEncoding = codec
|
||||||
|
|
||||||
def handlePossibleStartTag(self):
|
def handlePossibleStartTag(self):
|
||||||
return self.handlePossibleTag(False)
|
return self.handlePossibleTag(False)
|
||||||
|
|
||||||
def handlePossibleEndTag(self):
|
def handlePossibleEndTag(self):
|
||||||
self.data.next()
|
next(self.data)
|
||||||
return self.handlePossibleTag(True)
|
return self.handlePossibleTag(True)
|
||||||
|
|
||||||
def handlePossibleTag(self, endTag):
|
def handlePossibleTag(self, endTag):
|
||||||
@ -656,7 +736,7 @@ class EncodingParser(object):
|
|||||||
return True
|
return True
|
||||||
|
|
||||||
c = data.skipUntil(spacesAngleBrackets)
|
c = data.skipUntil(spacesAngleBrackets)
|
||||||
if c == "<":
|
if c == b"<":
|
||||||
# return to the first step in the overall "two step" algorithm
|
# return to the first step in the overall "two step" algorithm
|
||||||
# reprocessing the < byte
|
# reprocessing the < byte
|
||||||
data.previous()
|
data.previous()
|
||||||
@ -668,66 +748,66 @@ class EncodingParser(object):
|
|||||||
return True
|
return True
|
||||||
|
|
||||||
def handleOther(self):
|
def handleOther(self):
|
||||||
return self.data.jumpTo(">")
|
return self.data.jumpTo(b">")
|
||||||
|
|
||||||
def getAttribute(self):
|
def getAttribute(self):
|
||||||
"""Return a name,value pair for the next attribute in the stream,
|
"""Return a name,value pair for the next attribute in the stream,
|
||||||
if one is found, or None"""
|
if one is found, or None"""
|
||||||
data = self.data
|
data = self.data
|
||||||
# Step 1 (skip chars)
|
# Step 1 (skip chars)
|
||||||
c = data.skip(spaceCharactersBytes | frozenset("/"))
|
c = data.skip(spaceCharactersBytes | frozenset([b"/"]))
|
||||||
|
assert c is None or len(c) == 1
|
||||||
# Step 2
|
# Step 2
|
||||||
if c in (">", None):
|
if c in (b">", None):
|
||||||
return None
|
return None
|
||||||
# Step 3
|
# Step 3
|
||||||
attrName = []
|
attrName = []
|
||||||
attrValue = []
|
attrValue = []
|
||||||
# Step 4 attribute name
|
# Step 4 attribute name
|
||||||
while True:
|
while True:
|
||||||
if c == "=" and attrName:
|
if c == b"=" and attrName:
|
||||||
break
|
break
|
||||||
elif c in spaceCharactersBytes:
|
elif c in spaceCharactersBytes:
|
||||||
# Step 6!
|
# Step 6!
|
||||||
c = data.skip()
|
c = data.skip()
|
||||||
c = data.next()
|
|
||||||
break
|
break
|
||||||
elif c in ("/", ">"):
|
elif c in (b"/", b">"):
|
||||||
return "".join(attrName), ""
|
return b"".join(attrName), b""
|
||||||
elif c in asciiUppercaseBytes:
|
elif c in asciiUppercaseBytes:
|
||||||
attrName.append(c.lower())
|
attrName.append(c.lower())
|
||||||
elif c == None:
|
elif c is None:
|
||||||
return None
|
return None
|
||||||
else:
|
else:
|
||||||
attrName.append(c)
|
attrName.append(c)
|
||||||
# Step 5
|
# Step 5
|
||||||
c = data.next()
|
c = next(data)
|
||||||
# Step 7
|
# Step 7
|
||||||
if c != "=":
|
if c != b"=":
|
||||||
data.previous()
|
data.previous()
|
||||||
return "".join(attrName), ""
|
return b"".join(attrName), b""
|
||||||
# Step 8
|
# Step 8
|
||||||
data.next()
|
next(data)
|
||||||
# Step 9
|
# Step 9
|
||||||
c = data.skip()
|
c = data.skip()
|
||||||
# Step 10
|
# Step 10
|
||||||
if c in ("'", '"'):
|
if c in (b"'", b'"'):
|
||||||
# 10.1
|
# 10.1
|
||||||
quoteChar = c
|
quoteChar = c
|
||||||
while True:
|
while True:
|
||||||
# 10.2
|
# 10.2
|
||||||
c = data.next()
|
c = next(data)
|
||||||
# 10.3
|
# 10.3
|
||||||
if c == quoteChar:
|
if c == quoteChar:
|
||||||
data.next()
|
next(data)
|
||||||
return "".join(attrName), "".join(attrValue)
|
return b"".join(attrName), b"".join(attrValue)
|
||||||
# 10.4
|
# 10.4
|
||||||
elif c in asciiUppercaseBytes:
|
elif c in asciiUppercaseBytes:
|
||||||
attrValue.append(c.lower())
|
attrValue.append(c.lower())
|
||||||
# 10.5
|
# 10.5
|
||||||
else:
|
else:
|
||||||
attrValue.append(c)
|
attrValue.append(c)
|
||||||
elif c == ">":
|
elif c == b">":
|
||||||
return "".join(attrName), ""
|
return b"".join(attrName), b""
|
||||||
elif c in asciiUppercaseBytes:
|
elif c in asciiUppercaseBytes:
|
||||||
attrValue.append(c.lower())
|
attrValue.append(c.lower())
|
||||||
elif c is None:
|
elif c is None:
|
||||||
@ -736,9 +816,9 @@ class EncodingParser(object):
|
|||||||
attrValue.append(c)
|
attrValue.append(c)
|
||||||
# Step 11
|
# Step 11
|
||||||
while True:
|
while True:
|
||||||
c = data.next()
|
c = next(data)
|
||||||
if c in spacesAngleBrackets:
|
if c in spacesAngleBrackets:
|
||||||
return "".join(attrName), "".join(attrValue)
|
return b"".join(attrName), b"".join(attrValue)
|
||||||
elif c in asciiUppercaseBytes:
|
elif c in asciiUppercaseBytes:
|
||||||
attrValue.append(c.lower())
|
attrValue.append(c.lower())
|
||||||
elif c is None:
|
elif c is None:
|
||||||
@ -749,21 +829,23 @@ class EncodingParser(object):
|
|||||||
|
|
||||||
class ContentAttrParser(object):
|
class ContentAttrParser(object):
|
||||||
def __init__(self, data):
|
def __init__(self, data):
|
||||||
|
assert isinstance(data, bytes)
|
||||||
self.data = data
|
self.data = data
|
||||||
|
|
||||||
def parse(self):
|
def parse(self):
|
||||||
try:
|
try:
|
||||||
# Check if the attr name is charset
|
# Check if the attr name is charset
|
||||||
# otherwise return
|
# otherwise return
|
||||||
self.data.jumpTo("charset")
|
self.data.jumpTo(b"charset")
|
||||||
self.data.position += 1
|
self.data.position += 1
|
||||||
self.data.skip()
|
self.data.skip()
|
||||||
if not self.data.currentByte == "=":
|
if not self.data.currentByte == b"=":
|
||||||
# If there is no = sign keep looking for attrs
|
# If there is no = sign keep looking for attrs
|
||||||
return None
|
return None
|
||||||
self.data.position += 1
|
self.data.position += 1
|
||||||
self.data.skip()
|
self.data.skip()
|
||||||
# Look for an encoding between matching quote marks
|
# Look for an encoding between matching quote marks
|
||||||
if self.data.currentByte in ('"', "'"):
|
if self.data.currentByte in (b'"', b"'"):
|
||||||
quoteMark = self.data.currentByte
|
quoteMark = self.data.currentByte
|
||||||
self.data.position += 1
|
self.data.position += 1
|
||||||
oldPosition = self.data.position
|
oldPosition = self.data.position
|
||||||
@ -787,7 +869,12 @@ class ContentAttrParser(object):
|
|||||||
def codecName(encoding):
|
def codecName(encoding):
|
||||||
"""Return the python codec name corresponding to an encoding or None if the
|
"""Return the python codec name corresponding to an encoding or None if the
|
||||||
string doesn't correspond to a valid encoding."""
|
string doesn't correspond to a valid encoding."""
|
||||||
if (encoding is not None and type(encoding) in types.StringTypes):
|
if isinstance(encoding, bytes):
|
||||||
|
try:
|
||||||
|
encoding = encoding.decode("ascii")
|
||||||
|
except UnicodeDecodeError:
|
||||||
|
return None
|
||||||
|
if encoding:
|
||||||
canonicalName = ascii_punctuation_re.sub("", encoding).lower()
|
canonicalName = ascii_punctuation_re.sub("", encoding).lower()
|
||||||
return encodings.get(canonicalName, None)
|
return encodings.get(canonicalName, None)
|
||||||
else:
|
else:
|
||||||
|
@ -1,21 +1,28 @@
|
|||||||
|
from __future__ import absolute_import, division, unicode_literals
|
||||||
|
|
||||||
import re
|
import re
|
||||||
from xml.sax.saxutils import escape, unescape
|
from xml.sax.saxutils import escape, unescape
|
||||||
|
|
||||||
from tokenizer import HTMLTokenizer
|
from .tokenizer import HTMLTokenizer
|
||||||
from constants import tokenTypes
|
from .constants import tokenTypes
|
||||||
|
|
||||||
|
|
||||||
class HTMLSanitizerMixin(object):
|
class HTMLSanitizerMixin(object):
|
||||||
""" sanitization of XHTML+MathML+SVG and of inline style attributes."""
|
""" sanitization of XHTML+MathML+SVG and of inline style attributes."""
|
||||||
|
|
||||||
acceptable_elements = ['a', 'abbr', 'acronym', 'address', 'area', 'b',
|
acceptable_elements = ['a', 'abbr', 'acronym', 'address', 'area',
|
||||||
'big', 'blockquote', 'br', 'button', 'caption', 'center', 'cite',
|
'article', 'aside', 'audio', 'b', 'big', 'blockquote', 'br', 'button',
|
||||||
'code', 'col', 'colgroup', 'dd', 'del', 'dfn', 'dir', 'div', 'dl', 'dt',
|
'canvas', 'caption', 'center', 'cite', 'code', 'col', 'colgroup',
|
||||||
'em', 'fieldset', 'font', 'form', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6',
|
'command', 'datagrid', 'datalist', 'dd', 'del', 'details', 'dfn',
|
||||||
'hr', 'i', 'img', 'input', 'ins', 'kbd', 'label', 'legend', 'li', 'map',
|
'dialog', 'dir', 'div', 'dl', 'dt', 'em', 'event-source', 'fieldset',
|
||||||
'menu', 'ol', 'optgroup', 'option', 'p', 'pre', 'q', 's', 'samp',
|
'figcaption', 'figure', 'footer', 'font', 'form', 'header', 'h1',
|
||||||
'select', 'small', 'span', 'strike', 'strong', 'sub', 'sup', 'table',
|
'h2', 'h3', 'h4', 'h5', 'h6', 'hr', 'i', 'img', 'input', 'ins',
|
||||||
'tbody', 'td', 'textarea', 'tfoot', 'th', 'thead', 'tr', 'tt', 'u',
|
'keygen', 'kbd', 'label', 'legend', 'li', 'm', 'map', 'menu', 'meter',
|
||||||
'ul', 'var']
|
'multicol', 'nav', 'nextid', 'ol', 'output', 'optgroup', 'option',
|
||||||
|
'p', 'pre', 'progress', 'q', 's', 'samp', 'section', 'select',
|
||||||
|
'small', 'sound', 'source', 'spacer', 'span', 'strike', 'strong',
|
||||||
|
'sub', 'sup', 'table', 'tbody', 'td', 'textarea', 'time', 'tfoot',
|
||||||
|
'th', 'thead', 'tr', 'tt', 'u', 'ul', 'var', 'video']
|
||||||
|
|
||||||
mathml_elements = ['maction', 'math', 'merror', 'mfrac', 'mi',
|
mathml_elements = ['maction', 'math', 'merror', 'mfrac', 'mi',
|
||||||
'mmultiscripts', 'mn', 'mo', 'mover', 'mpadded', 'mphantom',
|
'mmultiscripts', 'mn', 'mo', 'mover', 'mpadded', 'mphantom',
|
||||||
@ -24,24 +31,35 @@ class HTMLSanitizerMixin(object):
|
|||||||
'munderover', 'none']
|
'munderover', 'none']
|
||||||
|
|
||||||
svg_elements = ['a', 'animate', 'animateColor', 'animateMotion',
|
svg_elements = ['a', 'animate', 'animateColor', 'animateMotion',
|
||||||
'animateTransform', 'circle', 'defs', 'desc', 'ellipse', 'font-face',
|
'animateTransform', 'clipPath', 'circle', 'defs', 'desc', 'ellipse',
|
||||||
'font-face-name', 'font-face-src', 'g', 'glyph', 'hkern',
|
'font-face', 'font-face-name', 'font-face-src', 'g', 'glyph', 'hkern',
|
||||||
'linearGradient', 'line', 'marker', 'metadata', 'missing-glyph',
|
'linearGradient', 'line', 'marker', 'metadata', 'missing-glyph',
|
||||||
'mpath', 'path', 'polygon', 'polyline', 'radialGradient', 'rect',
|
'mpath', 'path', 'polygon', 'polyline', 'radialGradient', 'rect',
|
||||||
'set', 'stop', 'svg', 'switch', 'text', 'title', 'tspan', 'use']
|
'set', 'stop', 'svg', 'switch', 'text', 'title', 'tspan', 'use']
|
||||||
|
|
||||||
acceptable_attributes = ['abbr', 'accept', 'accept-charset', 'accesskey',
|
acceptable_attributes = ['abbr', 'accept', 'accept-charset', 'accesskey',
|
||||||
'action', 'align', 'alt', 'axis', 'border', 'cellpadding',
|
'action', 'align', 'alt', 'autocomplete', 'autofocus', 'axis',
|
||||||
'cellspacing', 'char', 'charoff', 'charset', 'checked', 'cite', 'class',
|
'background', 'balance', 'bgcolor', 'bgproperties', 'border',
|
||||||
'clear', 'cols', 'colspan', 'color', 'compact', 'coords', 'datetime',
|
'bordercolor', 'bordercolordark', 'bordercolorlight', 'bottompadding',
|
||||||
'dir', 'disabled', 'enctype', 'for', 'frame', 'headers', 'height',
|
'cellpadding', 'cellspacing', 'ch', 'challenge', 'char', 'charoff',
|
||||||
'href', 'hreflang', 'hspace', 'id', 'ismap', 'label', 'lang',
|
'choff', 'charset', 'checked', 'cite', 'class', 'clear', 'color',
|
||||||
'longdesc', 'maxlength', 'media', 'method', 'multiple', 'name',
|
'cols', 'colspan', 'compact', 'contenteditable', 'controls', 'coords',
|
||||||
'nohref', 'noshade', 'nowrap', 'prompt', 'readonly', 'rel', 'rev',
|
'data', 'datafld', 'datapagesize', 'datasrc', 'datetime', 'default',
|
||||||
'rows', 'rowspan', 'rules', 'scope', 'selected', 'shape', 'size',
|
'delay', 'dir', 'disabled', 'draggable', 'dynsrc', 'enctype', 'end',
|
||||||
'span', 'src', 'start', 'style', 'summary', 'tabindex', 'target',
|
'face', 'for', 'form', 'frame', 'galleryimg', 'gutter', 'headers',
|
||||||
'title', 'type', 'usemap', 'valign', 'value', 'vspace', 'width',
|
'height', 'hidefocus', 'hidden', 'high', 'href', 'hreflang', 'hspace',
|
||||||
'xml:lang']
|
'icon', 'id', 'inputmode', 'ismap', 'keytype', 'label', 'leftspacing',
|
||||||
|
'lang', 'list', 'longdesc', 'loop', 'loopcount', 'loopend',
|
||||||
|
'loopstart', 'low', 'lowsrc', 'max', 'maxlength', 'media', 'method',
|
||||||
|
'min', 'multiple', 'name', 'nohref', 'noshade', 'nowrap', 'open',
|
||||||
|
'optimum', 'pattern', 'ping', 'point-size', 'poster', 'pqg', 'preload',
|
||||||
|
'prompt', 'radiogroup', 'readonly', 'rel', 'repeat-max', 'repeat-min',
|
||||||
|
'replace', 'required', 'rev', 'rightspacing', 'rows', 'rowspan',
|
||||||
|
'rules', 'scope', 'selected', 'shape', 'size', 'span', 'src', 'start',
|
||||||
|
'step', 'style', 'summary', 'suppress', 'tabindex', 'target',
|
||||||
|
'template', 'title', 'toppadding', 'type', 'unselectable', 'usemap',
|
||||||
|
'urn', 'valign', 'value', 'variable', 'volume', 'vspace', 'vrml',
|
||||||
|
'width', 'wrap', 'xml:lang']
|
||||||
|
|
||||||
mathml_attributes = ['actiontype', 'align', 'columnalign', 'columnalign',
|
mathml_attributes = ['actiontype', 'align', 'columnalign', 'columnalign',
|
||||||
'columnalign', 'columnlines', 'columnspacing', 'columnspan', 'depth',
|
'columnalign', 'columnlines', 'columnspacing', 'columnspan', 'depth',
|
||||||
@ -56,41 +74,43 @@ class HTMLSanitizerMixin(object):
|
|||||||
svg_attributes = ['accent-height', 'accumulate', 'additive', 'alphabetic',
|
svg_attributes = ['accent-height', 'accumulate', 'additive', 'alphabetic',
|
||||||
'arabic-form', 'ascent', 'attributeName', 'attributeType',
|
'arabic-form', 'ascent', 'attributeName', 'attributeType',
|
||||||
'baseProfile', 'bbox', 'begin', 'by', 'calcMode', 'cap-height',
|
'baseProfile', 'bbox', 'begin', 'by', 'calcMode', 'cap-height',
|
||||||
'class', 'color', 'color-rendering', 'content', 'cx', 'cy', 'd', 'dx',
|
'class', 'clip-path', 'color', 'color-rendering', 'content', 'cx',
|
||||||
'dy', 'descent', 'display', 'dur', 'end', 'fill', 'fill-opacity',
|
'cy', 'd', 'dx', 'dy', 'descent', 'display', 'dur', 'end', 'fill',
|
||||||
'fill-rule', 'font-family', 'font-size', 'font-stretch', 'font-style',
|
'fill-opacity', 'fill-rule', 'font-family', 'font-size',
|
||||||
'font-variant', 'font-weight', 'from', 'fx', 'fy', 'g1', 'g2',
|
'font-stretch', 'font-style', 'font-variant', 'font-weight', 'from',
|
||||||
'glyph-name', 'gradientUnits', 'hanging', 'height', 'horiz-adv-x',
|
'fx', 'fy', 'g1', 'g2', 'glyph-name', 'gradientUnits', 'hanging',
|
||||||
'horiz-origin-x', 'id', 'ideographic', 'k', 'keyPoints',
|
'height', 'horiz-adv-x', 'horiz-origin-x', 'id', 'ideographic', 'k',
|
||||||
'keySplines', 'keyTimes', 'lang', 'marker-end', 'marker-mid',
|
'keyPoints', 'keySplines', 'keyTimes', 'lang', 'marker-end',
|
||||||
'marker-start', 'markerHeight', 'markerUnits', 'markerWidth',
|
'marker-mid', 'marker-start', 'markerHeight', 'markerUnits',
|
||||||
'mathematical', 'max', 'min', 'name', 'offset', 'opacity', 'orient',
|
'markerWidth', 'mathematical', 'max', 'min', 'name', 'offset',
|
||||||
'origin', 'overline-position', 'overline-thickness', 'panose-1',
|
'opacity', 'orient', 'origin', 'overline-position',
|
||||||
'path', 'pathLength', 'points', 'preserveAspectRatio', 'r', 'refX',
|
'overline-thickness', 'panose-1', 'path', 'pathLength', 'points',
|
||||||
'refY', 'repeatCount', 'repeatDur', 'requiredExtensions',
|
'preserveAspectRatio', 'r', 'refX', 'refY', 'repeatCount',
|
||||||
'requiredFeatures', 'restart', 'rotate', 'rx', 'ry', 'slope',
|
'repeatDur', 'requiredExtensions', 'requiredFeatures', 'restart',
|
||||||
'stemh', 'stemv', 'stop-color', 'stop-opacity',
|
'rotate', 'rx', 'ry', 'slope', 'stemh', 'stemv', 'stop-color',
|
||||||
'strikethrough-position', 'strikethrough-thickness', 'stroke',
|
'stop-opacity', 'strikethrough-position', 'strikethrough-thickness',
|
||||||
'stroke-dasharray', 'stroke-dashoffset', 'stroke-linecap',
|
'stroke', 'stroke-dasharray', 'stroke-dashoffset', 'stroke-linecap',
|
||||||
'stroke-linejoin', 'stroke-miterlimit', 'stroke-opacity',
|
'stroke-linejoin', 'stroke-miterlimit', 'stroke-opacity',
|
||||||
'stroke-width', 'systemLanguage', 'target', 'text-anchor', 'to',
|
'stroke-width', 'systemLanguage', 'target', 'text-anchor', 'to',
|
||||||
'transform', 'type', 'u1', 'u2', 'underline-position',
|
'transform', 'type', 'u1', 'u2', 'underline-position',
|
||||||
'underline-thickness', 'unicode', 'unicode-range', 'units-per-em',
|
'underline-thickness', 'unicode', 'unicode-range', 'units-per-em',
|
||||||
'values', 'version', 'viewBox', 'visibility', 'width', 'widths', 'x',
|
'values', 'version', 'viewBox', 'visibility', 'width', 'widths', 'x',
|
||||||
'x-height', 'x1', 'x2', 'xlink:actuate', 'xlink:arcrole',
|
'x-height', 'x1', 'x2', 'xlink:actuate', 'xlink:arcrole',
|
||||||
'xlink:href', 'xlink:role', 'xlink:show', 'xlink:title',
|
'xlink:href', 'xlink:role', 'xlink:show', 'xlink:title', 'xlink:type',
|
||||||
'xlink:type', 'xml:base', 'xml:lang', 'xml:space', 'xmlns',
|
'xml:base', 'xml:lang', 'xml:space', 'xmlns', 'xmlns:xlink', 'y',
|
||||||
'xmlns:xlink', 'y', 'y1', 'y2', 'zoomAndPan']
|
'y1', 'y2', 'zoomAndPan']
|
||||||
|
|
||||||
attr_val_is_uri = ['href', 'src', 'cite', 'action', 'longdesc',
|
attr_val_is_uri = ['href', 'src', 'cite', 'action', 'longdesc', 'poster',
|
||||||
'xlink:href', 'xml:base']
|
'xlink:href', 'xml:base']
|
||||||
|
|
||||||
svg_attr_val_allows_ref = ['clip-path', 'color-profile', 'cursor', 'fill',
|
svg_attr_val_allows_ref = ['clip-path', 'color-profile', 'cursor', 'fill',
|
||||||
'filter', 'marker', 'marker-start', 'marker-mid', 'marker-end', 'mask', 'stroke']
|
'filter', 'marker', 'marker-start', 'marker-mid', 'marker-end',
|
||||||
|
'mask', 'stroke']
|
||||||
|
|
||||||
svg_allow_local_href = ['altGlyph', 'animate', 'animateColor', 'animateMotion',
|
svg_allow_local_href = ['altGlyph', 'animate', 'animateColor',
|
||||||
'animateTransform', 'cursor', 'feImage', 'filter', 'linearGradient', 'pattern',
|
'animateMotion', 'animateTransform', 'cursor', 'feImage', 'filter',
|
||||||
'radialGradient', 'textpath', 'tref', 'set', 'use']
|
'linearGradient', 'pattern', 'radialGradient', 'textpath', 'tref',
|
||||||
|
'set', 'use']
|
||||||
|
|
||||||
acceptable_css_properties = ['azimuth', 'background-color',
|
acceptable_css_properties = ['azimuth', 'background-color',
|
||||||
'border-bottom-color', 'border-collapse', 'border-color',
|
'border-bottom-color', 'border-collapse', 'border-color',
|
||||||
@ -140,20 +160,35 @@ class HTMLSanitizerMixin(object):
|
|||||||
# sanitize_html('<a href="javascript: sucker();">Click here for $100</a>')
|
# sanitize_html('<a href="javascript: sucker();">Click here for $100</a>')
|
||||||
# => <a>Click here for $100</a>
|
# => <a>Click here for $100</a>
|
||||||
def sanitize_token(self, token):
|
def sanitize_token(self, token):
|
||||||
if token["type"] in (tokenTypes["StartTag"], tokenTypes["EndTag"],
|
|
||||||
|
# accommodate filters which use token_type differently
|
||||||
|
token_type = token["type"]
|
||||||
|
if token_type in list(tokenTypes.keys()):
|
||||||
|
token_type = tokenTypes[token_type]
|
||||||
|
|
||||||
|
if token_type in (tokenTypes["StartTag"], tokenTypes["EndTag"],
|
||||||
tokenTypes["EmptyTag"]):
|
tokenTypes["EmptyTag"]):
|
||||||
if token["name"] in self.allowed_elements:
|
if token["name"] in self.allowed_elements:
|
||||||
if token.has_key("data"):
|
return self.allowed_token(token, token_type)
|
||||||
|
else:
|
||||||
|
return self.disallowed_token(token, token_type)
|
||||||
|
elif token_type == tokenTypes["Comment"]:
|
||||||
|
pass
|
||||||
|
else:
|
||||||
|
return token
|
||||||
|
|
||||||
|
def allowed_token(self, token, token_type):
|
||||||
|
if "data" in token:
|
||||||
attrs = dict([(name, val) for name, val in
|
attrs = dict([(name, val) for name, val in
|
||||||
token["data"][::-1]
|
token["data"][::-1]
|
||||||
if name in self.allowed_attributes])
|
if name in self.allowed_attributes])
|
||||||
for attr in self.attr_val_is_uri:
|
for attr in self.attr_val_is_uri:
|
||||||
if not attrs.has_key(attr):
|
if attr not in attrs:
|
||||||
continue
|
continue
|
||||||
val_unescaped = re.sub("[`\000-\040\177-\240\s]+", '',
|
val_unescaped = re.sub("[`\000-\040\177-\240\s]+", '',
|
||||||
unescape(attrs[attr])).lower()
|
unescape(attrs[attr])).lower()
|
||||||
# remove replacement characters from unescaped characters
|
# remove replacement characters from unescaped characters
|
||||||
val_unescaped = val_unescaped.replace(u"\ufffd", "")
|
val_unescaped = val_unescaped.replace("\ufffd", "")
|
||||||
if (re.match("^[a-z0-9][-+.a-z0-9]*:", val_unescaped) and
|
if (re.match("^[a-z0-9][-+.a-z0-9]*:", val_unescaped) and
|
||||||
(val_unescaped.split(':')[0] not in
|
(val_unescaped.split(':')[0] not in
|
||||||
self.allowed_protocols)):
|
self.allowed_protocols)):
|
||||||
@ -167,26 +202,28 @@ class HTMLSanitizerMixin(object):
|
|||||||
'xlink:href' in attrs and re.search('^\s*[^#\s].*',
|
'xlink:href' in attrs and re.search('^\s*[^#\s].*',
|
||||||
attrs['xlink:href'])):
|
attrs['xlink:href'])):
|
||||||
del attrs['xlink:href']
|
del attrs['xlink:href']
|
||||||
if attrs.has_key('style'):
|
if 'style' in attrs:
|
||||||
attrs['style'] = self.sanitize_css(attrs['style'])
|
attrs['style'] = self.sanitize_css(attrs['style'])
|
||||||
token["data"] = [[name,val] for name,val in attrs.items()]
|
token["data"] = [[name, val] for name, val in list(attrs.items())]
|
||||||
return token
|
return token
|
||||||
else:
|
|
||||||
if token["type"] == tokenTypes["EndTag"]:
|
def disallowed_token(self, token, token_type):
|
||||||
|
if token_type == tokenTypes["EndTag"]:
|
||||||
token["data"] = "</%s>" % token["name"]
|
token["data"] = "</%s>" % token["name"]
|
||||||
elif token["data"]:
|
elif token["data"]:
|
||||||
attrs = ''.join([' %s="%s"' % (k, escape(v)) for k, v in token["data"]])
|
attrs = ''.join([' %s="%s"' % (k, escape(v)) for k, v in token["data"]])
|
||||||
token["data"] = "<%s%s>" % (token["name"], attrs)
|
token["data"] = "<%s%s>" % (token["name"], attrs)
|
||||||
else:
|
else:
|
||||||
token["data"] = "<%s>" % token["name"]
|
token["data"] = "<%s>" % token["name"]
|
||||||
if token["selfClosing"]:
|
if token.get("selfClosing"):
|
||||||
token["data"] = token["data"][:-1] + "/>"
|
token["data"] = token["data"][:-1] + "/>"
|
||||||
token["type"] = tokenTypes["Characters"]
|
|
||||||
del token["name"]
|
if token["type"] in list(tokenTypes.keys()):
|
||||||
return token
|
token["type"] = "Characters"
|
||||||
elif token["type"] == tokenTypes["Comment"]:
|
|
||||||
pass
|
|
||||||
else:
|
else:
|
||||||
|
token["type"] = tokenTypes["Characters"]
|
||||||
|
|
||||||
|
del token["name"]
|
||||||
return token
|
return token
|
||||||
|
|
||||||
def sanitize_css(self, style):
|
def sanitize_css(self, style):
|
||||||
@ -194,12 +231,15 @@ class HTMLSanitizerMixin(object):
|
|||||||
style = re.compile('url\s*\(\s*[^\s)]+?\s*\)\s*').sub(' ', style)
|
style = re.compile('url\s*\(\s*[^\s)]+?\s*\)\s*').sub(' ', style)
|
||||||
|
|
||||||
# gauntlet
|
# gauntlet
|
||||||
if not re.match("""^([:,;#%.\sa-zA-Z0-9!]|\w-\w|'[\s\w]+'|"[\s\w]+"|\([\d,\s]+\))*$""", style): return ''
|
if not re.match("""^([:,;#%.\sa-zA-Z0-9!]|\w-\w|'[\s\w]+'|"[\s\w]+"|\([\d,\s]+\))*$""", style):
|
||||||
if not re.match("^\s*([-\w]+\s*:[^:;]*(;\s*|$))*$", style): return ''
|
return ''
|
||||||
|
if not re.match("^\s*([-\w]+\s*:[^:;]*(;\s*|$))*$", style):
|
||||||
|
return ''
|
||||||
|
|
||||||
clean = []
|
clean = []
|
||||||
for prop, value in re.findall("([-\w]+)\s*:\s*([^:;]*)", style):
|
for prop, value in re.findall("([-\w]+)\s*:\s*([^:;]*)", style):
|
||||||
if not value: continue
|
if not value:
|
||||||
|
continue
|
||||||
if prop.lower() in self.allowed_css_properties:
|
if prop.lower() in self.allowed_css_properties:
|
||||||
clean.append(prop + ': ' + value + ';')
|
clean.append(prop + ': ' + value + ';')
|
||||||
elif prop.split('-')[0].lower() in ['background', 'border', 'margin',
|
elif prop.split('-')[0].lower() in ['background', 'border', 'margin',
|
||||||
@ -215,13 +255,14 @@ class HTMLSanitizerMixin(object):
|
|||||||
|
|
||||||
return ' '.join(clean)
|
return ' '.join(clean)
|
||||||
|
|
||||||
|
|
||||||
class HTMLSanitizer(HTMLTokenizer, HTMLSanitizerMixin):
|
class HTMLSanitizer(HTMLTokenizer, HTMLSanitizerMixin):
|
||||||
def __init__(self, stream, encoding=None, parseMeta=True, useChardet=True,
|
def __init__(self, stream, encoding=None, parseMeta=True, useChardet=True,
|
||||||
lowercaseElementName=False, lowercaseAttrName=False):
|
lowercaseElementName=False, lowercaseAttrName=False, parser=None):
|
||||||
# Change case matching defaults as we only output lowercase html anyway
|
# Change case matching defaults as we only output lowercase html anyway
|
||||||
# This solution doesn't seem ideal...
|
# This solution doesn't seem ideal...
|
||||||
HTMLTokenizer.__init__(self, stream, encoding, parseMeta, useChardet,
|
HTMLTokenizer.__init__(self, stream, encoding, parseMeta, useChardet,
|
||||||
lowercaseElementName, lowercaseAttrName)
|
lowercaseElementName, lowercaseAttrName, parser=parser)
|
||||||
|
|
||||||
def __iter__(self):
|
def __iter__(self):
|
||||||
for token in HTMLTokenizer.__iter__(self):
|
for token in HTMLTokenizer.__iter__(self):
|
||||||
|
@ -1,17 +1,16 @@
|
|||||||
|
from __future__ import absolute_import, division, unicode_literals
|
||||||
|
|
||||||
from html5lib import treewalkers
|
from .. import treewalkers
|
||||||
|
|
||||||
from htmlserializer import HTMLSerializer
|
from .htmlserializer import HTMLSerializer
|
||||||
from xhtmlserializer import XHTMLSerializer
|
|
||||||
|
|
||||||
def serialize(input, tree="simpletree", format="html", encoding=None,
|
|
||||||
|
def serialize(input, tree="etree", format="html", encoding=None,
|
||||||
**serializer_opts):
|
**serializer_opts):
|
||||||
# XXX: Should we cache this?
|
# XXX: Should we cache this?
|
||||||
walker = treewalkers.getTreeWalker(tree)
|
walker = treewalkers.getTreeWalker(tree)
|
||||||
if format == "html":
|
if format == "html":
|
||||||
s = HTMLSerializer(**serializer_opts)
|
s = HTMLSerializer(**serializer_opts)
|
||||||
elif format == "xhtml":
|
|
||||||
s = XHTMLSerializer(**serializer_opts)
|
|
||||||
else:
|
else:
|
||||||
raise ValueError, "type must be either html or xhtml"
|
raise ValueError("type must be html")
|
||||||
return s.render(walker(input), encoding)
|
return s.render(walker(input), encoding)
|
||||||
|
@ -1,18 +1,20 @@
|
|||||||
try:
|
from __future__ import absolute_import, division, unicode_literals
|
||||||
frozenset
|
from six import text_type
|
||||||
except NameError:
|
|
||||||
# Import from the sets module for python 2.3
|
|
||||||
from sets import ImmutableSet as frozenset
|
|
||||||
|
|
||||||
import gettext
|
import gettext
|
||||||
_ = gettext.gettext
|
_ = gettext.gettext
|
||||||
|
|
||||||
from html5lib.constants import voidElements, booleanAttributes, spaceCharacters
|
try:
|
||||||
from html5lib.constants import rcdataElements
|
from functools import reduce
|
||||||
|
except ImportError:
|
||||||
|
pass
|
||||||
|
|
||||||
|
from ..constants import voidElements, booleanAttributes, spaceCharacters
|
||||||
|
from ..constants import rcdataElements, entities, xmlEntities
|
||||||
|
from .. import utils
|
||||||
from xml.sax.saxutils import escape
|
from xml.sax.saxutils import escape
|
||||||
|
|
||||||
spaceCharacters = u"".join(spaceCharacters)
|
spaceCharacters = "".join(spaceCharacters)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
from codecs import register_error, xmlcharrefreplace_errors
|
from codecs import register_error, xmlcharrefreplace_errors
|
||||||
@ -21,27 +23,48 @@ except ImportError:
|
|||||||
else:
|
else:
|
||||||
unicode_encode_errors = "htmlentityreplace"
|
unicode_encode_errors = "htmlentityreplace"
|
||||||
|
|
||||||
from html5lib.constants import entities
|
|
||||||
|
|
||||||
encode_entity_map = {}
|
encode_entity_map = {}
|
||||||
for k, v in entities.items():
|
is_ucs4 = len("\U0010FFFF") == 1
|
||||||
if v != "&" and encode_entity_map.get(v) != k.lower():
|
for k, v in list(entities.items()):
|
||||||
|
# skip multi-character entities
|
||||||
|
if ((is_ucs4 and len(v) > 1) or
|
||||||
|
(not is_ucs4 and len(v) > 2)):
|
||||||
|
continue
|
||||||
|
if v != "&":
|
||||||
|
if len(v) == 2:
|
||||||
|
v = utils.surrogatePairToCodepoint(v)
|
||||||
|
else:
|
||||||
|
v = ord(v)
|
||||||
|
if not v in encode_entity_map or k.islower():
|
||||||
# prefer < over < and similarly for &, >, etc.
|
# prefer < over < and similarly for &, >, etc.
|
||||||
encode_entity_map[v] = k
|
encode_entity_map[v] = k
|
||||||
|
|
||||||
def htmlentityreplace_errors(exc):
|
def htmlentityreplace_errors(exc):
|
||||||
if isinstance(exc, (UnicodeEncodeError, UnicodeTranslateError)):
|
if isinstance(exc, (UnicodeEncodeError, UnicodeTranslateError)):
|
||||||
res = []
|
res = []
|
||||||
for c in exc.object[exc.start:exc.end]:
|
codepoints = []
|
||||||
e = encode_entity_map.get(c)
|
skip = False
|
||||||
|
for i, c in enumerate(exc.object[exc.start:exc.end]):
|
||||||
|
if skip:
|
||||||
|
skip = False
|
||||||
|
continue
|
||||||
|
index = i + exc.start
|
||||||
|
if utils.isSurrogatePair(exc.object[index:min([exc.end, index + 2])]):
|
||||||
|
codepoint = utils.surrogatePairToCodepoint(exc.object[index:index + 2])
|
||||||
|
skip = True
|
||||||
|
else:
|
||||||
|
codepoint = ord(c)
|
||||||
|
codepoints.append(codepoint)
|
||||||
|
for cp in codepoints:
|
||||||
|
e = encode_entity_map.get(cp)
|
||||||
if e:
|
if e:
|
||||||
res.append("&")
|
res.append("&")
|
||||||
res.append(e)
|
res.append(e)
|
||||||
if not e.endswith(";"):
|
if not e.endswith(";"):
|
||||||
res.append(";")
|
res.append(";")
|
||||||
else:
|
else:
|
||||||
res.append(c.encode(exc.encoding, "xmlcharrefreplace"))
|
res.append("&#x%s;" % (hex(cp)[2:]))
|
||||||
return (u"".join(res), exc.end)
|
return ("".join(res), exc.end)
|
||||||
else:
|
else:
|
||||||
return xmlcharrefreplace_errors(exc)
|
return xmlcharrefreplace_errors(exc)
|
||||||
|
|
||||||
@ -49,125 +72,185 @@ else:
|
|||||||
|
|
||||||
del register_error
|
del register_error
|
||||||
|
|
||||||
def encode(text, encoding):
|
|
||||||
return text.encode(encoding, unicode_encode_errors)
|
|
||||||
|
|
||||||
class HTMLSerializer(object):
|
class HTMLSerializer(object):
|
||||||
|
|
||||||
|
# attribute quoting options
|
||||||
quote_attr_values = False
|
quote_attr_values = False
|
||||||
quote_char = '"'
|
quote_char = '"'
|
||||||
use_best_quote_char = True
|
use_best_quote_char = True
|
||||||
minimize_boolean_attributes = True
|
|
||||||
|
|
||||||
|
# tag syntax options
|
||||||
|
omit_optional_tags = True
|
||||||
|
minimize_boolean_attributes = True
|
||||||
use_trailing_solidus = False
|
use_trailing_solidus = False
|
||||||
space_before_trailing_solidus = True
|
space_before_trailing_solidus = True
|
||||||
|
|
||||||
|
# escaping options
|
||||||
escape_lt_in_attrs = False
|
escape_lt_in_attrs = False
|
||||||
escape_rcdata = False
|
escape_rcdata = False
|
||||||
|
resolve_entities = True
|
||||||
|
|
||||||
|
# miscellaneous options
|
||||||
|
alphabetical_attributes = False
|
||||||
inject_meta_charset = True
|
inject_meta_charset = True
|
||||||
strip_whitespace = False
|
strip_whitespace = False
|
||||||
sanitize = False
|
sanitize = False
|
||||||
omit_optional_tags = True
|
|
||||||
|
|
||||||
options = ("quote_attr_values", "quote_char", "use_best_quote_char",
|
options = ("quote_attr_values", "quote_char", "use_best_quote_char",
|
||||||
"minimize_boolean_attributes", "use_trailing_solidus",
|
"omit_optional_tags", "minimize_boolean_attributes",
|
||||||
"space_before_trailing_solidus", "omit_optional_tags",
|
"use_trailing_solidus", "space_before_trailing_solidus",
|
||||||
"strip_whitespace", "inject_meta_charset", "escape_lt_in_attrs",
|
"escape_lt_in_attrs", "escape_rcdata", "resolve_entities",
|
||||||
"escape_rcdata", 'use_trailing_solidus', "sanitize")
|
"alphabetical_attributes", "inject_meta_charset",
|
||||||
|
"strip_whitespace", "sanitize")
|
||||||
|
|
||||||
def __init__(self, **kwargs):
|
def __init__(self, **kwargs):
|
||||||
if kwargs.has_key('quote_char'):
|
"""Initialize HTMLSerializer.
|
||||||
|
|
||||||
|
Keyword options (default given first unless specified) include:
|
||||||
|
|
||||||
|
inject_meta_charset=True|False
|
||||||
|
Whether it insert a meta element to define the character set of the
|
||||||
|
document.
|
||||||
|
quote_attr_values=True|False
|
||||||
|
Whether to quote attribute values that don't require quoting
|
||||||
|
per HTML5 parsing rules.
|
||||||
|
quote_char=u'"'|u"'"
|
||||||
|
Use given quote character for attribute quoting. Default is to
|
||||||
|
use double quote unless attribute value contains a double quote,
|
||||||
|
in which case single quotes are used instead.
|
||||||
|
escape_lt_in_attrs=False|True
|
||||||
|
Whether to escape < in attribute values.
|
||||||
|
escape_rcdata=False|True
|
||||||
|
Whether to escape characters that need to be escaped within normal
|
||||||
|
elements within rcdata elements such as style.
|
||||||
|
resolve_entities=True|False
|
||||||
|
Whether to resolve named character entities that appear in the
|
||||||
|
source tree. The XML predefined entities < > & " '
|
||||||
|
are unaffected by this setting.
|
||||||
|
strip_whitespace=False|True
|
||||||
|
Whether to remove semantically meaningless whitespace. (This
|
||||||
|
compresses all whitespace to a single space except within pre.)
|
||||||
|
minimize_boolean_attributes=True|False
|
||||||
|
Shortens boolean attributes to give just the attribute value,
|
||||||
|
for example <input disabled="disabled"> becomes <input disabled>.
|
||||||
|
use_trailing_solidus=False|True
|
||||||
|
Includes a close-tag slash at the end of the start tag of void
|
||||||
|
elements (empty elements whose end tag is forbidden). E.g. <hr/>.
|
||||||
|
space_before_trailing_solidus=True|False
|
||||||
|
Places a space immediately before the closing slash in a tag
|
||||||
|
using a trailing solidus. E.g. <hr />. Requires use_trailing_solidus.
|
||||||
|
sanitize=False|True
|
||||||
|
Strip all unsafe or unknown constructs from output.
|
||||||
|
See `html5lib user documentation`_
|
||||||
|
omit_optional_tags=True|False
|
||||||
|
Omit start/end tags that are optional.
|
||||||
|
alphabetical_attributes=False|True
|
||||||
|
Reorder attributes to be in alphabetical order.
|
||||||
|
|
||||||
|
.. _html5lib user documentation: http://code.google.com/p/html5lib/wiki/UserDocumentation
|
||||||
|
"""
|
||||||
|
if 'quote_char' in kwargs:
|
||||||
self.use_best_quote_char = False
|
self.use_best_quote_char = False
|
||||||
for attr in self.options:
|
for attr in self.options:
|
||||||
setattr(self, attr, kwargs.get(attr, getattr(self, attr)))
|
setattr(self, attr, kwargs.get(attr, getattr(self, attr)))
|
||||||
self.errors = []
|
self.errors = []
|
||||||
self.strict = False
|
self.strict = False
|
||||||
|
|
||||||
|
def encode(self, string):
|
||||||
|
assert(isinstance(string, text_type))
|
||||||
|
if self.encoding:
|
||||||
|
return string.encode(self.encoding, unicode_encode_errors)
|
||||||
|
else:
|
||||||
|
return string
|
||||||
|
|
||||||
|
def encodeStrict(self, string):
|
||||||
|
assert(isinstance(string, text_type))
|
||||||
|
if self.encoding:
|
||||||
|
return string.encode(self.encoding, "strict")
|
||||||
|
else:
|
||||||
|
return string
|
||||||
|
|
||||||
def serialize(self, treewalker, encoding=None):
|
def serialize(self, treewalker, encoding=None):
|
||||||
|
self.encoding = encoding
|
||||||
in_cdata = False
|
in_cdata = False
|
||||||
self.errors = []
|
self.errors = []
|
||||||
|
|
||||||
if encoding and self.inject_meta_charset:
|
if encoding and self.inject_meta_charset:
|
||||||
from html5lib.filters.inject_meta_charset import Filter
|
from ..filters.inject_meta_charset import Filter
|
||||||
treewalker = Filter(treewalker, encoding)
|
treewalker = Filter(treewalker, encoding)
|
||||||
# XXX: WhitespaceFilter should be used before OptionalTagFilter
|
# WhitespaceFilter should be used before OptionalTagFilter
|
||||||
# for maximum efficiently of this latter filter
|
# for maximum efficiently of this latter filter
|
||||||
if self.strip_whitespace:
|
if self.strip_whitespace:
|
||||||
from html5lib.filters.whitespace import Filter
|
from ..filters.whitespace import Filter
|
||||||
treewalker = Filter(treewalker)
|
treewalker = Filter(treewalker)
|
||||||
if self.sanitize:
|
if self.sanitize:
|
||||||
from html5lib.filters.sanitizer import Filter
|
from ..filters.sanitizer import Filter
|
||||||
treewalker = Filter(treewalker)
|
treewalker = Filter(treewalker)
|
||||||
if self.omit_optional_tags:
|
if self.omit_optional_tags:
|
||||||
from html5lib.filters.optionaltags import Filter
|
from ..filters.optionaltags import Filter
|
||||||
treewalker = Filter(treewalker)
|
treewalker = Filter(treewalker)
|
||||||
|
# Alphabetical attributes must be last, as other filters
|
||||||
|
# could add attributes and alter the order
|
||||||
|
if self.alphabetical_attributes:
|
||||||
|
from ..filters.alphabeticalattributes import Filter
|
||||||
|
treewalker = Filter(treewalker)
|
||||||
|
|
||||||
for token in treewalker:
|
for token in treewalker:
|
||||||
type = token["type"]
|
type = token["type"]
|
||||||
if type == "Doctype":
|
if type == "Doctype":
|
||||||
doctype = u"<!DOCTYPE %s" % token["name"]
|
doctype = "<!DOCTYPE %s" % token["name"]
|
||||||
|
|
||||||
if token["publicId"]:
|
if token["publicId"]:
|
||||||
doctype += u' PUBLIC "%s"' % token["publicId"]
|
doctype += ' PUBLIC "%s"' % token["publicId"]
|
||||||
elif token["systemId"]:
|
elif token["systemId"]:
|
||||||
doctype += u" SYSTEM"
|
doctype += " SYSTEM"
|
||||||
if token["systemId"]:
|
if token["systemId"]:
|
||||||
if token["systemId"].find(u'"') >= 0:
|
if token["systemId"].find('"') >= 0:
|
||||||
if token["systemId"].find(u"'") >= 0:
|
if token["systemId"].find("'") >= 0:
|
||||||
self.serializeError(_("System identifer contains both single and double quote characters"))
|
self.serializeError(_("System identifer contains both single and double quote characters"))
|
||||||
quote_char = u"'"
|
quote_char = "'"
|
||||||
else:
|
else:
|
||||||
quote_char = u'"'
|
quote_char = '"'
|
||||||
doctype += u" %s%s%s" % (quote_char, token["systemId"], quote_char)
|
doctype += " %s%s%s" % (quote_char, token["systemId"], quote_char)
|
||||||
|
|
||||||
doctype += u">"
|
doctype += ">"
|
||||||
|
yield self.encodeStrict(doctype)
|
||||||
if encoding:
|
|
||||||
yield doctype.encode(encoding)
|
|
||||||
else:
|
|
||||||
yield doctype
|
|
||||||
|
|
||||||
elif type in ("Characters", "SpaceCharacters"):
|
elif type in ("Characters", "SpaceCharacters"):
|
||||||
if type == "SpaceCharacters" or in_cdata:
|
if type == "SpaceCharacters" or in_cdata:
|
||||||
if in_cdata and token["data"].find("</") >= 0:
|
if in_cdata and token["data"].find("</") >= 0:
|
||||||
self.serializeError(_("Unexpected </ in CDATA"))
|
self.serializeError(_("Unexpected </ in CDATA"))
|
||||||
if encoding:
|
yield self.encode(token["data"])
|
||||||
yield token["data"].encode(encoding, "strict")
|
|
||||||
else:
|
else:
|
||||||
yield token["data"]
|
yield self.encode(escape(token["data"]))
|
||||||
elif encoding:
|
|
||||||
yield encode(escape(token["data"]), encoding)
|
|
||||||
else:
|
|
||||||
yield escape(token["data"])
|
|
||||||
|
|
||||||
elif type in ("StartTag", "EmptyTag"):
|
elif type in ("StartTag", "EmptyTag"):
|
||||||
name = token["name"]
|
name = token["name"]
|
||||||
|
yield self.encodeStrict("<%s" % name)
|
||||||
if name in rcdataElements and not self.escape_rcdata:
|
if name in rcdataElements and not self.escape_rcdata:
|
||||||
in_cdata = True
|
in_cdata = True
|
||||||
elif in_cdata:
|
elif in_cdata:
|
||||||
self.serializeError(_("Unexpected child element of a CDATA element"))
|
self.serializeError(_("Unexpected child element of a CDATA element"))
|
||||||
attrs = token["data"]
|
for (attr_namespace, attr_name), attr_value in token["data"].items():
|
||||||
if hasattr(attrs, "items"):
|
# TODO: Add namespace support here
|
||||||
attrs = attrs.items()
|
k = attr_name
|
||||||
attrs.sort()
|
v = attr_value
|
||||||
attributes = []
|
yield self.encodeStrict(' ')
|
||||||
for k,v in attrs:
|
|
||||||
if encoding:
|
|
||||||
k = k.encode(encoding, "strict")
|
|
||||||
attributes.append(' ')
|
|
||||||
|
|
||||||
attributes.append(k)
|
yield self.encodeStrict(k)
|
||||||
if not self.minimize_boolean_attributes or \
|
if not self.minimize_boolean_attributes or \
|
||||||
(k not in booleanAttributes.get(name, tuple()) \
|
(k not in booleanAttributes.get(name, tuple())
|
||||||
and k not in booleanAttributes.get("", tuple())):
|
and k not in booleanAttributes.get("", tuple())):
|
||||||
attributes.append("=")
|
yield self.encodeStrict("=")
|
||||||
if self.quote_attr_values or not v:
|
if self.quote_attr_values or not v:
|
||||||
quote_attr = True
|
quote_attr = True
|
||||||
else:
|
else:
|
||||||
quote_attr = reduce(lambda x, y: x or (y in v),
|
quote_attr = reduce(lambda x, y: x or (y in v),
|
||||||
spaceCharacters + ">\"'=", False)
|
spaceCharacters + ">\"'=", False)
|
||||||
v = v.replace("&", "&")
|
v = v.replace("&", "&")
|
||||||
if self.escape_lt_in_attrs: v = v.replace("<", "<")
|
if self.escape_lt_in_attrs:
|
||||||
if encoding:
|
v = v.replace("<", "<")
|
||||||
v = encode(v, encoding)
|
|
||||||
if quote_attr:
|
if quote_attr:
|
||||||
quote_char = self.quote_char
|
quote_char = self.quote_char
|
||||||
if self.use_best_quote_char:
|
if self.use_best_quote_char:
|
||||||
@ -179,20 +262,17 @@ class HTMLSerializer(object):
|
|||||||
v = v.replace("'", "'")
|
v = v.replace("'", "'")
|
||||||
else:
|
else:
|
||||||
v = v.replace('"', """)
|
v = v.replace('"', """)
|
||||||
attributes.append(quote_char)
|
yield self.encodeStrict(quote_char)
|
||||||
attributes.append(v)
|
yield self.encode(v)
|
||||||
attributes.append(quote_char)
|
yield self.encodeStrict(quote_char)
|
||||||
else:
|
else:
|
||||||
attributes.append(v)
|
yield self.encode(v)
|
||||||
if name in voidElements and self.use_trailing_solidus:
|
if name in voidElements and self.use_trailing_solidus:
|
||||||
if self.space_before_trailing_solidus:
|
if self.space_before_trailing_solidus:
|
||||||
attributes.append(" /")
|
yield self.encodeStrict(" /")
|
||||||
else:
|
else:
|
||||||
attributes.append("/")
|
yield self.encodeStrict("/")
|
||||||
if encoding:
|
yield self.encode(">")
|
||||||
yield "<%s%s>" % (name.encode(encoding, "strict"), "".join(attributes))
|
|
||||||
else:
|
|
||||||
yield u"<%s%s>" % (name, u"".join(attributes))
|
|
||||||
|
|
||||||
elif type == "EndTag":
|
elif type == "EndTag":
|
||||||
name = token["name"]
|
name = token["name"]
|
||||||
@ -200,28 +280,33 @@ class HTMLSerializer(object):
|
|||||||
in_cdata = False
|
in_cdata = False
|
||||||
elif in_cdata:
|
elif in_cdata:
|
||||||
self.serializeError(_("Unexpected child element of a CDATA element"))
|
self.serializeError(_("Unexpected child element of a CDATA element"))
|
||||||
end_tag = u"</%s>" % name
|
yield self.encodeStrict("</%s>" % name)
|
||||||
if encoding:
|
|
||||||
end_tag = end_tag.encode(encoding, "strict")
|
|
||||||
yield end_tag
|
|
||||||
|
|
||||||
elif type == "Comment":
|
elif type == "Comment":
|
||||||
data = token["data"]
|
data = token["data"]
|
||||||
if data.find("--") >= 0:
|
if data.find("--") >= 0:
|
||||||
self.serializeError(_("Comment contains --"))
|
self.serializeError(_("Comment contains --"))
|
||||||
comment = u"<!--%s-->" % token["data"]
|
yield self.encodeStrict("<!--%s-->" % token["data"])
|
||||||
if encoding:
|
|
||||||
comment = comment.encode(encoding, unicode_encode_errors)
|
elif type == "Entity":
|
||||||
yield comment
|
name = token["name"]
|
||||||
|
key = name + ";"
|
||||||
|
if not key in entities:
|
||||||
|
self.serializeError(_("Entity %s not recognized" % name))
|
||||||
|
if self.resolve_entities and key not in xmlEntities:
|
||||||
|
data = entities[key]
|
||||||
|
else:
|
||||||
|
data = "&%s;" % name
|
||||||
|
yield self.encodeStrict(data)
|
||||||
|
|
||||||
else:
|
else:
|
||||||
self.serializeError(token["data"])
|
self.serializeError(token["data"])
|
||||||
|
|
||||||
def render(self, treewalker, encoding=None):
|
def render(self, treewalker, encoding=None):
|
||||||
if encoding:
|
if encoding:
|
||||||
return "".join(list(self.serialize(treewalker, encoding)))
|
return b"".join(list(self.serialize(treewalker, encoding)))
|
||||||
else:
|
else:
|
||||||
return u"".join(list(self.serialize(treewalker)))
|
return "".join(list(self.serialize(treewalker)))
|
||||||
|
|
||||||
def serializeError(self, data="XXX ERROR MESSAGE NEEDED"):
|
def serializeError(self, data="XXX ERROR MESSAGE NEEDED"):
|
||||||
# XXX The idea is to make data mandatory.
|
# XXX The idea is to make data mandatory.
|
||||||
@ -229,6 +314,7 @@ class HTMLSerializer(object):
|
|||||||
if self.strict:
|
if self.strict:
|
||||||
raise SerializeError
|
raise SerializeError
|
||||||
|
|
||||||
|
|
||||||
def SerializeError(Exception):
|
def SerializeError(Exception):
|
||||||
"""Error in serialized tree"""
|
"""Error in serialized tree"""
|
||||||
pass
|
pass
|
||||||
|
@ -1,9 +0,0 @@
|
|||||||
from htmlserializer import HTMLSerializer
|
|
||||||
|
|
||||||
class XHTMLSerializer(HTMLSerializer):
|
|
||||||
quote_attr_values = True
|
|
||||||
minimize_boolean_attributes = False
|
|
||||||
use_trailing_solidus = True
|
|
||||||
escape_lt_in_attrs = True
|
|
||||||
omit_optional_tags = False
|
|
||||||
escape_rcdata = True
|
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
0
src/html5lib/treeadapters/__init__.py
Normal file
0
src/html5lib/treeadapters/__init__.py
Normal file
44
src/html5lib/treeadapters/sax.py
Normal file
44
src/html5lib/treeadapters/sax.py
Normal file
@ -0,0 +1,44 @@
|
|||||||
|
from __future__ import absolute_import, division, unicode_literals
|
||||||
|
|
||||||
|
from xml.sax.xmlreader import AttributesNSImpl
|
||||||
|
|
||||||
|
from ..constants import adjustForeignAttributes, unadjustForeignAttributes
|
||||||
|
|
||||||
|
prefix_mapping = {}
|
||||||
|
for prefix, localName, namespace in adjustForeignAttributes.values():
|
||||||
|
if prefix is not None:
|
||||||
|
prefix_mapping[prefix] = namespace
|
||||||
|
|
||||||
|
|
||||||
|
def to_sax(walker, handler):
|
||||||
|
"""Call SAX-like content handler based on treewalker walker"""
|
||||||
|
handler.startDocument()
|
||||||
|
for prefix, namespace in prefix_mapping.items():
|
||||||
|
handler.startPrefixMapping(prefix, namespace)
|
||||||
|
|
||||||
|
for token in walker:
|
||||||
|
type = token["type"]
|
||||||
|
if type == "Doctype":
|
||||||
|
continue
|
||||||
|
elif type in ("StartTag", "EmptyTag"):
|
||||||
|
attrs = AttributesNSImpl(token["data"],
|
||||||
|
unadjustForeignAttributes)
|
||||||
|
handler.startElementNS((token["namespace"], token["name"]),
|
||||||
|
token["name"],
|
||||||
|
attrs)
|
||||||
|
if type == "EmptyTag":
|
||||||
|
handler.endElementNS((token["namespace"], token["name"]),
|
||||||
|
token["name"])
|
||||||
|
elif type == "EndTag":
|
||||||
|
handler.endElementNS((token["namespace"], token["name"]),
|
||||||
|
token["name"])
|
||||||
|
elif type in ("Characters", "SpaceCharacters"):
|
||||||
|
handler.characters(token["data"])
|
||||||
|
elif type == "Comment":
|
||||||
|
pass
|
||||||
|
else:
|
||||||
|
assert False, "Unknown token type"
|
||||||
|
|
||||||
|
for prefix, namespace in prefix_mapping.items():
|
||||||
|
handler.endPrefixMapping(prefix)
|
||||||
|
handler.endDocument()
|
@ -7,7 +7,7 @@ implement several things:
|
|||||||
1) A set of classes for various types of elements: Document, Doctype,
|
1) A set of classes for various types of elements: Document, Doctype,
|
||||||
Comment, Element. These must implement the interface of
|
Comment, Element. These must implement the interface of
|
||||||
_base.treebuilders.Node (although comment nodes have a different
|
_base.treebuilders.Node (although comment nodes have a different
|
||||||
signature for their constructor, see treebuilders.simpletree.Comment)
|
signature for their constructor, see treebuilders.etree.Comment)
|
||||||
Textual content may also be implemented as another node type, or not, as
|
Textual content may also be implemented as another node type, or not, as
|
||||||
your tree implementation requires.
|
your tree implementation requires.
|
||||||
|
|
||||||
@ -24,69 +24,53 @@ getDocument - Returns the root node of the complete document tree
|
|||||||
testSerializer method on your treebuilder which accepts a node and
|
testSerializer method on your treebuilder which accepts a node and
|
||||||
returns a string containing Node and its children serialized according
|
returns a string containing Node and its children serialized according
|
||||||
to the format used in the unittests
|
to the format used in the unittests
|
||||||
|
|
||||||
The supplied simpletree module provides a python-only implementation
|
|
||||||
of a full treebuilder and is a useful reference for the semantics of
|
|
||||||
the various methods.
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
from __future__ import absolute_import, division, unicode_literals
|
||||||
|
|
||||||
|
from ..utils import default_etree
|
||||||
|
|
||||||
treeBuilderCache = {}
|
treeBuilderCache = {}
|
||||||
|
|
||||||
|
|
||||||
def getTreeBuilder(treeType, implementation=None, **kwargs):
|
def getTreeBuilder(treeType, implementation=None, **kwargs):
|
||||||
"""Get a TreeBuilder class for various types of tree with built-in support
|
"""Get a TreeBuilder class for various types of tree with built-in support
|
||||||
|
|
||||||
treeType - the name of the tree type required (case-insensitive). Supported
|
treeType - the name of the tree type required (case-insensitive). Supported
|
||||||
values are "simpletree", "dom", "etree" and "beautifulsoup"
|
values are:
|
||||||
|
|
||||||
"simpletree" - a built-in DOM-ish tree type with support for some
|
|
||||||
more pythonic idioms.
|
|
||||||
"dom" - A generic builder for DOM implementations, defaulting to
|
"dom" - A generic builder for DOM implementations, defaulting to
|
||||||
a xml.dom.minidom based implementation for the sake of
|
a xml.dom.minidom based implementation.
|
||||||
backwards compatibility (as releases up until 0.10 had a
|
|
||||||
builder called "dom" that was a minidom implemenation).
|
|
||||||
"etree" - A generic builder for tree implementations exposing an
|
"etree" - A generic builder for tree implementations exposing an
|
||||||
elementtree-like interface (known to work with
|
ElementTree-like interface, defaulting to
|
||||||
ElementTree, cElementTree and lxml.etree).
|
xml.etree.cElementTree if available and
|
||||||
"beautifulsoup" - Beautiful soup (if installed)
|
xml.etree.ElementTree if not.
|
||||||
|
"lxml" - A etree-based builder for lxml.etree, handling
|
||||||
|
limitations of lxml's implementation.
|
||||||
|
|
||||||
implementation - (Currently applies to the "etree" and "dom" tree types). A
|
implementation - (Currently applies to the "etree" and "dom" tree types). A
|
||||||
module implementing the tree type e.g.
|
module implementing the tree type e.g.
|
||||||
xml.etree.ElementTree or lxml.etree."""
|
xml.etree.ElementTree or xml.etree.cElementTree."""
|
||||||
|
|
||||||
treeType = treeType.lower()
|
treeType = treeType.lower()
|
||||||
if treeType not in treeBuilderCache:
|
if treeType not in treeBuilderCache:
|
||||||
if treeType == "dom":
|
if treeType == "dom":
|
||||||
import dom
|
from . import dom
|
||||||
# XXX: Keep backwards compatibility by using minidom if no implementation is given
|
# Come up with a sane default (pref. from the stdlib)
|
||||||
if implementation == None:
|
if implementation is None:
|
||||||
from xml.dom import minidom
|
from xml.dom import minidom
|
||||||
implementation = minidom
|
implementation = minidom
|
||||||
# XXX: NEVER cache here, caching is done in the dom submodule
|
# NEVER cache here, caching is done in the dom submodule
|
||||||
return dom.getDomModule(implementation, **kwargs).TreeBuilder
|
return dom.getDomModule(implementation, **kwargs).TreeBuilder
|
||||||
elif treeType == "simpletree":
|
|
||||||
import simpletree
|
|
||||||
treeBuilderCache[treeType] = simpletree.TreeBuilder
|
|
||||||
elif treeType == "beautifulsoup":
|
|
||||||
import soup
|
|
||||||
treeBuilderCache[treeType] = soup.TreeBuilder
|
|
||||||
elif treeType == "lxml":
|
elif treeType == "lxml":
|
||||||
import etree_lxml
|
from . import etree_lxml
|
||||||
treeBuilderCache[treeType] = etree_lxml.TreeBuilder
|
treeBuilderCache[treeType] = etree_lxml.TreeBuilder
|
||||||
elif treeType == "etree":
|
elif treeType == "etree":
|
||||||
# Come up with a sane default
|
from . import etree
|
||||||
if implementation == None:
|
if implementation is None:
|
||||||
try:
|
implementation = default_etree
|
||||||
import xml.etree.cElementTree as ET
|
# NEVER cache here, caching is done in the etree submodule
|
||||||
except ImportError:
|
|
||||||
try:
|
|
||||||
import xml.etree.ElementTree as ET
|
|
||||||
except ImportError:
|
|
||||||
try:
|
|
||||||
import cElementTree as ET
|
|
||||||
except ImportError:
|
|
||||||
import elementtree.ElementTree as ET
|
|
||||||
implementation = ET
|
|
||||||
import etree
|
|
||||||
# XXX: NEVER cache here, caching is done in the etree submodule
|
|
||||||
return etree.getETreeModule(implementation, **kwargs).TreeBuilder
|
return etree.getETreeModule(implementation, **kwargs).TreeBuilder
|
||||||
|
else:
|
||||||
|
raise ValueError("""Unrecognised treebuilder "%s" """ % treeType)
|
||||||
return treeBuilderCache.get(treeType)
|
return treeBuilderCache.get(treeType)
|
||||||
|
@ -1,16 +1,25 @@
|
|||||||
from html5lib.constants import scopingElements, tableInsertModeElements, namespaces
|
from __future__ import absolute_import, division, unicode_literals
|
||||||
try:
|
from six import text_type
|
||||||
frozenset
|
|
||||||
except NameError:
|
|
||||||
# Import from the sets module for python 2.3
|
|
||||||
from sets import Set as set
|
|
||||||
from sets import ImmutableSet as frozenset
|
|
||||||
|
|
||||||
# The scope markers are inserted when entering buttons, object elements,
|
from ..constants import scopingElements, tableInsertModeElements, namespaces
|
||||||
|
|
||||||
|
# The scope markers are inserted when entering object elements,
|
||||||
# marquees, table cells, and table captions, and are used to prevent formatting
|
# marquees, table cells, and table captions, and are used to prevent formatting
|
||||||
# from "leaking" into tables, buttons, object elements, and marquees.
|
# from "leaking" into tables, object elements, and marquees.
|
||||||
Marker = None
|
Marker = None
|
||||||
|
|
||||||
|
listElementsMap = {
|
||||||
|
None: (frozenset(scopingElements), False),
|
||||||
|
"button": (frozenset(scopingElements | set([(namespaces["html"], "button")])), False),
|
||||||
|
"list": (frozenset(scopingElements | set([(namespaces["html"], "ol"),
|
||||||
|
(namespaces["html"], "ul")])), False),
|
||||||
|
"table": (frozenset([(namespaces["html"], "html"),
|
||||||
|
(namespaces["html"], "table")]), False),
|
||||||
|
"select": (frozenset([(namespaces["html"], "optgroup"),
|
||||||
|
(namespaces["html"], "option")]), True)
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
class Node(object):
|
class Node(object):
|
||||||
def __init__(self, name):
|
def __init__(self, name):
|
||||||
"""Node representing an item in the tree.
|
"""Node representing an item in the tree.
|
||||||
@ -30,10 +39,10 @@ class Node(object):
|
|||||||
self.childNodes = []
|
self.childNodes = []
|
||||||
self._flags = []
|
self._flags = []
|
||||||
|
|
||||||
def __unicode__(self):
|
def __str__(self):
|
||||||
attributesStr = " ".join(["%s=\"%s\"" % (name, value)
|
attributesStr = " ".join(["%s=\"%s\"" % (name, value)
|
||||||
for name, value in
|
for name, value in
|
||||||
self.attributes.iteritems()])
|
self.attributes.items()])
|
||||||
if attributesStr:
|
if attributesStr:
|
||||||
return "<%s %s>" % (self.name, attributesStr)
|
return "<%s %s>" % (self.name, attributesStr)
|
||||||
else:
|
else:
|
||||||
@ -80,12 +89,36 @@ class Node(object):
|
|||||||
"""
|
"""
|
||||||
raise NotImplementedError
|
raise NotImplementedError
|
||||||
|
|
||||||
|
|
||||||
def hasContent(self):
|
def hasContent(self):
|
||||||
"""Return true if the node has children or text, false otherwise
|
"""Return true if the node has children or text, false otherwise
|
||||||
"""
|
"""
|
||||||
raise NotImplementedError
|
raise NotImplementedError
|
||||||
|
|
||||||
|
|
||||||
|
class ActiveFormattingElements(list):
|
||||||
|
def append(self, node):
|
||||||
|
equalCount = 0
|
||||||
|
if node != Marker:
|
||||||
|
for element in self[::-1]:
|
||||||
|
if element == Marker:
|
||||||
|
break
|
||||||
|
if self.nodesEqual(element, node):
|
||||||
|
equalCount += 1
|
||||||
|
if equalCount == 3:
|
||||||
|
self.remove(element)
|
||||||
|
break
|
||||||
|
list.append(self, node)
|
||||||
|
|
||||||
|
def nodesEqual(self, node1, node2):
|
||||||
|
if not node1.nameTuple == node2.nameTuple:
|
||||||
|
return False
|
||||||
|
|
||||||
|
if not node1.attributes == node2.attributes:
|
||||||
|
return False
|
||||||
|
|
||||||
|
return True
|
||||||
|
|
||||||
|
|
||||||
class TreeBuilder(object):
|
class TreeBuilder(object):
|
||||||
"""Base treebuilder implementation
|
"""Base treebuilder implementation
|
||||||
documentClass - the class to use for the bottommost node of a document
|
documentClass - the class to use for the bottommost node of a document
|
||||||
@ -118,7 +151,7 @@ class TreeBuilder(object):
|
|||||||
|
|
||||||
def reset(self):
|
def reset(self):
|
||||||
self.openElements = []
|
self.openElements = []
|
||||||
self.activeFormattingElements = []
|
self.activeFormattingElements = ActiveFormattingElements()
|
||||||
|
|
||||||
# XXX - rename these to headElement, formElement
|
# XXX - rename these to headElement, formElement
|
||||||
self.headPointer = None
|
self.headPointer = None
|
||||||
@ -129,20 +162,18 @@ class TreeBuilder(object):
|
|||||||
self.document = self.documentClass()
|
self.document = self.documentClass()
|
||||||
|
|
||||||
def elementInScope(self, target, variant=None):
|
def elementInScope(self, target, variant=None):
|
||||||
# Exit early when possible.
|
|
||||||
listElementsMap = {
|
# If we pass a node in we match that. if we pass a string
|
||||||
None:scopingElements,
|
# match any node with that name
|
||||||
"list":scopingElements | set([(namespaces["html"], "ol"),
|
exactNode = hasattr(target, "nameTuple")
|
||||||
(namespaces["html"], "ul")]),
|
|
||||||
"table":set([(namespaces["html"], "html"),
|
listElements, invert = listElementsMap[variant]
|
||||||
(namespaces["html"], "table")])
|
|
||||||
}
|
|
||||||
listElements = listElementsMap[variant]
|
|
||||||
|
|
||||||
for node in reversed(self.openElements):
|
for node in reversed(self.openElements):
|
||||||
if node.name == target:
|
if (node.name == target and not exactNode or
|
||||||
|
node == target and exactNode):
|
||||||
return True
|
return True
|
||||||
elif node.nameTuple in listElements:
|
elif (invert ^ (node.nameTuple in listElements)):
|
||||||
return False
|
return False
|
||||||
|
|
||||||
assert False # We should never reach this point
|
assert False # We should never reach this point
|
||||||
@ -254,6 +285,7 @@ class TreeBuilder(object):
|
|||||||
|
|
||||||
def insertElementNormal(self, token):
|
def insertElementNormal(self, token):
|
||||||
name = token["name"]
|
name = token["name"]
|
||||||
|
assert isinstance(name, text_type), "Element %s not unicode" % name
|
||||||
namespace = token.get("namespace", self.defaultNamespace)
|
namespace = token.get("namespace", self.defaultNamespace)
|
||||||
element = self.elementClass(name, namespace)
|
element = self.elementClass(name, namespace)
|
||||||
element.attributes = token["data"]
|
element.attributes = token["data"]
|
||||||
@ -321,7 +353,7 @@ class TreeBuilder(object):
|
|||||||
def generateImpliedEndTags(self, exclude=None):
|
def generateImpliedEndTags(self, exclude=None):
|
||||||
name = self.openElements[-1].name
|
name = self.openElements[-1].name
|
||||||
# XXX td, th and tr are not actually needed
|
# XXX td, th and tr are not actually needed
|
||||||
if (name in frozenset(("dd", "dt", "li", "p", "td", "th", "tr"))
|
if (name in frozenset(("dd", "dt", "li", "option", "optgroup", "p", "rp", "rt"))
|
||||||
and name != exclude):
|
and name != exclude):
|
||||||
self.openElements.pop()
|
self.openElements.pop()
|
||||||
# XXX This is not entirely what the specification says. We should
|
# XXX This is not entirely what the specification says. We should
|
||||||
|
@ -1,40 +1,38 @@
|
|||||||
|
from __future__ import absolute_import, division, unicode_literals
|
||||||
|
|
||||||
from xml.dom import minidom, Node, XML_NAMESPACE, XMLNS_NAMESPACE
|
|
||||||
import new
|
from xml.dom import minidom, Node
|
||||||
import re
|
|
||||||
import weakref
|
import weakref
|
||||||
|
|
||||||
import _base
|
from . import _base
|
||||||
from html5lib import constants, ihatexml
|
from .. import constants
|
||||||
from html5lib.constants import namespaces
|
from ..constants import namespaces
|
||||||
|
from ..utils import moduleFactoryFactory
|
||||||
|
|
||||||
moduleCache = {}
|
|
||||||
|
|
||||||
def getDomModule(DomImplementation):
|
|
||||||
name = "_" + DomImplementation.__name__+"builder"
|
|
||||||
if name in moduleCache:
|
|
||||||
return moduleCache[name]
|
|
||||||
else:
|
|
||||||
mod = new.module(name)
|
|
||||||
objs = getDomBuilder(DomImplementation)
|
|
||||||
mod.__dict__.update(objs)
|
|
||||||
moduleCache[name] = mod
|
|
||||||
return mod
|
|
||||||
|
|
||||||
def getDomBuilder(DomImplementation):
|
def getDomBuilder(DomImplementation):
|
||||||
Dom = DomImplementation
|
Dom = DomImplementation
|
||||||
class AttrList:
|
|
||||||
|
class AttrList(object):
|
||||||
def __init__(self, element):
|
def __init__(self, element):
|
||||||
self.element = element
|
self.element = element
|
||||||
|
|
||||||
def __iter__(self):
|
def __iter__(self):
|
||||||
return self.element.attributes.items().__iter__()
|
return list(self.element.attributes.items()).__iter__()
|
||||||
|
|
||||||
def __setitem__(self, name, value):
|
def __setitem__(self, name, value):
|
||||||
self.element.setAttribute(name, value)
|
self.element.setAttribute(name, value)
|
||||||
|
|
||||||
|
def __len__(self):
|
||||||
|
return len(list(self.element.attributes.items()))
|
||||||
|
|
||||||
def items(self):
|
def items(self):
|
||||||
return [(item[0], item[1]) for item in
|
return [(item[0], item[1]) for item in
|
||||||
self.element.attributes.items()]
|
list(self.element.attributes.items())]
|
||||||
|
|
||||||
def keys(self):
|
def keys(self):
|
||||||
return self.element.attributes.keys()
|
return list(self.element.attributes.keys())
|
||||||
|
|
||||||
def __getitem__(self, name):
|
def __getitem__(self, name):
|
||||||
return self.element.getAttribute(name)
|
return self.element.getAttribute(name)
|
||||||
|
|
||||||
@ -84,7 +82,7 @@ def getDomBuilder(DomImplementation):
|
|||||||
|
|
||||||
def setAttributes(self, attributes):
|
def setAttributes(self, attributes):
|
||||||
if attributes:
|
if attributes:
|
||||||
for name, value in attributes.items():
|
for name, value in list(attributes.items()):
|
||||||
if isinstance(name, tuple):
|
if isinstance(name, tuple):
|
||||||
if name[0] is not None:
|
if name[0] is not None:
|
||||||
qualifiedName = (name[0] + ":" + name[1])
|
qualifiedName = (name[0] + ":" + name[1])
|
||||||
@ -104,7 +102,7 @@ def getDomBuilder(DomImplementation):
|
|||||||
return self.element.hasChildNodes()
|
return self.element.hasChildNodes()
|
||||||
|
|
||||||
def getNameTuple(self):
|
def getNameTuple(self):
|
||||||
if self.namespace == None:
|
if self.namespace is None:
|
||||||
return namespaces["html"], self.name
|
return namespaces["html"], self.name
|
||||||
else:
|
else:
|
||||||
return self.namespace, self.name
|
return self.namespace, self.name
|
||||||
@ -155,7 +153,7 @@ def getDomBuilder(DomImplementation):
|
|||||||
|
|
||||||
def insertText(self, data, parent=None):
|
def insertText(self, data, parent=None):
|
||||||
data = data
|
data = data
|
||||||
if parent <> self:
|
if parent != self:
|
||||||
_base.TreeBuilder.insertText(self, data, parent)
|
_base.TreeBuilder.insertText(self, data, parent)
|
||||||
else:
|
else:
|
||||||
# HACK: allow text nodes as children of the document node
|
# HACK: allow text nodes as children of the document node
|
||||||
@ -165,19 +163,21 @@ def getDomBuilder(DomImplementation):
|
|||||||
self.dom._child_node_types.append(Node.TEXT_NODE)
|
self.dom._child_node_types.append(Node.TEXT_NODE)
|
||||||
self.dom.appendChild(self.dom.createTextNode(data))
|
self.dom.appendChild(self.dom.createTextNode(data))
|
||||||
|
|
||||||
|
implementation = DomImplementation
|
||||||
name = None
|
name = None
|
||||||
|
|
||||||
def testSerializer(element):
|
def testSerializer(element):
|
||||||
element.normalize()
|
element.normalize()
|
||||||
rv = []
|
rv = []
|
||||||
|
|
||||||
def serializeElement(element, indent=0):
|
def serializeElement(element, indent=0):
|
||||||
if element.nodeType == Node.DOCUMENT_TYPE_NODE:
|
if element.nodeType == Node.DOCUMENT_TYPE_NODE:
|
||||||
if element.name:
|
if element.name:
|
||||||
if element.publicId or element.systemId:
|
if element.publicId or element.systemId:
|
||||||
publicId = element.publicId or ""
|
publicId = element.publicId or ""
|
||||||
systemId = element.systemId or ""
|
systemId = element.systemId or ""
|
||||||
rv.append( """|%s<!DOCTYPE %s "%s" "%s">"""%(
|
rv.append("""|%s<!DOCTYPE %s "%s" "%s">""" %
|
||||||
' '*indent, element.name, publicId, systemId))
|
(' ' * indent, element.name, publicId, systemId))
|
||||||
else:
|
else:
|
||||||
rv.append("|%s<!DOCTYPE %s>" % (' ' * indent, element.name))
|
rv.append("|%s<!DOCTYPE %s>" % (' ' * indent, element.name))
|
||||||
else:
|
else:
|
||||||
@ -192,16 +192,16 @@ def getDomBuilder(DomImplementation):
|
|||||||
rv.append("|%s\"%s\"" % (' ' * indent, element.nodeValue))
|
rv.append("|%s\"%s\"" % (' ' * indent, element.nodeValue))
|
||||||
else:
|
else:
|
||||||
if (hasattr(element, "namespaceURI") and
|
if (hasattr(element, "namespaceURI") and
|
||||||
element.namespaceURI != None):
|
element.namespaceURI is not None):
|
||||||
name = "%s %s" % (constants.prefixes[element.namespaceURI],
|
name = "%s %s" % (constants.prefixes[element.namespaceURI],
|
||||||
element.nodeName)
|
element.nodeName)
|
||||||
else:
|
else:
|
||||||
name = element.nodeName
|
name = element.nodeName
|
||||||
rv.append("|%s<%s>" % (' ' * indent, name))
|
rv.append("|%s<%s>" % (' ' * indent, name))
|
||||||
if element.hasAttributes():
|
if element.hasAttributes():
|
||||||
i = 0
|
attributes = []
|
||||||
|
for i in range(len(element.attributes)):
|
||||||
attr = element.attributes.item(i)
|
attr = element.attributes.item(i)
|
||||||
while attr:
|
|
||||||
name = attr.nodeName
|
name = attr.nodeName
|
||||||
value = attr.value
|
value = attr.value
|
||||||
ns = attr.namespaceURI
|
ns = attr.namespaceURI
|
||||||
@ -209,9 +209,9 @@ def getDomBuilder(DomImplementation):
|
|||||||
name = "%s %s" % (constants.prefixes[ns], attr.localName)
|
name = "%s %s" % (constants.prefixes[ns], attr.localName)
|
||||||
else:
|
else:
|
||||||
name = attr.nodeName
|
name = attr.nodeName
|
||||||
i += 1
|
attributes.append((name, value))
|
||||||
attr = element.attributes.item(i)
|
|
||||||
|
|
||||||
|
for name, value in sorted(attributes):
|
||||||
rv.append('|%s%s="%s"' % (' ' * (indent + 2), name, value))
|
rv.append('|%s%s="%s"' % (' ' * (indent + 2), name, value))
|
||||||
indent += 2
|
indent += 2
|
||||||
for child in element.childNodes:
|
for child in element.childNodes:
|
||||||
@ -220,67 +220,8 @@ def getDomBuilder(DomImplementation):
|
|||||||
|
|
||||||
return "\n".join(rv)
|
return "\n".join(rv)
|
||||||
|
|
||||||
def dom2sax(node, handler, nsmap={'xml':XML_NAMESPACE}):
|
|
||||||
if node.nodeType == Node.ELEMENT_NODE:
|
|
||||||
if not nsmap:
|
|
||||||
handler.startElement(node.nodeName, node.attributes)
|
|
||||||
for child in node.childNodes: dom2sax(child, handler, nsmap)
|
|
||||||
handler.endElement(node.nodeName)
|
|
||||||
else:
|
|
||||||
attributes = dict(node.attributes.itemsNS())
|
|
||||||
|
|
||||||
# gather namespace declarations
|
|
||||||
prefixes = []
|
|
||||||
for attrname in node.attributes.keys():
|
|
||||||
attr = node.getAttributeNode(attrname)
|
|
||||||
if (attr.namespaceURI == XMLNS_NAMESPACE or
|
|
||||||
(attr.namespaceURI == None and attr.nodeName.startswith('xmlns'))):
|
|
||||||
prefix = (attr.nodeName != 'xmlns' and attr.nodeName or None)
|
|
||||||
handler.startPrefixMapping(prefix, attr.nodeValue)
|
|
||||||
prefixes.append(prefix)
|
|
||||||
nsmap = nsmap.copy()
|
|
||||||
nsmap[prefix] = attr.nodeValue
|
|
||||||
del attributes[(attr.namespaceURI, attr.nodeName)]
|
|
||||||
|
|
||||||
# apply namespace declarations
|
|
||||||
for attrname in node.attributes.keys():
|
|
||||||
attr = node.getAttributeNode(attrname)
|
|
||||||
if attr.namespaceURI == None and ':' in attr.nodeName:
|
|
||||||
prefix = attr.nodeName.split(':')[0]
|
|
||||||
if nsmap.has_key(prefix):
|
|
||||||
del attributes[(attr.namespaceURI, attr.nodeName)]
|
|
||||||
attributes[(nsmap[prefix],attr.nodeName)]=attr.nodeValue
|
|
||||||
|
|
||||||
# SAX events
|
|
||||||
ns = node.namespaceURI or nsmap.get(None,None)
|
|
||||||
handler.startElementNS((ns,node.nodeName), node.nodeName, attributes)
|
|
||||||
for child in node.childNodes: dom2sax(child, handler, nsmap)
|
|
||||||
handler.endElementNS((ns, node.nodeName), node.nodeName)
|
|
||||||
for prefix in prefixes: handler.endPrefixMapping(prefix)
|
|
||||||
|
|
||||||
elif node.nodeType in [Node.TEXT_NODE, Node.CDATA_SECTION_NODE]:
|
|
||||||
handler.characters(node.nodeValue)
|
|
||||||
|
|
||||||
elif node.nodeType == Node.DOCUMENT_NODE:
|
|
||||||
handler.startDocument()
|
|
||||||
for child in node.childNodes: dom2sax(child, handler, nsmap)
|
|
||||||
handler.endDocument()
|
|
||||||
|
|
||||||
elif node.nodeType == Node.DOCUMENT_FRAGMENT_NODE:
|
|
||||||
for child in node.childNodes: dom2sax(child, handler, nsmap)
|
|
||||||
|
|
||||||
else:
|
|
||||||
# ATTRIBUTE_NODE
|
|
||||||
# ENTITY_NODE
|
|
||||||
# PROCESSING_INSTRUCTION_NODE
|
|
||||||
# COMMENT_NODE
|
|
||||||
# DOCUMENT_TYPE_NODE
|
|
||||||
# NOTATION_NODE
|
|
||||||
pass
|
|
||||||
|
|
||||||
return locals()
|
return locals()
|
||||||
|
|
||||||
# Keep backwards compatibility with things that directly load
|
|
||||||
# classes/functions from this module
|
# The actual means to get a module!
|
||||||
for key, value in getDomModule(minidom).__dict__.items():
|
getDomModule = moduleFactoryFactory(getDomBuilder)
|
||||||
globals()[key] = value
|
|
||||||
|
@ -1,28 +1,21 @@
|
|||||||
import new
|
from __future__ import absolute_import, division, unicode_literals
|
||||||
|
from six import text_type
|
||||||
|
|
||||||
import re
|
import re
|
||||||
|
|
||||||
import _base
|
from . import _base
|
||||||
from html5lib import ihatexml
|
from .. import ihatexml
|
||||||
from html5lib import constants
|
from .. import constants
|
||||||
from html5lib.constants import namespaces
|
from ..constants import namespaces
|
||||||
|
from ..utils import moduleFactoryFactory
|
||||||
|
|
||||||
tag_regexp = re.compile("{([^}]*)}(.*)")
|
tag_regexp = re.compile("{([^}]*)}(.*)")
|
||||||
|
|
||||||
moduleCache = {}
|
|
||||||
|
|
||||||
def getETreeModule(ElementTreeImplementation, fullTree=False):
|
|
||||||
name = "_" + ElementTreeImplementation.__name__+"builder"
|
|
||||||
if name in moduleCache:
|
|
||||||
return moduleCache[name]
|
|
||||||
else:
|
|
||||||
mod = new.module("_" + ElementTreeImplementation.__name__+"builder")
|
|
||||||
objs = getETreeBuilder(ElementTreeImplementation, fullTree)
|
|
||||||
mod.__dict__.update(objs)
|
|
||||||
moduleCache[name] = mod
|
|
||||||
return mod
|
|
||||||
|
|
||||||
def getETreeBuilder(ElementTreeImplementation, fullTree=False):
|
def getETreeBuilder(ElementTreeImplementation, fullTree=False):
|
||||||
ElementTree = ElementTreeImplementation
|
ElementTree = ElementTreeImplementation
|
||||||
|
ElementTreeCommentType = ElementTree.Comment("asd").tag
|
||||||
|
|
||||||
class Element(_base.Node):
|
class Element(_base.Node):
|
||||||
def __init__(self, name, namespace=None):
|
def __init__(self, name, namespace=None):
|
||||||
self._name = name
|
self._name = name
|
||||||
@ -68,9 +61,9 @@ def getETreeBuilder(ElementTreeImplementation, fullTree=False):
|
|||||||
def _setAttributes(self, attributes):
|
def _setAttributes(self, attributes):
|
||||||
# Delete existing attributes first
|
# Delete existing attributes first
|
||||||
# XXX - there may be a better way to do this...
|
# XXX - there may be a better way to do this...
|
||||||
for key in self._element.attrib.keys():
|
for key in list(self._element.attrib.keys()):
|
||||||
del self._element.attrib[key]
|
del self._element.attrib[key]
|
||||||
for key, value in attributes.iteritems():
|
for key, value in attributes.items():
|
||||||
if isinstance(key, tuple):
|
if isinstance(key, tuple):
|
||||||
name = "{%s}%s" % (key[2], key[1])
|
name = "{%s}%s" % (key[2], key[1])
|
||||||
else:
|
else:
|
||||||
@ -81,6 +74,7 @@ def getETreeBuilder(ElementTreeImplementation, fullTree=False):
|
|||||||
|
|
||||||
def _getChildNodes(self):
|
def _getChildNodes(self):
|
||||||
return self._childNodes
|
return self._childNodes
|
||||||
|
|
||||||
def _setChildNodes(self, value):
|
def _setChildNodes(self, value):
|
||||||
del self._element[:]
|
del self._element[:]
|
||||||
self._childNodes = []
|
self._childNodes = []
|
||||||
@ -91,7 +85,7 @@ def getETreeBuilder(ElementTreeImplementation, fullTree=False):
|
|||||||
|
|
||||||
def hasContent(self):
|
def hasContent(self):
|
||||||
"""Return true if the node has children or text"""
|
"""Return true if the node has children or text"""
|
||||||
return bool(self._element.text or self._element.getchildren())
|
return bool(self._element.text or len(self._element))
|
||||||
|
|
||||||
def appendChild(self, node):
|
def appendChild(self, node):
|
||||||
self._childNodes.append(node)
|
self._childNodes.append(node)
|
||||||
@ -99,7 +93,7 @@ def getETreeBuilder(ElementTreeImplementation, fullTree=False):
|
|||||||
node.parent = self
|
node.parent = self
|
||||||
|
|
||||||
def insertBefore(self, node, refNode):
|
def insertBefore(self, node, refNode):
|
||||||
index = self._element.getchildren().index(refNode._element)
|
index = list(self._element).index(refNode._element)
|
||||||
self._element.insert(index, node._element)
|
self._element.insert(index, node._element)
|
||||||
node.parent = self
|
node.parent = self
|
||||||
|
|
||||||
@ -119,7 +113,7 @@ def getETreeBuilder(ElementTreeImplementation, fullTree=False):
|
|||||||
self._element[-1].tail += data
|
self._element[-1].tail += data
|
||||||
else:
|
else:
|
||||||
# Insert the text before the specified node
|
# Insert the text before the specified node
|
||||||
children = self._element.getchildren()
|
children = list(self._element)
|
||||||
index = children.index(insertBefore._element)
|
index = children.index(insertBefore._element)
|
||||||
if index > 0:
|
if index > 0:
|
||||||
if not self._element[index - 1].tail:
|
if not self._element[index - 1].tail:
|
||||||
@ -131,8 +125,8 @@ def getETreeBuilder(ElementTreeImplementation, fullTree=False):
|
|||||||
self._element.text += data
|
self._element.text += data
|
||||||
|
|
||||||
def cloneNode(self):
|
def cloneNode(self):
|
||||||
element = Element(self.name, self.namespace)
|
element = type(self)(self.name, self.namespace)
|
||||||
for name, value in self.attributes.iteritems():
|
for name, value in self.attributes.items():
|
||||||
element.attributes[name] = value
|
element.attributes[name] = value
|
||||||
return element
|
return element
|
||||||
|
|
||||||
@ -172,34 +166,34 @@ def getETreeBuilder(ElementTreeImplementation, fullTree=False):
|
|||||||
self.systemId = systemId
|
self.systemId = systemId
|
||||||
|
|
||||||
def _getPublicId(self):
|
def _getPublicId(self):
|
||||||
return self._element.get(u"publicId", "")
|
return self._element.get("publicId", "")
|
||||||
|
|
||||||
def _setPublicId(self, value):
|
def _setPublicId(self, value):
|
||||||
if value is not None:
|
if value is not None:
|
||||||
self._element.set(u"publicId", value)
|
self._element.set("publicId", value)
|
||||||
|
|
||||||
publicId = property(_getPublicId, _setPublicId)
|
publicId = property(_getPublicId, _setPublicId)
|
||||||
|
|
||||||
def _getSystemId(self):
|
def _getSystemId(self):
|
||||||
return self._element.get(u"systemId", "")
|
return self._element.get("systemId", "")
|
||||||
|
|
||||||
def _setSystemId(self, value):
|
def _setSystemId(self, value):
|
||||||
if value is not None:
|
if value is not None:
|
||||||
self._element.set(u"systemId", value)
|
self._element.set("systemId", value)
|
||||||
|
|
||||||
systemId = property(_getSystemId, _setSystemId)
|
systemId = property(_getSystemId, _setSystemId)
|
||||||
|
|
||||||
class Document(Element):
|
class Document(Element):
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
Element.__init__(self, "<DOCUMENT_ROOT>")
|
Element.__init__(self, "DOCUMENT_ROOT")
|
||||||
|
|
||||||
class DocumentFragment(Element):
|
class DocumentFragment(Element):
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
Element.__init__(self, "<DOCUMENT_FRAGMENT>")
|
Element.__init__(self, "DOCUMENT_FRAGMENT")
|
||||||
|
|
||||||
def testSerializer(element):
|
def testSerializer(element):
|
||||||
rv = []
|
rv = []
|
||||||
finalText = None
|
|
||||||
def serializeElement(element, indent=0):
|
def serializeElement(element, indent=0):
|
||||||
if not(hasattr(element, "tag")):
|
if not(hasattr(element, "tag")):
|
||||||
element = element.getroot()
|
element = element.getroot()
|
||||||
@ -207,19 +201,23 @@ def getETreeBuilder(ElementTreeImplementation, fullTree=False):
|
|||||||
if element.get("publicId") or element.get("systemId"):
|
if element.get("publicId") or element.get("systemId"):
|
||||||
publicId = element.get("publicId") or ""
|
publicId = element.get("publicId") or ""
|
||||||
systemId = element.get("systemId") or ""
|
systemId = element.get("systemId") or ""
|
||||||
rv.append( """<!DOCTYPE %s "%s" "%s">"""%(
|
rv.append("""<!DOCTYPE %s "%s" "%s">""" %
|
||||||
element.text, publicId, systemId))
|
(element.text, publicId, systemId))
|
||||||
else:
|
else:
|
||||||
rv.append("<!DOCTYPE %s>" % (element.text,))
|
rv.append("<!DOCTYPE %s>" % (element.text,))
|
||||||
elif element.tag == "<DOCUMENT_ROOT>":
|
elif element.tag == "DOCUMENT_ROOT":
|
||||||
rv.append("#document")
|
rv.append("#document")
|
||||||
if element.text:
|
if element.text is not None:
|
||||||
rv.append("|%s\"%s\"" % (' ' * (indent + 2), element.text))
|
rv.append("|%s\"%s\"" % (' ' * (indent + 2), element.text))
|
||||||
if element.tail:
|
if element.tail is not None:
|
||||||
finalText = element.tail
|
raise TypeError("Document node cannot have tail")
|
||||||
elif type(element.tag) == type(ElementTree.Comment):
|
if hasattr(element, "attrib") and len(element.attrib):
|
||||||
|
raise TypeError("Document node cannot have attributes")
|
||||||
|
elif element.tag == ElementTreeCommentType:
|
||||||
rv.append("|%s<!-- %s -->" % (' ' * indent, element.text))
|
rv.append("|%s<!-- %s -->" % (' ' * indent, element.text))
|
||||||
else:
|
else:
|
||||||
|
assert isinstance(element.tag, text_type), \
|
||||||
|
"Expected unicode, got %s, %s" % (type(element.tag), element.tag)
|
||||||
nsmatch = tag_regexp.match(element.tag)
|
nsmatch = tag_regexp.match(element.tag)
|
||||||
|
|
||||||
if nsmatch is None:
|
if nsmatch is None:
|
||||||
@ -231,54 +229,59 @@ def getETreeBuilder(ElementTreeImplementation, fullTree=False):
|
|||||||
rv.append("|%s<%s>" % (' ' * indent, name))
|
rv.append("|%s<%s>" % (' ' * indent, name))
|
||||||
|
|
||||||
if hasattr(element, "attrib"):
|
if hasattr(element, "attrib"):
|
||||||
for name, value in element.attrib.iteritems():
|
attributes = []
|
||||||
|
for name, value in element.attrib.items():
|
||||||
nsmatch = tag_regexp.match(name)
|
nsmatch = tag_regexp.match(name)
|
||||||
if nsmatch is not None:
|
if nsmatch is not None:
|
||||||
ns, name = nsmatch.groups()
|
ns, name = nsmatch.groups()
|
||||||
prefix = constants.prefixes[ns]
|
prefix = constants.prefixes[ns]
|
||||||
name = "%s %s"%(prefix, name)
|
attr_string = "%s %s" % (prefix, name)
|
||||||
|
else:
|
||||||
|
attr_string = name
|
||||||
|
attributes.append((attr_string, value))
|
||||||
|
|
||||||
|
for name, value in sorted(attributes):
|
||||||
rv.append('|%s%s="%s"' % (' ' * (indent + 2), name, value))
|
rv.append('|%s%s="%s"' % (' ' * (indent + 2), name, value))
|
||||||
if element.text:
|
if element.text:
|
||||||
rv.append("|%s\"%s\"" % (' ' * (indent + 2), element.text))
|
rv.append("|%s\"%s\"" % (' ' * (indent + 2), element.text))
|
||||||
indent += 2
|
indent += 2
|
||||||
for child in element.getchildren():
|
for child in element:
|
||||||
serializeElement(child, indent)
|
serializeElement(child, indent)
|
||||||
if element.tail:
|
if element.tail:
|
||||||
rv.append("|%s\"%s\"" % (' ' * (indent - 2), element.tail))
|
rv.append("|%s\"%s\"" % (' ' * (indent - 2), element.tail))
|
||||||
serializeElement(element, 0)
|
serializeElement(element, 0)
|
||||||
|
|
||||||
if finalText is not None:
|
|
||||||
rv.append("|%s\"%s\""%(' '*2, finalText))
|
|
||||||
|
|
||||||
return "\n".join(rv)
|
return "\n".join(rv)
|
||||||
|
|
||||||
def tostring(element):
|
def tostring(element):
|
||||||
"""Serialize an element and its child nodes to a string"""
|
"""Serialize an element and its child nodes to a string"""
|
||||||
rv = []
|
rv = []
|
||||||
finalText = None
|
|
||||||
filter = ihatexml.InfosetFilter()
|
filter = ihatexml.InfosetFilter()
|
||||||
|
|
||||||
def serializeElement(element):
|
def serializeElement(element):
|
||||||
if type(element) == type(ElementTree.ElementTree):
|
if isinstance(element, ElementTree.ElementTree):
|
||||||
element = element.getroot()
|
element = element.getroot()
|
||||||
|
|
||||||
if element.tag == "<!DOCTYPE>":
|
if element.tag == "<!DOCTYPE>":
|
||||||
if element.get("publicId") or element.get("systemId"):
|
if element.get("publicId") or element.get("systemId"):
|
||||||
publicId = element.get("publicId") or ""
|
publicId = element.get("publicId") or ""
|
||||||
systemId = element.get("systemId") or ""
|
systemId = element.get("systemId") or ""
|
||||||
rv.append( """<!DOCTYPE %s PUBLIC "%s" "%s">"""%(
|
rv.append("""<!DOCTYPE %s PUBLIC "%s" "%s">""" %
|
||||||
element.text, publicId, systemId))
|
(element.text, publicId, systemId))
|
||||||
else:
|
else:
|
||||||
rv.append("<!DOCTYPE %s>" % (element.text,))
|
rv.append("<!DOCTYPE %s>" % (element.text,))
|
||||||
elif element.tag == "<DOCUMENT_ROOT>":
|
elif element.tag == "DOCUMENT_ROOT":
|
||||||
if element.text:
|
if element.text is not None:
|
||||||
rv.append(element.text)
|
rv.append(element.text)
|
||||||
if element.tail:
|
if element.tail is not None:
|
||||||
finalText = element.tail
|
raise TypeError("Document node cannot have tail")
|
||||||
|
if hasattr(element, "attrib") and len(element.attrib):
|
||||||
|
raise TypeError("Document node cannot have attributes")
|
||||||
|
|
||||||
for child in element.getchildren():
|
for child in element:
|
||||||
serializeElement(child)
|
serializeElement(child)
|
||||||
|
|
||||||
elif type(element.tag) == type(ElementTree.Comment):
|
elif element.tag == ElementTreeCommentType:
|
||||||
rv.append("<!--%s-->" % (element.text,))
|
rv.append("<!--%s-->" % (element.text,))
|
||||||
else:
|
else:
|
||||||
# This is assumed to be an ordinary element
|
# This is assumed to be an ordinary element
|
||||||
@ -287,12 +290,12 @@ def getETreeBuilder(ElementTreeImplementation, fullTree=False):
|
|||||||
else:
|
else:
|
||||||
attr = " ".join(["%s=\"%s\"" % (
|
attr = " ".join(["%s=\"%s\"" % (
|
||||||
filter.fromXmlName(name), value)
|
filter.fromXmlName(name), value)
|
||||||
for name, value in element.attrib.iteritems()])
|
for name, value in element.attrib.items()])
|
||||||
rv.append("<%s %s>" % (element.tag, attr))
|
rv.append("<%s %s>" % (element.tag, attr))
|
||||||
if element.text:
|
if element.text:
|
||||||
rv.append(element.text)
|
rv.append(element.text)
|
||||||
|
|
||||||
for child in element.getchildren():
|
for child in element:
|
||||||
serializeElement(child)
|
serializeElement(child)
|
||||||
|
|
||||||
rv.append("</%s>" % (element.tag,))
|
rv.append("</%s>" % (element.tag,))
|
||||||
@ -302,9 +305,6 @@ def getETreeBuilder(ElementTreeImplementation, fullTree=False):
|
|||||||
|
|
||||||
serializeElement(element)
|
serializeElement(element)
|
||||||
|
|
||||||
if finalText is not None:
|
|
||||||
rv.append("%s\""%(' '*2, finalText))
|
|
||||||
|
|
||||||
return "".join(rv)
|
return "".join(rv)
|
||||||
|
|
||||||
class TreeBuilder(_base.TreeBuilder):
|
class TreeBuilder(_base.TreeBuilder):
|
||||||
@ -313,6 +313,7 @@ def getETreeBuilder(ElementTreeImplementation, fullTree=False):
|
|||||||
elementClass = Element
|
elementClass = Element
|
||||||
commentClass = Comment
|
commentClass = Comment
|
||||||
fragmentClass = DocumentFragment
|
fragmentClass = DocumentFragment
|
||||||
|
implementation = ElementTreeImplementation
|
||||||
|
|
||||||
def testSerializer(self, element):
|
def testSerializer(self, element):
|
||||||
return testSerializer(element)
|
return testSerializer(element)
|
||||||
@ -320,6 +321,10 @@ def getETreeBuilder(ElementTreeImplementation, fullTree=False):
|
|||||||
def getDocument(self):
|
def getDocument(self):
|
||||||
if fullTree:
|
if fullTree:
|
||||||
return self.document._element
|
return self.document._element
|
||||||
|
else:
|
||||||
|
if self.defaultNamespace is not None:
|
||||||
|
return self.document._element.find(
|
||||||
|
"{%s}html" % self.defaultNamespace)
|
||||||
else:
|
else:
|
||||||
return self.document._element.find("html")
|
return self.document._element.find("html")
|
||||||
|
|
||||||
@ -327,3 +332,6 @@ def getETreeBuilder(ElementTreeImplementation, fullTree=False):
|
|||||||
return _base.TreeBuilder.getFragment(self)._element
|
return _base.TreeBuilder.getFragment(self)._element
|
||||||
|
|
||||||
return locals()
|
return locals()
|
||||||
|
|
||||||
|
|
||||||
|
getETreeModule = moduleFactoryFactory(getETreeBuilder)
|
||||||
|
@ -1,20 +1,3 @@
|
|||||||
import new
|
|
||||||
import warnings
|
|
||||||
import re
|
|
||||||
|
|
||||||
import _base
|
|
||||||
from html5lib.constants import DataLossWarning
|
|
||||||
import html5lib.constants as constants
|
|
||||||
import etree as etree_builders
|
|
||||||
from html5lib import ihatexml
|
|
||||||
|
|
||||||
try:
|
|
||||||
import lxml.etree as etree
|
|
||||||
except ImportError:
|
|
||||||
pass
|
|
||||||
|
|
||||||
fullTree = True
|
|
||||||
|
|
||||||
"""Module for supporting the lxml.etree library. The idea here is to use as much
|
"""Module for supporting the lxml.etree library. The idea here is to use as much
|
||||||
of the native library as possible, without using fragile hacks like custom element
|
of the native library as possible, without using fragile hacks like custom element
|
||||||
names that break between releases. The downside of this is that we cannot represent
|
names that break between releases. The downside of this is that we cannot represent
|
||||||
@ -26,12 +9,34 @@ Docypes with no name
|
|||||||
When any of these things occur, we emit a DataLossWarning
|
When any of these things occur, we emit a DataLossWarning
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
from __future__ import absolute_import, division, unicode_literals
|
||||||
|
|
||||||
|
import warnings
|
||||||
|
import re
|
||||||
|
import sys
|
||||||
|
|
||||||
|
from . import _base
|
||||||
|
from ..constants import DataLossWarning
|
||||||
|
from .. import constants
|
||||||
|
from . import etree as etree_builders
|
||||||
|
from .. import ihatexml
|
||||||
|
|
||||||
|
import lxml.etree as etree
|
||||||
|
|
||||||
|
|
||||||
|
fullTree = True
|
||||||
|
tag_regexp = re.compile("{([^}]*)}(.*)")
|
||||||
|
|
||||||
|
comment_type = etree.Comment("asd").tag
|
||||||
|
|
||||||
|
|
||||||
class DocumentType(object):
|
class DocumentType(object):
|
||||||
def __init__(self, name, publicId, systemId):
|
def __init__(self, name, publicId, systemId):
|
||||||
self.name = name
|
self.name = name
|
||||||
self.publicId = publicId
|
self.publicId = publicId
|
||||||
self.systemId = systemId
|
self.systemId = systemId
|
||||||
|
|
||||||
|
|
||||||
class Document(object):
|
class Document(object):
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
self._elementTree = None
|
self._elementTree = None
|
||||||
@ -45,10 +50,12 @@ class Document(object):
|
|||||||
|
|
||||||
childNodes = property(_getChildNodes)
|
childNodes = property(_getChildNodes)
|
||||||
|
|
||||||
|
|
||||||
def testSerializer(element):
|
def testSerializer(element):
|
||||||
rv = []
|
rv = []
|
||||||
finalText = None
|
finalText = None
|
||||||
filter = ihatexml.InfosetFilter()
|
infosetFilter = ihatexml.InfosetFilter()
|
||||||
|
|
||||||
def serializeElement(element, indent=0):
|
def serializeElement(element, indent=0):
|
||||||
if not hasattr(element, "tag"):
|
if not hasattr(element, "tag"):
|
||||||
if hasattr(element, "getroot"):
|
if hasattr(element, "getroot"):
|
||||||
@ -70,47 +77,52 @@ def testSerializer(element):
|
|||||||
while next_element is not None:
|
while next_element is not None:
|
||||||
serializeElement(next_element, indent + 2)
|
serializeElement(next_element, indent + 2)
|
||||||
next_element = next_element.getnext()
|
next_element = next_element.getnext()
|
||||||
elif isinstance(element, basestring):
|
elif isinstance(element, str) or isinstance(element, bytes):
|
||||||
# Text in a fragment
|
# Text in a fragment
|
||||||
|
assert isinstance(element, str) or sys.version_info.major == 2
|
||||||
rv.append("|%s\"%s\"" % (' ' * indent, element))
|
rv.append("|%s\"%s\"" % (' ' * indent, element))
|
||||||
else:
|
else:
|
||||||
# Fragment case
|
# Fragment case
|
||||||
rv.append("#document-fragment")
|
rv.append("#document-fragment")
|
||||||
for next_element in element:
|
for next_element in element:
|
||||||
serializeElement(next_element, indent + 2)
|
serializeElement(next_element, indent + 2)
|
||||||
elif type(element.tag) == type(etree.Comment):
|
elif element.tag == comment_type:
|
||||||
rv.append("|%s<!-- %s -->" % (' ' * indent, element.text))
|
rv.append("|%s<!-- %s -->" % (' ' * indent, element.text))
|
||||||
|
if hasattr(element, "tail") and element.tail:
|
||||||
|
rv.append("|%s\"%s\"" % (' ' * indent, element.tail))
|
||||||
else:
|
else:
|
||||||
|
assert isinstance(element, etree._Element)
|
||||||
nsmatch = etree_builders.tag_regexp.match(element.tag)
|
nsmatch = etree_builders.tag_regexp.match(element.tag)
|
||||||
if nsmatch is not None:
|
if nsmatch is not None:
|
||||||
ns = nsmatch.group(1)
|
ns = nsmatch.group(1)
|
||||||
tag = nsmatch.group(2)
|
tag = nsmatch.group(2)
|
||||||
prefix = constants.prefixes[ns]
|
prefix = constants.prefixes[ns]
|
||||||
rv.append("|%s<%s %s>" % (' ' * indent, prefix,
|
rv.append("|%s<%s %s>" % (' ' * indent, prefix,
|
||||||
filter.fromXmlName(tag)))
|
infosetFilter.fromXmlName(tag)))
|
||||||
else:
|
else:
|
||||||
rv.append("|%s<%s>" % (' ' * indent,
|
rv.append("|%s<%s>" % (' ' * indent,
|
||||||
filter.fromXmlName(element.tag)))
|
infosetFilter.fromXmlName(element.tag)))
|
||||||
|
|
||||||
if hasattr(element, "attrib"):
|
if hasattr(element, "attrib"):
|
||||||
for name, value in element.attrib.iteritems():
|
attributes = []
|
||||||
nsmatch = etree_builders.tag_regexp.match(name)
|
for name, value in element.attrib.items():
|
||||||
if nsmatch:
|
nsmatch = tag_regexp.match(name)
|
||||||
ns = nsmatch.group(1)
|
if nsmatch is not None:
|
||||||
name = nsmatch.group(2)
|
ns, name = nsmatch.groups()
|
||||||
|
name = infosetFilter.fromXmlName(name)
|
||||||
prefix = constants.prefixes[ns]
|
prefix = constants.prefixes[ns]
|
||||||
rv.append('|%s%s %s="%s"' % (' '*(indent+2),
|
attr_string = "%s %s" % (prefix, name)
|
||||||
prefix,
|
|
||||||
filter.fromXmlName(name),
|
|
||||||
value))
|
|
||||||
else:
|
else:
|
||||||
rv.append('|%s%s="%s"' % (' '*(indent+2),
|
attr_string = infosetFilter.fromXmlName(name)
|
||||||
filter.fromXmlName(name),
|
attributes.append((attr_string, value))
|
||||||
value))
|
|
||||||
|
for name, value in sorted(attributes):
|
||||||
|
rv.append('|%s%s="%s"' % (' ' * (indent + 2), name, value))
|
||||||
|
|
||||||
if element.text:
|
if element.text:
|
||||||
rv.append("|%s\"%s\"" % (' ' * (indent + 2), element.text))
|
rv.append("|%s\"%s\"" % (' ' * (indent + 2), element.text))
|
||||||
indent += 2
|
indent += 2
|
||||||
for child in element.getchildren():
|
for child in element:
|
||||||
serializeElement(child, indent)
|
serializeElement(child, indent)
|
||||||
if hasattr(element, "tail") and element.tail:
|
if hasattr(element, "tail") and element.tail:
|
||||||
rv.append("|%s\"%s\"" % (' ' * (indent - 2), element.tail))
|
rv.append("|%s\"%s\"" % (' ' * (indent - 2), element.tail))
|
||||||
@ -121,10 +133,12 @@ def testSerializer(element):
|
|||||||
|
|
||||||
return "\n".join(rv)
|
return "\n".join(rv)
|
||||||
|
|
||||||
|
|
||||||
def tostring(element):
|
def tostring(element):
|
||||||
"""Serialize an element and its child nodes to a string"""
|
"""Serialize an element and its child nodes to a string"""
|
||||||
rv = []
|
rv = []
|
||||||
finalText = None
|
finalText = None
|
||||||
|
|
||||||
def serializeElement(element):
|
def serializeElement(element):
|
||||||
if not hasattr(element, "tag"):
|
if not hasattr(element, "tag"):
|
||||||
if element.docinfo.internalDTD:
|
if element.docinfo.internalDTD:
|
||||||
@ -135,7 +149,7 @@ def tostring(element):
|
|||||||
rv.append(dtd_str)
|
rv.append(dtd_str)
|
||||||
serializeElement(element.getroot())
|
serializeElement(element.getroot())
|
||||||
|
|
||||||
elif type(element.tag) == type(etree.Comment):
|
elif element.tag == comment_type:
|
||||||
rv.append("<!--%s-->" % (element.text,))
|
rv.append("<!--%s-->" % (element.text,))
|
||||||
|
|
||||||
else:
|
else:
|
||||||
@ -144,12 +158,12 @@ def tostring(element):
|
|||||||
rv.append("<%s>" % (element.tag,))
|
rv.append("<%s>" % (element.tag,))
|
||||||
else:
|
else:
|
||||||
attr = " ".join(["%s=\"%s\"" % (name, value)
|
attr = " ".join(["%s=\"%s\"" % (name, value)
|
||||||
for name, value in element.attrib.iteritems()])
|
for name, value in element.attrib.items()])
|
||||||
rv.append("<%s %s>" % (element.tag, attr))
|
rv.append("<%s %s>" % (element.tag, attr))
|
||||||
if element.text:
|
if element.text:
|
||||||
rv.append(element.text)
|
rv.append(element.text)
|
||||||
|
|
||||||
for child in element.getchildren():
|
for child in element:
|
||||||
serializeElement(child)
|
serializeElement(child)
|
||||||
|
|
||||||
rv.append("</%s>" % (element.tag,))
|
rv.append("</%s>" % (element.tag,))
|
||||||
@ -171,44 +185,45 @@ class TreeBuilder(_base.TreeBuilder):
|
|||||||
elementClass = None
|
elementClass = None
|
||||||
commentClass = None
|
commentClass = None
|
||||||
fragmentClass = Document
|
fragmentClass = Document
|
||||||
|
implementation = etree
|
||||||
|
|
||||||
def __init__(self, namespaceHTMLElements, fullTree=False):
|
def __init__(self, namespaceHTMLElements, fullTree=False):
|
||||||
builder = etree_builders.getETreeModule(etree, fullTree=fullTree)
|
builder = etree_builders.getETreeModule(etree, fullTree=fullTree)
|
||||||
filter = self.filter = ihatexml.InfosetFilter()
|
infosetFilter = self.infosetFilter = ihatexml.InfosetFilter()
|
||||||
self.namespaceHTMLElements = namespaceHTMLElements
|
self.namespaceHTMLElements = namespaceHTMLElements
|
||||||
|
|
||||||
class Attributes(dict):
|
class Attributes(dict):
|
||||||
def __init__(self, element, value={}):
|
def __init__(self, element, value={}):
|
||||||
self._element = element
|
self._element = element
|
||||||
dict.__init__(self, value)
|
dict.__init__(self, value)
|
||||||
for key, value in self.iteritems():
|
for key, value in self.items():
|
||||||
if isinstance(key, tuple):
|
if isinstance(key, tuple):
|
||||||
name = "{%s}%s"%(key[2], filter.coerceAttribute(key[1]))
|
name = "{%s}%s" % (key[2], infosetFilter.coerceAttribute(key[1]))
|
||||||
else:
|
else:
|
||||||
name = filter.coerceAttribute(key)
|
name = infosetFilter.coerceAttribute(key)
|
||||||
self._element._element.attrib[name] = value
|
self._element._element.attrib[name] = value
|
||||||
|
|
||||||
def __setitem__(self, key, value):
|
def __setitem__(self, key, value):
|
||||||
dict.__setitem__(self, key, value)
|
dict.__setitem__(self, key, value)
|
||||||
if isinstance(key, tuple):
|
if isinstance(key, tuple):
|
||||||
name = "{%s}%s"%(key[2], filter.coerceAttribute(key[1]))
|
name = "{%s}%s" % (key[2], infosetFilter.coerceAttribute(key[1]))
|
||||||
else:
|
else:
|
||||||
name = filter.coerceAttribute(key)
|
name = infosetFilter.coerceAttribute(key)
|
||||||
self._element._element.attrib[name] = value
|
self._element._element.attrib[name] = value
|
||||||
|
|
||||||
class Element(builder.Element):
|
class Element(builder.Element):
|
||||||
def __init__(self, name, namespace):
|
def __init__(self, name, namespace):
|
||||||
name = filter.coerceElement(name)
|
name = infosetFilter.coerceElement(name)
|
||||||
builder.Element.__init__(self, name, namespace=namespace)
|
builder.Element.__init__(self, name, namespace=namespace)
|
||||||
self._attributes = Attributes(self)
|
self._attributes = Attributes(self)
|
||||||
|
|
||||||
def _setName(self, name):
|
def _setName(self, name):
|
||||||
self._name = filter.coerceElement(name)
|
self._name = infosetFilter.coerceElement(name)
|
||||||
self._element.tag = self._getETreeTag(
|
self._element.tag = self._getETreeTag(
|
||||||
self._name, self._namespace)
|
self._name, self._namespace)
|
||||||
|
|
||||||
def _getName(self):
|
def _getName(self):
|
||||||
return filter.fromXmlName(self._name)
|
return infosetFilter.fromXmlName(self._name)
|
||||||
|
|
||||||
name = property(_getName, _setName)
|
name = property(_getName, _setName)
|
||||||
|
|
||||||
@ -221,20 +236,19 @@ class TreeBuilder(_base.TreeBuilder):
|
|||||||
attributes = property(_getAttributes, _setAttributes)
|
attributes = property(_getAttributes, _setAttributes)
|
||||||
|
|
||||||
def insertText(self, data, insertBefore=None):
|
def insertText(self, data, insertBefore=None):
|
||||||
data = filter.coerceCharacters(data)
|
data = infosetFilter.coerceCharacters(data)
|
||||||
builder.Element.insertText(self, data, insertBefore)
|
builder.Element.insertText(self, data, insertBefore)
|
||||||
|
|
||||||
def appendChild(self, child):
|
def appendChild(self, child):
|
||||||
builder.Element.appendChild(self, child)
|
builder.Element.appendChild(self, child)
|
||||||
|
|
||||||
|
|
||||||
class Comment(builder.Comment):
|
class Comment(builder.Comment):
|
||||||
def __init__(self, data):
|
def __init__(self, data):
|
||||||
data = filter.coerceComment(data)
|
data = infosetFilter.coerceComment(data)
|
||||||
builder.Comment.__init__(self, data)
|
builder.Comment.__init__(self, data)
|
||||||
|
|
||||||
def _setData(self, data):
|
def _setData(self, data):
|
||||||
data = filter.coerceComment(data)
|
data = infosetFilter.coerceComment(data)
|
||||||
self._element.text = data
|
self._element.text = data
|
||||||
|
|
||||||
def _getData(self):
|
def _getData(self):
|
||||||
@ -267,7 +281,7 @@ class TreeBuilder(_base.TreeBuilder):
|
|||||||
element = self.openElements[0]._element
|
element = self.openElements[0]._element
|
||||||
if element.text:
|
if element.text:
|
||||||
fragment.append(element.text)
|
fragment.append(element.text)
|
||||||
fragment.extend(element.getchildren())
|
fragment.extend(list(element))
|
||||||
if element.tail:
|
if element.tail:
|
||||||
fragment.append(element.tail)
|
fragment.append(element.tail)
|
||||||
return fragment
|
return fragment
|
||||||
@ -277,15 +291,26 @@ class TreeBuilder(_base.TreeBuilder):
|
|||||||
publicId = token["publicId"]
|
publicId = token["publicId"]
|
||||||
systemId = token["systemId"]
|
systemId = token["systemId"]
|
||||||
|
|
||||||
if not name or ihatexml.nonXmlNameBMPRegexp.search(name) or name[0] == '"':
|
if not name:
|
||||||
warnings.warn("lxml cannot represent null or non-xml doctype", DataLossWarning)
|
warnings.warn("lxml cannot represent empty doctype", DataLossWarning)
|
||||||
|
self.doctype = None
|
||||||
|
else:
|
||||||
|
coercedName = self.infosetFilter.coerceElement(name)
|
||||||
|
if coercedName != name:
|
||||||
|
warnings.warn("lxml cannot represent non-xml doctype", DataLossWarning)
|
||||||
|
|
||||||
doctype = self.doctypeClass(name, publicId, systemId)
|
doctype = self.doctypeClass(coercedName, publicId, systemId)
|
||||||
self.doctype = doctype
|
self.doctype = doctype
|
||||||
|
|
||||||
def insertCommentInitial(self, data, parent=None):
|
def insertCommentInitial(self, data, parent=None):
|
||||||
self.initial_comments.append(data)
|
self.initial_comments.append(data)
|
||||||
|
|
||||||
|
def insertCommentMain(self, data, parent=None):
|
||||||
|
if (parent == self.document and
|
||||||
|
self.document._elementTree.getroot()[-1].tag == comment_type):
|
||||||
|
warnings.warn("lxml cannot represent adjacent comments beyond the root elements", DataLossWarning)
|
||||||
|
super(TreeBuilder, self).insertComment(data, parent)
|
||||||
|
|
||||||
def insertRoot(self, token):
|
def insertRoot(self, token):
|
||||||
"""Create the document root"""
|
"""Create the document root"""
|
||||||
# Because of the way libxml2 works, it doesn't seem to be possible to
|
# Because of the way libxml2 works, it doesn't seem to be possible to
|
||||||
@ -293,20 +318,29 @@ class TreeBuilder(_base.TreeBuilder):
|
|||||||
# Therefore we need to use the built-in parser to create our iniial
|
# Therefore we need to use the built-in parser to create our iniial
|
||||||
# tree, after which we can add elements like normal
|
# tree, after which we can add elements like normal
|
||||||
docStr = ""
|
docStr = ""
|
||||||
if self.doctype and self.doctype.name and not self.doctype.name.startswith('"'):
|
if self.doctype:
|
||||||
|
assert self.doctype.name
|
||||||
docStr += "<!DOCTYPE %s" % self.doctype.name
|
docStr += "<!DOCTYPE %s" % self.doctype.name
|
||||||
if (self.doctype.publicId is not None or
|
if (self.doctype.publicId is not None or
|
||||||
self.doctype.systemId is not None):
|
self.doctype.systemId is not None):
|
||||||
docStr += ' PUBLIC "%s" "%s"'%(self.doctype.publicId or "",
|
docStr += (' PUBLIC "%s" ' %
|
||||||
self.doctype.systemId or "")
|
(self.infosetFilter.coercePubid(self.doctype.publicId or "")))
|
||||||
|
if self.doctype.systemId:
|
||||||
|
sysid = self.doctype.systemId
|
||||||
|
if sysid.find("'") >= 0 and sysid.find('"') >= 0:
|
||||||
|
warnings.warn("DOCTYPE system cannot contain single and double quotes", DataLossWarning)
|
||||||
|
sysid = sysid.replace("'", 'U00027')
|
||||||
|
if sysid.find("'") >= 0:
|
||||||
|
docStr += '"%s"' % sysid
|
||||||
|
else:
|
||||||
|
docStr += "'%s'" % sysid
|
||||||
|
else:
|
||||||
|
docStr += "''"
|
||||||
docStr += ">"
|
docStr += ">"
|
||||||
|
if self.doctype.name != token["name"]:
|
||||||
|
warnings.warn("lxml cannot represent doctype with a different name to the root element", DataLossWarning)
|
||||||
docStr += "<THIS_SHOULD_NEVER_APPEAR_PUBLICLY/>"
|
docStr += "<THIS_SHOULD_NEVER_APPEAR_PUBLICLY/>"
|
||||||
|
|
||||||
try:
|
|
||||||
root = etree.fromstring(docStr)
|
root = etree.fromstring(docStr)
|
||||||
except etree.XMLSyntaxError:
|
|
||||||
print docStr
|
|
||||||
raise
|
|
||||||
|
|
||||||
# Append the initial comments:
|
# Append the initial comments:
|
||||||
for comment_token in self.initial_comments:
|
for comment_token in self.initial_comments:
|
||||||
@ -332,4 +366,4 @@ class TreeBuilder(_base.TreeBuilder):
|
|||||||
self.openElements.append(root_element)
|
self.openElements.append(root_element)
|
||||||
|
|
||||||
# Reset to the default insert comment function
|
# Reset to the default insert comment function
|
||||||
self.insertComment = super(TreeBuilder, self).insertComment
|
self.insertComment = self.insertCommentMain
|
||||||
|
@ -1,248 +0,0 @@
|
|||||||
import _base
|
|
||||||
from html5lib.constants import voidElements, namespaces, prefixes
|
|
||||||
from xml.sax.saxutils import escape
|
|
||||||
|
|
||||||
# Really crappy basic implementation of a DOM-core like thing
|
|
||||||
class Node(_base.Node):
|
|
||||||
type = -1
|
|
||||||
def __init__(self, name):
|
|
||||||
self.name = name
|
|
||||||
self.parent = None
|
|
||||||
self.value = None
|
|
||||||
self.childNodes = []
|
|
||||||
self._flags = []
|
|
||||||
|
|
||||||
def __iter__(self):
|
|
||||||
for node in self.childNodes:
|
|
||||||
yield node
|
|
||||||
for item in node:
|
|
||||||
yield item
|
|
||||||
|
|
||||||
def __unicode__(self):
|
|
||||||
return self.name
|
|
||||||
|
|
||||||
def toxml(self):
|
|
||||||
raise NotImplementedError
|
|
||||||
|
|
||||||
def printTree(self, indent=0):
|
|
||||||
tree = '\n|%s%s' % (' '* indent, unicode(self))
|
|
||||||
for child in self.childNodes:
|
|
||||||
tree += child.printTree(indent + 2)
|
|
||||||
return tree
|
|
||||||
|
|
||||||
def appendChild(self, node):
|
|
||||||
if (isinstance(node, TextNode) and self.childNodes and
|
|
||||||
isinstance(self.childNodes[-1], TextNode)):
|
|
||||||
self.childNodes[-1].value += node.value
|
|
||||||
else:
|
|
||||||
self.childNodes.append(node)
|
|
||||||
node.parent = self
|
|
||||||
|
|
||||||
def insertText(self, data, insertBefore=None):
|
|
||||||
if insertBefore is None:
|
|
||||||
self.appendChild(TextNode(data))
|
|
||||||
else:
|
|
||||||
self.insertBefore(TextNode(data), insertBefore)
|
|
||||||
|
|
||||||
def insertBefore(self, node, refNode):
|
|
||||||
index = self.childNodes.index(refNode)
|
|
||||||
if (isinstance(node, TextNode) and index > 0 and
|
|
||||||
isinstance(self.childNodes[index - 1], TextNode)):
|
|
||||||
self.childNodes[index - 1].value += node.value
|
|
||||||
else:
|
|
||||||
self.childNodes.insert(index, node)
|
|
||||||
node.parent = self
|
|
||||||
|
|
||||||
def removeChild(self, node):
|
|
||||||
try:
|
|
||||||
self.childNodes.remove(node)
|
|
||||||
except:
|
|
||||||
# XXX
|
|
||||||
raise
|
|
||||||
node.parent = None
|
|
||||||
|
|
||||||
def cloneNode(self):
|
|
||||||
raise NotImplementedError
|
|
||||||
|
|
||||||
def hasContent(self):
|
|
||||||
"""Return true if the node has children or text"""
|
|
||||||
return bool(self.childNodes)
|
|
||||||
|
|
||||||
def getNameTuple(self):
|
|
||||||
if self.namespace == None:
|
|
||||||
return namespaces["html"], self.name
|
|
||||||
else:
|
|
||||||
return self.namespace, self.name
|
|
||||||
|
|
||||||
nameTuple = property(getNameTuple)
|
|
||||||
|
|
||||||
class Document(Node):
|
|
||||||
type = 1
|
|
||||||
def __init__(self):
|
|
||||||
Node.__init__(self, None)
|
|
||||||
|
|
||||||
def __unicode__(self):
|
|
||||||
return "#document"
|
|
||||||
|
|
||||||
def appendChild(self, child):
|
|
||||||
Node.appendChild(self, child)
|
|
||||||
|
|
||||||
def toxml(self, encoding="utf=8"):
|
|
||||||
result = ""
|
|
||||||
for child in self.childNodes:
|
|
||||||
result += child.toxml()
|
|
||||||
return result.encode(encoding)
|
|
||||||
|
|
||||||
def hilite(self, encoding="utf-8"):
|
|
||||||
result = "<pre>"
|
|
||||||
for child in self.childNodes:
|
|
||||||
result += child.hilite()
|
|
||||||
return result.encode(encoding) + "</pre>"
|
|
||||||
|
|
||||||
def printTree(self):
|
|
||||||
tree = unicode(self)
|
|
||||||
for child in self.childNodes:
|
|
||||||
tree += child.printTree(2)
|
|
||||||
return tree
|
|
||||||
|
|
||||||
def cloneNode(self):
|
|
||||||
return Document()
|
|
||||||
|
|
||||||
class DocumentFragment(Document):
|
|
||||||
type = 2
|
|
||||||
def __unicode__(self):
|
|
||||||
return "#document-fragment"
|
|
||||||
|
|
||||||
def cloneNode(self):
|
|
||||||
return DocumentFragment()
|
|
||||||
|
|
||||||
class DocumentType(Node):
|
|
||||||
type = 3
|
|
||||||
def __init__(self, name, publicId, systemId):
|
|
||||||
Node.__init__(self, name)
|
|
||||||
self.publicId = publicId
|
|
||||||
self.systemId = systemId
|
|
||||||
|
|
||||||
def __unicode__(self):
|
|
||||||
if self.publicId or self.systemId:
|
|
||||||
publicId = self.publicId or ""
|
|
||||||
systemId = self.systemId or ""
|
|
||||||
return """<!DOCTYPE %s "%s" "%s">"""%(
|
|
||||||
self.name, publicId, systemId)
|
|
||||||
|
|
||||||
else:
|
|
||||||
return u"<!DOCTYPE %s>" % self.name
|
|
||||||
|
|
||||||
|
|
||||||
toxml = __unicode__
|
|
||||||
|
|
||||||
def hilite(self):
|
|
||||||
return '<code class="markup doctype"><!DOCTYPE %s></code>' % self.name
|
|
||||||
|
|
||||||
def cloneNode(self):
|
|
||||||
return DocumentType(self.name, self.publicId, self.systemId)
|
|
||||||
|
|
||||||
class TextNode(Node):
|
|
||||||
type = 4
|
|
||||||
def __init__(self, value):
|
|
||||||
Node.__init__(self, None)
|
|
||||||
self.value = value
|
|
||||||
|
|
||||||
def __unicode__(self):
|
|
||||||
return u"\"%s\"" % self.value
|
|
||||||
|
|
||||||
def toxml(self):
|
|
||||||
return escape(self.value)
|
|
||||||
|
|
||||||
hilite = toxml
|
|
||||||
|
|
||||||
def cloneNode(self):
|
|
||||||
return TextNode(self.value)
|
|
||||||
|
|
||||||
class Element(Node):
|
|
||||||
type = 5
|
|
||||||
def __init__(self, name, namespace=None):
|
|
||||||
Node.__init__(self, name)
|
|
||||||
self.namespace = namespace
|
|
||||||
self.attributes = {}
|
|
||||||
|
|
||||||
def __unicode__(self):
|
|
||||||
if self.namespace == None:
|
|
||||||
return u"<%s>" % self.name
|
|
||||||
else:
|
|
||||||
return u"<%s %s>"%(prefixes[self.namespace], self.name)
|
|
||||||
|
|
||||||
def toxml(self):
|
|
||||||
result = '<' + self.name
|
|
||||||
if self.attributes:
|
|
||||||
for name,value in self.attributes.iteritems():
|
|
||||||
result += u' %s="%s"' % (name, escape(value,{'"':'"'}))
|
|
||||||
if self.childNodes:
|
|
||||||
result += '>'
|
|
||||||
for child in self.childNodes:
|
|
||||||
result += child.toxml()
|
|
||||||
result += u'</%s>' % self.name
|
|
||||||
else:
|
|
||||||
result += u'/>'
|
|
||||||
return result
|
|
||||||
|
|
||||||
def hilite(self):
|
|
||||||
result = '<<code class="markup element-name">%s</code>' % self.name
|
|
||||||
if self.attributes:
|
|
||||||
for name, value in self.attributes.iteritems():
|
|
||||||
result += ' <code class="markup attribute-name">%s</code>=<code class="markup attribute-value">"%s"</code>' % (name, escape(value, {'"':'"'}))
|
|
||||||
if self.childNodes:
|
|
||||||
result += ">"
|
|
||||||
for child in self.childNodes:
|
|
||||||
result += child.hilite()
|
|
||||||
elif self.name in voidElements:
|
|
||||||
return result + ">"
|
|
||||||
return result + '</<code class="markup element-name">%s</code>>' % self.name
|
|
||||||
|
|
||||||
def printTree(self, indent):
|
|
||||||
tree = '\n|%s%s' % (' '*indent, unicode(self))
|
|
||||||
indent += 2
|
|
||||||
if self.attributes:
|
|
||||||
for name, value in self.attributes.iteritems():
|
|
||||||
if isinstance(name, tuple):
|
|
||||||
name = "%s %s"%(name[0], name[1])
|
|
||||||
tree += '\n|%s%s="%s"' % (' ' * indent, name, value)
|
|
||||||
for child in self.childNodes:
|
|
||||||
tree += child.printTree(indent)
|
|
||||||
return tree
|
|
||||||
|
|
||||||
def cloneNode(self):
|
|
||||||
newNode = Element(self.name)
|
|
||||||
if hasattr(self, 'namespace'):
|
|
||||||
newNode.namespace = self.namespace
|
|
||||||
for attr, value in self.attributes.iteritems():
|
|
||||||
newNode.attributes[attr] = value
|
|
||||||
return newNode
|
|
||||||
|
|
||||||
class CommentNode(Node):
|
|
||||||
type = 6
|
|
||||||
def __init__(self, data):
|
|
||||||
Node.__init__(self, None)
|
|
||||||
self.data = data
|
|
||||||
|
|
||||||
def __unicode__(self):
|
|
||||||
return "<!-- %s -->" % self.data
|
|
||||||
|
|
||||||
def toxml(self):
|
|
||||||
return "<!--%s-->" % self.data
|
|
||||||
|
|
||||||
def hilite(self):
|
|
||||||
return '<code class="markup comment"><!--%s--></code>' % escape(self.data)
|
|
||||||
|
|
||||||
def cloneNode(self):
|
|
||||||
return CommentNode(self.data)
|
|
||||||
|
|
||||||
class TreeBuilder(_base.TreeBuilder):
|
|
||||||
documentClass = Document
|
|
||||||
doctypeClass = DocumentType
|
|
||||||
elementClass = Element
|
|
||||||
commentClass = CommentNode
|
|
||||||
fragmentClass = DocumentFragment
|
|
||||||
|
|
||||||
def testSerializer(self, node):
|
|
||||||
return node.printTree()
|
|
@ -1,228 +0,0 @@
|
|||||||
import warnings
|
|
||||||
|
|
||||||
warnings.warn("BeautifulSoup 3.x (as of 3.1) is not fully compatible with html5lib and support will be removed in the future", DeprecationWarning)
|
|
||||||
|
|
||||||
from BeautifulSoup import BeautifulSoup, Tag, NavigableString, Comment, Declaration
|
|
||||||
|
|
||||||
import _base
|
|
||||||
from html5lib.constants import namespaces, DataLossWarning
|
|
||||||
|
|
||||||
class AttrList(object):
|
|
||||||
def __init__(self, element):
|
|
||||||
self.element = element
|
|
||||||
self.attrs = dict(self.element.attrs)
|
|
||||||
def __iter__(self):
|
|
||||||
return self.attrs.items().__iter__()
|
|
||||||
def __setitem__(self, name, value):
|
|
||||||
"set attr", name, value
|
|
||||||
self.element[name] = value
|
|
||||||
def items(self):
|
|
||||||
return self.attrs.items()
|
|
||||||
def keys(self):
|
|
||||||
return self.attrs.keys()
|
|
||||||
def __getitem__(self, name):
|
|
||||||
return self.attrs[name]
|
|
||||||
def __contains__(self, name):
|
|
||||||
return name in self.attrs.keys()
|
|
||||||
|
|
||||||
|
|
||||||
class Element(_base.Node):
|
|
||||||
def __init__(self, element, soup, namespace):
|
|
||||||
_base.Node.__init__(self, element.name)
|
|
||||||
self.element = element
|
|
||||||
self.soup = soup
|
|
||||||
self.namespace = namespace
|
|
||||||
|
|
||||||
def _nodeIndex(self, node, refNode):
|
|
||||||
# Finds a node by identity rather than equality
|
|
||||||
for index in range(len(self.element.contents)):
|
|
||||||
if id(self.element.contents[index]) == id(refNode.element):
|
|
||||||
return index
|
|
||||||
return None
|
|
||||||
|
|
||||||
def appendChild(self, node):
|
|
||||||
if (node.element.__class__ == NavigableString and self.element.contents
|
|
||||||
and self.element.contents[-1].__class__ == NavigableString):
|
|
||||||
# Concatenate new text onto old text node
|
|
||||||
# (TODO: This has O(n^2) performance, for input like "a</a>a</a>a</a>...")
|
|
||||||
newStr = NavigableString(self.element.contents[-1]+node.element)
|
|
||||||
|
|
||||||
# Remove the old text node
|
|
||||||
# (Can't simply use .extract() by itself, because it fails if
|
|
||||||
# an equal text node exists within the parent node)
|
|
||||||
oldElement = self.element.contents[-1]
|
|
||||||
del self.element.contents[-1]
|
|
||||||
oldElement.parent = None
|
|
||||||
oldElement.extract()
|
|
||||||
|
|
||||||
self.element.insert(len(self.element.contents), newStr)
|
|
||||||
else:
|
|
||||||
self.element.insert(len(self.element.contents), node.element)
|
|
||||||
node.parent = self
|
|
||||||
|
|
||||||
def getAttributes(self):
|
|
||||||
return AttrList(self.element)
|
|
||||||
|
|
||||||
def setAttributes(self, attributes):
|
|
||||||
if attributes:
|
|
||||||
for name, value in attributes.items():
|
|
||||||
self.element[name] = value
|
|
||||||
|
|
||||||
attributes = property(getAttributes, setAttributes)
|
|
||||||
|
|
||||||
def insertText(self, data, insertBefore=None):
|
|
||||||
text = TextNode(NavigableString(data), self.soup)
|
|
||||||
if insertBefore:
|
|
||||||
self.insertBefore(text, insertBefore)
|
|
||||||
else:
|
|
||||||
self.appendChild(text)
|
|
||||||
|
|
||||||
def insertBefore(self, node, refNode):
|
|
||||||
index = self._nodeIndex(node, refNode)
|
|
||||||
if (node.element.__class__ == NavigableString and self.element.contents
|
|
||||||
and self.element.contents[index-1].__class__ == NavigableString):
|
|
||||||
# (See comments in appendChild)
|
|
||||||
newStr = NavigableString(self.element.contents[index-1]+node.element)
|
|
||||||
oldNode = self.element.contents[index-1]
|
|
||||||
del self.element.contents[index-1]
|
|
||||||
oldNode.parent = None
|
|
||||||
oldNode.extract()
|
|
||||||
|
|
||||||
self.element.insert(index-1, newStr)
|
|
||||||
else:
|
|
||||||
self.element.insert(index, node.element)
|
|
||||||
node.parent = self
|
|
||||||
|
|
||||||
def removeChild(self, node):
|
|
||||||
index = self._nodeIndex(node.parent, node)
|
|
||||||
del node.parent.element.contents[index]
|
|
||||||
node.element.parent = None
|
|
||||||
node.element.extract()
|
|
||||||
node.parent = None
|
|
||||||
|
|
||||||
def reparentChildren(self, newParent):
|
|
||||||
while self.element.contents:
|
|
||||||
child = self.element.contents[0]
|
|
||||||
child.extract()
|
|
||||||
if isinstance(child, Tag):
|
|
||||||
newParent.appendChild(Element(child, self.soup, namespaces["html"]))
|
|
||||||
else:
|
|
||||||
newParent.appendChild(TextNode(child, self.soup))
|
|
||||||
|
|
||||||
def cloneNode(self):
|
|
||||||
node = Element(Tag(self.soup, self.element.name), self.soup, self.namespace)
|
|
||||||
for key,value in self.attributes:
|
|
||||||
node.attributes[key] = value
|
|
||||||
return node
|
|
||||||
|
|
||||||
def hasContent(self):
|
|
||||||
return self.element.contents
|
|
||||||
|
|
||||||
def getNameTuple(self):
|
|
||||||
if self.namespace == None:
|
|
||||||
return namespaces["html"], self.name
|
|
||||||
else:
|
|
||||||
return self.namespace, self.name
|
|
||||||
|
|
||||||
nameTuple = property(getNameTuple)
|
|
||||||
|
|
||||||
class TextNode(Element):
|
|
||||||
def __init__(self, element, soup):
|
|
||||||
_base.Node.__init__(self, None)
|
|
||||||
self.element = element
|
|
||||||
self.soup = soup
|
|
||||||
|
|
||||||
def cloneNode(self):
|
|
||||||
raise NotImplementedError
|
|
||||||
|
|
||||||
class TreeBuilder(_base.TreeBuilder):
|
|
||||||
def __init__(self, namespaceHTMLElements):
|
|
||||||
if namespaceHTMLElements:
|
|
||||||
warnings.warn("BeautifulSoup cannot represent elements in any namespace", DataLossWarning)
|
|
||||||
_base.TreeBuilder.__init__(self, namespaceHTMLElements)
|
|
||||||
|
|
||||||
def documentClass(self):
|
|
||||||
self.soup = BeautifulSoup("")
|
|
||||||
return Element(self.soup, self.soup, None)
|
|
||||||
|
|
||||||
def insertDoctype(self, token):
|
|
||||||
name = token["name"]
|
|
||||||
publicId = token["publicId"]
|
|
||||||
systemId = token["systemId"]
|
|
||||||
|
|
||||||
if publicId:
|
|
||||||
self.soup.insert(0, Declaration("%s PUBLIC \"%s\" \"%s\""%(name, publicId, systemId or "")))
|
|
||||||
elif systemId:
|
|
||||||
self.soup.insert(0, Declaration("%s SYSTEM \"%s\""%
|
|
||||||
(name, systemId)))
|
|
||||||
else:
|
|
||||||
self.soup.insert(0, Declaration(name))
|
|
||||||
|
|
||||||
def elementClass(self, name, namespace):
|
|
||||||
if namespace is not None:
|
|
||||||
warnings.warn("BeautifulSoup cannot represent elements in any namespace", DataLossWarning)
|
|
||||||
return Element(Tag(self.soup, name), self.soup, namespace)
|
|
||||||
|
|
||||||
def commentClass(self, data):
|
|
||||||
return TextNode(Comment(data), self.soup)
|
|
||||||
|
|
||||||
def fragmentClass(self):
|
|
||||||
self.soup = BeautifulSoup("")
|
|
||||||
self.soup.name = "[document_fragment]"
|
|
||||||
return Element(self.soup, self.soup, None)
|
|
||||||
|
|
||||||
def appendChild(self, node):
|
|
||||||
self.soup.insert(len(self.soup.contents), node.element)
|
|
||||||
|
|
||||||
def testSerializer(self, element):
|
|
||||||
return testSerializer(element)
|
|
||||||
|
|
||||||
def getDocument(self):
|
|
||||||
return self.soup
|
|
||||||
|
|
||||||
def getFragment(self):
|
|
||||||
return _base.TreeBuilder.getFragment(self).element
|
|
||||||
|
|
||||||
def testSerializer(element):
|
|
||||||
import re
|
|
||||||
rv = []
|
|
||||||
def serializeElement(element, indent=0):
|
|
||||||
if isinstance(element, Declaration):
|
|
||||||
doctype_regexp = r'(?P<name>[^\s]*)( PUBLIC "(?P<publicId>.*)" "(?P<systemId1>.*)"| SYSTEM "(?P<systemId2>.*)")?'
|
|
||||||
m = re.compile(doctype_regexp).match(element.string)
|
|
||||||
assert m is not None, "DOCTYPE did not match expected format"
|
|
||||||
name = m.group('name')
|
|
||||||
publicId = m.group('publicId')
|
|
||||||
if publicId is not None:
|
|
||||||
systemId = m.group('systemId1') or ""
|
|
||||||
else:
|
|
||||||
systemId = m.group('systemId2')
|
|
||||||
|
|
||||||
if publicId is not None or systemId is not None:
|
|
||||||
rv.append("""|%s<!DOCTYPE %s "%s" "%s">"""%
|
|
||||||
(' '*indent, name, publicId or "", systemId or ""))
|
|
||||||
else:
|
|
||||||
rv.append("|%s<!DOCTYPE %s>"%(' '*indent, name))
|
|
||||||
|
|
||||||
elif isinstance(element, BeautifulSoup):
|
|
||||||
if element.name == "[document_fragment]":
|
|
||||||
rv.append("#document-fragment")
|
|
||||||
else:
|
|
||||||
rv.append("#document")
|
|
||||||
|
|
||||||
elif isinstance(element, Comment):
|
|
||||||
rv.append("|%s<!-- %s -->"%(' '*indent, element.string))
|
|
||||||
elif isinstance(element, unicode):
|
|
||||||
rv.append("|%s\"%s\"" %(' '*indent, element))
|
|
||||||
else:
|
|
||||||
rv.append("|%s<%s>"%(' '*indent, element.name))
|
|
||||||
if element.attrs:
|
|
||||||
for name, value in element.attrs:
|
|
||||||
rv.append('|%s%s="%s"' % (' '*(indent+2), name, value))
|
|
||||||
indent += 2
|
|
||||||
if hasattr(element, "contents"):
|
|
||||||
for child in element.contents:
|
|
||||||
serializeElement(child, indent)
|
|
||||||
serializeElement(element, 0)
|
|
||||||
|
|
||||||
return "\n".join(rv)
|
|
@ -8,23 +8,27 @@ implements a 'serialize' method taking a tree as sole argument and
|
|||||||
returning an iterator generating tokens.
|
returning an iterator generating tokens.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
from __future__ import absolute_import, division, unicode_literals
|
||||||
|
|
||||||
|
import sys
|
||||||
|
|
||||||
|
from ..utils import default_etree
|
||||||
|
|
||||||
treeWalkerCache = {}
|
treeWalkerCache = {}
|
||||||
|
|
||||||
|
|
||||||
def getTreeWalker(treeType, implementation=None, **kwargs):
|
def getTreeWalker(treeType, implementation=None, **kwargs):
|
||||||
"""Get a TreeWalker class for various types of tree with built-in support
|
"""Get a TreeWalker class for various types of tree with built-in support
|
||||||
|
|
||||||
treeType - the name of the tree type required (case-insensitive). Supported
|
treeType - the name of the tree type required (case-insensitive). Supported
|
||||||
values are "simpletree", "dom", "etree" and "beautifulsoup"
|
values are:
|
||||||
|
|
||||||
"simpletree" - a built-in DOM-ish tree type with support for some
|
|
||||||
more pythonic idioms.
|
|
||||||
"dom" - The xml.dom.minidom DOM implementation
|
"dom" - The xml.dom.minidom DOM implementation
|
||||||
"pulldom" - The xml.dom.pulldom event stream
|
"pulldom" - The xml.dom.pulldom event stream
|
||||||
"etree" - A generic walker for tree implementations exposing an
|
"etree" - A generic walker for tree implementations exposing an
|
||||||
elementtree-like interface (known to work with
|
elementtree-like interface (known to work with
|
||||||
ElementTree, cElementTree and lxml.etree).
|
ElementTree, cElementTree and lxml.etree).
|
||||||
"lxml" - Optimized walker for lxml.etree
|
"lxml" - Optimized walker for lxml.etree
|
||||||
"beautifulsoup" - Beautiful soup (if installed)
|
|
||||||
"genshi" - a Genshi stream
|
"genshi" - a Genshi stream
|
||||||
|
|
||||||
implementation - (Currently applies to the "etree" tree type only). A module
|
implementation - (Currently applies to the "etree" tree type only). A module
|
||||||
@ -33,20 +37,21 @@ def getTreeWalker(treeType, implementation=None, **kwargs):
|
|||||||
|
|
||||||
treeType = treeType.lower()
|
treeType = treeType.lower()
|
||||||
if treeType not in treeWalkerCache:
|
if treeType not in treeWalkerCache:
|
||||||
if treeType in ("dom", "pulldom", "simpletree"):
|
if treeType in ("dom", "pulldom"):
|
||||||
mod = __import__(treeType, globals())
|
name = "%s.%s" % (__name__, treeType)
|
||||||
|
__import__(name)
|
||||||
|
mod = sys.modules[name]
|
||||||
treeWalkerCache[treeType] = mod.TreeWalker
|
treeWalkerCache[treeType] = mod.TreeWalker
|
||||||
elif treeType == "genshi":
|
elif treeType == "genshi":
|
||||||
import genshistream
|
from . import genshistream
|
||||||
treeWalkerCache[treeType] = genshistream.TreeWalker
|
treeWalkerCache[treeType] = genshistream.TreeWalker
|
||||||
elif treeType == "beautifulsoup":
|
|
||||||
import soup
|
|
||||||
treeWalkerCache[treeType] = soup.TreeWalker
|
|
||||||
elif treeType == "lxml":
|
elif treeType == "lxml":
|
||||||
import lxmletree
|
from . import lxmletree
|
||||||
treeWalkerCache[treeType] = lxmletree.TreeWalker
|
treeWalkerCache[treeType] = lxmletree.TreeWalker
|
||||||
elif treeType == "etree":
|
elif treeType == "etree":
|
||||||
import etree
|
from . import etree
|
||||||
|
if implementation is None:
|
||||||
|
implementation = default_etree
|
||||||
# XXX: NEVER cache here, caching is done in the etree submodule
|
# XXX: NEVER cache here, caching is done in the etree submodule
|
||||||
return etree.getETreeModule(implementation, **kwargs).TreeWalker
|
return etree.getETreeModule(implementation, **kwargs).TreeWalker
|
||||||
return treeWalkerCache.get(treeType)
|
return treeWalkerCache.get(treeType)
|
||||||
|
@ -1,8 +1,40 @@
|
|||||||
|
from __future__ import absolute_import, division, unicode_literals
|
||||||
|
from six import text_type, string_types
|
||||||
|
|
||||||
import gettext
|
import gettext
|
||||||
_ = gettext.gettext
|
_ = gettext.gettext
|
||||||
|
|
||||||
from html5lib.constants import voidElements, spaceCharacters
|
from xml.dom import Node
|
||||||
spaceCharacters = u"".join(spaceCharacters)
|
|
||||||
|
DOCUMENT = Node.DOCUMENT_NODE
|
||||||
|
DOCTYPE = Node.DOCUMENT_TYPE_NODE
|
||||||
|
TEXT = Node.TEXT_NODE
|
||||||
|
ELEMENT = Node.ELEMENT_NODE
|
||||||
|
COMMENT = Node.COMMENT_NODE
|
||||||
|
ENTITY = Node.ENTITY_NODE
|
||||||
|
UNKNOWN = "<#UNKNOWN#>"
|
||||||
|
|
||||||
|
from ..constants import voidElements, spaceCharacters
|
||||||
|
spaceCharacters = "".join(spaceCharacters)
|
||||||
|
|
||||||
|
|
||||||
|
def to_text(s, blank_if_none=True):
|
||||||
|
"""Wrapper around six.text_type to convert None to empty string"""
|
||||||
|
if s is None:
|
||||||
|
if blank_if_none:
|
||||||
|
return ""
|
||||||
|
else:
|
||||||
|
return None
|
||||||
|
elif isinstance(s, text_type):
|
||||||
|
return s
|
||||||
|
else:
|
||||||
|
return text_type(s)
|
||||||
|
|
||||||
|
|
||||||
|
def is_text_or_none(string):
|
||||||
|
"""Wrapper around isinstance(string_types) or is None"""
|
||||||
|
return string is None or isinstance(string, string_types)
|
||||||
|
|
||||||
|
|
||||||
class TreeWalker(object):
|
class TreeWalker(object):
|
||||||
def __init__(self, tree):
|
def __init__(self, tree):
|
||||||
@ -14,34 +46,48 @@ class TreeWalker(object):
|
|||||||
def error(self, msg):
|
def error(self, msg):
|
||||||
return {"type": "SerializeError", "data": msg}
|
return {"type": "SerializeError", "data": msg}
|
||||||
|
|
||||||
def normalizeAttrs(self, attrs):
|
|
||||||
if not attrs:
|
|
||||||
attrs = []
|
|
||||||
elif hasattr(attrs, 'items'):
|
|
||||||
attrs = attrs.items()
|
|
||||||
return [(unicode(name),unicode(value)) for name,value in attrs]
|
|
||||||
|
|
||||||
def emptyTag(self, namespace, name, attrs, hasChildren=False):
|
def emptyTag(self, namespace, name, attrs, hasChildren=False):
|
||||||
yield {"type": "EmptyTag", "name": unicode(name),
|
assert namespace is None or isinstance(namespace, string_types), type(namespace)
|
||||||
"namespace":unicode(namespace),
|
assert isinstance(name, string_types), type(name)
|
||||||
"data": self.normalizeAttrs(attrs)}
|
assert all((namespace is None or isinstance(namespace, string_types)) and
|
||||||
|
isinstance(name, string_types) and
|
||||||
|
isinstance(value, string_types)
|
||||||
|
for (namespace, name), value in attrs.items())
|
||||||
|
|
||||||
|
yield {"type": "EmptyTag", "name": to_text(name, False),
|
||||||
|
"namespace": to_text(namespace),
|
||||||
|
"data": attrs}
|
||||||
if hasChildren:
|
if hasChildren:
|
||||||
yield self.error(_("Void element has children"))
|
yield self.error(_("Void element has children"))
|
||||||
|
|
||||||
def startTag(self, namespace, name, attrs):
|
def startTag(self, namespace, name, attrs):
|
||||||
|
assert namespace is None or isinstance(namespace, string_types), type(namespace)
|
||||||
|
assert isinstance(name, string_types), type(name)
|
||||||
|
assert all((namespace is None or isinstance(namespace, string_types)) and
|
||||||
|
isinstance(name, string_types) and
|
||||||
|
isinstance(value, string_types)
|
||||||
|
for (namespace, name), value in attrs.items())
|
||||||
|
|
||||||
return {"type": "StartTag",
|
return {"type": "StartTag",
|
||||||
"name": unicode(name),
|
"name": text_type(name),
|
||||||
"namespace":unicode(namespace),
|
"namespace": to_text(namespace),
|
||||||
"data": self.normalizeAttrs(attrs)}
|
"data": dict(((to_text(namespace, False), to_text(name)),
|
||||||
|
to_text(value, False))
|
||||||
|
for (namespace, name), value in attrs.items())}
|
||||||
|
|
||||||
def endTag(self, namespace, name):
|
def endTag(self, namespace, name):
|
||||||
|
assert namespace is None or isinstance(namespace, string_types), type(namespace)
|
||||||
|
assert isinstance(name, string_types), type(namespace)
|
||||||
|
|
||||||
return {"type": "EndTag",
|
return {"type": "EndTag",
|
||||||
"name": unicode(name),
|
"name": to_text(name, False),
|
||||||
"namespace":unicode(namespace),
|
"namespace": to_text(namespace),
|
||||||
"data": []}
|
"data": {}}
|
||||||
|
|
||||||
def text(self, data):
|
def text(self, data):
|
||||||
data = unicode(data)
|
assert isinstance(data, string_types), type(data)
|
||||||
|
|
||||||
|
data = to_text(data)
|
||||||
middle = data.lstrip(spaceCharacters)
|
middle = data.lstrip(spaceCharacters)
|
||||||
left = data[:len(data) - len(middle)]
|
left = data[:len(data) - len(middle)]
|
||||||
if left:
|
if left:
|
||||||
@ -55,41 +101,29 @@ class TreeWalker(object):
|
|||||||
yield {"type": "SpaceCharacters", "data": right}
|
yield {"type": "SpaceCharacters", "data": right}
|
||||||
|
|
||||||
def comment(self, data):
|
def comment(self, data):
|
||||||
return {"type": "Comment", "data": unicode(data)}
|
assert isinstance(data, string_types), type(data)
|
||||||
|
|
||||||
|
return {"type": "Comment", "data": text_type(data)}
|
||||||
|
|
||||||
def doctype(self, name, publicId=None, systemId=None, correct=True):
|
def doctype(self, name, publicId=None, systemId=None, correct=True):
|
||||||
|
assert is_text_or_none(name), type(name)
|
||||||
|
assert is_text_or_none(publicId), type(publicId)
|
||||||
|
assert is_text_or_none(systemId), type(systemId)
|
||||||
|
|
||||||
return {"type": "Doctype",
|
return {"type": "Doctype",
|
||||||
"name": name is not None and unicode(name) or u"",
|
"name": to_text(name),
|
||||||
"publicId": publicId,
|
"publicId": to_text(publicId),
|
||||||
"systemId": systemId,
|
"systemId": to_text(systemId),
|
||||||
"correct": correct}
|
"correct": to_text(correct)}
|
||||||
|
|
||||||
|
def entity(self, name):
|
||||||
|
assert isinstance(name, string_types), type(name)
|
||||||
|
|
||||||
|
return {"type": "Entity", "name": text_type(name)}
|
||||||
|
|
||||||
def unknown(self, nodeType):
|
def unknown(self, nodeType):
|
||||||
return self.error(_("Unknown node type: ") + nodeType)
|
return self.error(_("Unknown node type: ") + nodeType)
|
||||||
|
|
||||||
class RecursiveTreeWalker(TreeWalker):
|
|
||||||
def walkChildren(self, node):
|
|
||||||
raise NodeImplementedError
|
|
||||||
|
|
||||||
def element(self, node, namespace, name, attrs, hasChildren):
|
|
||||||
if name in voidElements:
|
|
||||||
for token in self.emptyTag(namespace, name, attrs, hasChildren):
|
|
||||||
yield token
|
|
||||||
else:
|
|
||||||
yield self.startTag(name, attrs)
|
|
||||||
if hasChildren:
|
|
||||||
for token in self.walkChildren(node):
|
|
||||||
yield token
|
|
||||||
yield self.endTag(name)
|
|
||||||
|
|
||||||
from xml.dom import Node
|
|
||||||
|
|
||||||
DOCUMENT = Node.DOCUMENT_NODE
|
|
||||||
DOCTYPE = Node.DOCUMENT_TYPE_NODE
|
|
||||||
TEXT = Node.TEXT_NODE
|
|
||||||
ELEMENT = Node.ELEMENT_NODE
|
|
||||||
COMMENT = Node.COMMENT_NODE
|
|
||||||
UNKNOWN = "<#UNKNOWN#>"
|
|
||||||
|
|
||||||
class NonRecursiveTreeWalker(TreeWalker):
|
class NonRecursiveTreeWalker(TreeWalker):
|
||||||
def getNodeDetails(self, node):
|
def getNodeDetails(self, node):
|
||||||
@ -110,7 +144,6 @@ class NonRecursiveTreeWalker(TreeWalker):
|
|||||||
details = self.getNodeDetails(currentNode)
|
details = self.getNodeDetails(currentNode)
|
||||||
type, details = details[0], details[1:]
|
type, details = details[0], details[1:]
|
||||||
hasChildren = False
|
hasChildren = False
|
||||||
endTag = None
|
|
||||||
|
|
||||||
if type == DOCTYPE:
|
if type == DOCTYPE:
|
||||||
yield self.doctype(*details)
|
yield self.doctype(*details)
|
||||||
@ -127,12 +160,14 @@ class NonRecursiveTreeWalker(TreeWalker):
|
|||||||
yield token
|
yield token
|
||||||
hasChildren = False
|
hasChildren = False
|
||||||
else:
|
else:
|
||||||
endTag = name
|
|
||||||
yield self.startTag(namespace, name, attributes)
|
yield self.startTag(namespace, name, attributes)
|
||||||
|
|
||||||
elif type == COMMENT:
|
elif type == COMMENT:
|
||||||
yield self.comment(details[0])
|
yield self.comment(details[0])
|
||||||
|
|
||||||
|
elif type == ENTITY:
|
||||||
|
yield self.entity(details[0])
|
||||||
|
|
||||||
elif type == DOCUMENT:
|
elif type == DOCUMENT:
|
||||||
hasChildren = True
|
hasChildren = True
|
||||||
|
|
||||||
|
@ -1,10 +1,12 @@
|
|||||||
|
from __future__ import absolute_import, division, unicode_literals
|
||||||
|
|
||||||
from xml.dom import Node
|
from xml.dom import Node
|
||||||
|
|
||||||
import gettext
|
import gettext
|
||||||
_ = gettext.gettext
|
_ = gettext.gettext
|
||||||
|
|
||||||
import _base
|
from . import _base
|
||||||
from html5lib.constants import voidElements
|
|
||||||
|
|
||||||
class TreeWalker(_base.NonRecursiveTreeWalker):
|
class TreeWalker(_base.NonRecursiveTreeWalker):
|
||||||
def getNodeDetails(self, node):
|
def getNodeDetails(self, node):
|
||||||
@ -15,8 +17,15 @@ class TreeWalker(_base.NonRecursiveTreeWalker):
|
|||||||
return _base.TEXT, node.nodeValue
|
return _base.TEXT, node.nodeValue
|
||||||
|
|
||||||
elif node.nodeType == Node.ELEMENT_NODE:
|
elif node.nodeType == Node.ELEMENT_NODE:
|
||||||
|
attrs = {}
|
||||||
|
for attr in list(node.attributes.keys()):
|
||||||
|
attr = node.getAttributeNode(attr)
|
||||||
|
if attr.namespaceURI:
|
||||||
|
attrs[(attr.namespaceURI, attr.localName)] = attr.value
|
||||||
|
else:
|
||||||
|
attrs[(None, attr.name)] = attr.value
|
||||||
return (_base.ELEMENT, node.namespaceURI, node.nodeName,
|
return (_base.ELEMENT, node.namespaceURI, node.nodeName,
|
||||||
node.attributes.items(), node.hasChildNodes)
|
attrs, node.hasChildNodes())
|
||||||
|
|
||||||
elif node.nodeType == Node.COMMENT_NODE:
|
elif node.nodeType == Node.COMMENT_NODE:
|
||||||
return _base.COMMENT, node.nodeValue
|
return _base.COMMENT, node.nodeValue
|
||||||
|
@ -1,30 +1,28 @@
|
|||||||
|
from __future__ import absolute_import, division, unicode_literals
|
||||||
|
|
||||||
|
try:
|
||||||
|
from collections import OrderedDict
|
||||||
|
except ImportError:
|
||||||
|
try:
|
||||||
|
from ordereddict import OrderedDict
|
||||||
|
except ImportError:
|
||||||
|
OrderedDict = dict
|
||||||
import gettext
|
import gettext
|
||||||
_ = gettext.gettext
|
_ = gettext.gettext
|
||||||
|
|
||||||
import new
|
|
||||||
import copy
|
|
||||||
import re
|
import re
|
||||||
|
|
||||||
import _base
|
from six import text_type
|
||||||
from html5lib.constants import voidElements
|
|
||||||
|
from . import _base
|
||||||
|
from ..utils import moduleFactoryFactory
|
||||||
|
|
||||||
tag_regexp = re.compile("{([^}]*)}(.*)")
|
tag_regexp = re.compile("{([^}]*)}(.*)")
|
||||||
|
|
||||||
moduleCache = {}
|
|
||||||
|
|
||||||
def getETreeModule(ElementTreeImplementation):
|
|
||||||
name = "_" + ElementTreeImplementation.__name__+"builder"
|
|
||||||
if name in moduleCache:
|
|
||||||
return moduleCache[name]
|
|
||||||
else:
|
|
||||||
mod = new.module("_" + ElementTreeImplementation.__name__+"builder")
|
|
||||||
objs = getETreeBuilder(ElementTreeImplementation)
|
|
||||||
mod.__dict__.update(objs)
|
|
||||||
moduleCache[name] = mod
|
|
||||||
return mod
|
|
||||||
|
|
||||||
def getETreeBuilder(ElementTreeImplementation):
|
def getETreeBuilder(ElementTreeImplementation):
|
||||||
ElementTree = ElementTreeImplementation
|
ElementTree = ElementTreeImplementation
|
||||||
|
ElementTreeCommentType = ElementTree.Comment("asd").tag
|
||||||
|
|
||||||
class TreeWalker(_base.NonRecursiveTreeWalker):
|
class TreeWalker(_base.NonRecursiveTreeWalker):
|
||||||
"""Given the particular ElementTree representation, this implementation,
|
"""Given the particular ElementTree representation, this implementation,
|
||||||
@ -51,17 +49,18 @@ def getETreeBuilder(ElementTreeImplementation):
|
|||||||
if not(hasattr(node, "tag")):
|
if not(hasattr(node, "tag")):
|
||||||
node = node.getroot()
|
node = node.getroot()
|
||||||
|
|
||||||
if node.tag in ("<DOCUMENT_ROOT>", "<DOCUMENT_FRAGMENT>"):
|
if node.tag in ("DOCUMENT_ROOT", "DOCUMENT_FRAGMENT"):
|
||||||
return (_base.DOCUMENT,)
|
return (_base.DOCUMENT,)
|
||||||
|
|
||||||
elif node.tag == "<!DOCTYPE>":
|
elif node.tag == "<!DOCTYPE>":
|
||||||
return (_base.DOCTYPE, node.text,
|
return (_base.DOCTYPE, node.text,
|
||||||
node.get("publicId"), node.get("systemId"))
|
node.get("publicId"), node.get("systemId"))
|
||||||
|
|
||||||
elif type(node.tag) == type(ElementTree.Comment):
|
elif node.tag == ElementTreeCommentType:
|
||||||
return _base.COMMENT, node.text
|
return _base.COMMENT, node.text
|
||||||
|
|
||||||
else:
|
else:
|
||||||
|
assert type(node.tag) == text_type, type(node.tag)
|
||||||
# This is assumed to be an ordinary element
|
# This is assumed to be an ordinary element
|
||||||
match = tag_regexp.match(node.tag)
|
match = tag_regexp.match(node.tag)
|
||||||
if match:
|
if match:
|
||||||
@ -69,8 +68,15 @@ def getETreeBuilder(ElementTreeImplementation):
|
|||||||
else:
|
else:
|
||||||
namespace = None
|
namespace = None
|
||||||
tag = node.tag
|
tag = node.tag
|
||||||
|
attrs = OrderedDict()
|
||||||
|
for name, value in list(node.attrib.items()):
|
||||||
|
match = tag_regexp.match(name)
|
||||||
|
if match:
|
||||||
|
attrs[(match.group(1), match.group(2))] = value
|
||||||
|
else:
|
||||||
|
attrs[(None, name)] = value
|
||||||
return (_base.ELEMENT, namespace, tag,
|
return (_base.ELEMENT, namespace, tag,
|
||||||
node.attrib.items(), len(node) or node.text)
|
attrs, len(node) or node.text)
|
||||||
|
|
||||||
def getFirstChild(self, node):
|
def getFirstChild(self, node):
|
||||||
if isinstance(node, tuple):
|
if isinstance(node, tuple):
|
||||||
@ -128,3 +134,5 @@ def getETreeBuilder(ElementTreeImplementation):
|
|||||||
return parent, list(parents[-1]).index(parent), parents, None
|
return parent, list(parents[-1]).index(parent), parents, None
|
||||||
|
|
||||||
return locals()
|
return locals()
|
||||||
|
|
||||||
|
getETreeModule = moduleFactoryFactory(getETreeBuilder)
|
||||||
|
@ -1,50 +1,49 @@
|
|||||||
|
from __future__ import absolute_import, division, unicode_literals
|
||||||
|
|
||||||
|
from genshi.core import QName
|
||||||
from genshi.core import START, END, XML_NAMESPACE, DOCTYPE, TEXT
|
from genshi.core import START, END, XML_NAMESPACE, DOCTYPE, TEXT
|
||||||
from genshi.core import START_NS, END_NS, START_CDATA, END_CDATA, PI, COMMENT
|
from genshi.core import START_NS, END_NS, START_CDATA, END_CDATA, PI, COMMENT
|
||||||
from genshi.output import NamespaceFlattener
|
|
||||||
|
|
||||||
import _base
|
from . import _base
|
||||||
|
|
||||||
|
from ..constants import voidElements, namespaces
|
||||||
|
|
||||||
from html5lib.constants import voidElements
|
|
||||||
|
|
||||||
class TreeWalker(_base.TreeWalker):
|
class TreeWalker(_base.TreeWalker):
|
||||||
def __iter__(self):
|
def __iter__(self):
|
||||||
depth = 0
|
# Buffer the events so we can pass in the following one
|
||||||
ignore_until = None
|
|
||||||
previous = None
|
previous = None
|
||||||
for event in self.tree:
|
for event in self.tree:
|
||||||
if previous is not None:
|
if previous is not None:
|
||||||
if previous[0] == START:
|
|
||||||
depth += 1
|
|
||||||
if ignore_until <= depth:
|
|
||||||
ignore_until = None
|
|
||||||
if ignore_until is None:
|
|
||||||
for token in self.tokens(previous, event):
|
for token in self.tokens(previous, event):
|
||||||
yield token
|
yield token
|
||||||
if token["type"] == "EmptyTag":
|
|
||||||
ignore_until = depth
|
|
||||||
if previous[0] == END:
|
|
||||||
depth -= 1
|
|
||||||
previous = event
|
previous = event
|
||||||
|
|
||||||
|
# Don't forget the final event!
|
||||||
if previous is not None:
|
if previous is not None:
|
||||||
if ignore_until is None or ignore_until <= depth:
|
|
||||||
for token in self.tokens(previous, None):
|
for token in self.tokens(previous, None):
|
||||||
yield token
|
yield token
|
||||||
elif ignore_until is not None:
|
|
||||||
raise ValueError("Illformed DOM event stream: void element without END_ELEMENT")
|
|
||||||
|
|
||||||
def tokens(self, event, next):
|
def tokens(self, event, next):
|
||||||
kind, data, pos = event
|
kind, data, pos = event
|
||||||
if kind == START:
|
if kind == START:
|
||||||
tag, attrib = data
|
tag, attribs = data
|
||||||
name = tag.localname
|
name = tag.localname
|
||||||
namespace = tag.namespace
|
namespace = tag.namespace
|
||||||
if tag in voidElements:
|
converted_attribs = {}
|
||||||
for token in self.emptyTag(namespace, name, list(attrib),
|
for k, v in attribs:
|
||||||
|
if isinstance(k, QName):
|
||||||
|
converted_attribs[(k.namespace, k.localname)] = v
|
||||||
|
else:
|
||||||
|
converted_attribs[(None, k)] = v
|
||||||
|
|
||||||
|
if namespace == namespaces["html"] and name in voidElements:
|
||||||
|
for token in self.emptyTag(namespace, name, converted_attribs,
|
||||||
not next or next[0] != END
|
not next or next[0] != END
|
||||||
or next[1] != tag):
|
or next[1] != tag):
|
||||||
yield token
|
yield token
|
||||||
else:
|
else:
|
||||||
yield self.startTag(namespace, name, list(attrib))
|
yield self.startTag(namespace, name, converted_attribs)
|
||||||
|
|
||||||
elif kind == END:
|
elif kind == END:
|
||||||
name = data.localname
|
name = data.localname
|
||||||
@ -62,7 +61,7 @@ class TreeWalker(_base.TreeWalker):
|
|||||||
elif kind == DOCTYPE:
|
elif kind == DOCTYPE:
|
||||||
yield self.doctype(*data)
|
yield self.doctype(*data)
|
||||||
|
|
||||||
elif kind in (XML_NAMESPACE, DOCTYPE, START_NS, END_NS, \
|
elif kind in (XML_NAMESPACE, DOCTYPE, START_NS, END_NS,
|
||||||
START_CDATA, END_CDATA, PI):
|
START_CDATA, END_CDATA, PI):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
@ -1,22 +1,35 @@
|
|||||||
|
from __future__ import absolute_import, division, unicode_literals
|
||||||
|
from six import text_type
|
||||||
|
|
||||||
from lxml import etree
|
from lxml import etree
|
||||||
from html5lib.treebuilders.etree import tag_regexp
|
from ..treebuilders.etree import tag_regexp
|
||||||
|
|
||||||
from gettext import gettext
|
from gettext import gettext
|
||||||
_ = gettext
|
_ = gettext
|
||||||
|
|
||||||
import _base
|
from . import _base
|
||||||
|
|
||||||
|
from .. import ihatexml
|
||||||
|
|
||||||
|
|
||||||
|
def ensure_str(s):
|
||||||
|
if s is None:
|
||||||
|
return None
|
||||||
|
elif isinstance(s, text_type):
|
||||||
|
return s
|
||||||
|
else:
|
||||||
|
return s.decode("utf-8", "strict")
|
||||||
|
|
||||||
from html5lib.constants import voidElements
|
|
||||||
from html5lib import ihatexml
|
|
||||||
|
|
||||||
class Root(object):
|
class Root(object):
|
||||||
def __init__(self, et):
|
def __init__(self, et):
|
||||||
self.elementtree = et
|
self.elementtree = et
|
||||||
self.children = []
|
self.children = []
|
||||||
if et.docinfo.internalDTD:
|
if et.docinfo.internalDTD:
|
||||||
self.children.append(Doctype(self, et.docinfo.root_name,
|
self.children.append(Doctype(self,
|
||||||
et.docinfo.public_id,
|
ensure_str(et.docinfo.root_name),
|
||||||
et.docinfo.system_url))
|
ensure_str(et.docinfo.public_id),
|
||||||
|
ensure_str(et.docinfo.system_url)))
|
||||||
root = et.getroot()
|
root = et.getroot()
|
||||||
node = root
|
node = root
|
||||||
|
|
||||||
@ -38,6 +51,7 @@ class Root(object):
|
|||||||
def __len__(self):
|
def __len__(self):
|
||||||
return 1
|
return 1
|
||||||
|
|
||||||
|
|
||||||
class Doctype(object):
|
class Doctype(object):
|
||||||
def __init__(self, root_node, name, public_id, system_id):
|
def __init__(self, root_node, name, public_id, system_id):
|
||||||
self.root_node = root_node
|
self.root_node = root_node
|
||||||
@ -51,6 +65,7 @@ class Doctype(object):
|
|||||||
def getnext(self):
|
def getnext(self):
|
||||||
return self.root_node.children[1]
|
return self.root_node.children[1]
|
||||||
|
|
||||||
|
|
||||||
class FragmentRoot(Root):
|
class FragmentRoot(Root):
|
||||||
def __init__(self, children):
|
def __init__(self, children):
|
||||||
self.children = [FragmentWrapper(self, child) for child in children]
|
self.children = [FragmentWrapper(self, child) for child in children]
|
||||||
@ -59,19 +74,23 @@ class FragmentRoot(Root):
|
|||||||
def getnext(self):
|
def getnext(self):
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
class FragmentWrapper(object):
|
class FragmentWrapper(object):
|
||||||
def __init__(self, fragment_root, obj):
|
def __init__(self, fragment_root, obj):
|
||||||
self.root_node = fragment_root
|
self.root_node = fragment_root
|
||||||
self.obj = obj
|
self.obj = obj
|
||||||
if hasattr(self.obj, 'text'):
|
if hasattr(self.obj, 'text'):
|
||||||
self.text = self.obj.text
|
self.text = ensure_str(self.obj.text)
|
||||||
else:
|
else:
|
||||||
self.text = None
|
self.text = None
|
||||||
if hasattr(self.obj, 'tail'):
|
if hasattr(self.obj, 'tail'):
|
||||||
self.tail = self.obj.tail
|
self.tail = ensure_str(self.obj.tail)
|
||||||
else:
|
else:
|
||||||
self.tail = None
|
self.tail = None
|
||||||
self.isstring = isinstance(obj, basestring)
|
self.isstring = isinstance(obj, str) or isinstance(obj, bytes)
|
||||||
|
# Support for bytes here is Py2
|
||||||
|
if self.isstring:
|
||||||
|
self.obj = ensure_str(self.obj)
|
||||||
|
|
||||||
def __getattr__(self, name):
|
def __getattr__(self, name):
|
||||||
return getattr(self.obj, name)
|
return getattr(self.obj, name)
|
||||||
@ -87,7 +106,7 @@ class FragmentWrapper(object):
|
|||||||
def __getitem__(self, key):
|
def __getitem__(self, key):
|
||||||
return self.obj[key]
|
return self.obj[key]
|
||||||
|
|
||||||
def __nonzero__(self):
|
def __bool__(self):
|
||||||
return bool(self.obj)
|
return bool(self.obj)
|
||||||
|
|
||||||
def getparent(self):
|
def getparent(self):
|
||||||
@ -96,6 +115,9 @@ class FragmentWrapper(object):
|
|||||||
def __str__(self):
|
def __str__(self):
|
||||||
return str(self.obj)
|
return str(self.obj)
|
||||||
|
|
||||||
|
def __unicode__(self):
|
||||||
|
return str(self.obj)
|
||||||
|
|
||||||
def __len__(self):
|
def __len__(self):
|
||||||
return len(self.obj)
|
return len(self.obj)
|
||||||
|
|
||||||
@ -108,11 +130,12 @@ class TreeWalker(_base.NonRecursiveTreeWalker):
|
|||||||
tree = FragmentRoot(tree)
|
tree = FragmentRoot(tree)
|
||||||
_base.NonRecursiveTreeWalker.__init__(self, tree)
|
_base.NonRecursiveTreeWalker.__init__(self, tree)
|
||||||
self.filter = ihatexml.InfosetFilter()
|
self.filter = ihatexml.InfosetFilter()
|
||||||
|
|
||||||
def getNodeDetails(self, node):
|
def getNodeDetails(self, node):
|
||||||
if isinstance(node, tuple): # Text node
|
if isinstance(node, tuple): # Text node
|
||||||
node, key = node
|
node, key = node
|
||||||
assert key in ("text", "tail"), _("Text nodes are text or tail, found %s") % key
|
assert key in ("text", "tail"), _("Text nodes are text or tail, found %s") % key
|
||||||
return _base.TEXT, getattr(node, key)
|
return _base.TEXT, ensure_str(getattr(node, key))
|
||||||
|
|
||||||
elif isinstance(node, Root):
|
elif isinstance(node, Root):
|
||||||
return (_base.DOCUMENT,)
|
return (_base.DOCUMENT,)
|
||||||
@ -121,23 +144,33 @@ class TreeWalker(_base.NonRecursiveTreeWalker):
|
|||||||
return _base.DOCTYPE, node.name, node.public_id, node.system_id
|
return _base.DOCTYPE, node.name, node.public_id, node.system_id
|
||||||
|
|
||||||
elif isinstance(node, FragmentWrapper) and node.isstring:
|
elif isinstance(node, FragmentWrapper) and node.isstring:
|
||||||
return _base.TEXT, node
|
return _base.TEXT, node.obj
|
||||||
|
|
||||||
elif node.tag == etree.Comment:
|
elif node.tag == etree.Comment:
|
||||||
return _base.COMMENT, node.text
|
return _base.COMMENT, ensure_str(node.text)
|
||||||
|
|
||||||
|
elif node.tag == etree.Entity:
|
||||||
|
return _base.ENTITY, ensure_str(node.text)[1:-1] # strip &;
|
||||||
|
|
||||||
else:
|
else:
|
||||||
# This is assumed to be an ordinary element
|
# This is assumed to be an ordinary element
|
||||||
match = tag_regexp.match(node.tag)
|
match = tag_regexp.match(ensure_str(node.tag))
|
||||||
if match:
|
if match:
|
||||||
namespace, tag = match.groups()
|
namespace, tag = match.groups()
|
||||||
else:
|
else:
|
||||||
namespace = None
|
namespace = None
|
||||||
tag = node.tag
|
tag = ensure_str(node.tag)
|
||||||
|
attrs = {}
|
||||||
|
for name, value in list(node.attrib.items()):
|
||||||
|
name = ensure_str(name)
|
||||||
|
value = ensure_str(value)
|
||||||
|
match = tag_regexp.match(name)
|
||||||
|
if match:
|
||||||
|
attrs[(match.group(1), match.group(2))] = value
|
||||||
|
else:
|
||||||
|
attrs[(None, name)] = value
|
||||||
return (_base.ELEMENT, namespace, self.filter.fromXmlName(tag),
|
return (_base.ELEMENT, namespace, self.filter.fromXmlName(tag),
|
||||||
[(self.filter.fromXmlName(name), value) for
|
attrs, len(node) > 0 or node.text)
|
||||||
name,value in node.attrib.iteritems()],
|
|
||||||
len(node) > 0 or node.text)
|
|
||||||
|
|
||||||
def getFirstChild(self, node):
|
def getFirstChild(self, node):
|
||||||
assert not isinstance(node, tuple), _("Text nodes have no children")
|
assert not isinstance(node, tuple), _("Text nodes have no children")
|
||||||
@ -162,7 +195,7 @@ class TreeWalker(_base.NonRecursiveTreeWalker):
|
|||||||
else: # tail
|
else: # tail
|
||||||
return node.getnext()
|
return node.getnext()
|
||||||
|
|
||||||
return node.tail and (node, "tail") or node.getnext()
|
return (node, "tail") if node.tail else node.getnext()
|
||||||
|
|
||||||
def getParentNode(self, node):
|
def getParentNode(self, node):
|
||||||
if isinstance(node, tuple): # Text node
|
if isinstance(node, tuple): # Text node
|
||||||
|
@ -1,9 +1,12 @@
|
|||||||
|
from __future__ import absolute_import, division, unicode_literals
|
||||||
|
|
||||||
from xml.dom.pulldom import START_ELEMENT, END_ELEMENT, \
|
from xml.dom.pulldom import START_ELEMENT, END_ELEMENT, \
|
||||||
COMMENT, IGNORABLE_WHITESPACE, CHARACTERS
|
COMMENT, IGNORABLE_WHITESPACE, CHARACTERS
|
||||||
|
|
||||||
import _base
|
from . import _base
|
||||||
|
|
||||||
|
from ..constants import voidElements
|
||||||
|
|
||||||
from html5lib.constants import voidElements
|
|
||||||
|
|
||||||
class TreeWalker(_base.TreeWalker):
|
class TreeWalker(_base.TreeWalker):
|
||||||
def __iter__(self):
|
def __iter__(self):
|
||||||
@ -30,14 +33,18 @@ class TreeWalker(_base.TreeWalker):
|
|||||||
if type == START_ELEMENT:
|
if type == START_ELEMENT:
|
||||||
name = node.nodeName
|
name = node.nodeName
|
||||||
namespace = node.namespaceURI
|
namespace = node.namespaceURI
|
||||||
|
attrs = {}
|
||||||
|
for attr in list(node.attributes.keys()):
|
||||||
|
attr = node.getAttributeNode(attr)
|
||||||
|
attrs[(attr.namespaceURI, attr.localName)] = attr.value
|
||||||
if name in voidElements:
|
if name in voidElements:
|
||||||
for token in self.emptyTag(namespace,
|
for token in self.emptyTag(namespace,
|
||||||
name,
|
name,
|
||||||
node.attributes.items(),
|
attrs,
|
||||||
not next or next[1] is not node):
|
not next or next[1] is not node):
|
||||||
yield token
|
yield token
|
||||||
else:
|
else:
|
||||||
yield self.startTag(namespace, name, node.attributes.items())
|
yield self.startTag(namespace, name, attrs)
|
||||||
|
|
||||||
elif type == END_ELEMENT:
|
elif type == END_ELEMENT:
|
||||||
name = node.nodeName
|
name = node.nodeName
|
||||||
|
@ -1,72 +0,0 @@
|
|||||||
import gettext
|
|
||||||
_ = gettext.gettext
|
|
||||||
|
|
||||||
import _base
|
|
||||||
|
|
||||||
class TreeWalker(_base.NonRecursiveTreeWalker):
|
|
||||||
"""Given that simpletree has no performant way of getting a node's
|
|
||||||
next sibling, this implementation returns "nodes" as tuples with the
|
|
||||||
following content:
|
|
||||||
|
|
||||||
1. The parent Node (Element, Document or DocumentFragment)
|
|
||||||
|
|
||||||
2. The child index of the current node in its parent's children list
|
|
||||||
|
|
||||||
3. A list used as a stack of all ancestors. It is a pair tuple whose
|
|
||||||
first item is a parent Node and second item is a child index.
|
|
||||||
"""
|
|
||||||
|
|
||||||
def getNodeDetails(self, node):
|
|
||||||
if isinstance(node, tuple): # It might be the root Node
|
|
||||||
parent, idx, parents = node
|
|
||||||
node = parent.childNodes[idx]
|
|
||||||
|
|
||||||
# testing node.type allows us not to import treebuilders.simpletree
|
|
||||||
if node.type in (1, 2): # Document or DocumentFragment
|
|
||||||
return (_base.DOCUMENT,)
|
|
||||||
|
|
||||||
elif node.type == 3: # DocumentType
|
|
||||||
return _base.DOCTYPE, node.name, node.publicId, node.systemId
|
|
||||||
|
|
||||||
elif node.type == 4: # TextNode
|
|
||||||
return _base.TEXT, node.value
|
|
||||||
|
|
||||||
elif node.type == 5: # Element
|
|
||||||
return (_base.ELEMENT, node.namespace, node.name,
|
|
||||||
node.attributes.items(), node.hasContent())
|
|
||||||
|
|
||||||
elif node.type == 6: # CommentNode
|
|
||||||
return _base.COMMENT, node.data
|
|
||||||
|
|
||||||
else:
|
|
||||||
return _node.UNKNOWN, node.type
|
|
||||||
|
|
||||||
def getFirstChild(self, node):
|
|
||||||
if isinstance(node, tuple): # It might be the root Node
|
|
||||||
parent, idx, parents = node
|
|
||||||
parents.append((parent, idx))
|
|
||||||
node = parent.childNodes[idx]
|
|
||||||
else:
|
|
||||||
parents = []
|
|
||||||
|
|
||||||
assert node.hasContent(), "Node has no children"
|
|
||||||
return (node, 0, parents)
|
|
||||||
|
|
||||||
def getNextSibling(self, node):
|
|
||||||
assert isinstance(node, tuple), "Node is not a tuple: " + str(node)
|
|
||||||
parent, idx, parents = node
|
|
||||||
idx += 1
|
|
||||||
if len(parent.childNodes) > idx:
|
|
||||||
return (parent, idx, parents)
|
|
||||||
else:
|
|
||||||
return None
|
|
||||||
|
|
||||||
def getParentNode(self, node):
|
|
||||||
assert isinstance(node, tuple)
|
|
||||||
parent, idx, parents = node
|
|
||||||
if parents:
|
|
||||||
parent, idx = parents.pop()
|
|
||||||
return parent, idx, parents
|
|
||||||
else:
|
|
||||||
# HACK: We could return ``parent`` but None will stop the algorithm the same way
|
|
||||||
return None
|
|
@ -1,59 +0,0 @@
|
|||||||
import re
|
|
||||||
import gettext
|
|
||||||
_ = gettext.gettext
|
|
||||||
|
|
||||||
from BeautifulSoup import BeautifulSoup, Declaration, Comment, Tag
|
|
||||||
from html5lib.constants import namespaces
|
|
||||||
import _base
|
|
||||||
|
|
||||||
class TreeWalker(_base.NonRecursiveTreeWalker):
|
|
||||||
doctype_regexp = re.compile(
|
|
||||||
r'(?P<name>[^\s]*)(\s*PUBLIC\s*"(?P<publicId>.*)"\s*"(?P<systemId1>.*)"|\s*SYSTEM\s*"(?P<systemId2>.*)")?')
|
|
||||||
def getNodeDetails(self, node):
|
|
||||||
if isinstance(node, BeautifulSoup): # Document or DocumentFragment
|
|
||||||
return (_base.DOCUMENT,)
|
|
||||||
|
|
||||||
elif isinstance(node, Declaration): # DocumentType
|
|
||||||
string = unicode(node.string)
|
|
||||||
#Slice needed to remove markup added during unicode conversion,
|
|
||||||
#but only in some versions of BeautifulSoup/Python
|
|
||||||
if string.startswith('<!') and string.endswith('>'):
|
|
||||||
string = string[2:-1]
|
|
||||||
m = self.doctype_regexp.match(string)
|
|
||||||
#This regexp approach seems wrong and fragile
|
|
||||||
#but beautiful soup stores the doctype as a single thing and we want the seperate bits
|
|
||||||
#It should work as long as the tree is created by html5lib itself but may be wrong if it's
|
|
||||||
#been modified at all
|
|
||||||
#We could just feed to it a html5lib tokenizer, I guess...
|
|
||||||
assert m is not None, "DOCTYPE did not match expected format"
|
|
||||||
name = m.group('name')
|
|
||||||
publicId = m.group('publicId')
|
|
||||||
if publicId is not None:
|
|
||||||
systemId = m.group('systemId1')
|
|
||||||
else:
|
|
||||||
systemId = m.group('systemId2')
|
|
||||||
return _base.DOCTYPE, name, publicId or "", systemId or ""
|
|
||||||
|
|
||||||
elif isinstance(node, Comment):
|
|
||||||
string = unicode(node.string)
|
|
||||||
if string.startswith('<!--') and string.endswith('-->'):
|
|
||||||
string = string[4:-3]
|
|
||||||
return _base.COMMENT, string
|
|
||||||
|
|
||||||
elif isinstance(node, unicode): # TextNode
|
|
||||||
return _base.TEXT, node
|
|
||||||
|
|
||||||
elif isinstance(node, Tag): # Element
|
|
||||||
return (_base.ELEMENT, namespaces["html"], node.name,
|
|
||||||
dict(node.attrs).items(), node.contents)
|
|
||||||
else:
|
|
||||||
return _base.UNKNOWN, node.__class__.__name__
|
|
||||||
|
|
||||||
def getFirstChild(self, node):
|
|
||||||
return node.contents[0]
|
|
||||||
|
|
||||||
def getNextSibling(self, node):
|
|
||||||
return node.nextSibling
|
|
||||||
|
|
||||||
def getParentNode(self, node):
|
|
||||||
return node.parent
|
|
12
src/html5lib/trie/__init__.py
Normal file
12
src/html5lib/trie/__init__.py
Normal file
@ -0,0 +1,12 @@
|
|||||||
|
from __future__ import absolute_import, division, unicode_literals
|
||||||
|
|
||||||
|
from .py import Trie as PyTrie
|
||||||
|
|
||||||
|
Trie = PyTrie
|
||||||
|
|
||||||
|
try:
|
||||||
|
from .datrie import Trie as DATrie
|
||||||
|
except ImportError:
|
||||||
|
pass
|
||||||
|
else:
|
||||||
|
Trie = DATrie
|
37
src/html5lib/trie/_base.py
Normal file
37
src/html5lib/trie/_base.py
Normal file
@ -0,0 +1,37 @@
|
|||||||
|
from __future__ import absolute_import, division, unicode_literals
|
||||||
|
|
||||||
|
from collections import Mapping
|
||||||
|
|
||||||
|
|
||||||
|
class Trie(Mapping):
|
||||||
|
"""Abstract base class for tries"""
|
||||||
|
|
||||||
|
def keys(self, prefix=None):
|
||||||
|
keys = super().keys()
|
||||||
|
|
||||||
|
if prefix is None:
|
||||||
|
return set(keys)
|
||||||
|
|
||||||
|
# Python 2.6: no set comprehensions
|
||||||
|
return set([x for x in keys if x.startswith(prefix)])
|
||||||
|
|
||||||
|
def has_keys_with_prefix(self, prefix):
|
||||||
|
for key in self.keys():
|
||||||
|
if key.startswith(prefix):
|
||||||
|
return True
|
||||||
|
|
||||||
|
return False
|
||||||
|
|
||||||
|
def longest_prefix(self, prefix):
|
||||||
|
if prefix in self:
|
||||||
|
return prefix
|
||||||
|
|
||||||
|
for i in range(1, len(prefix) + 1):
|
||||||
|
if prefix[:-i] in self:
|
||||||
|
return prefix[:-i]
|
||||||
|
|
||||||
|
raise KeyError(prefix)
|
||||||
|
|
||||||
|
def longest_prefix_item(self, prefix):
|
||||||
|
lprefix = self.longest_prefix(prefix)
|
||||||
|
return (lprefix, self[lprefix])
|
44
src/html5lib/trie/datrie.py
Normal file
44
src/html5lib/trie/datrie.py
Normal file
@ -0,0 +1,44 @@
|
|||||||
|
from __future__ import absolute_import, division, unicode_literals
|
||||||
|
|
||||||
|
from datrie import Trie as DATrie
|
||||||
|
from six import text_type
|
||||||
|
|
||||||
|
from ._base import Trie as ABCTrie
|
||||||
|
|
||||||
|
|
||||||
|
class Trie(ABCTrie):
|
||||||
|
def __init__(self, data):
|
||||||
|
chars = set()
|
||||||
|
for key in data.keys():
|
||||||
|
if not isinstance(key, text_type):
|
||||||
|
raise TypeError("All keys must be strings")
|
||||||
|
for char in key:
|
||||||
|
chars.add(char)
|
||||||
|
|
||||||
|
self._data = DATrie("".join(chars))
|
||||||
|
for key, value in data.items():
|
||||||
|
self._data[key] = value
|
||||||
|
|
||||||
|
def __contains__(self, key):
|
||||||
|
return key in self._data
|
||||||
|
|
||||||
|
def __len__(self):
|
||||||
|
return len(self._data)
|
||||||
|
|
||||||
|
def __iter__(self):
|
||||||
|
raise NotImplementedError()
|
||||||
|
|
||||||
|
def __getitem__(self, key):
|
||||||
|
return self._data[key]
|
||||||
|
|
||||||
|
def keys(self, prefix=None):
|
||||||
|
return self._data.keys(prefix)
|
||||||
|
|
||||||
|
def has_keys_with_prefix(self, prefix):
|
||||||
|
return self._data.has_keys_with_prefix(prefix)
|
||||||
|
|
||||||
|
def longest_prefix(self, prefix):
|
||||||
|
return self._data.longest_prefix(prefix)
|
||||||
|
|
||||||
|
def longest_prefix_item(self, prefix):
|
||||||
|
return self._data.longest_prefix_item(prefix)
|
67
src/html5lib/trie/py.py
Normal file
67
src/html5lib/trie/py.py
Normal file
@ -0,0 +1,67 @@
|
|||||||
|
from __future__ import absolute_import, division, unicode_literals
|
||||||
|
from six import text_type
|
||||||
|
|
||||||
|
from bisect import bisect_left
|
||||||
|
|
||||||
|
from ._base import Trie as ABCTrie
|
||||||
|
|
||||||
|
|
||||||
|
class Trie(ABCTrie):
|
||||||
|
def __init__(self, data):
|
||||||
|
if not all(isinstance(x, text_type) for x in data.keys()):
|
||||||
|
raise TypeError("All keys must be strings")
|
||||||
|
|
||||||
|
self._data = data
|
||||||
|
self._keys = sorted(data.keys())
|
||||||
|
self._cachestr = ""
|
||||||
|
self._cachepoints = (0, len(data))
|
||||||
|
|
||||||
|
def __contains__(self, key):
|
||||||
|
return key in self._data
|
||||||
|
|
||||||
|
def __len__(self):
|
||||||
|
return len(self._data)
|
||||||
|
|
||||||
|
def __iter__(self):
|
||||||
|
return iter(self._data)
|
||||||
|
|
||||||
|
def __getitem__(self, key):
|
||||||
|
return self._data[key]
|
||||||
|
|
||||||
|
def keys(self, prefix=None):
|
||||||
|
if prefix is None or prefix == "" or not self._keys:
|
||||||
|
return set(self._keys)
|
||||||
|
|
||||||
|
if prefix.startswith(self._cachestr):
|
||||||
|
lo, hi = self._cachepoints
|
||||||
|
start = i = bisect_left(self._keys, prefix, lo, hi)
|
||||||
|
else:
|
||||||
|
start = i = bisect_left(self._keys, prefix)
|
||||||
|
|
||||||
|
keys = set()
|
||||||
|
if start == len(self._keys):
|
||||||
|
return keys
|
||||||
|
|
||||||
|
while self._keys[i].startswith(prefix):
|
||||||
|
keys.add(self._keys[i])
|
||||||
|
i += 1
|
||||||
|
|
||||||
|
self._cachestr = prefix
|
||||||
|
self._cachepoints = (start, i)
|
||||||
|
|
||||||
|
return keys
|
||||||
|
|
||||||
|
def has_keys_with_prefix(self, prefix):
|
||||||
|
if prefix in self._data:
|
||||||
|
return True
|
||||||
|
|
||||||
|
if prefix.startswith(self._cachestr):
|
||||||
|
lo, hi = self._cachepoints
|
||||||
|
i = bisect_left(self._keys, prefix, lo, hi)
|
||||||
|
else:
|
||||||
|
i = bisect_left(self._keys, prefix)
|
||||||
|
|
||||||
|
if i == len(self._keys):
|
||||||
|
return False
|
||||||
|
|
||||||
|
return self._keys[i].startswith(prefix)
|
@ -1,9 +1,16 @@
|
|||||||
|
from __future__ import absolute_import, division, unicode_literals
|
||||||
|
|
||||||
|
from types import ModuleType
|
||||||
|
|
||||||
try:
|
try:
|
||||||
frozenset
|
import xml.etree.cElementTree as default_etree
|
||||||
except NameError:
|
except ImportError:
|
||||||
#Import from the sets module for python 2.3
|
import xml.etree.ElementTree as default_etree
|
||||||
from sets import Set as set
|
|
||||||
from sets import ImmutableSet as frozenset
|
|
||||||
|
__all__ = ["default_etree", "MethodDispatcher", "isSurrogatePair",
|
||||||
|
"surrogatePairToCodepoint", "moduleFactoryFactory"]
|
||||||
|
|
||||||
|
|
||||||
class MethodDispatcher(dict):
|
class MethodDispatcher(dict):
|
||||||
"""Dict with 2 special properties:
|
"""Dict with 2 special properties:
|
||||||
@ -35,122 +42,41 @@ class MethodDispatcher(dict):
|
|||||||
def __getitem__(self, key):
|
def __getitem__(self, key):
|
||||||
return dict.get(self, key, self.default)
|
return dict.get(self, key, self.default)
|
||||||
|
|
||||||
#Pure python implementation of deque taken from the ASPN Python Cookbook
|
|
||||||
#Original code by Raymond Hettinger
|
|
||||||
|
|
||||||
class deque(object):
|
# Some utility functions to dal with weirdness around UCS2 vs UCS4
|
||||||
|
# python builds
|
||||||
|
|
||||||
def __init__(self, iterable=(), maxsize=-1):
|
def isSurrogatePair(data):
|
||||||
if not hasattr(self, 'data'):
|
return (len(data) == 2 and
|
||||||
self.left = self.right = 0
|
ord(data[0]) >= 0xD800 and ord(data[0]) <= 0xDBFF and
|
||||||
self.data = {}
|
ord(data[1]) >= 0xDC00 and ord(data[1]) <= 0xDFFF)
|
||||||
self.maxsize = maxsize
|
|
||||||
self.extend(iterable)
|
|
||||||
|
|
||||||
def append(self, x):
|
|
||||||
self.data[self.right] = x
|
|
||||||
self.right += 1
|
|
||||||
if self.maxsize != -1 and len(self) > self.maxsize:
|
|
||||||
self.popleft()
|
|
||||||
|
|
||||||
def appendleft(self, x):
|
def surrogatePairToCodepoint(data):
|
||||||
self.left -= 1
|
char_val = (0x10000 + (ord(data[0]) - 0xD800) * 0x400 +
|
||||||
self.data[self.left] = x
|
(ord(data[1]) - 0xDC00))
|
||||||
if self.maxsize != -1 and len(self) > self.maxsize:
|
return char_val
|
||||||
self.pop()
|
|
||||||
|
|
||||||
def pop(self):
|
# Module Factory Factory (no, this isn't Java, I know)
|
||||||
if self.left == self.right:
|
# Here to stop this being duplicated all over the place.
|
||||||
raise IndexError('cannot pop from empty deque')
|
|
||||||
self.right -= 1
|
|
||||||
elem = self.data[self.right]
|
|
||||||
del self.data[self.right]
|
|
||||||
return elem
|
|
||||||
|
|
||||||
def popleft(self):
|
|
||||||
if self.left == self.right:
|
|
||||||
raise IndexError('cannot pop from empty deque')
|
|
||||||
elem = self.data[self.left]
|
|
||||||
del self.data[self.left]
|
|
||||||
self.left += 1
|
|
||||||
return elem
|
|
||||||
|
|
||||||
def clear(self):
|
def moduleFactoryFactory(factory):
|
||||||
self.data.clear()
|
moduleCache = {}
|
||||||
self.left = self.right = 0
|
|
||||||
|
|
||||||
def extend(self, iterable):
|
def moduleFactory(baseModule, *args, **kwargs):
|
||||||
for elem in iterable:
|
if isinstance(ModuleType.__name__, type("")):
|
||||||
self.append(elem)
|
name = "_%s_factory" % baseModule.__name__
|
||||||
|
else:
|
||||||
|
name = b"_%s_factory" % baseModule.__name__
|
||||||
|
|
||||||
def extendleft(self, iterable):
|
if name in moduleCache:
|
||||||
for elem in iterable:
|
return moduleCache[name]
|
||||||
self.appendleft(elem)
|
else:
|
||||||
|
mod = ModuleType(name)
|
||||||
|
objs = factory(baseModule, *args, **kwargs)
|
||||||
|
mod.__dict__.update(objs)
|
||||||
|
moduleCache[name] = mod
|
||||||
|
return mod
|
||||||
|
|
||||||
def rotate(self, n=1):
|
return moduleFactory
|
||||||
if self:
|
|
||||||
n %= len(self)
|
|
||||||
for i in xrange(n):
|
|
||||||
self.appendleft(self.pop())
|
|
||||||
|
|
||||||
def __getitem__(self, i):
|
|
||||||
if i < 0:
|
|
||||||
i += len(self)
|
|
||||||
try:
|
|
||||||
return self.data[i + self.left]
|
|
||||||
except KeyError:
|
|
||||||
raise IndexError
|
|
||||||
|
|
||||||
def __setitem__(self, i, value):
|
|
||||||
if i < 0:
|
|
||||||
i += len(self)
|
|
||||||
try:
|
|
||||||
self.data[i + self.left] = value
|
|
||||||
except KeyError:
|
|
||||||
raise IndexError
|
|
||||||
|
|
||||||
def __delitem__(self, i):
|
|
||||||
size = len(self)
|
|
||||||
if not (-size <= i < size):
|
|
||||||
raise IndexError
|
|
||||||
data = self.data
|
|
||||||
if i < 0:
|
|
||||||
i += size
|
|
||||||
for j in xrange(self.left+i, self.right-1):
|
|
||||||
data[j] = data[j+1]
|
|
||||||
self.pop()
|
|
||||||
|
|
||||||
def __len__(self):
|
|
||||||
return self.right - self.left
|
|
||||||
|
|
||||||
def __cmp__(self, other):
|
|
||||||
if type(self) != type(other):
|
|
||||||
return cmp(type(self), type(other))
|
|
||||||
return cmp(list(self), list(other))
|
|
||||||
|
|
||||||
def __repr__(self, _track=[]):
|
|
||||||
if id(self) in _track:
|
|
||||||
return '...'
|
|
||||||
_track.append(id(self))
|
|
||||||
r = 'deque(%r)' % (list(self),)
|
|
||||||
_track.remove(id(self))
|
|
||||||
return r
|
|
||||||
|
|
||||||
def __getstate__(self):
|
|
||||||
return (tuple(self),)
|
|
||||||
|
|
||||||
def __setstate__(self, s):
|
|
||||||
self.__init__(s[0])
|
|
||||||
|
|
||||||
def __hash__(self):
|
|
||||||
raise TypeError
|
|
||||||
|
|
||||||
def __copy__(self):
|
|
||||||
return self.__class__(self)
|
|
||||||
|
|
||||||
def __deepcopy__(self, memo={}):
|
|
||||||
from copy import deepcopy
|
|
||||||
result = self.__class__()
|
|
||||||
memo[id(self)] = result
|
|
||||||
result.__init__(deepcopy(tuple(self), memo))
|
|
||||||
return result
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user