From 7459e67a2a6fca4b6ca8ba62f9bfcc41a51d71b0 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Wed, 4 Nov 2015 20:30:35 +0530 Subject: [PATCH] Get rid of bleach html5lib's sanitizer actually turns out to be good enough for our use case. --- src/bleach/__init__.py | 383 -------------------------------- src/bleach/callbacks.py | 20 -- src/bleach/encoding.py | 69 ------ src/bleach/sanitizer.py | 147 ------------ src/calibre/library/comments.py | 18 +- src/calibre/test_build.py | 4 +- 6 files changed, 17 insertions(+), 624 deletions(-) delete mode 100644 src/bleach/__init__.py delete mode 100644 src/bleach/callbacks.py delete mode 100644 src/bleach/encoding.py delete mode 100644 src/bleach/sanitizer.py diff --git a/src/bleach/__init__.py b/src/bleach/__init__.py deleted file mode 100644 index 1d8caa2e94..0000000000 --- a/src/bleach/__init__.py +++ /dev/null @@ -1,383 +0,0 @@ -# -*- coding: utf-8 -*- - -from __future__ import unicode_literals -import logging -import re - -import html5lib -from html5lib.sanitizer import HTMLSanitizer -from html5lib.serializer.htmlserializer import HTMLSerializer - -from . import callbacks as linkify_callbacks -from .encoding import force_unicode -from .sanitizer import BleachSanitizer - - -VERSION = (1, 4, 2) -__version__ = '.'.join([str(n) for n in VERSION]) - -__all__ = ['clean', 'linkify'] - -log = logging.getLogger('bleach') - -ALLOWED_TAGS = [ - 'a', - 'abbr', - 'acronym', - 'b', - 'blockquote', - 'code', - 'em', - 'i', - 'li', - 'ol', - 'strong', - 'ul', -] - -ALLOWED_ATTRIBUTES = { - 'a': ['href', 'title'], - 'abbr': ['title'], - 'acronym': ['title'], -} - -ALLOWED_STYLES = [] - -TLDS = """ac ad ae aero af ag ai al am an ao aq ar arpa as asia at au aw ax az - ba bb bd be bf bg bh bi biz bj bm bn bo br bs bt bv bw by bz ca cat - cc cd cf cg ch ci ck cl cm cn co com coop cr cu cv cx cy cz de dj dk - dm do dz ec edu ee eg er es et eu fi fj fk fm fo fr ga gb gd ge gf gg - gh gi gl gm gn gov gp gq gr gs gt gu gw gy hk hm hn hr ht hu id ie il - im in info int io iq ir is it je jm jo jobs jp ke kg kh ki km kn kp - kr kw ky kz la lb lc li lk lr ls lt lu lv ly ma mc md me mg mh mil mk - ml mm mn mo mobi mp mq mr ms mt mu museum mv mw mx my mz na name nc ne - net nf ng ni nl no np nr nu nz om org pa pe pf pg ph pk pl pm pn post - pr pro ps pt pw py qa re ro rs ru rw sa sb sc sd se sg sh si sj sk sl - sm sn so sr ss st su sv sx sy sz tc td tel tf tg th tj tk tl tm tn to - tp tr travel tt tv tw tz ua ug uk us uy uz va vc ve vg vi vn vu wf ws - xn xxx ye yt yu za zm zw""".split() - -# Make sure that .com doesn't get matched by .co first -TLDS.reverse() - -PROTOCOLS = HTMLSanitizer.acceptable_protocols - -url_re = re.compile( - r"""\(* # Match any opening parentheses. - \b(?"]*)? - # /path/zz (excluding "unsafe" chars from RFC 1738, - # except for # and ~, which happen in practice) - """.format('|'.join(PROTOCOLS), '|'.join(TLDS)), - re.IGNORECASE | re.VERBOSE | re.UNICODE) - -proto_re = re.compile(r'^[\w-]+:/{0,3}', re.IGNORECASE) - -punct_re = re.compile(r'([\.,]+)$') - -email_re = re.compile( - r"""(? tag replaced by the text within it - adj = replace_nodes(tree, _text, node, - current_child) - current_child -= 1 - # pull back current_child by 1 to scan the - # new nodes again. - else: - text = force_unicode(attrs.pop('_text')) - for attr_key, attr_val in attrs.items(): - node.set(attr_key, attr_val) - - for n in reversed(list(node)): - node.remove(n) - text = parser.parseFragment(text) - node.text = text.text - for n in text: - node.append(n) - _seen.add(node) - - elif current_child >= 0: - if node.tag == ETREE_TAG('pre') and skip_pre: - linkify_nodes(node, False) - elif not (node in _seen): - linkify_nodes(node, True) - - current_child += 1 - - def email_repl(match): - addr = match.group(0).replace('"', '"') - link = { - '_text': addr, - 'href': 'mailto:{0!s}'.format(addr), - } - link = apply_callbacks(link, True) - - if link is None: - return addr - - _href = link.pop('href') - _text = link.pop('_text') - - repl = '{2!s}' - attr = '{0!s}="{1!s}"' - attribs = ' '.join(attr.format(k, v) for k, v in link.items()) - return repl.format(_href, attribs, _text) - - def link_repl(match): - url = match.group(0) - open_brackets = close_brackets = 0 - if url.startswith('('): - _wrapping = strip_wrapping_parentheses(url) - url, open_brackets, close_brackets = _wrapping - end = '' - m = re.search(punct_re, url) - if m: - end = m.group(0) - url = url[0:m.start()] - if re.search(proto_re, url): - href = url - else: - href = ''.join(['http://', url]) - - link = { - '_text': url, - 'href': href, - } - - link = apply_callbacks(link, True) - - if link is None: - return '(' * open_brackets + url + ')' * close_brackets - - _text = link.pop('_text') - _href = link.pop('href') - - repl = '{0!s}{3!s}{4!s}{5!s}' - attr = '{0!s}="{1!s}"' - attribs = ' '.join(attr.format(k, v) for k, v in link.items()) - - return repl.format('(' * open_brackets, - _href, attribs, _text, end, - ')' * close_brackets) - - try: - linkify_nodes(forest) - except RuntimeError as e: - # If we hit the max recursion depth, just return what we've got. - log.exception('Probable recursion error: {0!r}'.format(e)) - - return _render(forest) - - -def _render(tree): - """Try rendering as HTML, then XML, then give up.""" - return force_unicode(_serialize(tree)) - - -def _serialize(domtree): - walker = html5lib.treewalkers.getTreeWalker('etree') - stream = walker(domtree) - serializer = HTMLSerializer(quote_attr_values=True, - alphabetical_attributes=True, - omit_optional_tags=False) - return serializer.render(stream) diff --git a/src/bleach/callbacks.py b/src/bleach/callbacks.py deleted file mode 100644 index 3cb82c255a..0000000000 --- a/src/bleach/callbacks.py +++ /dev/null @@ -1,20 +0,0 @@ -"""A set of basic callbacks for bleach.linkify.""" -from __future__ import unicode_literals - - -def nofollow(attrs, new=False): - if attrs['href'].startswith('mailto:'): - return attrs - rel = [x for x in attrs.get('rel', '').split(' ') if x] - if 'nofollow' not in [x.lower() for x in rel]: - rel.append('nofollow') - attrs['rel'] = ' '.join(rel) - - return attrs - - -def target_blank(attrs, new=False): - if attrs['href'].startswith('mailto:'): - return attrs - attrs['target'] = '_blank' - return attrs diff --git a/src/bleach/encoding.py b/src/bleach/encoding.py deleted file mode 100644 index c52079ecf8..0000000000 --- a/src/bleach/encoding.py +++ /dev/null @@ -1,69 +0,0 @@ -import datetime -from decimal import Decimal -import types -import sys -from collections import namedtuple -if sys.version_info[0] == 2: - six = namedtuple('Six', 'integer_types string_types text_type PY3')( - (int, long), (basestring,), unicode, False) # noqa -else: - six = namedtuple('Six', 'integer_types string_types text_type PY3')( - (int,), (str,), str, True) - - -def is_protected_type(obj): - """Determine if the object instance is of a protected type. - - Objects of protected types are preserved as-is when passed to - force_unicode(strings_only=True). - """ - return isinstance(obj, ( - six.integer_types + - (types.NoneType, - datetime.datetime, datetime.date, datetime.time, - float, Decimal)) - ) - - -def force_unicode(s, encoding='utf-8', strings_only=False, errors='strict'): - """ - Similar to smart_text, except that lazy instances are resolved to - strings, rather than kept as lazy objects. - - If strings_only is True, don't convert (some) non-string-like objects. - """ - # Handle the common case first, saves 30-40% when s is an instance of - # six.text_type. This function gets called often in that setting. - if isinstance(s, six.text_type): - return s - if strings_only and is_protected_type(s): - return s - try: - if not isinstance(s, six.string_types): - if hasattr(s, '__unicode__'): - s = s.__unicode__() - else: - if six.PY3: - if isinstance(s, bytes): - s = six.text_type(s, encoding, errors) - else: - s = six.text_type(s) - else: - s = six.text_type(bytes(s), encoding, errors) - else: - # Note: We use .decode() here, instead of six.text_type(s, - # encoding, errors), so that if s is a SafeBytes, it ends up being - # a SafeText at the end. - s = s.decode(encoding, errors) - except UnicodeDecodeError as e: - if not isinstance(s, Exception): - raise UnicodeDecodeError(*e.args) - else: - # If we get to here, the caller has passed in an Exception - # subclass populated with non-ASCII bytestring data without a - # working unicode method. Try to handle this without raising a - # further exception by individually forcing the exception args - # to unicode. - s = ' '.join([force_unicode(arg, encoding, strings_only, - errors) for arg in s]) - return s diff --git a/src/bleach/sanitizer.py b/src/bleach/sanitizer.py deleted file mode 100644 index eec6659b3c..0000000000 --- a/src/bleach/sanitizer.py +++ /dev/null @@ -1,147 +0,0 @@ -from __future__ import unicode_literals -import re -from xml.sax.saxutils import escape, unescape - -from html5lib.constants import tokenTypes -from html5lib.sanitizer import HTMLSanitizerMixin -from html5lib.tokenizer import HTMLTokenizer - - -PROTOS = HTMLSanitizerMixin.acceptable_protocols -PROTOS.remove('feed') - - -class BleachSanitizerMixin(HTMLSanitizerMixin): - """Mixin to replace sanitize_token() and sanitize_css().""" - - allowed_svg_properties = [] - - def sanitize_token(self, token): - """Sanitize a token either by HTML-encoding or dropping. - - Unlike HTMLSanitizerMixin.sanitize_token, allowed_attributes can be - a dict of {'tag': ['attribute', 'pairs'], 'tag': callable}. - - Here callable is a function with two arguments of attribute name - and value. It should return true of false. - - Also gives the option to strip tags instead of encoding. - - """ - if (getattr(self, 'wildcard_attributes', None) is None and - isinstance(self.allowed_attributes, dict)): - self.wildcard_attributes = self.allowed_attributes.get('*', []) - - if token['type'] in (tokenTypes['StartTag'], tokenTypes['EndTag'], - tokenTypes['EmptyTag']): - if token['name'] in self.allowed_elements: - if 'data' in token: - if isinstance(self.allowed_attributes, dict): - allowed_attributes = self.allowed_attributes.get( - token['name'], []) - if not callable(allowed_attributes): - allowed_attributes += self.wildcard_attributes - else: - allowed_attributes = self.allowed_attributes - attrs = dict([(name, val) for name, val in - token['data'][::-1] - if (allowed_attributes(name, val) - if callable(allowed_attributes) - else name in allowed_attributes)]) - for attr in self.attr_val_is_uri: - if attr not in attrs: - continue - val_unescaped = re.sub("[`\000-\040\177-\240\s]+", '', - unescape(attrs[attr])).lower() - # Remove replacement characters from unescaped - # characters. - val_unescaped = val_unescaped.replace("\ufffd", "") - if (re.match(r'^[a-z0-9][-+.a-z0-9]*:', val_unescaped) - and (val_unescaped.split(':')[0] not in - self.allowed_protocols)): - del attrs[attr] - for attr in self.svg_attr_val_allows_ref: - if attr in attrs: - attrs[attr] = re.sub(r'url\s*\(\s*[^#\s][^)]+?\)', - ' ', - unescape(attrs[attr])) - if (token['name'] in self.svg_allow_local_href and - 'xlink:href' in attrs and - re.search(r'^\s*[^#\s].*', attrs['xlink:href'])): - del attrs['xlink:href'] - if 'style' in attrs: - attrs['style'] = self.sanitize_css(attrs['style']) - token['data'] = [(name, val) for name, val in - attrs.items()] - return token - elif self.strip_disallowed_elements: - pass - else: - if token['type'] == tokenTypes['EndTag']: - token['data'] = ''.format(token['name']) - elif token['data']: - attr = ' {0!s}="{1!s}"' - attrs = ''.join([attr.format(k, escape(v)) for k, v in - token['data']]) - token['data'] = '<{0!s}{1!s}>'.format(token['name'], attrs) - else: - token['data'] = '<{0!s}>'.format(token['name']) - if token['selfClosing']: - token['data'] = token['data'][:-1] + '/>' - token['type'] = tokenTypes['Characters'] - del token["name"] - return token - elif token['type'] == tokenTypes['Comment']: - if not self.strip_html_comments: - return token - else: - return token - - def sanitize_css(self, style): - """HTMLSanitizerMixin.sanitize_css replacement. - - HTMLSanitizerMixin.sanitize_css always whitelists background-*, - border-*, margin-*, and padding-*. We only whitelist what's in - the whitelist. - - """ - # disallow urls - style = re.compile('url\s*\(\s*[^\s)]+?\s*\)\s*').sub(' ', style) - - # gauntlet - # TODO: Make sure this does what it's meant to - I *think* it wants to - # validate style attribute contents. - parts = style.split(';') - gauntlet = re.compile("""^([-/:,#%.'"\sa-zA-Z0-9!]|\w-\w|'[\s\w]+'""" - """\s*|"[\s\w]+"|\([\d,%\.\s]+\))*$""") - for part in parts: - if not gauntlet.match(part): - return '' - - if not re.match("^\s*([-\w]+\s*:[^:;]*(;\s*|$))*$", style): - return '' - - clean = [] - for prop, value in re.findall('([-\w]+)\s*:\s*([^:;]*)', style): - if not value: - continue - if prop.lower() in self.allowed_css_properties: - clean.append(prop + ': ' + value + ';') - elif prop.lower() in self.allowed_svg_properties: - clean.append(prop + ': ' + value + ';') - - return ' '.join(clean) - - -class BleachSanitizer(HTMLTokenizer, BleachSanitizerMixin): - def __init__(self, stream, encoding=None, parseMeta=True, useChardet=True, - lowercaseElementName=True, lowercaseAttrName=True, **kwargs): - HTMLTokenizer.__init__(self, stream, encoding, parseMeta, useChardet, - lowercaseElementName, lowercaseAttrName, - **kwargs) - - def __iter__(self): - for token in HTMLTokenizer.__iter__(self): - token = self.sanitize_token(token) - if token: - yield token diff --git a/src/calibre/library/comments.py b/src/calibre/library/comments.py index 83becd8be9..7edfef2d75 100644 --- a/src/calibre/library/comments.py +++ b/src/calibre/library/comments.py @@ -133,14 +133,26 @@ def comments_to_html(comments): def merge_comments(one, two): return comments_to_html(one) + '\n\n' + comments_to_html(two) +def sanitize_html(html): + if isinstance(html, bytes): + html = html.decode('utf-8', 'replace') + import html5lib + from html5lib.sanitizer import HTMLSanitizer + from html5lib.serializer.htmlserializer import HTMLSerializer + from html5lib.treebuilders.etree_lxml import TreeBuilder + from html5lib.treewalkers.lxmletree import TreeWalker + parser = html5lib.HTMLParser(tokenizer=HTMLSanitizer, tree=TreeBuilder) + tree = parser.parseFragment(html) + serializer = HTMLSerializer(quote_attr_values=True, alphabetical_attributes=False, omit_optional_tags=False) + stream = TreeWalker(tree) + return serializer.render(stream) + def sanitize_comments_html(html): from calibre.ebooks.markdown import Markdown - import bleach text = html2text(html) md = Markdown() html = md.convert(text) - cleansed = re.sub(u'\n+', u'', bleach.clean(html)) - return cleansed + return sanitize_html(html) def test(): for pat, val in [ diff --git a/src/calibre/test_build.py b/src/calibre/test_build.py index 65bf8b6a52..16ad1b46b1 100644 --- a/src/calibre/test_build.py +++ b/src/calibre/test_build.py @@ -236,8 +236,8 @@ def test_terminal(): def test_markdown(): from calibre.ebooks.markdown import Markdown Markdown(extensions=['extra']) - import bleach - bleach.clean(u'xxx') + from calibre.library.comments import sanitize_html + sanitize_html(b'''xxx''') print('Markdown OK!') def test():