diff --git a/src/bleach/__init__.py b/src/bleach/__init__.py
deleted file mode 100644
index 1d8caa2e94..0000000000
--- a/src/bleach/__init__.py
+++ /dev/null
@@ -1,383 +0,0 @@
-# -*- coding: utf-8 -*-
-
-from __future__ import unicode_literals
-import logging
-import re
-
-import html5lib
-from html5lib.sanitizer import HTMLSanitizer
-from html5lib.serializer.htmlserializer import HTMLSerializer
-
-from . import callbacks as linkify_callbacks
-from .encoding import force_unicode
-from .sanitizer import BleachSanitizer
-
-
-VERSION = (1, 4, 2)
-__version__ = '.'.join([str(n) for n in VERSION])
-
-__all__ = ['clean', 'linkify']
-
-log = logging.getLogger('bleach')
-
-ALLOWED_TAGS = [
- 'a',
- 'abbr',
- 'acronym',
- 'b',
- 'blockquote',
- 'code',
- 'em',
- 'i',
- 'li',
- 'ol',
- 'strong',
- 'ul',
-]
-
-ALLOWED_ATTRIBUTES = {
- 'a': ['href', 'title'],
- 'abbr': ['title'],
- 'acronym': ['title'],
-}
-
-ALLOWED_STYLES = []
-
-TLDS = """ac ad ae aero af ag ai al am an ao aq ar arpa as asia at au aw ax az
- ba bb bd be bf bg bh bi biz bj bm bn bo br bs bt bv bw by bz ca cat
- cc cd cf cg ch ci ck cl cm cn co com coop cr cu cv cx cy cz de dj dk
- dm do dz ec edu ee eg er es et eu fi fj fk fm fo fr ga gb gd ge gf gg
- gh gi gl gm gn gov gp gq gr gs gt gu gw gy hk hm hn hr ht hu id ie il
- im in info int io iq ir is it je jm jo jobs jp ke kg kh ki km kn kp
- kr kw ky kz la lb lc li lk lr ls lt lu lv ly ma mc md me mg mh mil mk
- ml mm mn mo mobi mp mq mr ms mt mu museum mv mw mx my mz na name nc ne
- net nf ng ni nl no np nr nu nz om org pa pe pf pg ph pk pl pm pn post
- pr pro ps pt pw py qa re ro rs ru rw sa sb sc sd se sg sh si sj sk sl
- sm sn so sr ss st su sv sx sy sz tc td tel tf tg th tj tk tl tm tn to
- tp tr travel tt tv tw tz ua ug uk us uy uz va vc ve vg vi vn vu wf ws
- xn xxx ye yt yu za zm zw""".split()
-
-# Make sure that .com doesn't get matched by .co first
-TLDS.reverse()
-
-PROTOCOLS = HTMLSanitizer.acceptable_protocols
-
-url_re = re.compile(
- r"""\(* # Match any opening parentheses.
- \b(?"]*)?
- # /path/zz (excluding "unsafe" chars from RFC 1738,
- # except for # and ~, which happen in practice)
- """.format('|'.join(PROTOCOLS), '|'.join(TLDS)),
- re.IGNORECASE | re.VERBOSE | re.UNICODE)
-
-proto_re = re.compile(r'^[\w-]+:/{0,3}', re.IGNORECASE)
-
-punct_re = re.compile(r'([\.,]+)$')
-
-email_re = re.compile(
- r"""(? tag replaced by the text within it
- adj = replace_nodes(tree, _text, node,
- current_child)
- current_child -= 1
- # pull back current_child by 1 to scan the
- # new nodes again.
- else:
- text = force_unicode(attrs.pop('_text'))
- for attr_key, attr_val in attrs.items():
- node.set(attr_key, attr_val)
-
- for n in reversed(list(node)):
- node.remove(n)
- text = parser.parseFragment(text)
- node.text = text.text
- for n in text:
- node.append(n)
- _seen.add(node)
-
- elif current_child >= 0:
- if node.tag == ETREE_TAG('pre') and skip_pre:
- linkify_nodes(node, False)
- elif not (node in _seen):
- linkify_nodes(node, True)
-
- current_child += 1
-
- def email_repl(match):
- addr = match.group(0).replace('"', '"')
- link = {
- '_text': addr,
- 'href': 'mailto:{0!s}'.format(addr),
- }
- link = apply_callbacks(link, True)
-
- if link is None:
- return addr
-
- _href = link.pop('href')
- _text = link.pop('_text')
-
- repl = '{2!s}'
- attr = '{0!s}="{1!s}"'
- attribs = ' '.join(attr.format(k, v) for k, v in link.items())
- return repl.format(_href, attribs, _text)
-
- def link_repl(match):
- url = match.group(0)
- open_brackets = close_brackets = 0
- if url.startswith('('):
- _wrapping = strip_wrapping_parentheses(url)
- url, open_brackets, close_brackets = _wrapping
- end = ''
- m = re.search(punct_re, url)
- if m:
- end = m.group(0)
- url = url[0:m.start()]
- if re.search(proto_re, url):
- href = url
- else:
- href = ''.join(['http://', url])
-
- link = {
- '_text': url,
- 'href': href,
- }
-
- link = apply_callbacks(link, True)
-
- if link is None:
- return '(' * open_brackets + url + ')' * close_brackets
-
- _text = link.pop('_text')
- _href = link.pop('href')
-
- repl = '{0!s}{3!s}{4!s}{5!s}'
- attr = '{0!s}="{1!s}"'
- attribs = ' '.join(attr.format(k, v) for k, v in link.items())
-
- return repl.format('(' * open_brackets,
- _href, attribs, _text, end,
- ')' * close_brackets)
-
- try:
- linkify_nodes(forest)
- except RuntimeError as e:
- # If we hit the max recursion depth, just return what we've got.
- log.exception('Probable recursion error: {0!r}'.format(e))
-
- return _render(forest)
-
-
-def _render(tree):
- """Try rendering as HTML, then XML, then give up."""
- return force_unicode(_serialize(tree))
-
-
-def _serialize(domtree):
- walker = html5lib.treewalkers.getTreeWalker('etree')
- stream = walker(domtree)
- serializer = HTMLSerializer(quote_attr_values=True,
- alphabetical_attributes=True,
- omit_optional_tags=False)
- return serializer.render(stream)
diff --git a/src/bleach/callbacks.py b/src/bleach/callbacks.py
deleted file mode 100644
index 3cb82c255a..0000000000
--- a/src/bleach/callbacks.py
+++ /dev/null
@@ -1,20 +0,0 @@
-"""A set of basic callbacks for bleach.linkify."""
-from __future__ import unicode_literals
-
-
-def nofollow(attrs, new=False):
- if attrs['href'].startswith('mailto:'):
- return attrs
- rel = [x for x in attrs.get('rel', '').split(' ') if x]
- if 'nofollow' not in [x.lower() for x in rel]:
- rel.append('nofollow')
- attrs['rel'] = ' '.join(rel)
-
- return attrs
-
-
-def target_blank(attrs, new=False):
- if attrs['href'].startswith('mailto:'):
- return attrs
- attrs['target'] = '_blank'
- return attrs
diff --git a/src/bleach/encoding.py b/src/bleach/encoding.py
deleted file mode 100644
index c52079ecf8..0000000000
--- a/src/bleach/encoding.py
+++ /dev/null
@@ -1,69 +0,0 @@
-import datetime
-from decimal import Decimal
-import types
-import sys
-from collections import namedtuple
-if sys.version_info[0] == 2:
- six = namedtuple('Six', 'integer_types string_types text_type PY3')(
- (int, long), (basestring,), unicode, False) # noqa
-else:
- six = namedtuple('Six', 'integer_types string_types text_type PY3')(
- (int,), (str,), str, True)
-
-
-def is_protected_type(obj):
- """Determine if the object instance is of a protected type.
-
- Objects of protected types are preserved as-is when passed to
- force_unicode(strings_only=True).
- """
- return isinstance(obj, (
- six.integer_types +
- (types.NoneType,
- datetime.datetime, datetime.date, datetime.time,
- float, Decimal))
- )
-
-
-def force_unicode(s, encoding='utf-8', strings_only=False, errors='strict'):
- """
- Similar to smart_text, except that lazy instances are resolved to
- strings, rather than kept as lazy objects.
-
- If strings_only is True, don't convert (some) non-string-like objects.
- """
- # Handle the common case first, saves 30-40% when s is an instance of
- # six.text_type. This function gets called often in that setting.
- if isinstance(s, six.text_type):
- return s
- if strings_only and is_protected_type(s):
- return s
- try:
- if not isinstance(s, six.string_types):
- if hasattr(s, '__unicode__'):
- s = s.__unicode__()
- else:
- if six.PY3:
- if isinstance(s, bytes):
- s = six.text_type(s, encoding, errors)
- else:
- s = six.text_type(s)
- else:
- s = six.text_type(bytes(s), encoding, errors)
- else:
- # Note: We use .decode() here, instead of six.text_type(s,
- # encoding, errors), so that if s is a SafeBytes, it ends up being
- # a SafeText at the end.
- s = s.decode(encoding, errors)
- except UnicodeDecodeError as e:
- if not isinstance(s, Exception):
- raise UnicodeDecodeError(*e.args)
- else:
- # If we get to here, the caller has passed in an Exception
- # subclass populated with non-ASCII bytestring data without a
- # working unicode method. Try to handle this without raising a
- # further exception by individually forcing the exception args
- # to unicode.
- s = ' '.join([force_unicode(arg, encoding, strings_only,
- errors) for arg in s])
- return s
diff --git a/src/bleach/sanitizer.py b/src/bleach/sanitizer.py
deleted file mode 100644
index eec6659b3c..0000000000
--- a/src/bleach/sanitizer.py
+++ /dev/null
@@ -1,147 +0,0 @@
-from __future__ import unicode_literals
-import re
-from xml.sax.saxutils import escape, unescape
-
-from html5lib.constants import tokenTypes
-from html5lib.sanitizer import HTMLSanitizerMixin
-from html5lib.tokenizer import HTMLTokenizer
-
-
-PROTOS = HTMLSanitizerMixin.acceptable_protocols
-PROTOS.remove('feed')
-
-
-class BleachSanitizerMixin(HTMLSanitizerMixin):
- """Mixin to replace sanitize_token() and sanitize_css()."""
-
- allowed_svg_properties = []
-
- def sanitize_token(self, token):
- """Sanitize a token either by HTML-encoding or dropping.
-
- Unlike HTMLSanitizerMixin.sanitize_token, allowed_attributes can be
- a dict of {'tag': ['attribute', 'pairs'], 'tag': callable}.
-
- Here callable is a function with two arguments of attribute name
- and value. It should return true of false.
-
- Also gives the option to strip tags instead of encoding.
-
- """
- if (getattr(self, 'wildcard_attributes', None) is None and
- isinstance(self.allowed_attributes, dict)):
- self.wildcard_attributes = self.allowed_attributes.get('*', [])
-
- if token['type'] in (tokenTypes['StartTag'], tokenTypes['EndTag'],
- tokenTypes['EmptyTag']):
- if token['name'] in self.allowed_elements:
- if 'data' in token:
- if isinstance(self.allowed_attributes, dict):
- allowed_attributes = self.allowed_attributes.get(
- token['name'], [])
- if not callable(allowed_attributes):
- allowed_attributes += self.wildcard_attributes
- else:
- allowed_attributes = self.allowed_attributes
- attrs = dict([(name, val) for name, val in
- token['data'][::-1]
- if (allowed_attributes(name, val)
- if callable(allowed_attributes)
- else name in allowed_attributes)])
- for attr in self.attr_val_is_uri:
- if attr not in attrs:
- continue
- val_unescaped = re.sub("[`\000-\040\177-\240\s]+", '',
- unescape(attrs[attr])).lower()
- # Remove replacement characters from unescaped
- # characters.
- val_unescaped = val_unescaped.replace("\ufffd", "")
- if (re.match(r'^[a-z0-9][-+.a-z0-9]*:', val_unescaped)
- and (val_unescaped.split(':')[0] not in
- self.allowed_protocols)):
- del attrs[attr]
- for attr in self.svg_attr_val_allows_ref:
- if attr in attrs:
- attrs[attr] = re.sub(r'url\s*\(\s*[^#\s][^)]+?\)',
- ' ',
- unescape(attrs[attr]))
- if (token['name'] in self.svg_allow_local_href and
- 'xlink:href' in attrs and
- re.search(r'^\s*[^#\s].*', attrs['xlink:href'])):
- del attrs['xlink:href']
- if 'style' in attrs:
- attrs['style'] = self.sanitize_css(attrs['style'])
- token['data'] = [(name, val) for name, val in
- attrs.items()]
- return token
- elif self.strip_disallowed_elements:
- pass
- else:
- if token['type'] == tokenTypes['EndTag']:
- token['data'] = '{0!s}>'.format(token['name'])
- elif token['data']:
- attr = ' {0!s}="{1!s}"'
- attrs = ''.join([attr.format(k, escape(v)) for k, v in
- token['data']])
- token['data'] = '<{0!s}{1!s}>'.format(token['name'], attrs)
- else:
- token['data'] = '<{0!s}>'.format(token['name'])
- if token['selfClosing']:
- token['data'] = token['data'][:-1] + '/>'
- token['type'] = tokenTypes['Characters']
- del token["name"]
- return token
- elif token['type'] == tokenTypes['Comment']:
- if not self.strip_html_comments:
- return token
- else:
- return token
-
- def sanitize_css(self, style):
- """HTMLSanitizerMixin.sanitize_css replacement.
-
- HTMLSanitizerMixin.sanitize_css always whitelists background-*,
- border-*, margin-*, and padding-*. We only whitelist what's in
- the whitelist.
-
- """
- # disallow urls
- style = re.compile('url\s*\(\s*[^\s)]+?\s*\)\s*').sub(' ', style)
-
- # gauntlet
- # TODO: Make sure this does what it's meant to - I *think* it wants to
- # validate style attribute contents.
- parts = style.split(';')
- gauntlet = re.compile("""^([-/:,#%.'"\sa-zA-Z0-9!]|\w-\w|'[\s\w]+'"""
- """\s*|"[\s\w]+"|\([\d,%\.\s]+\))*$""")
- for part in parts:
- if not gauntlet.match(part):
- return ''
-
- if not re.match("^\s*([-\w]+\s*:[^:;]*(;\s*|$))*$", style):
- return ''
-
- clean = []
- for prop, value in re.findall('([-\w]+)\s*:\s*([^:;]*)', style):
- if not value:
- continue
- if prop.lower() in self.allowed_css_properties:
- clean.append(prop + ': ' + value + ';')
- elif prop.lower() in self.allowed_svg_properties:
- clean.append(prop + ': ' + value + ';')
-
- return ' '.join(clean)
-
-
-class BleachSanitizer(HTMLTokenizer, BleachSanitizerMixin):
- def __init__(self, stream, encoding=None, parseMeta=True, useChardet=True,
- lowercaseElementName=True, lowercaseAttrName=True, **kwargs):
- HTMLTokenizer.__init__(self, stream, encoding, parseMeta, useChardet,
- lowercaseElementName, lowercaseAttrName,
- **kwargs)
-
- def __iter__(self):
- for token in HTMLTokenizer.__iter__(self):
- token = self.sanitize_token(token)
- if token:
- yield token
diff --git a/src/calibre/library/comments.py b/src/calibre/library/comments.py
index 83becd8be9..7edfef2d75 100644
--- a/src/calibre/library/comments.py
+++ b/src/calibre/library/comments.py
@@ -133,14 +133,26 @@ def comments_to_html(comments):
def merge_comments(one, two):
return comments_to_html(one) + '\n\n' + comments_to_html(two)
+def sanitize_html(html):
+ if isinstance(html, bytes):
+ html = html.decode('utf-8', 'replace')
+ import html5lib
+ from html5lib.sanitizer import HTMLSanitizer
+ from html5lib.serializer.htmlserializer import HTMLSerializer
+ from html5lib.treebuilders.etree_lxml import TreeBuilder
+ from html5lib.treewalkers.lxmletree import TreeWalker
+ parser = html5lib.HTMLParser(tokenizer=HTMLSanitizer, tree=TreeBuilder)
+ tree = parser.parseFragment(html)
+ serializer = HTMLSerializer(quote_attr_values=True, alphabetical_attributes=False, omit_optional_tags=False)
+ stream = TreeWalker(tree)
+ return serializer.render(stream)
+
def sanitize_comments_html(html):
from calibre.ebooks.markdown import Markdown
- import bleach
text = html2text(html)
md = Markdown()
html = md.convert(text)
- cleansed = re.sub(u'\n+', u'', bleach.clean(html))
- return cleansed
+ return sanitize_html(html)
def test():
for pat, val in [
diff --git a/src/calibre/test_build.py b/src/calibre/test_build.py
index 65bf8b6a52..16ad1b46b1 100644
--- a/src/calibre/test_build.py
+++ b/src/calibre/test_build.py
@@ -236,8 +236,8 @@ def test_terminal():
def test_markdown():
from calibre.ebooks.markdown import Markdown
Markdown(extensions=['extra'])
- import bleach
- bleach.clean(u'xxx')
+ from calibre.library.comments import sanitize_html
+ sanitize_html(b'''xxx
''')
print('Markdown OK!')
def test():