From 7459e67a2a6fca4b6ca8ba62f9bfcc41a51d71b0 Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Wed, 4 Nov 2015 20:30:35 +0530
Subject: [PATCH] Get rid of bleach

html5lib's sanitizer actually turns out to be good enough for our
use case.
---
 src/bleach/__init__.py          | 383 --------------------------------
 src/bleach/callbacks.py         |  20 --
 src/bleach/encoding.py          |  69 ------
 src/bleach/sanitizer.py         | 147 ------------
 src/calibre/library/comments.py |  18 +-
 src/calibre/test_build.py       |   4 +-
 6 files changed, 17 insertions(+), 624 deletions(-)
 delete mode 100644 src/bleach/__init__.py
 delete mode 100644 src/bleach/callbacks.py
 delete mode 100644 src/bleach/encoding.py
 delete mode 100644 src/bleach/sanitizer.py

diff --git a/src/bleach/__init__.py b/src/bleach/__init__.py
deleted file mode 100644
index 1d8caa2e94..0000000000
--- a/src/bleach/__init__.py
+++ /dev/null
@@ -1,383 +0,0 @@
-# -*- coding: utf-8 -*-
-
-from __future__ import unicode_literals
-import logging
-import re
-
-import html5lib
-from html5lib.sanitizer import HTMLSanitizer
-from html5lib.serializer.htmlserializer import HTMLSerializer
-
-from . import callbacks as linkify_callbacks
-from .encoding import force_unicode
-from .sanitizer import BleachSanitizer
-
-
-VERSION = (1, 4, 2)
-__version__ = '.'.join([str(n) for n in VERSION])
-
-__all__ = ['clean', 'linkify']
-
-log = logging.getLogger('bleach')
-
-ALLOWED_TAGS = [
-    'a',
-    'abbr',
-    'acronym',
-    'b',
-    'blockquote',
-    'code',
-    'em',
-    'i',
-    'li',
-    'ol',
-    'strong',
-    'ul',
-]
-
-ALLOWED_ATTRIBUTES = {
-    'a': ['href', 'title'],
-    'abbr': ['title'],
-    'acronym': ['title'],
-}
-
-ALLOWED_STYLES = []
-
-TLDS = """ac ad ae aero af ag ai al am an ao aq ar arpa as asia at au aw ax az
-       ba bb bd be bf bg bh bi biz bj bm bn bo br bs bt bv bw by bz ca cat
-       cc cd cf cg ch ci ck cl cm cn co com coop cr cu cv cx cy cz de dj dk
-       dm do dz ec edu ee eg er es et eu fi fj fk fm fo fr ga gb gd ge gf gg
-       gh gi gl gm gn gov gp gq gr gs gt gu gw gy hk hm hn hr ht hu id ie il
-       im in info int io iq ir is it je jm jo jobs jp ke kg kh ki km kn kp
-       kr kw ky kz la lb lc li lk lr ls lt lu lv ly ma mc md me mg mh mil mk
-       ml mm mn mo mobi mp mq mr ms mt mu museum mv mw mx my mz na name nc ne
-       net nf ng ni nl no np nr nu nz om org pa pe pf pg ph pk pl pm pn post
-       pr pro ps pt pw py qa re ro rs ru rw sa sb sc sd se sg sh si sj sk sl
-       sm sn so sr ss st su sv sx sy sz tc td tel tf tg th tj tk tl tm tn to
-       tp tr travel tt tv tw tz ua ug uk us uy uz va vc ve vg vi vn vu wf ws
-       xn xxx ye yt yu za zm zw""".split()
-
-# Make sure that .com doesn't get matched by .co first
-TLDS.reverse()
-
-PROTOCOLS = HTMLSanitizer.acceptable_protocols
-
-url_re = re.compile(
-    r"""\(*  # Match any opening parentheses.
-    \b(?<![@.])(?:(?:{0}):/{{0,3}}(?:(?:\w+:)?\w+@)?)?  # http://
-    ([\w-]+\.)+(?:{1})(?:\:\d+)?(?!\.\w)\b   # xx.yy.tld(:##)?
-    (?:[/?][^\s\{{\}}\|\\\^\[\]`<>"]*)?
-        # /path/zz (excluding "unsafe" chars from RFC 1738,
-        # except for # and ~, which happen in practice)
-    """.format('|'.join(PROTOCOLS), '|'.join(TLDS)),
-    re.IGNORECASE | re.VERBOSE | re.UNICODE)
-
-proto_re = re.compile(r'^[\w-]+:/{0,3}', re.IGNORECASE)
-
-punct_re = re.compile(r'([\.,]+)$')
-
-email_re = re.compile(
-    r"""(?<!//)
-    (([-!#$%&'*+/=?^_`{0!s}|~0-9A-Z]+
-        (\.[-!#$%&'*+/=?^_`{1!s}|~0-9A-Z]+)*  # dot-atom
-    |^"([\001-\010\013\014\016-\037!#-\[\]-\177]
-        |\\[\001-011\013\014\016-\177])*"  # quoted-string
-    )@(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+[A-Z]{2,6})\.?  # domain
-    """,
-    re.IGNORECASE | re.MULTILINE | re.VERBOSE)
-
-NODE_TEXT = 4  # The numeric ID of a text node in simpletree.
-
-ETREE_TAG = lambda x: "".join(['{http://www.w3.org/1999/xhtml}', x])
-# a simple routine that returns the tag name with the namespace prefix
-# as returned by etree's Element.tag attribute
-
-DEFAULT_CALLBACKS = [linkify_callbacks.nofollow]
-
-
-def clean(text, tags=ALLOWED_TAGS, attributes=ALLOWED_ATTRIBUTES,
-          styles=ALLOWED_STYLES, strip=False, strip_comments=True):
-    """Clean an HTML fragment and return it"""
-    if not text:
-        return ''
-
-    text = force_unicode(text)
-
-    class s(BleachSanitizer):
-        allowed_elements = tags
-        allowed_attributes = attributes
-        allowed_css_properties = styles
-        strip_disallowed_elements = strip
-        strip_html_comments = strip_comments
-
-    parser = html5lib.HTMLParser(tokenizer=s)
-
-    return _render(parser.parseFragment(text))
-
-
-def linkify(text, callbacks=DEFAULT_CALLBACKS, skip_pre=False,
-            parse_email=False, tokenizer=HTMLSanitizer):
-    """Convert URL-like strings in an HTML fragment to links.
-
-    linkify() converts strings that look like URLs or domain names in a
-    blob of text that may be an HTML fragment to links, while preserving
-    (a) links already in the string, (b) urls found in attributes, and
-    (c) email addresses.
-    """
-    text = force_unicode(text)
-
-    if not text:
-        return ''
-
-    parser = html5lib.HTMLParser(tokenizer=tokenizer)
-
-    forest = parser.parseFragment(text)
-    _seen = set([])
-
-    def replace_nodes(tree, new_frag, node, index=0):
-        """
-        Doesn't really replace nodes, but inserts the nodes contained in
-        new_frag into the treee at position index and returns the number
-        of nodes inserted.
-        If node is passed in, it is removed from the tree
-        """
-        count = 0
-        new_tree = parser.parseFragment(new_frag)
-        # capture any non-tag text at the start of the fragment
-        if new_tree.text:
-            if index == 0:
-                tree.text = tree.text or ''
-                tree.text += new_tree.text
-            else:
-                tree[index - 1].tail = tree[index - 1].tail or ''
-                tree[index - 1].tail += new_tree.text
-        # the put in the tagged elements into the old tree
-        for n in new_tree:
-            if n.tag == ETREE_TAG('a'):
-                _seen.add(n)
-            tree.insert(index + count, n)
-            count += 1
-        # if we got a node to remove...
-        if node is not None:
-            tree.remove(node)
-        return count
-
-    def strip_wrapping_parentheses(fragment):
-        """Strips wrapping parentheses.
-
-        Returns a tuple of the following format::
-
-            (string stripped from wrapping parentheses,
-             count of stripped opening parentheses,
-             count of stripped closing parentheses)
-        """
-        opening_parentheses = closing_parentheses = 0
-        # Count consecutive opening parentheses
-        # at the beginning of the fragment (string).
-        for char in fragment:
-            if char == '(':
-                opening_parentheses += 1
-            else:
-                break
-
-        if opening_parentheses:
-            newer_frag = ''
-            # Cut the consecutive opening brackets from the fragment.
-            fragment = fragment[opening_parentheses:]
-            # Reverse the fragment for easier detection of parentheses
-            # inside the URL.
-            reverse_fragment = fragment[::-1]
-            skip = False
-            for char in reverse_fragment:
-                # Remove the closing parentheses if it has a matching
-                # opening parentheses (they are balanced).
-                if (char == ')' and
-                        closing_parentheses < opening_parentheses and
-                        not skip):
-                    closing_parentheses += 1
-                    continue
-                # Do not remove ')' from the URL itself.
-                elif char != ')':
-                    skip = True
-                newer_frag += char
-            fragment = newer_frag[::-1]
-
-        return fragment, opening_parentheses, closing_parentheses
-
-    def apply_callbacks(attrs, new):
-        for cb in callbacks:
-            attrs = cb(attrs, new)
-            if attrs is None:
-                return None
-        return attrs
-
-    def _render_inner(node):
-        out = ['' if node.text is None else node.text]
-        for subnode in node:
-            out.append(_render(subnode))
-            if subnode.tail:
-                out.append(subnode.tail)
-        return ''.join(out)
-
-    def linkify_nodes(tree, parse_text=True):
-        children = len(tree)
-        current_child = -1
-        # start at -1 to process the parent first
-        while current_child < len(tree):
-            if current_child < 0:
-                node = tree
-                if parse_text and node.text:
-                    new_txt = old_txt = node.text
-                    if parse_email:
-                        new_txt = re.sub(email_re, email_repl, node.text)
-                        if new_txt and new_txt != node.text:
-                            node.text = ''
-                            adj = replace_nodes(tree, new_txt, None, 0)
-                            children += adj
-                            current_child += adj
-                            linkify_nodes(tree, True)
-                            continue
-
-                    new_txt = re.sub(url_re, link_repl, new_txt)
-                    if new_txt != old_txt:
-                        node.text = ''
-                        adj = replace_nodes(tree, new_txt, None, 0)
-                        children += adj
-                        current_child += adj
-                        continue
-            else:
-                node = tree[current_child]
-
-            if parse_text and node.tail:
-                new_tail = old_tail = node.tail
-                if parse_email:
-                    new_tail = re.sub(email_re, email_repl, new_tail)
-                    if new_tail != node.tail:
-                        node.tail = ''
-                        adj = replace_nodes(tree, new_tail, None,
-                                            current_child + 1)
-                        # Insert the new nodes made from my tail into
-                        # the tree right after me. current_child+1
-                        children += adj
-                        continue
-
-                new_tail = re.sub(url_re, link_repl, new_tail)
-                if new_tail != old_tail:
-                    node.tail = ''
-                    adj = replace_nodes(tree, new_tail, None,
-                                        current_child + 1)
-                    children += adj
-
-            if node.tag == ETREE_TAG('a') and not (node in _seen):
-                if not node.get('href', None) is None:
-                    attrs = dict(node.items())
-
-                    _text = attrs['_text'] = _render_inner(node)
-
-                    attrs = apply_callbacks(attrs, False)
-
-                    if attrs is None:
-                        # <a> tag replaced by the text within it
-                        adj = replace_nodes(tree, _text, node,
-                                            current_child)
-                        current_child -= 1
-                        # pull back current_child by 1 to scan the
-                        # new nodes again.
-                    else:
-                        text = force_unicode(attrs.pop('_text'))
-                        for attr_key, attr_val in attrs.items():
-                            node.set(attr_key, attr_val)
-
-                        for n in reversed(list(node)):
-                            node.remove(n)
-                        text = parser.parseFragment(text)
-                        node.text = text.text
-                        for n in text:
-                            node.append(n)
-                        _seen.add(node)
-
-            elif current_child >= 0:
-                if node.tag == ETREE_TAG('pre') and skip_pre:
-                    linkify_nodes(node, False)
-                elif not (node in _seen):
-                    linkify_nodes(node, True)
-
-            current_child += 1
-
-    def email_repl(match):
-        addr = match.group(0).replace('"', '&quot;')
-        link = {
-            '_text': addr,
-            'href': 'mailto:{0!s}'.format(addr),
-        }
-        link = apply_callbacks(link, True)
-
-        if link is None:
-            return addr
-
-        _href = link.pop('href')
-        _text = link.pop('_text')
-
-        repl = '<a href="{0!s}" {1!s}>{2!s}</a>'
-        attr = '{0!s}="{1!s}"'
-        attribs = ' '.join(attr.format(k, v) for k, v in link.items())
-        return repl.format(_href, attribs, _text)
-
-    def link_repl(match):
-        url = match.group(0)
-        open_brackets = close_brackets = 0
-        if url.startswith('('):
-            _wrapping = strip_wrapping_parentheses(url)
-            url, open_brackets, close_brackets = _wrapping
-        end = ''
-        m = re.search(punct_re, url)
-        if m:
-            end = m.group(0)
-            url = url[0:m.start()]
-        if re.search(proto_re, url):
-            href = url
-        else:
-            href = ''.join(['http://', url])
-
-        link = {
-            '_text': url,
-            'href': href,
-        }
-
-        link = apply_callbacks(link, True)
-
-        if link is None:
-            return '(' * open_brackets + url + ')' * close_brackets
-
-        _text = link.pop('_text')
-        _href = link.pop('href')
-
-        repl = '{0!s}<a href="{1!s}" {2!s}>{3!s}</a>{4!s}{5!s}'
-        attr = '{0!s}="{1!s}"'
-        attribs = ' '.join(attr.format(k, v) for k, v in link.items())
-
-        return repl.format('(' * open_brackets,
-                           _href, attribs, _text, end,
-                           ')' * close_brackets)
-
-    try:
-        linkify_nodes(forest)
-    except RuntimeError as e:
-        # If we hit the max recursion depth, just return what we've got.
-        log.exception('Probable recursion error: {0!r}'.format(e))
-
-    return _render(forest)
-
-
-def _render(tree):
-    """Try rendering as HTML, then XML, then give up."""
-    return force_unicode(_serialize(tree))
-
-
-def _serialize(domtree):
-    walker = html5lib.treewalkers.getTreeWalker('etree')
-    stream = walker(domtree)
-    serializer = HTMLSerializer(quote_attr_values=True,
-                                alphabetical_attributes=True,
-                                omit_optional_tags=False)
-    return serializer.render(stream)
diff --git a/src/bleach/callbacks.py b/src/bleach/callbacks.py
deleted file mode 100644
index 3cb82c255a..0000000000
--- a/src/bleach/callbacks.py
+++ /dev/null
@@ -1,20 +0,0 @@
-"""A set of basic callbacks for bleach.linkify."""
-from __future__ import unicode_literals
-
-
-def nofollow(attrs, new=False):
-    if attrs['href'].startswith('mailto:'):
-        return attrs
-    rel = [x for x in attrs.get('rel', '').split(' ') if x]
-    if 'nofollow' not in [x.lower() for x in rel]:
-        rel.append('nofollow')
-    attrs['rel'] = ' '.join(rel)
-
-    return attrs
-
-
-def target_blank(attrs, new=False):
-    if attrs['href'].startswith('mailto:'):
-        return attrs
-    attrs['target'] = '_blank'
-    return attrs
diff --git a/src/bleach/encoding.py b/src/bleach/encoding.py
deleted file mode 100644
index c52079ecf8..0000000000
--- a/src/bleach/encoding.py
+++ /dev/null
@@ -1,69 +0,0 @@
-import datetime
-from decimal import Decimal
-import types
-import sys
-from collections import namedtuple
-if sys.version_info[0] == 2:
-    six = namedtuple('Six', 'integer_types string_types text_type PY3')(
-        (int, long), (basestring,), unicode, False)  # noqa
-else:
-    six = namedtuple('Six', 'integer_types string_types text_type PY3')(
-        (int,), (str,), str, True)
-
-
-def is_protected_type(obj):
-    """Determine if the object instance is of a protected type.
-
-    Objects of protected types are preserved as-is when passed to
-    force_unicode(strings_only=True).
-    """
-    return isinstance(obj, (
-        six.integer_types +
-        (types.NoneType,
-         datetime.datetime, datetime.date, datetime.time,
-         float, Decimal))
-    )
-
-
-def force_unicode(s, encoding='utf-8', strings_only=False, errors='strict'):
-    """
-    Similar to smart_text, except that lazy instances are resolved to
-    strings, rather than kept as lazy objects.
-
-    If strings_only is True, don't convert (some) non-string-like objects.
-    """
-    # Handle the common case first, saves 30-40% when s is an instance of
-    # six.text_type. This function gets called often in that setting.
-    if isinstance(s, six.text_type):
-        return s
-    if strings_only and is_protected_type(s):
-        return s
-    try:
-        if not isinstance(s, six.string_types):
-            if hasattr(s, '__unicode__'):
-                s = s.__unicode__()
-            else:
-                if six.PY3:
-                    if isinstance(s, bytes):
-                        s = six.text_type(s, encoding, errors)
-                    else:
-                        s = six.text_type(s)
-                else:
-                    s = six.text_type(bytes(s), encoding, errors)
-        else:
-            # Note: We use .decode() here, instead of six.text_type(s,
-            # encoding, errors), so that if s is a SafeBytes, it ends up being
-            # a SafeText at the end.
-            s = s.decode(encoding, errors)
-    except UnicodeDecodeError as e:
-        if not isinstance(s, Exception):
-            raise UnicodeDecodeError(*e.args)
-        else:
-            # If we get to here, the caller has passed in an Exception
-            # subclass populated with non-ASCII bytestring data without a
-            # working unicode method. Try to handle this without raising a
-            # further exception by individually forcing the exception args
-            # to unicode.
-            s = ' '.join([force_unicode(arg, encoding, strings_only,
-                          errors) for arg in s])
-    return s
diff --git a/src/bleach/sanitizer.py b/src/bleach/sanitizer.py
deleted file mode 100644
index eec6659b3c..0000000000
--- a/src/bleach/sanitizer.py
+++ /dev/null
@@ -1,147 +0,0 @@
-from __future__ import unicode_literals
-import re
-from xml.sax.saxutils import escape, unescape
-
-from html5lib.constants import tokenTypes
-from html5lib.sanitizer import HTMLSanitizerMixin
-from html5lib.tokenizer import HTMLTokenizer
-
-
-PROTOS = HTMLSanitizerMixin.acceptable_protocols
-PROTOS.remove('feed')
-
-
-class BleachSanitizerMixin(HTMLSanitizerMixin):
-    """Mixin to replace sanitize_token() and sanitize_css()."""
-
-    allowed_svg_properties = []
-
-    def sanitize_token(self, token):
-        """Sanitize a token either by HTML-encoding or dropping.
-
-        Unlike HTMLSanitizerMixin.sanitize_token, allowed_attributes can be
-        a dict of {'tag': ['attribute', 'pairs'], 'tag': callable}.
-
-        Here callable is a function with two arguments of attribute name
-        and value. It should return true of false.
-
-        Also gives the option to strip tags instead of encoding.
-
-        """
-        if (getattr(self, 'wildcard_attributes', None) is None and
-                isinstance(self.allowed_attributes, dict)):
-            self.wildcard_attributes = self.allowed_attributes.get('*', [])
-
-        if token['type'] in (tokenTypes['StartTag'], tokenTypes['EndTag'],
-                             tokenTypes['EmptyTag']):
-            if token['name'] in self.allowed_elements:
-                if 'data' in token:
-                    if isinstance(self.allowed_attributes, dict):
-                        allowed_attributes = self.allowed_attributes.get(
-                            token['name'], [])
-                        if not callable(allowed_attributes):
-                            allowed_attributes += self.wildcard_attributes
-                    else:
-                        allowed_attributes = self.allowed_attributes
-                    attrs = dict([(name, val) for name, val in
-                                  token['data'][::-1]
-                                  if (allowed_attributes(name, val)
-                                      if callable(allowed_attributes)
-                                      else name in allowed_attributes)])
-                    for attr in self.attr_val_is_uri:
-                        if attr not in attrs:
-                            continue
-                        val_unescaped = re.sub("[`\000-\040\177-\240\s]+", '',
-                                               unescape(attrs[attr])).lower()
-                        # Remove replacement characters from unescaped
-                        # characters.
-                        val_unescaped = val_unescaped.replace("\ufffd", "")
-                        if (re.match(r'^[a-z0-9][-+.a-z0-9]*:', val_unescaped)
-                            and (val_unescaped.split(':')[0] not in
-                                 self.allowed_protocols)):
-                            del attrs[attr]
-                    for attr in self.svg_attr_val_allows_ref:
-                        if attr in attrs:
-                            attrs[attr] = re.sub(r'url\s*\(\s*[^#\s][^)]+?\)',
-                                                 ' ',
-                                                 unescape(attrs[attr]))
-                    if (token['name'] in self.svg_allow_local_href and
-                            'xlink:href' in attrs and
-                            re.search(r'^\s*[^#\s].*', attrs['xlink:href'])):
-                        del attrs['xlink:href']
-                    if 'style' in attrs:
-                        attrs['style'] = self.sanitize_css(attrs['style'])
-                    token['data'] = [(name, val) for name, val in
-                                     attrs.items()]
-                return token
-            elif self.strip_disallowed_elements:
-                pass
-            else:
-                if token['type'] == tokenTypes['EndTag']:
-                    token['data'] = '</{0!s}>'.format(token['name'])
-                elif token['data']:
-                    attr = ' {0!s}="{1!s}"'
-                    attrs = ''.join([attr.format(k, escape(v)) for k, v in
-                                    token['data']])
-                    token['data'] = '<{0!s}{1!s}>'.format(token['name'], attrs)
-                else:
-                    token['data'] = '<{0!s}>'.format(token['name'])
-                if token['selfClosing']:
-                    token['data'] = token['data'][:-1] + '/>'
-                token['type'] = tokenTypes['Characters']
-                del token["name"]
-                return token
-        elif token['type'] == tokenTypes['Comment']:
-            if not self.strip_html_comments:
-                return token
-        else:
-            return token
-
-    def sanitize_css(self, style):
-        """HTMLSanitizerMixin.sanitize_css replacement.
-
-        HTMLSanitizerMixin.sanitize_css always whitelists background-*,
-        border-*, margin-*, and padding-*. We only whitelist what's in
-        the whitelist.
-
-        """
-        # disallow urls
-        style = re.compile('url\s*\(\s*[^\s)]+?\s*\)\s*').sub(' ', style)
-
-        # gauntlet
-        # TODO: Make sure this does what it's meant to - I *think* it wants to
-        # validate style attribute contents.
-        parts = style.split(';')
-        gauntlet = re.compile("""^([-/:,#%.'"\sa-zA-Z0-9!]|\w-\w|'[\s\w]+'"""
-                              """\s*|"[\s\w]+"|\([\d,%\.\s]+\))*$""")
-        for part in parts:
-            if not gauntlet.match(part):
-                return ''
-
-        if not re.match("^\s*([-\w]+\s*:[^:;]*(;\s*|$))*$", style):
-            return ''
-
-        clean = []
-        for prop, value in re.findall('([-\w]+)\s*:\s*([^:;]*)', style):
-            if not value:
-                continue
-            if prop.lower() in self.allowed_css_properties:
-                clean.append(prop + ': ' + value + ';')
-            elif prop.lower() in self.allowed_svg_properties:
-                clean.append(prop + ': ' + value + ';')
-
-        return ' '.join(clean)
-
-
-class BleachSanitizer(HTMLTokenizer, BleachSanitizerMixin):
-    def __init__(self, stream, encoding=None, parseMeta=True, useChardet=True,
-                 lowercaseElementName=True, lowercaseAttrName=True, **kwargs):
-        HTMLTokenizer.__init__(self, stream, encoding, parseMeta, useChardet,
-                               lowercaseElementName, lowercaseAttrName,
-                               **kwargs)
-
-    def __iter__(self):
-        for token in HTMLTokenizer.__iter__(self):
-            token = self.sanitize_token(token)
-            if token:
-                yield token
diff --git a/src/calibre/library/comments.py b/src/calibre/library/comments.py
index 83becd8be9..7edfef2d75 100644
--- a/src/calibre/library/comments.py
+++ b/src/calibre/library/comments.py
@@ -133,14 +133,26 @@ def comments_to_html(comments):
 def merge_comments(one, two):
     return comments_to_html(one) + '\n\n' + comments_to_html(two)
 
+def sanitize_html(html):
+    if isinstance(html, bytes):
+        html = html.decode('utf-8', 'replace')
+    import html5lib
+    from html5lib.sanitizer import HTMLSanitizer
+    from html5lib.serializer.htmlserializer import HTMLSerializer
+    from html5lib.treebuilders.etree_lxml import TreeBuilder
+    from html5lib.treewalkers.lxmletree import TreeWalker
+    parser = html5lib.HTMLParser(tokenizer=HTMLSanitizer, tree=TreeBuilder)
+    tree = parser.parseFragment(html)
+    serializer = HTMLSerializer(quote_attr_values=True, alphabetical_attributes=False, omit_optional_tags=False)
+    stream = TreeWalker(tree)
+    return serializer.render(stream)
+
 def sanitize_comments_html(html):
     from calibre.ebooks.markdown import Markdown
-    import bleach
     text = html2text(html)
     md = Markdown()
     html = md.convert(text)
-    cleansed = re.sub(u'\n+', u'', bleach.clean(html))
-    return cleansed
+    return sanitize_html(html)
 
 def test():
     for pat, val in [
diff --git a/src/calibre/test_build.py b/src/calibre/test_build.py
index 65bf8b6a52..16ad1b46b1 100644
--- a/src/calibre/test_build.py
+++ b/src/calibre/test_build.py
@@ -236,8 +236,8 @@ def test_terminal():
 def test_markdown():
     from calibre.ebooks.markdown import Markdown
     Markdown(extensions=['extra'])
-    import bleach
-    bleach.clean(u'xxx<script>a</script>')
+    from calibre.library.comments import sanitize_html
+    sanitize_html(b'''<script>moo</script>xxx<img src="http://moo.com/x.jpg">''')
     print('Markdown OK!')
 
 def test():