diff --git a/src/calibre/web/feeds/news.py b/src/calibre/web/feeds/news.py
index 8c750e43c1..6276b4e3f3 100644
--- a/src/calibre/web/feeds/news.py
+++ b/src/calibre/web/feeds/news.py
@@ -685,21 +685,20 @@ class BasicNewsRecipe(Recipe):
                 _raw = self.encoding(_raw)
             else:
                 _raw = _raw.decode(self.encoding, 'replace')
+        from calibre.ebooks.chardet import strip_encoding_declarations, xml_to_unicode
+        from calibre.utils.cleantext import clean_xml_chars
+        if isinstance(_raw, unicode):
+            _raw = strip_encoding_declarations(_raw)
+        else:
+            _raw = xml_to_unicode(_raw, strip_encoding_pats=True, resolve_entities=True)[0]
+        _raw = clean_xml_chars(_raw)
         if as_tree:
             from html5parser import parse
-            from calibre.ebooks.chardet import strip_encoding_declarations, xml_to_unicode
-            from calibre.utils.cleantext import clean_xml_chars
-            if isinstance(_raw, unicode):
-                _raw = strip_encoding_declarations(_raw)
-            else:
-                _raw = xml_to_unicode(_raw, strip_encoding_pats=True, resolve_entities=True)[0]
-            return parse(clean_xml_chars(_raw))
-
-        massage = list(BeautifulSoup.MARKUP_MASSAGE)
-        enc = 'cp1252' if callable(self.encoding) or self.encoding is None else self.encoding
-        massage.append((re.compile(r'&(\S+?);'), lambda match:
-            entity_to_unicode(match, encoding=enc)))
-        return BeautifulSoup(_raw, markupMassage=massage)
+            return parse(_raw)
+        else:
+            from html5_parser.soup import set_soup_module, parse
+            set_soup_module(sys.modules[BeautifulSoup.__module__])
+            return parse(_raw, return_root=False)
 
     def extract_readable_article(self, html, url):
         '''
diff --git a/src/calibre/web/fetch/simple.py b/src/calibre/web/fetch/simple.py
index c3d3559e2e..822882acd4 100644
--- a/src/calibre/web/fetch/simple.py
+++ b/src/calibre/web/fetch/simple.py
@@ -7,11 +7,13 @@ __copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
 Fetch a webpage and its links recursively. The webpages are saved to disk in
 UTF-8 encoding with any charset declarations removed.
 '''
-import sys, socket, os, urlparse, re, time, copy, urllib2, threading, traceback
+import sys, socket, os, urlparse, re, time, urllib2, threading, traceback
 from urllib import url2pathname, quote
 from httplib import responses
 from base64 import b64decode
 
+from html5_parser.soup import set_soup_module, parse
+
 from calibre import browser, relpath, unicode_path
 from calibre.constants import filesystem_encoding, iswindows
 from calibre.utils.filenames import ascii_filename
@@ -167,20 +169,24 @@ class RecursiveFetcher(object):
         self.job_info = job_info
 
     def get_soup(self, src, url=None):
-        nmassage = copy.copy(BeautifulSoup.MARKUP_MASSAGE)
+        nmassage = []
         nmassage.extend(self.preprocess_regexps)
-        # Some websites have buggy doctype declarations that mess up beautifulsoup
-        nmassage += [(re.compile(r'<!DOCTYPE .+?>', re.DOTALL|re.IGNORECASE), lambda m: '')]
         # Remove comments as they can leave detritus when extracting tags leaves
         # multiple nested comments
         nmassage.append((re.compile(r'<!--.*?-->', re.DOTALL), lambda m: ''))
         usrc = xml_to_unicode(src, self.verbose, strip_encoding_pats=True)[0]
         usrc = self.preprocess_raw_html(usrc, url)
-        soup = BeautifulSoup(usrc, markupMassage=nmassage)
+        for pat, repl in nmassage:
+            usrc = pat.sub(repl, usrc)
+        set_soup_module(sys.modules[BeautifulSoup.__module__])
+        soup = parse(usrc, return_root=False)
 
         replace = self.prepreprocess_html_ext(soup)
         if replace is not None:
-            soup = BeautifulSoup(xml_to_unicode(replace, self.verbose, strip_encoding_pats=True)[0], markupMassage=nmassage)
+            replace = xml_to_unicode(replace, self.verbose, strip_encoding_pats=True)[0]
+            for pat, repl in nmassage:
+                replace = pat.sub(repl, replace)
+            soup = parse(replace, return_root=False)
 
         if self.keep_only_tags:
             body = Tag(soup, 'body')