diff --git a/src/calibre/web/feeds/news.py b/src/calibre/web/feeds/news.py index 8c750e43c1..6276b4e3f3 100644 --- a/src/calibre/web/feeds/news.py +++ b/src/calibre/web/feeds/news.py @@ -685,21 +685,20 @@ class BasicNewsRecipe(Recipe): _raw = self.encoding(_raw) else: _raw = _raw.decode(self.encoding, 'replace') + from calibre.ebooks.chardet import strip_encoding_declarations, xml_to_unicode + from calibre.utils.cleantext import clean_xml_chars + if isinstance(_raw, unicode): + _raw = strip_encoding_declarations(_raw) + else: + _raw = xml_to_unicode(_raw, strip_encoding_pats=True, resolve_entities=True)[0] + _raw = clean_xml_chars(_raw) if as_tree: from html5parser import parse - from calibre.ebooks.chardet import strip_encoding_declarations, xml_to_unicode - from calibre.utils.cleantext import clean_xml_chars - if isinstance(_raw, unicode): - _raw = strip_encoding_declarations(_raw) - else: - _raw = xml_to_unicode(_raw, strip_encoding_pats=True, resolve_entities=True)[0] - return parse(clean_xml_chars(_raw)) - - massage = list(BeautifulSoup.MARKUP_MASSAGE) - enc = 'cp1252' if callable(self.encoding) or self.encoding is None else self.encoding - massage.append((re.compile(r'&(\S+?);'), lambda match: - entity_to_unicode(match, encoding=enc))) - return BeautifulSoup(_raw, markupMassage=massage) + return parse(_raw) + else: + from html5_parser.soup import set_soup_module, parse + set_soup_module(sys.modules[BeautifulSoup.__module__]) + return parse(_raw, return_root=False) def extract_readable_article(self, html, url): ''' diff --git a/src/calibre/web/fetch/simple.py b/src/calibre/web/fetch/simple.py index c3d3559e2e..822882acd4 100644 --- a/src/calibre/web/fetch/simple.py +++ b/src/calibre/web/fetch/simple.py @@ -7,11 +7,13 @@ __copyright__ = '2008, Kovid Goyal ' Fetch a webpage and its links recursively. The webpages are saved to disk in UTF-8 encoding with any charset declarations removed. ''' -import sys, socket, os, urlparse, re, time, copy, urllib2, threading, traceback +import sys, socket, os, urlparse, re, time, urllib2, threading, traceback from urllib import url2pathname, quote from httplib import responses from base64 import b64decode +from html5_parser.soup import set_soup_module, parse + from calibre import browser, relpath, unicode_path from calibre.constants import filesystem_encoding, iswindows from calibre.utils.filenames import ascii_filename @@ -167,20 +169,24 @@ class RecursiveFetcher(object): self.job_info = job_info def get_soup(self, src, url=None): - nmassage = copy.copy(BeautifulSoup.MARKUP_MASSAGE) + nmassage = [] nmassage.extend(self.preprocess_regexps) - # Some websites have buggy doctype declarations that mess up beautifulsoup - nmassage += [(re.compile(r'', re.DOTALL|re.IGNORECASE), lambda m: '')] # Remove comments as they can leave detritus when extracting tags leaves # multiple nested comments nmassage.append((re.compile(r'', re.DOTALL), lambda m: '')) usrc = xml_to_unicode(src, self.verbose, strip_encoding_pats=True)[0] usrc = self.preprocess_raw_html(usrc, url) - soup = BeautifulSoup(usrc, markupMassage=nmassage) + for pat, repl in nmassage: + usrc = pat.sub(repl, usrc) + set_soup_module(sys.modules[BeautifulSoup.__module__]) + soup = parse(usrc, return_root=False) replace = self.prepreprocess_html_ext(soup) if replace is not None: - soup = BeautifulSoup(xml_to_unicode(replace, self.verbose, strip_encoding_pats=True)[0], markupMassage=nmassage) + replace = xml_to_unicode(replace, self.verbose, strip_encoding_pats=True)[0] + for pat, repl in nmassage: + replace = pat.sub(repl, replace) + soup = parse(replace, return_root=False) if self.keep_only_tags: body = Tag(soup, 'body')