Use html5-parser to construct the soup in the news download system

This commit is contained in:
Kovid Goyal 2017-07-09 21:43:15 +05:30
parent 636cb5e654
commit 7aa05e70d3
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C
2 changed files with 24 additions and 19 deletions

View File

@ -685,21 +685,20 @@ class BasicNewsRecipe(Recipe):
_raw = self.encoding(_raw)
else:
_raw = _raw.decode(self.encoding, 'replace')
from calibre.ebooks.chardet import strip_encoding_declarations, xml_to_unicode
from calibre.utils.cleantext import clean_xml_chars
if isinstance(_raw, unicode):
_raw = strip_encoding_declarations(_raw)
else:
_raw = xml_to_unicode(_raw, strip_encoding_pats=True, resolve_entities=True)[0]
_raw = clean_xml_chars(_raw)
if as_tree:
from html5parser import parse
from calibre.ebooks.chardet import strip_encoding_declarations, xml_to_unicode
from calibre.utils.cleantext import clean_xml_chars
if isinstance(_raw, unicode):
_raw = strip_encoding_declarations(_raw)
else:
_raw = xml_to_unicode(_raw, strip_encoding_pats=True, resolve_entities=True)[0]
return parse(clean_xml_chars(_raw))
massage = list(BeautifulSoup.MARKUP_MASSAGE)
enc = 'cp1252' if callable(self.encoding) or self.encoding is None else self.encoding
massage.append((re.compile(r'&(\S+?);'), lambda match:
entity_to_unicode(match, encoding=enc)))
return BeautifulSoup(_raw, markupMassage=massage)
return parse(_raw)
else:
from html5_parser.soup import set_soup_module, parse
set_soup_module(sys.modules[BeautifulSoup.__module__])
return parse(_raw, return_root=False)
def extract_readable_article(self, html, url):
'''

View File

@ -7,11 +7,13 @@ __copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
Fetch a webpage and its links recursively. The webpages are saved to disk in
UTF-8 encoding with any charset declarations removed.
'''
import sys, socket, os, urlparse, re, time, copy, urllib2, threading, traceback
import sys, socket, os, urlparse, re, time, urllib2, threading, traceback
from urllib import url2pathname, quote
from httplib import responses
from base64 import b64decode
from html5_parser.soup import set_soup_module, parse
from calibre import browser, relpath, unicode_path
from calibre.constants import filesystem_encoding, iswindows
from calibre.utils.filenames import ascii_filename
@ -167,20 +169,24 @@ class RecursiveFetcher(object):
self.job_info = job_info
def get_soup(self, src, url=None):
nmassage = copy.copy(BeautifulSoup.MARKUP_MASSAGE)
nmassage = []
nmassage.extend(self.preprocess_regexps)
# Some websites have buggy doctype declarations that mess up beautifulsoup
nmassage += [(re.compile(r'<!DOCTYPE .+?>', re.DOTALL|re.IGNORECASE), lambda m: '')]
# Remove comments as they can leave detritus when extracting tags leaves
# multiple nested comments
nmassage.append((re.compile(r'<!--.*?-->', re.DOTALL), lambda m: ''))
usrc = xml_to_unicode(src, self.verbose, strip_encoding_pats=True)[0]
usrc = self.preprocess_raw_html(usrc, url)
soup = BeautifulSoup(usrc, markupMassage=nmassage)
for pat, repl in nmassage:
usrc = pat.sub(repl, usrc)
set_soup_module(sys.modules[BeautifulSoup.__module__])
soup = parse(usrc, return_root=False)
replace = self.prepreprocess_html_ext(soup)
if replace is not None:
soup = BeautifulSoup(xml_to_unicode(replace, self.verbose, strip_encoding_pats=True)[0], markupMassage=nmassage)
replace = xml_to_unicode(replace, self.verbose, strip_encoding_pats=True)[0]
for pat, repl in nmassage:
replace = pat.sub(repl, replace)
soup = parse(replace, return_root=False)
if self.keep_only_tags:
body = Tag(soup, 'body')