mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-08 18:54:09 -04:00
Use html5-parser to construct the soup in the news download system
This commit is contained in:
parent
636cb5e654
commit
7aa05e70d3
@ -685,21 +685,20 @@ class BasicNewsRecipe(Recipe):
|
||||
_raw = self.encoding(_raw)
|
||||
else:
|
||||
_raw = _raw.decode(self.encoding, 'replace')
|
||||
from calibre.ebooks.chardet import strip_encoding_declarations, xml_to_unicode
|
||||
from calibre.utils.cleantext import clean_xml_chars
|
||||
if isinstance(_raw, unicode):
|
||||
_raw = strip_encoding_declarations(_raw)
|
||||
else:
|
||||
_raw = xml_to_unicode(_raw, strip_encoding_pats=True, resolve_entities=True)[0]
|
||||
_raw = clean_xml_chars(_raw)
|
||||
if as_tree:
|
||||
from html5parser import parse
|
||||
from calibre.ebooks.chardet import strip_encoding_declarations, xml_to_unicode
|
||||
from calibre.utils.cleantext import clean_xml_chars
|
||||
if isinstance(_raw, unicode):
|
||||
_raw = strip_encoding_declarations(_raw)
|
||||
else:
|
||||
_raw = xml_to_unicode(_raw, strip_encoding_pats=True, resolve_entities=True)[0]
|
||||
return parse(clean_xml_chars(_raw))
|
||||
|
||||
massage = list(BeautifulSoup.MARKUP_MASSAGE)
|
||||
enc = 'cp1252' if callable(self.encoding) or self.encoding is None else self.encoding
|
||||
massage.append((re.compile(r'&(\S+?);'), lambda match:
|
||||
entity_to_unicode(match, encoding=enc)))
|
||||
return BeautifulSoup(_raw, markupMassage=massage)
|
||||
return parse(_raw)
|
||||
else:
|
||||
from html5_parser.soup import set_soup_module, parse
|
||||
set_soup_module(sys.modules[BeautifulSoup.__module__])
|
||||
return parse(_raw, return_root=False)
|
||||
|
||||
def extract_readable_article(self, html, url):
|
||||
'''
|
||||
|
@ -7,11 +7,13 @@ __copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
|
||||
Fetch a webpage and its links recursively. The webpages are saved to disk in
|
||||
UTF-8 encoding with any charset declarations removed.
|
||||
'''
|
||||
import sys, socket, os, urlparse, re, time, copy, urllib2, threading, traceback
|
||||
import sys, socket, os, urlparse, re, time, urllib2, threading, traceback
|
||||
from urllib import url2pathname, quote
|
||||
from httplib import responses
|
||||
from base64 import b64decode
|
||||
|
||||
from html5_parser.soup import set_soup_module, parse
|
||||
|
||||
from calibre import browser, relpath, unicode_path
|
||||
from calibre.constants import filesystem_encoding, iswindows
|
||||
from calibre.utils.filenames import ascii_filename
|
||||
@ -167,20 +169,24 @@ class RecursiveFetcher(object):
|
||||
self.job_info = job_info
|
||||
|
||||
def get_soup(self, src, url=None):
|
||||
nmassage = copy.copy(BeautifulSoup.MARKUP_MASSAGE)
|
||||
nmassage = []
|
||||
nmassage.extend(self.preprocess_regexps)
|
||||
# Some websites have buggy doctype declarations that mess up beautifulsoup
|
||||
nmassage += [(re.compile(r'<!DOCTYPE .+?>', re.DOTALL|re.IGNORECASE), lambda m: '')]
|
||||
# Remove comments as they can leave detritus when extracting tags leaves
|
||||
# multiple nested comments
|
||||
nmassage.append((re.compile(r'<!--.*?-->', re.DOTALL), lambda m: ''))
|
||||
usrc = xml_to_unicode(src, self.verbose, strip_encoding_pats=True)[0]
|
||||
usrc = self.preprocess_raw_html(usrc, url)
|
||||
soup = BeautifulSoup(usrc, markupMassage=nmassage)
|
||||
for pat, repl in nmassage:
|
||||
usrc = pat.sub(repl, usrc)
|
||||
set_soup_module(sys.modules[BeautifulSoup.__module__])
|
||||
soup = parse(usrc, return_root=False)
|
||||
|
||||
replace = self.prepreprocess_html_ext(soup)
|
||||
if replace is not None:
|
||||
soup = BeautifulSoup(xml_to_unicode(replace, self.verbose, strip_encoding_pats=True)[0], markupMassage=nmassage)
|
||||
replace = xml_to_unicode(replace, self.verbose, strip_encoding_pats=True)[0]
|
||||
for pat, repl in nmassage:
|
||||
replace = pat.sub(repl, replace)
|
||||
soup = parse(replace, return_root=False)
|
||||
|
||||
if self.keep_only_tags:
|
||||
body = Tag(soup, 'body')
|
||||
|
Loading…
x
Reference in New Issue
Block a user