mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-08 18:54:09 -04:00
Use html5-parser to construct the soup in the news download system
This commit is contained in:
parent
636cb5e654
commit
7aa05e70d3
@ -685,21 +685,20 @@ class BasicNewsRecipe(Recipe):
|
|||||||
_raw = self.encoding(_raw)
|
_raw = self.encoding(_raw)
|
||||||
else:
|
else:
|
||||||
_raw = _raw.decode(self.encoding, 'replace')
|
_raw = _raw.decode(self.encoding, 'replace')
|
||||||
|
from calibre.ebooks.chardet import strip_encoding_declarations, xml_to_unicode
|
||||||
|
from calibre.utils.cleantext import clean_xml_chars
|
||||||
|
if isinstance(_raw, unicode):
|
||||||
|
_raw = strip_encoding_declarations(_raw)
|
||||||
|
else:
|
||||||
|
_raw = xml_to_unicode(_raw, strip_encoding_pats=True, resolve_entities=True)[0]
|
||||||
|
_raw = clean_xml_chars(_raw)
|
||||||
if as_tree:
|
if as_tree:
|
||||||
from html5parser import parse
|
from html5parser import parse
|
||||||
from calibre.ebooks.chardet import strip_encoding_declarations, xml_to_unicode
|
return parse(_raw)
|
||||||
from calibre.utils.cleantext import clean_xml_chars
|
else:
|
||||||
if isinstance(_raw, unicode):
|
from html5_parser.soup import set_soup_module, parse
|
||||||
_raw = strip_encoding_declarations(_raw)
|
set_soup_module(sys.modules[BeautifulSoup.__module__])
|
||||||
else:
|
return parse(_raw, return_root=False)
|
||||||
_raw = xml_to_unicode(_raw, strip_encoding_pats=True, resolve_entities=True)[0]
|
|
||||||
return parse(clean_xml_chars(_raw))
|
|
||||||
|
|
||||||
massage = list(BeautifulSoup.MARKUP_MASSAGE)
|
|
||||||
enc = 'cp1252' if callable(self.encoding) or self.encoding is None else self.encoding
|
|
||||||
massage.append((re.compile(r'&(\S+?);'), lambda match:
|
|
||||||
entity_to_unicode(match, encoding=enc)))
|
|
||||||
return BeautifulSoup(_raw, markupMassage=massage)
|
|
||||||
|
|
||||||
def extract_readable_article(self, html, url):
|
def extract_readable_article(self, html, url):
|
||||||
'''
|
'''
|
||||||
|
@ -7,11 +7,13 @@ __copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
|
|||||||
Fetch a webpage and its links recursively. The webpages are saved to disk in
|
Fetch a webpage and its links recursively. The webpages are saved to disk in
|
||||||
UTF-8 encoding with any charset declarations removed.
|
UTF-8 encoding with any charset declarations removed.
|
||||||
'''
|
'''
|
||||||
import sys, socket, os, urlparse, re, time, copy, urllib2, threading, traceback
|
import sys, socket, os, urlparse, re, time, urllib2, threading, traceback
|
||||||
from urllib import url2pathname, quote
|
from urllib import url2pathname, quote
|
||||||
from httplib import responses
|
from httplib import responses
|
||||||
from base64 import b64decode
|
from base64 import b64decode
|
||||||
|
|
||||||
|
from html5_parser.soup import set_soup_module, parse
|
||||||
|
|
||||||
from calibre import browser, relpath, unicode_path
|
from calibre import browser, relpath, unicode_path
|
||||||
from calibre.constants import filesystem_encoding, iswindows
|
from calibre.constants import filesystem_encoding, iswindows
|
||||||
from calibre.utils.filenames import ascii_filename
|
from calibre.utils.filenames import ascii_filename
|
||||||
@ -167,20 +169,24 @@ class RecursiveFetcher(object):
|
|||||||
self.job_info = job_info
|
self.job_info = job_info
|
||||||
|
|
||||||
def get_soup(self, src, url=None):
|
def get_soup(self, src, url=None):
|
||||||
nmassage = copy.copy(BeautifulSoup.MARKUP_MASSAGE)
|
nmassage = []
|
||||||
nmassage.extend(self.preprocess_regexps)
|
nmassage.extend(self.preprocess_regexps)
|
||||||
# Some websites have buggy doctype declarations that mess up beautifulsoup
|
|
||||||
nmassage += [(re.compile(r'<!DOCTYPE .+?>', re.DOTALL|re.IGNORECASE), lambda m: '')]
|
|
||||||
# Remove comments as they can leave detritus when extracting tags leaves
|
# Remove comments as they can leave detritus when extracting tags leaves
|
||||||
# multiple nested comments
|
# multiple nested comments
|
||||||
nmassage.append((re.compile(r'<!--.*?-->', re.DOTALL), lambda m: ''))
|
nmassage.append((re.compile(r'<!--.*?-->', re.DOTALL), lambda m: ''))
|
||||||
usrc = xml_to_unicode(src, self.verbose, strip_encoding_pats=True)[0]
|
usrc = xml_to_unicode(src, self.verbose, strip_encoding_pats=True)[0]
|
||||||
usrc = self.preprocess_raw_html(usrc, url)
|
usrc = self.preprocess_raw_html(usrc, url)
|
||||||
soup = BeautifulSoup(usrc, markupMassage=nmassage)
|
for pat, repl in nmassage:
|
||||||
|
usrc = pat.sub(repl, usrc)
|
||||||
|
set_soup_module(sys.modules[BeautifulSoup.__module__])
|
||||||
|
soup = parse(usrc, return_root=False)
|
||||||
|
|
||||||
replace = self.prepreprocess_html_ext(soup)
|
replace = self.prepreprocess_html_ext(soup)
|
||||||
if replace is not None:
|
if replace is not None:
|
||||||
soup = BeautifulSoup(xml_to_unicode(replace, self.verbose, strip_encoding_pats=True)[0], markupMassage=nmassage)
|
replace = xml_to_unicode(replace, self.verbose, strip_encoding_pats=True)[0]
|
||||||
|
for pat, repl in nmassage:
|
||||||
|
replace = pat.sub(repl, replace)
|
||||||
|
soup = parse(replace, return_root=False)
|
||||||
|
|
||||||
if self.keep_only_tags:
|
if self.keep_only_tags:
|
||||||
body = Tag(soup, 'body')
|
body = Tag(soup, 'body')
|
||||||
|
Loading…
x
Reference in New Issue
Block a user