mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Fix #936728 (Christian Science Monitor no longer working)
This commit is contained in:
parent
ac2cc2834c
commit
3392ddc51f
@ -33,6 +33,32 @@ class ChristianScienceMonitor(BasicNewsRecipe):
|
|||||||
|
|
||||||
remove_javascript = True
|
remove_javascript = True
|
||||||
no_stylesheets = True
|
no_stylesheets = True
|
||||||
|
requires_version = (0, 8, 39)
|
||||||
|
|
||||||
|
def preprocess_raw_html(self, raw, url):
|
||||||
|
try:
|
||||||
|
from html5lib import parse
|
||||||
|
root = parse(raw, namespaceHTMLElements=False,
|
||||||
|
treebuilder='lxml').getroot()
|
||||||
|
from lxml import etree
|
||||||
|
for tag in root.xpath(
|
||||||
|
'//script|//style|//noscript|//meta|//link|//object'):
|
||||||
|
tag.getparent().remove(tag)
|
||||||
|
for elem in list(root.iterdescendants(tag=etree.Comment)):
|
||||||
|
elem.getparent().remove(elem)
|
||||||
|
ans = etree.tostring(root, encoding=unicode)
|
||||||
|
ans = re.sub('.*<html', '<html', ans, flags=re.DOTALL)
|
||||||
|
return ans
|
||||||
|
except:
|
||||||
|
import traceback
|
||||||
|
traceback.print_exc()
|
||||||
|
raise
|
||||||
|
|
||||||
|
def index_to_soup(self, url):
|
||||||
|
raw = BasicNewsRecipe.index_to_soup(self, url,
|
||||||
|
raw=True).decode('utf-8')
|
||||||
|
raw = self.preprocess_raw_html(raw, url)
|
||||||
|
return BasicNewsRecipe.index_to_soup(self, raw)
|
||||||
|
|
||||||
def append_page(self, soup, appendtag, position):
|
def append_page(self, soup, appendtag, position):
|
||||||
nav = soup.find('div',attrs={'class':'navigation'})
|
nav = soup.find('div',attrs={'class':'navigation'})
|
||||||
@ -78,14 +104,6 @@ class ChristianScienceMonitor(BasicNewsRecipe):
|
|||||||
print_soup = soup
|
print_soup = soup
|
||||||
return print_soup
|
return print_soup
|
||||||
|
|
||||||
preprocess_regexps = [ (re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in
|
|
||||||
[
|
|
||||||
(r'<!--.*?-->', lambda match : ''),
|
|
||||||
(r'<body.*?<div id="story"', lambda match : '<body><div id="story"'),
|
|
||||||
(r'<div class="pubdate">.*?</div>', lambda m: ''),
|
|
||||||
(r'Full HTML version of this story which may include photos, graphics, and related links.*</body>',
|
|
||||||
lambda match : '</body>'),
|
|
||||||
]]
|
|
||||||
extra_css = '''
|
extra_css = '''
|
||||||
h1{ color:#000000;font-family: Georgia,Times,"Times New Roman",serif; font-size: large}
|
h1{ color:#000000;font-family: Georgia,Times,"Times New Roman",serif; font-size: large}
|
||||||
.sub{ color:#000000;font-family: Georgia,Times,"Times New Roman",serif; font-size: small;}
|
.sub{ color:#000000;font-family: Georgia,Times,"Times New Roman",serif; font-size: small;}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user