mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Update chronicle of higher education
This commit is contained in:
parent
952f5709b0
commit
984d2b8b76
@ -1,3 +1,4 @@
|
|||||||
|
import re
|
||||||
from calibre.web.feeds.recipes import BasicNewsRecipe
|
from calibre.web.feeds.recipes import BasicNewsRecipe
|
||||||
from collections import OrderedDict
|
from collections import OrderedDict
|
||||||
|
|
||||||
@ -14,7 +15,8 @@ class Chronicle(BasicNewsRecipe):
|
|||||||
dict(name='div', attrs={'class':'article'}),
|
dict(name='div', attrs={'class':'article'}),
|
||||||
]
|
]
|
||||||
remove_tags = [dict(name='div',attrs={'class':['related module1','maintitle']}),
|
remove_tags = [dict(name='div',attrs={'class':['related module1','maintitle']}),
|
||||||
dict(name='div', attrs={'id':['section-nav','icon-row']})]
|
dict(name='div', attrs={'id':['section-nav','icon-row', 'enlarge-popup']}),
|
||||||
|
dict(name='a', attrs={'class':'show-enlarge enlarge'})]
|
||||||
no_javascript = True
|
no_javascript = True
|
||||||
no_stylesheets = True
|
no_stylesheets = True
|
||||||
|
|
||||||
@ -31,7 +33,6 @@ class Chronicle(BasicNewsRecipe):
|
|||||||
return br
|
return br
|
||||||
|
|
||||||
def parse_index(self):
|
def parse_index(self):
|
||||||
|
|
||||||
#Go to the issue
|
#Go to the issue
|
||||||
soup0 = self.index_to_soup('http://chronicle.com/section/Archives/39/')
|
soup0 = self.index_to_soup('http://chronicle.com/section/Archives/39/')
|
||||||
issue = soup0.find('ul',attrs={'class':'feature-promo-list'}).li
|
issue = soup0.find('ul',attrs={'class':'feature-promo-list'}).li
|
||||||
@ -42,9 +43,12 @@ class Chronicle(BasicNewsRecipe):
|
|||||||
self.timefmt = u' [%s]'%dates
|
self.timefmt = u' [%s]'%dates
|
||||||
|
|
||||||
#Find cover
|
#Find cover
|
||||||
cover=soup0.find('div',attrs={'class':'promo'}).findNext('div')
|
cover=soup0.find('div',attrs={'class':'side-content'}).find(attrs={'src':re.compile("photos/biz/Current")})
|
||||||
self.cover_url="http://chronicle.com"+cover.find('img')['src']
|
if cover is not None:
|
||||||
|
if "chronicle.com" in cover['src']:
|
||||||
|
self.cover_url=cover['src']
|
||||||
|
else:
|
||||||
|
self.cover_url="http://chronicle.com" + cover['src']
|
||||||
#Go to the main body
|
#Go to the main body
|
||||||
soup = self.index_to_soup(issueurl)
|
soup = self.index_to_soup(issueurl)
|
||||||
div = soup.find ('div', attrs={'id':'article-body'})
|
div = soup.find ('div', attrs={'id':'article-body'})
|
||||||
@ -74,8 +78,10 @@ class Chronicle(BasicNewsRecipe):
|
|||||||
def preprocess_html(self,soup):
|
def preprocess_html(self,soup):
|
||||||
#process all the images
|
#process all the images
|
||||||
for div in soup.findAll('div', attrs={'class':'tableauPlaceholder'}):
|
for div in soup.findAll('div', attrs={'class':'tableauPlaceholder'}):
|
||||||
|
|
||||||
noscripts=div.find('noscript').a
|
noscripts=div.find('noscript').a
|
||||||
div.replaceWith(noscripts)
|
div.replaceWith(noscripts)
|
||||||
for div0 in soup.findAll('div',text='Powered by Tableau'):
|
for div0 in soup.findAll('div',text='Powered by Tableau'):
|
||||||
div0.extract()
|
div0.extract()
|
||||||
return soup
|
return soup
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user