Update chronicle of higher education

This commit is contained in:
Kovid Goyal 2012-09-25 11:08:02 +05:30
parent 952f5709b0
commit 984d2b8b76

View File

@ -1,3 +1,4 @@
import re
from calibre.web.feeds.recipes import BasicNewsRecipe from calibre.web.feeds.recipes import BasicNewsRecipe
from collections import OrderedDict from collections import OrderedDict
@ -14,7 +15,8 @@ class Chronicle(BasicNewsRecipe):
dict(name='div', attrs={'class':'article'}), dict(name='div', attrs={'class':'article'}),
] ]
remove_tags = [dict(name='div',attrs={'class':['related module1','maintitle']}), remove_tags = [dict(name='div',attrs={'class':['related module1','maintitle']}),
dict(name='div', attrs={'id':['section-nav','icon-row']})] dict(name='div', attrs={'id':['section-nav','icon-row', 'enlarge-popup']}),
dict(name='a', attrs={'class':'show-enlarge enlarge'})]
no_javascript = True no_javascript = True
no_stylesheets = True no_stylesheets = True
@ -31,7 +33,6 @@ class Chronicle(BasicNewsRecipe):
return br return br
def parse_index(self): def parse_index(self):
#Go to the issue #Go to the issue
soup0 = self.index_to_soup('http://chronicle.com/section/Archives/39/') soup0 = self.index_to_soup('http://chronicle.com/section/Archives/39/')
issue = soup0.find('ul',attrs={'class':'feature-promo-list'}).li issue = soup0.find('ul',attrs={'class':'feature-promo-list'}).li
@ -42,9 +43,12 @@ class Chronicle(BasicNewsRecipe):
self.timefmt = u' [%s]'%dates self.timefmt = u' [%s]'%dates
#Find cover #Find cover
cover=soup0.find('div',attrs={'class':'promo'}).findNext('div') cover=soup0.find('div',attrs={'class':'side-content'}).find(attrs={'src':re.compile("photos/biz/Current")})
self.cover_url="http://chronicle.com"+cover.find('img')['src'] if cover is not None:
if "chronicle.com" in cover['src']:
self.cover_url=cover['src']
else:
self.cover_url="http://chronicle.com" + cover['src']
#Go to the main body #Go to the main body
soup = self.index_to_soup(issueurl) soup = self.index_to_soup(issueurl)
div = soup.find ('div', attrs={'id':'article-body'}) div = soup.find ('div', attrs={'id':'article-body'})
@ -74,8 +78,10 @@ class Chronicle(BasicNewsRecipe):
def preprocess_html(self,soup): def preprocess_html(self,soup):
#process all the images #process all the images
for div in soup.findAll('div', attrs={'class':'tableauPlaceholder'}): for div in soup.findAll('div', attrs={'class':'tableauPlaceholder'}):
noscripts=div.find('noscript').a noscripts=div.find('noscript').a
div.replaceWith(noscripts) div.replaceWith(noscripts)
for div0 in soup.findAll('div',text='Powered by Tableau'): for div0 in soup.findAll('div',text='Powered by Tableau'):
div0.extract() div0.extract()
return soup return soup