diff --git a/recipes/chronicle_higher_ed.recipe b/recipes/chronicle_higher_ed.recipe index f0188d4d77..15b284cd7a 100644 --- a/recipes/chronicle_higher_ed.recipe +++ b/recipes/chronicle_higher_ed.recipe @@ -1,3 +1,4 @@ +import re from calibre.web.feeds.recipes import BasicNewsRecipe from collections import OrderedDict @@ -14,7 +15,8 @@ class Chronicle(BasicNewsRecipe): dict(name='div', attrs={'class':'article'}), ] remove_tags = [dict(name='div',attrs={'class':['related module1','maintitle']}), - dict(name='div', attrs={'id':['section-nav','icon-row']})] + dict(name='div', attrs={'id':['section-nav','icon-row', 'enlarge-popup']}), + dict(name='a', attrs={'class':'show-enlarge enlarge'})] no_javascript = True no_stylesheets = True @@ -31,7 +33,6 @@ class Chronicle(BasicNewsRecipe): return br def parse_index(self): - #Go to the issue soup0 = self.index_to_soup('http://chronicle.com/section/Archives/39/') issue = soup0.find('ul',attrs={'class':'feature-promo-list'}).li @@ -42,9 +43,12 @@ class Chronicle(BasicNewsRecipe): self.timefmt = u' [%s]'%dates #Find cover - cover=soup0.find('div',attrs={'class':'promo'}).findNext('div') - self.cover_url="http://chronicle.com"+cover.find('img')['src'] - + cover=soup0.find('div',attrs={'class':'side-content'}).find(attrs={'src':re.compile("photos/biz/Current")}) + if cover is not None: + if "chronicle.com" in cover['src']: + self.cover_url=cover['src'] + else: + self.cover_url="http://chronicle.com" + cover['src'] #Go to the main body soup = self.index_to_soup(issueurl) div = soup.find ('div', attrs={'id':'article-body'}) @@ -74,8 +78,10 @@ class Chronicle(BasicNewsRecipe): def preprocess_html(self,soup): #process all the images for div in soup.findAll('div', attrs={'class':'tableauPlaceholder'}): + noscripts=div.find('noscript').a div.replaceWith(noscripts) for div0 in soup.findAll('div',text='Powered by Tableau'): div0.extract() return soup +