import re from calibre.web.feeds.recipes import BasicNewsRecipe from collections import OrderedDict class Chronicle(BasicNewsRecipe): title = 'The Chronicle of Higher Education' __author__ = 'Rick Shang' description = 'Weekly news and job-information source for college and university faculty members, administrators, and students.' language = 'en' category = 'news' encoding = 'UTF-8' keep_only_tags = [ dict(name='div', attrs={'class':['article','blog-mod']}), ] remove_tags = [dict(name='div',attrs={'class':['related module1','maintitle','entry-utility','object-meta']}), dict(name='div', attrs={'id':['section-nav','icon-row', 'enlarge-popup','confirm-popup']}), dict(name='a', attrs={'class':'show-enlarge enlarge'})] no_javascript = True no_stylesheets = True needs_subscription = True def get_browser(self): br = BasicNewsRecipe.get_browser(self) if self.username is not None and self.password is not None: br.open('http://chronicle.com/myaccount/login') br.select_form(nr=1) br['username'] = self.username br['password'] = self.password br.submit() return br def parse_index(self): #Go to the issue soup0 = self.index_to_soup('http://chronicle.com/section/Archives/39/') issue = soup0.find('ul',attrs={'class':'feature-promo-list'}).li issueurl = "http://chronicle.com"+issue.a['href'] #Find date dates = self.tag_to_string(issue.a).split(': ')[-1] self.timefmt = u' [%s]'%dates #Find cover cover=soup0.find('div',attrs={'class':'side-content'}).find(attrs={'src':re.compile("photos/biz/Current")}) if cover is not None: if "chronicle.com" in cover['src']: self.cover_url=cover['src'] else: self.cover_url="http://chronicle.com" + cover['src'] #Go to the main body soup = self.index_to_soup(issueurl) div = soup.find ('div', attrs={'id':'article-body'}) feeds = OrderedDict() section_title = '' for post in div.findAll('li'): articles = [] a=post.find('a', href=True) if a is not None: title=self.tag_to_string(a) url="http://chronicle.com"+a['href'].strip() sectiontitle=post.findPrevious('h3') if sectiontitle is None: sectiontitle=post.findPrevious('h4') section_title=self.tag_to_string(sectiontitle) desc=self.tag_to_string(post.find('p')) articles.append({'title':title, 'url':url, 'description':desc, 'date':''}) if articles: if section_title not in feeds: feeds[section_title] = [] feeds[section_title] += articles ans = [(key, val) for key, val in feeds.iteritems()] return ans def preprocess_html(self,soup): #process all the images for div in soup.findAll('div', attrs={'class':'tableauPlaceholder'}): noscripts=div.find('noscript').a div.replaceWith(noscripts) for div0 in soup.findAll('div',text='Powered by Tableau'): div0.extract() return soup