From 9360c5833a740336e58991c3d25b71d195e67335 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sun, 2 Sep 2012 09:35:50 +0530 Subject: [PATCH] Fix Chronicle of Higher Education --- recipes/chronicle_higher_ed.recipe | 32 ++++++++++++++++-------------- 1 file changed, 17 insertions(+), 15 deletions(-) diff --git a/recipes/chronicle_higher_ed.recipe b/recipes/chronicle_higher_ed.recipe index 7ed834a4e5..f0188d4d77 100644 --- a/recipes/chronicle_higher_ed.recipe +++ b/recipes/chronicle_higher_ed.recipe @@ -13,13 +13,13 @@ class Chronicle(BasicNewsRecipe): keep_only_tags = [ dict(name='div', attrs={'class':'article'}), ] - remove_tags = [dict(name='div',attrs={'class':'related module1'})] + remove_tags = [dict(name='div',attrs={'class':['related module1','maintitle']}), + dict(name='div', attrs={'id':['section-nav','icon-row']})] no_javascript = True no_stylesheets = True needs_subscription = True - def get_browser(self): br = BasicNewsRecipe.get_browser() if self.username is not None and self.password is not None: @@ -27,7 +27,7 @@ class Chronicle(BasicNewsRecipe): br.select_form(nr=1) br['username'] = self.username br['password'] = self.password - br.submit() + br.submit() return br def parse_index(self): @@ -47,33 +47,35 @@ class Chronicle(BasicNewsRecipe): #Go to the main body soup = self.index_to_soup(issueurl) - div0 = soup.find ('div', attrs={'id':'article-body'}) + div = soup.find ('div', attrs={'id':'article-body'}) feeds = OrderedDict() - for div in div0.findAll('div',attrs={'class':'module1'}): - section_title = self.tag_to_string(div.find('h3')) - for post in div.findAll('li',attrs={'class':'sub-promo'}): - articles = [] - a=post.find('a', href=True) + section_title = '' + for post in div.findAll('li'): + articles = [] + a=post.find('a', href=True) + if a is not None: title=self.tag_to_string(a) url="http://chronicle.com"+a['href'].strip() + sectiontitle=post.findPrevious('h3') + if sectiontitle is None: + sectiontitle=post.findPrevious('h4') + section_title=self.tag_to_string(sectiontitle) desc=self.tag_to_string(post.find('p')) articles.append({'title':title, 'url':url, 'description':desc, 'date':''}) - if articles: - if section_title not in feeds: - feeds[section_title] = [] - feeds[section_title] += articles + if articles: + if section_title not in feeds: + feeds[section_title] = [] + feeds[section_title] += articles ans = [(key, val) for key, val in feeds.iteritems()] return ans def preprocess_html(self,soup): #process all the images for div in soup.findAll('div', attrs={'class':'tableauPlaceholder'}): - noscripts=div.find('noscript').a div.replaceWith(noscripts) for div0 in soup.findAll('div',text='Powered by Tableau'): div0.extract() return soup -