From 6c6e8e97a1a37b40b2b3040455d4ce4b18dfa2fe Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Tue, 15 Jul 2014 23:47:34 +0530 Subject: [PATCH] Update The Hindu --- recipes/hindu.recipe | 73 ++++++++++++++++++++++++++++---------------- 1 file changed, 47 insertions(+), 26 deletions(-) diff --git a/recipes/hindu.recipe b/recipes/hindu.recipe index 82cef40e1f..71c057eff2 100644 --- a/recipes/hindu.recipe +++ b/recipes/hindu.recipe @@ -3,52 +3,73 @@ __license__ = 'GPL 3' __copyright__ = '2009, Kovid Goyal ' from calibre.web.feeds.news import BasicNewsRecipe +import string class TheHindu(BasicNewsRecipe): title = u'The Hindu' language = 'en_IN' - oldest_article = 7 + oldest_article = 1 __author__ = 'Kovid Goyal' max_articles_per_feed = 100 no_stylesheets = True auto_cleanup = True - extra_css = '.photo-caption { font-size: smaller }' def parse_index(self): soup = self.index_to_soup('http://www.thehindu.com/todays-paper/') - div = soup.find('div', attrs={'id':'left-column'}) - soup.find(id='subnav-tpbar').extract() + nav_div = soup.find(id='tpnav-bar') + section_list = [] + # Finding all the section titles that are acceptable + for x in nav_div.findAll(['a']): + if self.is_accepted_entry(x): + section_list.append((string.capwords(self.tag_to_string(x)), x['href'])) - - current_section = None - current_articles = [] + # For each section title, fetch the article urls feeds = [] - for x in div.findAll(['a', 'span']): - if x.name == 'span' and x['class'] == 's-link': - # Section heading found - if current_articles and current_section: - feeds.append((current_section, current_articles)) - current_section = self.tag_to_string(x) - current_articles = [] - self.log('\tFound section:', current_section) - elif x.name == 'a': + for section in section_list: + section_title = section[0] + section_url = section[1] + soup = self.index_to_soup(section_url) + current_articles = [] - title = self.tag_to_string(x) - url = x.get('href', False) - if not url or not title: - continue - self.log('\t\tFound article:', title) - self.log('\t\t\t', url) - current_articles.append({'title': title, 'url':url, - 'description':'', 'date':''}) + div = soup.find('div', attrs={'id':'left-column'}) + soup.find('div', attrs={'class':'newsection-title'}).extract() + soup.find('div', attrs={'id':'tpnav-bar'}).extract() - if current_articles and current_section: - feeds.append((current_section, current_articles)) + for x in div.findAll(['a']): + title = self.tag_to_string(x) + url = x.get('href', False) + if not url or not title: + continue + self.log('\t\tFound article:', title) + self.log('\t\t\t', url) + current_articles.append({'title': title, 'url':url, + 'description':'', 'date':''}) + + feeds.append((section_title, current_articles)) return feeds + def is_accepted_entry(self, entry): + # Those sections in the top nav bar that we will omit + omit_list = ['tp-tamilnadu', + 'tp-karnataka', + 'tp-kerala', + 'tp-andhrapradesh', + 'tp-newdelhi', + 'tp-otherstates', + 'tp-miscellaneous', + 'tp-in-school', + 'tp-metroplus', + 'tp-bookreview'] + + is_accepted = True + for omit_entry in omit_list: + if entry['href'][0:-1].endswith(omit_entry): + is_accepted = False + break + return is_accepted