diff --git a/recipes/hindu.recipe b/recipes/hindu.recipe index 74c0d6539a..82cef40e1f 100644 --- a/recipes/hindu.recipe +++ b/recipes/hindu.recipe @@ -2,7 +2,6 @@ from __future__ import with_statement __license__ = 'GPL 3' __copyright__ = '2009, Kovid Goyal ' -import time from calibre.web.feeds.news import BasicNewsRecipe class TheHindu(BasicNewsRecipe): @@ -14,44 +13,42 @@ class TheHindu(BasicNewsRecipe): max_articles_per_feed = 100 no_stylesheets = True - keep_only_tags = [dict(id='content')] - remove_tags = [dict(attrs={'class':['article-links', 'breadcr']}), - dict(id=['email-section', 'right-column', 'printfooter', 'topover', - 'slidebox', 'th_footer'])] + auto_cleanup = True + extra_css = '.photo-caption { font-size: smaller }' - def preprocess_raw_html(self, raw, url): - return raw.replace('

', '

').replace('

', '

') - - def postprocess_html(self, soup, first_fetch): - for t in soup.findAll(['table', 'tr', 'td','center']): - t.name = 'div' - return soup - def parse_index(self): - today = time.strftime('%Y-%m-%d') - soup = self.index_to_soup( - 'http://www.thehindu.com/todays-paper/tp-index/?date=' + today) - div = soup.find(id='left-column') - feeds = [] + soup = self.index_to_soup('http://www.thehindu.com/todays-paper/') + div = soup.find('div', attrs={'id':'left-column'}) + soup.find(id='subnav-tpbar').extract() + + + current_section = None current_articles = [] - for x in div.findAll(['h3', 'div']): - if current_section and x.get('class', '') == 'tpaper': - a = x.find('a', href=True) - if a is not None: - title = self.tag_to_string(a) - self.log('\tFound article:', title) - current_articles.append({'url':a['href']+'?css=print', - 'title':title, 'date': '', - 'description':''}) - if x.name == 'h3': - if current_section and current_articles: + feeds = [] + for x in div.findAll(['a', 'span']): + if x.name == 'span' and x['class'] == 's-link': + # Section heading found + if current_articles and current_section: feeds.append((current_section, current_articles)) current_section = self.tag_to_string(x) - self.log('Found section:', current_section) current_articles = [] + self.log('\tFound section:', current_section) + elif x.name == 'a': + + title = self.tag_to_string(x) + url = x.get('href', False) + if not url or not title: + continue + self.log('\t\tFound article:', title) + self.log('\t\t\t', url) + current_articles.append({'title': title, 'url':url, + 'description':'', 'date':''}) + + if current_articles and current_section: + feeds.append((current_section, current_articles)) + return feeds -