# vim:fileencoding=utf-8 from calibre.web.feeds.news import BasicNewsRecipe from lxml import html allowed_sections = {'Top Headlines', 'Opinion', 'Science', 'Education', 'US', 'Pakistan', 'India Business', 'Tech News', 'Cricket', 'Bollywood'} class TimesOfIndia(BasicNewsRecipe): title = u'Times of India Headlines' language = 'en' description = 'Headline news from the Indian daily Times of India' __author__ = 'Kovid Goyal' no_stylesheets = True no_javascript = True keep_only_tags = [dict(name='h1'), dict(id=['storydiv', 'contentarea'])] remove_tags = [ dict(name='div', attrs={'class':['video_list', 'rightpart', 'clearfix mTop15', 'footer_slider', 'read_more', 'flR', 'hide_new']}), dict(name='div', attrs={'id':[ 'most_pop', 'relartstory', 'slidebox', 'tmpFbokk', 'twittersource', 'reportAbuseDiv', 'result', 'yahoobuzzsyn', 'fb-root']}), dict(style='float:right;margin-left:5px;'), ] def parse_index(self): index = 'http://timesofindia.indiatimes.com/home/headlines' raw = self.index_to_soup(index, raw=True) root = html.fromstring(raw) feeds = [] current_section = None current_articles = [] toc = root.xpath('//div[@align="center"]/descendant::table[@class="cnt"]')[0] for x in toc.xpath('descendant::*[name()="h3" or (name()="ul" and @class="content")]'): if x.tag == 'h3': if current_articles and current_section in allowed_sections: feeds.append((current_section, current_articles)) current_section = html.tostring(x, method='text', encoding=unicode).strip() current_articles = [] self.log(current_section) else: for a in x.xpath('descendant::li/descendant::a[@href]'): title = html.tostring(a, method='text', encoding=unicode).strip() url = a.get('href') if url.startswith('/'): url = 'http://timesofindia.indiatimes.com' + url self.log(' ', title) current_articles.append({'title':title, 'url':url}) self.log('') if current_articles and current_section in allowed_sections: feeds.append((current_section, current_articles)) return feeds