from __future__ import with_statement __license__ = 'GPL 3' __copyright__ = '2013, dhiru ' import time from calibre.web.feeds.news import BasicNewsRecipe class TheHindu(BasicNewsRecipe): title = u'The Business Line' language = 'en_IN' oldest_article = 7 __author__ = 'Dhiru' max_articles_per_feed = 100 no_stylesheets = True keep_only_tags = [dict(id='content')] remove_tags = [dict(attrs={'class':['article-links', 'breadcr']}), dict(id=['email-section', 'right-column', 'printfooter', 'topover', 'slidebox', 'th_footer'])] extra_css = '.photo-caption { font-size: smaller }' def preprocess_raw_html(self, raw, url): return raw.replace('

', '

').replace('

', '

') def postprocess_html(self, soup, first_fetch): for t in soup.findAll(['table', 'tr', 'td','center']): t.name = 'div' return soup def parse_index(self): today = time.strftime('%Y-%m-%d') soup = self.index_to_soup( 'http://www.thehindubusinessline.com/todays-paper/tp-index/?date=' + today) div = soup.find(id='left-column') feeds = [] current_section = None current_articles = [] for x in div.findAll(['h3', 'div']): if current_section and x.get('class', '') == 'tpaper': a = x.find('a', href=True) if a is not None: current_articles.append({'url':a['href']+'?css=print', 'title':self.tag_to_string(a), 'date': '', 'description':''}) if x.name == 'h3': if current_section and current_articles: feeds.append((current_section, current_articles)) current_section = self.tag_to_string(x) current_articles = [] return feeds