#!/usr/bin/env python from calibre.web.feeds.news import BasicNewsRecipe, classes class scroll(BasicNewsRecipe): title = 'Scroll.in' __author__ = 'unkn0wn' description = ( 'The leading destination for original reporting on news, politics, and culture in India. ' 'Our award-winning team of journalists brings readers insightful analysis and opinion on the day’s ' 'headlines alongside a fresh mix of features on music, books, and cinema.' ) language = 'en_IN' masthead_url = 'https://scroll.in/static/assets/scroll-logo.0f68c78dd023e2598248ea107feba562.003.svg' no_stylesheets = True remove_javascript = True ignore_duplicate_articles = {'title', 'url'} remove_attributes = ['style', 'height', 'width'] extra_css = ''' .orange-tag, .article-meta-container { font-size:small; } .featured-image, .cms-block-image { text-align:center; font-size:small; } ''' keep_only_tags = [ dict(name='header'), classes('featured-image article-body') ] remove_tags = [classes('comments-entry-point-meta')] def parse_index(self): index = 'https://scroll.in/' sections = [ 'article', 'magazine' ] feeds = [] soup = self.index_to_soup(index) for sec in sections: section = sec.capitalize() self.log(section) articles = [] for a in soup.findAll('a', attrs={'href':lambda x: x and x.startswith(index + sec + '/')}): url = a['href'].split('?')[0] if url in {index + sec + '/', index + sec}: continue title = self.tag_to_string(a) self.log('\t', title, '\n\t\t', url) articles.append({'title': title, 'url': url}) if articles: feeds.append((section, articles)) return feeds def populate_article_metadata(self, article, soup, first): if soup.find('h2'): article.summary = article.text_summary = self.tag_to_string(soup.find('h2'))