#!/usr/bin/env python from calibre.web.feeds.news import BasicNewsRecipe, classes class inc42(BasicNewsRecipe): title = 'Inc42' __author__ = 'unkn0wn' description = 'Inc42 is India’s largest tech media platform working with the mission to accelerate the GDP of India’s tech & startup economy.' no_stylesheets = True use_embedded_content = False encoding = 'utf-8' language = 'en_IN' remove_attributes = ['style', 'height', 'width'] masthead_url = 'https://omcdn.inc42.com/users/d0ffd8ffa0d2/images/4477fc48bee71659696918-color-black-1-e1576150264134.png?width=224' keep_only_tags = [ classes('entry-header entry-content'), ] remove_tags = [ dict(name='button'), classes('also-read slick-list slides-three common-card'), ] ignore_duplicate_articles = {'title', 'url'} remove_empty_feeds = True def parse_index(self): index = 'https://inc42.com/' sections = [ 'features', 'buzz', 'startups', 'resources' ] feeds = [] soup = self.index_to_soup(index) for sec in sections: section = sec.capitalize() self.log(section) articles = [] for a in soup.findAll('a', attrs={'href':lambda x: x and x.startswith(index + sec + '/')}): url = a['href'] if url == index + sec + '/': continue title = self.tag_to_string(a) self.log('\t', title, '\n\t\t', url) articles.append({'title': title, 'url': url}) if articles: feeds.append((section, articles)) return feeds def preprocess_html(self, soup): for img in soup.findAll('img', attrs={'data-src':True}): img['src'] = img['data-src'] return soup