from calibre.ptempfile import PersistentTemporaryFile from calibre.web.feeds.news import BasicNewsRecipe, classes class inc42(BasicNewsRecipe): title = 'Inc42' __author__ = 'unkn0wn' description = 'Inc42 is India’s largest tech media platform working with the mission to accelerate the GDP of India’s tech & startup economy.' no_stylesheets = True use_embedded_content = False encoding = 'utf-8' language = 'en_IN' remove_attributes = ['style', 'height', 'width'] masthead_url = 'https://omcdn.inc42.com/users/d0ffd8ffa0d2/images/4477fc48bee71659696918-color-black-1-e1576150264134.png?width=224' keep_only_tags = [ classes('entry-header entry-content'), ] remove_tags = [ dict(name='button'), classes('also-read slick-list slides-three common-card'), ] ignore_duplicate_articles = {'title'} remove_empty_feeds = True articles_are_obfuscated = True def get_obfuscated_article(self, url): br = self.get_browser() try: br.open(url) except Exception as e: url = e.hdrs.get('location') soup = self.index_to_soup(url) link = soup.find('a', href=True) skip_sections =[ # add sections you want to skip '/video/', '/videos/', '/media/' ] if any(x in link['href'] for x in skip_sections): self.log('Aborting Article ', link['href']) self.abort_article('skipping video links') self.log('Downloading ', link['href']) html = br.open(link['href']).read() pt = PersistentTemporaryFile('.html') pt.write(html) pt.close() return pt.name feeds = [] sections = [ 'features', 'buzz', 'startups', 'resources' ] for sec in sections: a = 'https://news.google.com/rss/search?q=when:27h+allinurl:inc42.com{}&hl=en-IN&gl=IN&ceid=IN:en' feeds.append((sec.capitalize(), a.format('%2F' + sec + '%2F'))) feeds.append(('Others', a.format(''))) def preprocess_html(self, soup): for img in soup.findAll('img', attrs={'data-src':True}): img['src'] = img['data-src'] return soup