calibre/recipes/inc42.recipe
un-pogaz 8d28380515 add/remove blank-line (extra-edit)
ruff 'E302,E303,E304,E305,W391'
2025-01-24 11:14:21 +01:00

54 lines
1.8 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python
from calibre.web.feeds.news import BasicNewsRecipe, classes
class inc42(BasicNewsRecipe):
title = 'Inc42'
__author__ = 'unkn0wn'
description = 'Inc42 is Indias largest tech media platform working with the mission to accelerate the GDP of Indias tech & startup economy.'
no_stylesheets = True
use_embedded_content = False
encoding = 'utf-8'
language = 'en_IN'
remove_attributes = ['style', 'height', 'width']
masthead_url = 'https://omcdn.inc42.com/users/d0ffd8ffa0d2/images/4477fc48bee71659696918-color-black-1-e1576150264134.png?width=224'
keep_only_tags = [
classes('entry-header entry-content'),
]
remove_tags = [
dict(name='button'),
classes('also-read slick-list slides-three common-card'),
]
ignore_duplicate_articles = {'title', 'url'}
remove_empty_feeds = True
def parse_index(self):
index = 'https://inc42.com/'
sections = [
'features', 'buzz', 'startups', 'resources'
]
feeds = []
soup = self.index_to_soup(index)
for sec in sections:
section = sec.capitalize()
self.log(section)
articles = []
for a in soup.findAll('a', attrs={'href':lambda x: x and x.startswith(index + sec + '/')}):
url = a['href']
if url == index + sec + '/':
continue
title = self.tag_to_string(a)
self.log('\t', title, '\n\t\t', url)
articles.append({'title': title, 'url': url})
if articles:
feeds.append((section, articles))
return feeds
def preprocess_html(self, soup):
for img in soup.findAll('img', attrs={'data-src':True}):
img['src'] = img['data-src']
return soup