calibre/recipes/inc42.recipe
2023-07-23 13:09:38 +05:30

66 lines
2.2 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

from calibre.ptempfile import PersistentTemporaryFile
from calibre.web.feeds.news import BasicNewsRecipe, classes
class inc42(BasicNewsRecipe):
title = 'Inc42'
__author__ = 'unkn0wn'
description = 'Inc42 is Indias largest tech media platform working with the mission to accelerate the GDP of Indias tech & startup economy.'
no_stylesheets = True
use_embedded_content = False
encoding = 'utf-8'
language = 'en_IN'
remove_attributes = ['style', 'height', 'width']
masthead_url = 'https://omcdn.inc42.com/users/d0ffd8ffa0d2/images/4477fc48bee71659696918-color-black-1-e1576150264134.png?width=224'
keep_only_tags = [
classes('entry-header entry-content'),
]
remove_tags = [
dict(name='button'),
classes('also-read slick-list slides-three common-card'),
]
ignore_duplicate_articles = {'title'}
remove_empty_feeds = True
articles_are_obfuscated = True
def get_obfuscated_article(self, url):
br = self.get_browser()
try:
br.open(url)
except Exception as e:
url = e.hdrs.get('location')
soup = self.index_to_soup(url)
link = soup.find('a', href=True)
skip_sections =[ # add sections you want to skip
'/video/', '/videos/', '/media/'
]
if any(x in link['href'] for x in skip_sections):
self.log('Aborting Article ', link['href'])
self.abort_article('skipping video links')
self.log('Downloading ', link['href'])
html = br.open(link['href']).read()
pt = PersistentTemporaryFile('.html')
pt.write(html)
pt.close()
return pt.name
feeds = []
sections = [
'features', 'buzz', 'startups', 'resources'
]
for sec in sections:
a = 'https://news.google.com/rss/search?q=when:27h+allinurl:inc42.com{}&hl=en-IN&gl=IN&ceid=IN:en'
feeds.append((sec.capitalize(), a.format('%2F' + sec + '%2F')))
feeds.append(('Others', a.format('')))
def preprocess_html(self, soup):
for img in soup.findAll('img', attrs={'data-src':True}):
img['src'] = img['data-src']
return soup