calibre/recipes/india_today.recipe
2022-05-16 06:23:24 +05:30

69 lines
2.7 KiB
Plaintext
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

from calibre.web.feeds.news import BasicNewsRecipe, classes
class IndiaToday(BasicNewsRecipe):
title = u'India Today Magaine'
language = 'en_IN'
__author__ = 'unkn0wn'
no_stylesheets = True
use_embedded_content = False
remove_attributes = ['style','height','width']
ignore_duplicate_articles = {'url'}
extra_css = '[itemprop^="description"] {font-size: small; font-style: italic;}'
description = ('Indias Most Reputed, Credible and Popular news magazine.'
' Read the most preferred magazine of 9.5 million Indians to access highly researched and unbiased content.')
masthead_url = 'https://akm-img-a-in.tosshub.com/sites/all/themes/itg/logo.png'
def get_cover_url(self):
soup = self.index_to_soup('https://www.magzter.com/IN/India-Today-Group/India-Today/News/')
for citem in soup.findAll('meta', content=lambda s: s and s.endswith('view/3.jpg')):
return citem['content']
keep_only_tags = [
dict(name='h1'),
classes('story-kicker story-right'),
dict(itemProp='articleBody'),
]
def parse_index(self):
soup = self.index_to_soup('https://www.indiatoday.in/magazine')
ans = self.it_parse_index(soup)
return ans
def it_parse_index(self, soup):
feeds = []
for section in soup.findAll('div', attrs={'class':['magazin-top-left', 'section-ordering']}):
sec = section.find('span')
secname = self.tag_to_string(sec)
self.log(secname)
articles = []
for a in section.findAll('a', href=lambda x: x and x.startswith(("/magazine/cover-story/story/", "https://www.indiatoday.in/magazine/"))):
url = a['href']
if url.startswith('https'):
url = url
else:
url = 'https://www.indiatoday.in' + url
title = self.tag_to_string(a)
empty = " "
if title is empty:
url = ''
self.log('\t', title)
self.log('\t\t', url)
articles.append({
'title': title,
'url': url})
if articles:
feeds.append((secname, articles))
return feeds
def preprocess_raw_html(self, raw_html, url):
from calibre.ebooks.BeautifulSoup import BeautifulSoup
soup = BeautifulSoup(raw_html)
for script in soup.findAll('script'):
script.extract()
for style in soup.findAll('style'):
style.extract()
for img in soup.findAll('img', attrs={'data-src': True}):
img['src'] = img['data-src']
return str(soup)