calibre/recipes/india_today.recipe
2024-07-22 11:08:00 +05:30

135 lines
5.0 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

from calibre.ebooks.BeautifulSoup import Tag
from calibre.web.feeds.news import BasicNewsRecipe
def classes(classes):
q = frozenset(classes.split(' '))
return dict(attrs={
'class': lambda x: x and frozenset(x.split()).intersection(q)})
def new_tag(soup, name, attrs=()):
impl = getattr(soup, 'new_tag', None)
if impl is not None:
return impl(name, attrs=dict(attrs))
return Tag(soup, name, attrs=attrs or None)
class IndiaToday(BasicNewsRecipe):
title = u'India Today Magazine'
language = 'en_IN'
__author__ = 'unkn0wn'
no_stylesheets = True
use_embedded_content = False
remove_attributes = ['style', 'height', 'width']
ignore_duplicate_articles = {'url'}
description = (
'Indias Most Reputed, Credible and Popular news magazine.'
' Read the most preferred magazine of 9.5 million Indians to access highly researched and unbiased content.'
)
masthead_url = 'https://akm-img-a-in.tosshub.com/sites/all/themes/itg/logo.png'
extra_css = '''
#sub-d {font-style:italic; color:#202020;}
.story__byline {font-size:small; text-align:left;}
.body_caption, .mos__alt .caption, .caption-drupal-entity {font-size:small; text-align:center;}
blockquote{color:#404040;}
'''
remove_tags = [
classes('checkout__section sharing align-center-button amp-izooto-sub ads__container inline-story-add amp-ad readmore__box'),
dict(name=(('amp-web-push-widget', 'amp-ad'))),
dict(attrs={'id':'tab-link-wrapper-plugin'}),
dict(name='div', attrs={'amp-access':'NOT granted'})
]
def preprocess_raw_html(self, raw_html, url):
return raw_html.replace('—', '--')
recipe_specific_options = {
'date': {
'short': 'The date of the edition to download (DD-MM-YYYY format)',
'long': 'For example, 22-07-2024'
}
}
def get_cover_url(self):
d = self.recipe_specific_options.get('date')
if not (d and isinstance(d, str)):
soup = self.index_to_soup(
'https://www.readwhere.com/magazine/the-india-today-group/India-Today/1154'
)
for citem in soup.findAll(
'meta', content=lambda s: s and s.endswith('/magazine/300/new')
):
return citem['content'].replace('300', '600')
def parse_index(self):
issue = https://www.indiatoday.in/magazine'
d = self.recipe_specific_options.get('date')
if d and isinstance(d, str):
issue = issue + '/' + d
soup = self.index_to_soup(issue)
section = None
sections = {}
for tag in soup.findAll('div', attrs={'class': lambda x: x and 'NoCard_story__grid__' in x}):
sec = tag.find('div', attrs={'class': lambda x: x and 'NoCard_header__nav__' in x})
section = self.tag_to_string(sec).strip()
self.log(section)
sections[section] = []
for art in tag.findAll('article'):
title = self.tag_to_string(art.find(attrs={'class':lambda x: x and 'NoCard_articletitle__' in x})).strip()
url = art.find('a', href=True, title=True)['href']
if url.startswith('/'):
url = 'https://www.indiatoday.in' + url
desc = self.tag_to_string(art.find(attrs={'class':lambda x: x and 'NoCard_story__shortcont__' in x})).strip()
self.log('\t', title, '\n\t', desc, '\n\t\t', url)
sections[section].append({'title': title, 'url': url, 'description': desc})
def sort_key(x):
section = x[0]
try:
return (
'Editor\'s Note', 'Cover Story', 'The Big Story', 'Upfront',
'NATION', 'INTERVIEW'
).index(section)
except Exception:
return 99999999
return sorted(sections.items(), key=sort_key)
def preprocess_html(self, soup):
if soup.find('div', attrs={'amp-access':'granted'}) is not None:
keep_only_tags = [
classes('strytitle strykicker story__byline srtymos'),
dict(name='div', attrs={'amp-access':'granted'}),
]
else:
keep_only_tags = [
classes('strytitle strykicker story__byline srtymos'),
dict(name='div', attrs={'class':'description'}),
]
body = new_tag(soup, 'body')
for spec in keep_only_tags:
for tag in soup.find('body').findAll(**spec):
body.insert(len(body.contents), tag)
soup.find('body').replaceWith(body)
for img in soup.findAll('amp-img'):
if not img.find('img'):
img.name = 'img'
h2 = soup.find('h2')
if h2:
h2.name = 'p'
h2['id'] = 'sub-d'
for quo in soup.findAll(attrs={'class':'quotes'}):
quo.name = 'blockquote'
return soup
def print_version(self, url):
return url.replace('.in/','.in/amp/')