mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-09-29 15:31:08 -04:00
135 lines
5.0 KiB
Python
135 lines
5.0 KiB
Python
|
||
from calibre.ebooks.BeautifulSoup import Tag
|
||
from calibre.web.feeds.news import BasicNewsRecipe
|
||
|
||
|
||
def classes(classes):
|
||
q = frozenset(classes.split(' '))
|
||
return dict(attrs={
|
||
'class': lambda x: x and frozenset(x.split()).intersection(q)})
|
||
|
||
|
||
def new_tag(soup, name, attrs=()):
|
||
impl = getattr(soup, 'new_tag', None)
|
||
if impl is not None:
|
||
return impl(name, attrs=dict(attrs))
|
||
return Tag(soup, name, attrs=attrs or None)
|
||
|
||
|
||
class IndiaToday(BasicNewsRecipe):
|
||
title = u'India Today Magazine'
|
||
language = 'en_IN'
|
||
__author__ = 'unkn0wn'
|
||
no_stylesheets = True
|
||
use_embedded_content = False
|
||
remove_attributes = ['style', 'height', 'width']
|
||
ignore_duplicate_articles = {'url'}
|
||
description = (
|
||
'India’s Most Reputed, Credible and Popular news magazine.'
|
||
' Read the most preferred magazine of 9.5 million Indians to access highly researched and unbiased content.'
|
||
)
|
||
masthead_url = 'https://akm-img-a-in.tosshub.com/sites/all/themes/itg/logo.png'
|
||
|
||
extra_css = '''
|
||
#sub-d {font-style:italic; color:#202020;}
|
||
.story__byline {font-size:small; text-align:left;}
|
||
.body_caption, .mos__alt .caption, .caption-drupal-entity {font-size:small; text-align:center;}
|
||
blockquote{color:#404040;}
|
||
'''
|
||
|
||
remove_tags = [
|
||
classes('checkout__section sharing align-center-button amp-izooto-sub ads__container inline-story-add amp-ad readmore__box'),
|
||
dict(name=(('amp-web-push-widget', 'amp-ad'))),
|
||
dict(attrs={'id':'tab-link-wrapper-plugin'}),
|
||
dict(name='div', attrs={'amp-access':'NOT granted'})
|
||
]
|
||
|
||
def preprocess_raw_html(self, raw_html, url):
|
||
return raw_html.replace('—', '--')
|
||
|
||
recipe_specific_options = {
|
||
'date': {
|
||
'short': 'The date of the edition to download (DD-MM-YYYY format)',
|
||
'long': 'For example, 22-07-2024'
|
||
}
|
||
}
|
||
|
||
def get_cover_url(self):
|
||
d = self.recipe_specific_options.get('date')
|
||
if not (d and isinstance(d, str)):
|
||
soup = self.index_to_soup(
|
||
'https://www.readwhere.com/magazine/the-india-today-group/India-Today/1154'
|
||
)
|
||
for citem in soup.findAll(
|
||
'meta', content=lambda s: s and s.endswith('/magazine/300/new')
|
||
):
|
||
return citem['content'].replace('300', '600')
|
||
|
||
def parse_index(self):
|
||
issue = https://www.indiatoday.in/magazine'
|
||
d = self.recipe_specific_options.get('date')
|
||
if d and isinstance(d, str):
|
||
issue = issue + '/' + d
|
||
soup = self.index_to_soup(issue)
|
||
|
||
section = None
|
||
sections = {}
|
||
|
||
for tag in soup.findAll('div', attrs={'class': lambda x: x and 'NoCard_story__grid__' in x}):
|
||
sec = tag.find('div', attrs={'class': lambda x: x and 'NoCard_header__nav__' in x})
|
||
section = self.tag_to_string(sec).strip()
|
||
self.log(section)
|
||
sections[section] = []
|
||
|
||
for art in tag.findAll('article'):
|
||
title = self.tag_to_string(art.find(attrs={'class':lambda x: x and 'NoCard_articletitle__' in x})).strip()
|
||
url = art.find('a', href=True, title=True)['href']
|
||
if url.startswith('/'):
|
||
url = 'https://www.indiatoday.in' + url
|
||
desc = self.tag_to_string(art.find(attrs={'class':lambda x: x and 'NoCard_story__shortcont__' in x})).strip()
|
||
self.log('\t', title, '\n\t', desc, '\n\t\t', url)
|
||
sections[section].append({'title': title, 'url': url, 'description': desc})
|
||
|
||
def sort_key(x):
|
||
section = x[0]
|
||
try:
|
||
return (
|
||
'Editor\'s Note', 'Cover Story', 'The Big Story', 'Upfront',
|
||
'NATION', 'INTERVIEW'
|
||
).index(section)
|
||
except Exception:
|
||
return 99999999
|
||
|
||
return sorted(sections.items(), key=sort_key)
|
||
|
||
def preprocess_html(self, soup):
|
||
if soup.find('div', attrs={'amp-access':'granted'}) is not None:
|
||
keep_only_tags = [
|
||
classes('strytitle strykicker story__byline srtymos'),
|
||
dict(name='div', attrs={'amp-access':'granted'}),
|
||
]
|
||
else:
|
||
keep_only_tags = [
|
||
classes('strytitle strykicker story__byline srtymos'),
|
||
dict(name='div', attrs={'class':'description'}),
|
||
]
|
||
body = new_tag(soup, 'body')
|
||
for spec in keep_only_tags:
|
||
for tag in soup.find('body').findAll(**spec):
|
||
body.insert(len(body.contents), tag)
|
||
soup.find('body').replaceWith(body)
|
||
|
||
for img in soup.findAll('amp-img'):
|
||
if not img.find('img'):
|
||
img.name = 'img'
|
||
h2 = soup.find('h2')
|
||
if h2:
|
||
h2.name = 'p'
|
||
h2['id'] = 'sub-d'
|
||
for quo in soup.findAll(attrs={'class':'quotes'}):
|
||
quo.name = 'blockquote'
|
||
return soup
|
||
|
||
def print_version(self, url):
|
||
return url.replace('.in/','.in/amp/')
|