mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Update India Today Magazine
This commit is contained in:
parent
f4d63adc07
commit
2bb6bb47e0
@ -1,4 +1,19 @@
|
|||||||
from calibre.web.feeds.news import BasicNewsRecipe, classes
|
|
||||||
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
from calibre.ebooks.BeautifulSoup import Tag
|
||||||
|
|
||||||
|
|
||||||
|
def classes(classes):
|
||||||
|
q = frozenset(classes.split(' '))
|
||||||
|
return dict(attrs={
|
||||||
|
'class': lambda x: x and frozenset(x.split()).intersection(q)})
|
||||||
|
|
||||||
|
|
||||||
|
def new_tag(soup, name, attrs=()):
|
||||||
|
impl = getattr(soup, 'new_tag', None)
|
||||||
|
if impl is not None:
|
||||||
|
return impl(name, attrs=dict(attrs))
|
||||||
|
return Tag(soup, name, attrs=attrs or None)
|
||||||
|
|
||||||
|
|
||||||
class IndiaToday(BasicNewsRecipe):
|
class IndiaToday(BasicNewsRecipe):
|
||||||
@ -16,11 +31,19 @@ class IndiaToday(BasicNewsRecipe):
|
|||||||
masthead_url = 'https://akm-img-a-in.tosshub.com/sites/all/themes/itg/logo.png'
|
masthead_url = 'https://akm-img-a-in.tosshub.com/sites/all/themes/itg/logo.png'
|
||||||
|
|
||||||
extra_css = '''
|
extra_css = '''
|
||||||
.body_caption{font-size:small;}
|
#sub-d {font-style:italic; color:#202020;}
|
||||||
.image-alt{font-size:small;}
|
.story__byline {font-size:small; text-align:left;}
|
||||||
[itemprop^="description"] {font-size: small; font-style: italic;}
|
.body_caption, .mos__alt {font-size:small; text-align:center;}
|
||||||
|
blockquote{color:#404040;}
|
||||||
'''
|
'''
|
||||||
|
|
||||||
|
remove_tags = [
|
||||||
|
classes('checkout__section sharing align-center-button amp-izooto-sub ads__container inline-story-add amp-ad'),
|
||||||
|
dict(name=(('amp-web-push-widget', 'amp-ad'))),
|
||||||
|
dict(attrs={'id':'tab-link-wrapper-plugin'}),
|
||||||
|
dict(name='div', attrs={'amp-access':'NOT granted'})
|
||||||
|
]
|
||||||
|
|
||||||
def get_cover_url(self):
|
def get_cover_url(self):
|
||||||
soup = self.index_to_soup(
|
soup = self.index_to_soup(
|
||||||
'https://www.readwhere.com/magazine/the-india-today-group/India-Today/1154'
|
'https://www.readwhere.com/magazine/the-india-today-group/India-Today/1154'
|
||||||
@ -30,58 +53,40 @@ class IndiaToday(BasicNewsRecipe):
|
|||||||
):
|
):
|
||||||
return citem['content'].replace('300', '600')
|
return citem['content'].replace('300', '600')
|
||||||
|
|
||||||
keep_only_tags = [
|
|
||||||
dict(name='h1'),
|
|
||||||
classes('story-kicker story-right'),
|
|
||||||
dict(itemProp='articleBody'),
|
|
||||||
]
|
|
||||||
|
|
||||||
def parse_index(self):
|
def parse_index(self):
|
||||||
soup = self.index_to_soup('https://www.indiatoday.in/magazine')
|
soup = self.index_to_soup('https://www.indiatoday.in/magazine')
|
||||||
|
|
||||||
section = None
|
section = None
|
||||||
sections = {}
|
sections = {}
|
||||||
|
|
||||||
for tag in soup.findAll(
|
date = soup.find(attrs={'class':lambda x: x and x.startswith('MagazineEdition_edition__date')})
|
||||||
'div', attrs={'class': ['magazin-top-left', 'section-ordering']}
|
edition = soup.find(attrs={'class':lambda x: x and x.startswith('MagazineEdition_magazineprime')})
|
||||||
):
|
self.timefmt =' (' + self.tag_to_string(edition) + ') [' + self.tag_to_string(date).strip() + ']'
|
||||||
sec = tag.find('span')
|
p = edition.findNext('p')
|
||||||
section = self.tag_to_string(sec)
|
if p:
|
||||||
|
self.description = self.tag_to_string(p).strip()
|
||||||
|
self.log('Downloading Issue: ', self.timefmt)
|
||||||
|
|
||||||
|
for tag in soup.findAll('div', attrs={'class': lambda x: x and 'NoCard_story__grid__' in x}):
|
||||||
|
sec = tag.find('div', attrs={'class': lambda x: x and 'NoCard_header__nav__' in x})
|
||||||
|
section = self.tag_to_string(sec).strip()
|
||||||
self.log(section)
|
self.log(section)
|
||||||
sections[section] = []
|
sections[section] = []
|
||||||
|
|
||||||
for a in tag.findAll(
|
for art in tag.findAll('article'):
|
||||||
'a',
|
title = self.tag_to_string(art.find(attrs={'class':lambda x: x and 'NoCard_articletitle__' in x})).strip()
|
||||||
href=lambda x: x and x.startswith((
|
url = art.find('a', href=True, title=True)['href']
|
||||||
"/magazine/cover-story/story/",
|
if url.startswith('/'):
|
||||||
"https://www.indiatoday.in/magazine/"
|
|
||||||
))
|
|
||||||
):
|
|
||||||
url = a['href']
|
|
||||||
if url.startswith('https'):
|
|
||||||
url = url
|
|
||||||
else:
|
|
||||||
url = 'https://www.indiatoday.in' + url
|
url = 'https://www.indiatoday.in' + url
|
||||||
title = self.tag_to_string(a).strip()
|
desc = self.tag_to_string(art.find(attrs={'class':lambda x: x and 'NoCard_story__shortcont__' in x})).strip()
|
||||||
try:
|
self.log('\t', title, '\n\t', desc, '\n\t\t', url)
|
||||||
desc = self.tag_to_string(a.findParent(
|
|
||||||
'span', attrs={'class':'field-content'}).findNext(
|
|
||||||
'div', attrs={'class':'views-field'})).strip()
|
|
||||||
except Exception:
|
|
||||||
desc = self.tag_to_string(a.findParent(
|
|
||||||
('h3','p')).findNext('span', attrs={'class':'kicket-text'})).strip()
|
|
||||||
if not url or not title:
|
|
||||||
continue
|
|
||||||
self.log('\t', title)
|
|
||||||
self.log('\t', desc)
|
|
||||||
self.log('\t\t', url)
|
|
||||||
sections[section].append({'title': title, 'url': url, 'description': desc})
|
sections[section].append({'title': title, 'url': url, 'description': desc})
|
||||||
|
|
||||||
def sort_key(x):
|
def sort_key(x):
|
||||||
section = x[0]
|
section = x[0]
|
||||||
try:
|
try:
|
||||||
return (
|
return (
|
||||||
'EDITOR\'S NOTE', 'Cover Story', 'The Big Story', 'Upfront',
|
'Editor\'s Note', 'Cover Story', 'The Big Story', 'Upfront',
|
||||||
'NATION', 'INTERVIEW'
|
'NATION', 'INTERVIEW'
|
||||||
).index(section)
|
).index(section)
|
||||||
except Exception:
|
except Exception:
|
||||||
@ -89,24 +94,39 @@ class IndiaToday(BasicNewsRecipe):
|
|||||||
|
|
||||||
return sorted(sections.items(), key=sort_key)
|
return sorted(sections.items(), key=sort_key)
|
||||||
|
|
||||||
def preprocess_raw_html(self, raw_html, url):
|
def preprocess_html(self, soup):
|
||||||
from calibre.ebooks.BeautifulSoup import BeautifulSoup
|
if soup.find('div', attrs={'amp-access':'granted'}) is not None:
|
||||||
soup = BeautifulSoup(raw_html)
|
keep_only_tags = [
|
||||||
for div in soup.findAll('div', attrs={'id': 'premium_content_data'}):
|
classes('strytitle strykicker story__byline srtymos'),
|
||||||
div.extract()
|
dict(name='div', attrs={'amp-access':'granted'}),
|
||||||
for tv in soup.findAll(
|
]
|
||||||
'div',
|
else:
|
||||||
attrs={
|
keep_only_tags = [
|
||||||
'class': ['live-tv-ico', 'sendros', 'live-tv-ico-st', 'sendros-st']
|
classes('strytitle strykicker story__byline srtymos'),
|
||||||
}
|
dict(name='div', attrs={'class':'description'}),
|
||||||
):
|
]
|
||||||
tv.extract()
|
body = new_tag(soup, 'body')
|
||||||
for script in soup.findAll('script'):
|
for spec in keep_only_tags:
|
||||||
script.extract()
|
for tag in soup.find('body').findAll(**spec):
|
||||||
for style in soup.findAll('style'):
|
body.insert(len(body.contents), tag)
|
||||||
style.extract()
|
soup.find('body').replaceWith(body)
|
||||||
for img in soup.findAll('img', attrs={'data-src': True}):
|
|
||||||
img['src'] = img['data-src']
|
for img in soup.findAll('amp-img'):
|
||||||
for h2 in soup.findAll('h2'):
|
if not img.find('img'):
|
||||||
h2.name = 'h5'
|
img.name = 'img'
|
||||||
return str(soup)
|
h2 = soup.find('h2')
|
||||||
|
if h2:
|
||||||
|
h2.name = 'p'
|
||||||
|
h2['id'] = 'sub-d'
|
||||||
|
for quo in soup.findAll(attrs={'class':'quotes'}):
|
||||||
|
quo.name = 'blockquote'
|
||||||
|
return soup
|
||||||
|
|
||||||
|
def populate_article_metadata(self, article, soup, first):
|
||||||
|
if first and hasattr(self, 'add_toc_thumbnail'):
|
||||||
|
image = soup.find('img', src=True, attrs={'class':'i-amphtml-fill-content'})
|
||||||
|
if image is not None:
|
||||||
|
self.add_toc_thumbnail(article, image['src'])
|
||||||
|
|
||||||
|
def print_version(self, url):
|
||||||
|
return url.replace('.in/','.in/amp/')
|
||||||
|
Loading…
x
Reference in New Issue
Block a user