Update India Today Outlook Magazine and Live Mint

This commit is contained in:
Kovid Goyal 2023-01-08 13:38:33 +05:30
parent 8114376b2f
commit 0ebd840d6a
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C
3 changed files with 27 additions and 21 deletions

View File

@ -63,11 +63,10 @@ class IndiaToday(BasicNewsRecipe):
sections = {}
date = soup.find(attrs={'class':lambda x: x and x.startswith('MagazineEdition_edition__date')})
edition = soup.find(attrs={'class':lambda x: x and x.startswith('MagazineEdition_magazineprime')})
self.timefmt =' (' + self.tag_to_string(edition) + ') [' + self.tag_to_string(date).strip() + ']'
p = edition.findNext('p')
if p:
self.description = self.tag_to_string(p).strip()
edition = soup.find(attrs={'class':'prime__magazine'})
self.timefmt = '(' + self.tag_to_string(edition).strip() +') [' + self.tag_to_string(date).strip() + ']'
if p := edition.findNext('p'):
self.description = self.tag_to_string(p)
self.log('Downloading Issue: ', self.timefmt)
for tag in soup.findAll('div', attrs={'class': lambda x: x and 'NoCard_story__grid__' in x}):
@ -125,11 +124,5 @@ class IndiaToday(BasicNewsRecipe):
quo.name = 'blockquote'
return soup
def populate_article_metadata(self, article, soup, first):
if first and hasattr(self, 'add_toc_thumbnail'):
image = soup.find('img', src=True, attrs={'class':'i-amphtml-fill-content'})
if image is not None:
self.add_toc_thumbnail(article, image['src'])
def print_version(self, url):
return url.replace('.in/','.in/amp/')

View File

@ -24,9 +24,14 @@ class LiveMint(BasicNewsRecipe):
remove_empty_feeds = True
if is_saturday:
def get_cover_url(self):
soup = self.index_to_soup(
'https://www.magzter.com/IN/HT-Digital-Streams-Ltd./Mint-Mumbai/Newspaper/'
)
for citem in soup.findAll('meta', content=lambda s: s and s.endswith('view/3.jpg')):
return citem['content']
cover_url = 'https://epsfs.hindustantimes.com/MINT/2022/04/16/Delhi/Delhi/5_01/bf867ea1_01_mr.jpg'
if is_saturday:
keep_only_tags = [
dict(name='h1'),
@ -54,14 +59,13 @@ class LiveMint(BasicNewsRecipe):
img['src'] = img['data-img']
return soup
else:
# some wsj articles wont load
extra_css = '''
#img-cap {font-size:small; text-align:center;}
#auth-info {font-size:small; text-align:center;}
.highlights {font-style:italic;}
.summary{font-style:italic; color:#404040;}
'''
cover_url = 'https://epsfs.hindustantimes.com/MINT/2022/04/05/Delhi/Delhi/5_01/1ec7ad14_01_mr.jpg'
keep_only_tags = [
dict(name='h1'),

View File

@ -18,13 +18,17 @@ class outlook(BasicNewsRecipe):
remove_attributes = ['height', 'width', 'style']
ignore_duplicate_articles = {'url'}
resolve_internal_links = True
masthead_url = 'https://www.outlookindia.com/images/home_new_v4/logo_outlook.svg'
extra_css = '''
.story-summary{font-style:italic; color:#202020;}
.author_wrapper, .relatedCategory{font-size:small; color:#404040;}
#figcap{font-size:small; text-align:center;}
'''
keep_only_tags = [classes('__story_detail')]
remove_tags = [
classes(
'social_sharing_article left_trending left-sticky __tag_links'
' next_prev_stories downarrow uparrow more_from_author_links next prev __related_stories_thumbs'
'social_sharing_article left_trending left-sticky __tag_links next_prev_stories '
'downarrow uparrow more_from_author_links next prev __related_stories_thumbs'
)
]
@ -33,8 +37,8 @@ class outlook(BasicNewsRecipe):
div = soup.find('div', attrs={'class':'wrapper'})
a = div.find('a', href=lambda x: x and x.startswith('/magazine/issue/'))
url = a['href']
self.log('Downloading issue:', url)
self.timefmt = ' [' + self.tag_to_string(a) + ']'
self.timefmt = ' [' + self.tag_to_string(a.find('p')).strip() + ']'
self.log('Downloading issue:', url, self.timefmt)
soup = self.index_to_soup('https://www.outlookindia.com' + url)
cover = soup.find(**classes('listingPage_lead_story'))
self.cover_url = cover.find('img', attrs={'src': True})['src']
@ -42,7 +46,7 @@ class outlook(BasicNewsRecipe):
for h3 in soup.findAll(['h3', 'h4'],
attrs={'class': 'tk-kepler-std-condensed-subhead'}):
a = h3.find('a', href=lambda x: x)
a = h3.find('a', href=True)
url = a['href']
title = self.tag_to_string(a)
desc = ''
@ -55,6 +59,11 @@ class outlook(BasicNewsRecipe):
ans.append({'title': title, 'url': url, 'description': desc})
return [('Articles', ans)]
def preprocess_html(self,soup):
for fig in soup.findAll('figure'):
fig['id'] = 'figcap'
return soup
def preprocess_raw_html(self, raw, *a):
return raw
m = re.search('<!-- NewsArticle Schema -->.*?script.*?>', raw, flags=re.DOTALL)