From 0ebd840d6abdc8fc38398c275321fa140c37b774 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sun, 8 Jan 2023 13:38:33 +0530 Subject: [PATCH] Update India Today Outlook Magazine and Live Mint --- recipes/india_today.recipe | 15 ++++----------- recipes/livemint.recipe | 12 ++++++++---- recipes/outlook_india.recipe | 21 +++++++++++++++------ 3 files changed, 27 insertions(+), 21 deletions(-) diff --git a/recipes/india_today.recipe b/recipes/india_today.recipe index 33dfa40716..957187c015 100644 --- a/recipes/india_today.recipe +++ b/recipes/india_today.recipe @@ -63,11 +63,10 @@ class IndiaToday(BasicNewsRecipe): sections = {} date = soup.find(attrs={'class':lambda x: x and x.startswith('MagazineEdition_edition__date')}) - edition = soup.find(attrs={'class':lambda x: x and x.startswith('MagazineEdition_magazineprime')}) - self.timefmt =' (' + self.tag_to_string(edition) + ') [' + self.tag_to_string(date).strip() + ']' - p = edition.findNext('p') - if p: - self.description = self.tag_to_string(p).strip() + edition = soup.find(attrs={'class':'prime__magazine'}) + self.timefmt = '(' + self.tag_to_string(edition).strip() +') [' + self.tag_to_string(date).strip() + ']' + if p := edition.findNext('p'): + self.description = self.tag_to_string(p) self.log('Downloading Issue: ', self.timefmt) for tag in soup.findAll('div', attrs={'class': lambda x: x and 'NoCard_story__grid__' in x}): @@ -125,11 +124,5 @@ class IndiaToday(BasicNewsRecipe): quo.name = 'blockquote' return soup - def populate_article_metadata(self, article, soup, first): - if first and hasattr(self, 'add_toc_thumbnail'): - image = soup.find('img', src=True, attrs={'class':'i-amphtml-fill-content'}) - if image is not None: - self.add_toc_thumbnail(article, image['src']) - def print_version(self, url): return url.replace('.in/','.in/amp/') diff --git a/recipes/livemint.recipe b/recipes/livemint.recipe index a112968816..520d9bd54b 100644 --- a/recipes/livemint.recipe +++ b/recipes/livemint.recipe @@ -24,9 +24,14 @@ class LiveMint(BasicNewsRecipe): remove_empty_feeds = True - if is_saturday: + def get_cover_url(self): + soup = self.index_to_soup( + 'https://www.magzter.com/IN/HT-Digital-Streams-Ltd./Mint-Mumbai/Newspaper/' + ) + for citem in soup.findAll('meta', content=lambda s: s and s.endswith('view/3.jpg')): + return citem['content'] - cover_url = 'https://epsfs.hindustantimes.com/MINT/2022/04/16/Delhi/Delhi/5_01/bf867ea1_01_mr.jpg' + if is_saturday: keep_only_tags = [ dict(name='h1'), @@ -54,14 +59,13 @@ class LiveMint(BasicNewsRecipe): img['src'] = img['data-img'] return soup else: - # some wsj articles wont load + extra_css = ''' #img-cap {font-size:small; text-align:center;} #auth-info {font-size:small; text-align:center;} .highlights {font-style:italic;} .summary{font-style:italic; color:#404040;} ''' - cover_url = 'https://epsfs.hindustantimes.com/MINT/2022/04/05/Delhi/Delhi/5_01/1ec7ad14_01_mr.jpg' keep_only_tags = [ dict(name='h1'), diff --git a/recipes/outlook_india.recipe b/recipes/outlook_india.recipe index 8c4f5bb7e1..f88714f386 100644 --- a/recipes/outlook_india.recipe +++ b/recipes/outlook_india.recipe @@ -18,13 +18,17 @@ class outlook(BasicNewsRecipe): remove_attributes = ['height', 'width', 'style'] ignore_duplicate_articles = {'url'} resolve_internal_links = True - masthead_url = 'https://www.outlookindia.com/images/home_new_v4/logo_outlook.svg' + extra_css = ''' + .story-summary{font-style:italic; color:#202020;} + .author_wrapper, .relatedCategory{font-size:small; color:#404040;} + #figcap{font-size:small; text-align:center;} + ''' keep_only_tags = [classes('__story_detail')] remove_tags = [ classes( - 'social_sharing_article left_trending left-sticky __tag_links' - ' next_prev_stories downarrow uparrow more_from_author_links next prev __related_stories_thumbs' + 'social_sharing_article left_trending left-sticky __tag_links next_prev_stories ' + 'downarrow uparrow more_from_author_links next prev __related_stories_thumbs' ) ] @@ -33,8 +37,8 @@ class outlook(BasicNewsRecipe): div = soup.find('div', attrs={'class':'wrapper'}) a = div.find('a', href=lambda x: x and x.startswith('/magazine/issue/')) url = a['href'] - self.log('Downloading issue:', url) - self.timefmt = ' [' + self.tag_to_string(a) + ']' + self.timefmt = ' [' + self.tag_to_string(a.find('p')).strip() + ']' + self.log('Downloading issue:', url, self.timefmt) soup = self.index_to_soup('https://www.outlookindia.com' + url) cover = soup.find(**classes('listingPage_lead_story')) self.cover_url = cover.find('img', attrs={'src': True})['src'] @@ -42,7 +46,7 @@ class outlook(BasicNewsRecipe): for h3 in soup.findAll(['h3', 'h4'], attrs={'class': 'tk-kepler-std-condensed-subhead'}): - a = h3.find('a', href=lambda x: x) + a = h3.find('a', href=True) url = a['href'] title = self.tag_to_string(a) desc = '' @@ -55,6 +59,11 @@ class outlook(BasicNewsRecipe): ans.append({'title': title, 'url': url, 'description': desc}) return [('Articles', ans)] + def preprocess_html(self,soup): + for fig in soup.findAll('figure'): + fig['id'] = 'figcap' + return soup + def preprocess_raw_html(self, raw, *a): return raw m = re.search('.*?script.*?>', raw, flags=re.DOTALL)