From 02e3ba7588a09da436a908322e01d0c6f738523c Mon Sep 17 00:00:00 2001 From: unkn0w7n <51942695+unkn0w7n@users.noreply.github.com> Date: Thu, 27 Apr 2023 15:12:11 +0530 Subject: [PATCH 1/5] Update hindu.recipe --- recipes/hindu.recipe | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/recipes/hindu.recipe b/recipes/hindu.recipe index 442d6dba2b..c5e0c07801 100644 --- a/recipes/hindu.recipe +++ b/recipes/hindu.recipe @@ -1,7 +1,7 @@ import json import re from collections import defaultdict -from datetime import datetime +from datetime import date from calibre.web.feeds.news import BasicNewsRecipe, classes @@ -26,9 +26,10 @@ class TheHindu(BasicNewsRecipe): extra_css = ''' .caption {font-size:small; text-align:center;} - .author {font-size:small; font-weight:bold;} + .author, .dateLine {font-size:small; font-weight:bold;} .subhead, .subhead_lead {font-weight:bold;} img {display:block; margin:0 auto;} + .italic {font-style:italic; color:#202020;} ''' ignore_duplicate_articles = {'url'} @@ -52,20 +53,22 @@ class TheHindu(BasicNewsRecipe): BasicNewsRecipe.__init__(self, *args, **kwargs) if self.output_profile.short_name.startswith('kindle'): if not past_edition: - self.title = 'The Hindu ' + datetime.today().strftime('%b %d, %Y') + self.title = 'The Hindu ' + date.today().strftime('%b %d, %Y') def parse_index(self): + global local_edition if local_edition or past_edition: if local_edition is None: local_edition = 'th_chennai' - today = datetime.today().strftime('%Y-%m-%d') + today = date.today().strftime('%Y-%m-%d') if past_edition: today = past_edition self.log('Downloading past edition of', local_edition + ' from ' + today) url = absurl('/todays-paper/' + today + '/' + local_edition + '/') else: url = 'https://www.thehindu.com/todays-paper/' + raw = self.index_to_soup(url, raw=True) soup = self.index_to_soup(raw) ans = self.hindu_parse_index(soup) @@ -83,8 +86,8 @@ class TheHindu(BasicNewsRecipe): if not self.tag_to_string(script).strip().startswith('let grouped_articles = {}'): continue if script is not None: - art = re.search(r'grouped_articles = ({\"[^<]+?]})', self.tag_to_string(script)) - data = json.loads(art.group(1)) + art = re.search(r'grouped_articles = ({\".*)', self.tag_to_string(script)) + data = json.JSONDecoder().raw_decode(art.group(1))[0] feeds_dict = defaultdict(list) From bdc37ef07830d98216771a056da94dc556667c99 Mon Sep 17 00:00:00 2001 From: unkn0w7n <51942695+unkn0w7n@users.noreply.github.com> Date: Thu, 27 Apr 2023 15:27:30 +0530 Subject: [PATCH 2/5] Update psych.recipe --- recipes/psych.recipe | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/recipes/psych.recipe b/recipes/psych.recipe index 12eeacd4a5..235489565c 100644 --- a/recipes/psych.recipe +++ b/recipes/psych.recipe @@ -42,7 +42,7 @@ class PsychologyToday(BasicNewsRecipe): self.cover_url = absurl(a.img['src']) soup = self.index_to_soup(absurl(a['href'])) articles = [] - for article in soup.find('div', role='article').findAll('article'): + for article in soup.findAll('div', attrs={'class':'article-text'}): title = self.tag_to_string(article.find(['h2','h3'])).strip() url = absurl(article.find(['h2','h3']).a['href']) self.log('\n', title, 'at', url) From 19a332f74d962bd21baae87a264586e21c5cd878 Mon Sep 17 00:00:00 2001 From: unkn0w7n <51942695+unkn0w7n@users.noreply.github.com> Date: Thu, 27 Apr 2023 15:33:04 +0530 Subject: [PATCH 3/5] Update livemint.recipe --- recipes/livemint.recipe | 24 ++++++++++++++++-------- 1 file changed, 16 insertions(+), 8 deletions(-) diff --git a/recipes/livemint.recipe b/recipes/livemint.recipe index e63471a28f..4b990afd37 100644 --- a/recipes/livemint.recipe +++ b/recipes/livemint.recipe @@ -21,16 +21,17 @@ class LiveMint(BasicNewsRecipe): remove_empty_feeds = True resolve_internal_links = True - def get_cover_url(self): - soup = self.index_to_soup( - 'https://www.magzter.com/IN/HT-Digital-Streams-Ltd./Mint-Mumbai/Newspaper/' - ) - for citem in soup.findAll('meta', content=lambda s: s and s.endswith('view/3.jpg')): - return citem['content'] - if is_saturday: + + def get_cover_url(self): + soup = self.index_to_soup('https://lifestyle.livemint.com/') + self.title = 'Mint Lounge' + if citem := soup.find('div', attrs={'class':'headLatestIss_cover'}): + return citem.img['src'].replace('_tn.jpg', '_mr.jpg') + + masthead_url = 'https://lifestyle.livemint.com/mintlounge/static-images/lounge-logo.svg' - oldest_article = 6 # days + oldest_article = 6.5 # days extra_css = ''' #story-summary-0 {font-style:italic; color:#202020;} @@ -63,6 +64,13 @@ class LiveMint(BasicNewsRecipe): img['src'] = img['data-img'] return soup else: + + def get_cover_url(self): + soup = self.index_to_soup( + 'https://www.magzter.com/IN/HT-Digital-Streams-Ltd./Mint-Mumbai/Newspaper/' + ) + for citem in soup.findAll('meta', content=lambda s: s and s.endswith('view/3.jpg')): + return citem['content'] extra_css = ''' img {display:block; margin:0 auto;} From c547b188d7c0a146d35a6ef5d38eb4e66df9aa94 Mon Sep 17 00:00:00 2001 From: unkn0w7n <51942695+unkn0w7n@users.noreply.github.com> Date: Thu, 27 Apr 2023 15:50:51 +0530 Subject: [PATCH 4/5] Update irish_times.recipe --- recipes/irish_times.recipe | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/recipes/irish_times.recipe b/recipes/irish_times.recipe index 31732eec2e..3c3c97d0c1 100644 --- a/recipes/irish_times.recipe +++ b/recipes/irish_times.recipe @@ -33,11 +33,11 @@ class IrishTimes(BasicNewsRecipe): temp_files = [] keep_only_tags = [ dict(name=['h1', 'h2']), - classes('lead-art-wrapper article-body-wrapper'), + classes('lead-art-wrapper article-body-wrapper byline-text'), ] remove_tags = [ dict(name='button'), - classes('sm-promo-headline'), + classes('sm-promo-headline top-table-list-container'), ] remove_attributes = ['width', 'height'] From 8d66c6eab3f96ab7acc31e281a1ef052810f8630 Mon Sep 17 00:00:00 2001 From: unkn0w7n <51942695+unkn0w7n@users.noreply.github.com> Date: Thu, 27 Apr 2023 16:57:44 +0530 Subject: [PATCH 5/5] Update irish_independent.recipe --- recipes/irish_independent.recipe | 33 ++++++++++++-------------------- 1 file changed, 12 insertions(+), 21 deletions(-) diff --git a/recipes/irish_independent.recipe b/recipes/irish_independent.recipe index 562016afee..a7425e6a29 100644 --- a/recipes/irish_independent.recipe +++ b/recipes/irish_independent.recipe @@ -12,35 +12,26 @@ class IrishIndependent(BasicNewsRecipe): description = 'Irish and World news from Irelands Bestselling Daily Broadsheet' __author__ = 'Neil Grogan' language = 'en_IE' - oldest_article = 7 + oldest_article = 2 max_articles_per_feed = 100 - remove_tags_before = dict(id='article') - remove_tags_after = [dict(name='div', attrs={'class': 'toolsBottom'})] no_stylesheets = True + keep_only_tags = [ - classes('n-content1 n-content2 n-content3'), + dict(name='div', attrs={'class':lambda x: x and '_contentwrapper' in x}) ] - remove_tags_after = classes('quick-subscribe') + remove_tags = [ - classes('icon1 icon-close c-lightbox1-side c-socials1 social-embed-consent-wall n-split1-side c-footer1'), - dict(attrs={'data-ad-slot': True}), - dict(attrs={'data-lightbox': True}), - dict(name='form'), - dict(attrs={'data-urn': lambda x: x and ':video:' in x}), + dict(name='div', attrs={'data-testid':['article-share', 'embed-video']}) ] feeds = [ - (u'Frontpage News', u'http://www.independent.ie/rss'), - (u'World News', u'http://www.independent.ie/world-news/rss'), - (u'Technology', u'http://www.independent.ie/business/technology/rss'), - (u'Sport', u'http://www.independent.ie/sport/rss'), - (u'Entertainment', u'http://www.independent.ie/entertainment/rss'), - (u'Independent Woman', u'http://www.independent.ie/lifestyle/independent-woman/rss'), - (u'Education', u'http://www.independent.ie/education/rss'), - (u'Lifestyle', u'http://www.independent.ie/lifestyle/rss'), - (u'Travel', u'http://www.independent.ie/travel/rss'), - (u'Letters', u'http://www.independent.ie/opinion/letters/rss'), - (u'Weather', u'http://www.independent.ie/weather/rss') + ('News', 'http://www.independent.ie/rss'), + ('Opinion', 'http://www.independent.ie/opinion/rss'), + ('Business', 'http://www.independent.ie/business/rss'), + ('Sport', 'http://www.independent.ie/sport/rss'), + ('Life', 'http://www.independent.ie/life/rss'), + ('Style', 'http://www.independent.ie/style/rss'), + ('Entertainment', 'http://www.independent.ie/business/rss'), ] def preprocess_html(self, soup):