From da640add79230559e13c8997aa77fa0a39764102 Mon Sep 17 00:00:00 2001 From: unkn0w7n <51942695+unkn0w7n@users.noreply.github.com> Date: Sat, 2 Aug 2025 19:08:13 +0530 Subject: [PATCH 1/4] Update indian_express.recipe Switched to RSS feeds. --- recipes/indian_express.recipe | 199 ++++++++++++++++++++-------------- 1 file changed, 118 insertions(+), 81 deletions(-) diff --git a/recipes/indian_express.recipe b/recipes/indian_express.recipe index 6a185992d4..1ef612b11b 100644 --- a/recipes/indian_express.recipe +++ b/recipes/indian_express.recipe @@ -11,7 +11,6 @@ class IndianExpress(BasicNewsRecipe): language = 'en_IN' __author__ = 'unkn0wn' oldest_article = 1.15 # days - max_articles_per_feed = 25 encoding = 'utf-8' masthead_url = 'https://indianexpress.com/wp-content/themes/indianexpress/images/indian-express-logo-n.svg' no_stylesheets = True @@ -49,84 +48,118 @@ class IndianExpress(BasicNewsRecipe): ) ] - def parse_index(self): + recipe_specific_options = { + 'days': { + 'short': 'Oldest article to download from this news source. In days ', + 'long': 'For example, 0.5, gives you articles from the past 12 hours', + 'default': str(oldest_article), + }, + 'res': { + 'short': 'For hi-res images, select a resolution from the\nfollowing options: 400, 800, 1200, 1600', + 'long': 'This is useful for non e-ink devices.', + 'default': '600', + }, + } - section_list = [ - ('Daily Briefing', 'https://indianexpress.com/section/live-news/'), - ('Front Page', 'https://indianexpress.com/print/front-page/'), - ('India', 'https://indianexpress.com/section/india/'), - # ('Express Network', 'https://indianexpress.com/print/express-network/'), - ('Delhi Confidential', 'https://indianexpress.com/section/delhi-confidential/'), - ('Editorials', 'https://indianexpress.com/section/opinion/editorials/'), - ('Columns', 'https://indianexpress.com/section/opinion/columns/'), - ('UPSC-CSE Key', 'https://indianexpress.com/section/upsc-current-affairs/'), - ('Explained', 'https://indianexpress.com/section/explained/'), - ('Business', 'https://indianexpress.com/section/business/'), - # ('Political Pulse', 'https://indianexpress.com/section/political-pulse/'), - ('Sunday Eye', 'https://indianexpress.com/section/express-sunday-eye/'), - ('World', 'https://indianexpress.com/section/world/'), - # ('Education', 'https://indianexpress.com/section/education/'), - # ('Gadgets', 'https://indianexpress.com/section/technology/gadgets/'), - ('Tech Review', 'https://indianexpress.com/section/technology/tech-reviews/'), - # ('Techhook', 'https://indianexpress.com/section/technology/techook/'), - # ('Laptops', 'https://indianexpress.com/section/technology/laptops/'), - # ('Mobiles & Tabs', 'https://indianexpress.com/section/technology/mobile-tabs/'), - ('Science', 'https://indianexpress.com/section/technology/science/'), - ('Movie Review', 'https://indianexpress.com/section/entertainment/movie-review/'), - ] + def __init__(self, *args, **kwargs): + BasicNewsRecipe.__init__(self, *args, **kwargs) + d = self.recipe_specific_options.get('days') + if d and isinstance(d, str): + self.oldest_article = float(d) - feeds = [] + feeds = [ + 'https://indianexpress.com/section/opinion/feed', + 'https://indianexpress.com/section/delhi-confidential/feed', + 'https://indianexpress.com/section/india/feed', + 'https://indianexpress.com/section/political-pulse/feed', + 'https://indianexpress.com/section/explained/feed', + 'https://indianexpress.com/section/business/feed/', + 'https://indianexpress.com/section/upsc-current-affairs/feed', + 'https://indianexpress.com/section/express-sunday-eye/feed', + 'http://indianexpress.com/section/world/feed', + 'https://indianexpress.com/section/technology/feed', + 'https://indianexpress.com/section/entertainment/feed', + 'https://indianexpress.com/feed', + ] - # For each section title, fetch the article urls - for section in section_list: - section_title = section[0] - section_url = section[1] - self.log(section_title, section_url) - soup = self.index_to_soup(section_url) - if '/world/' in section_url or '/explained/' in section_url: - articles = self.articles_from_page(soup) - else: - articles = self.articles_from_soup(soup) - if articles: - feeds.append((section_title, articles)) - return feeds + # def parse_index(self): - def articles_from_page(self, soup): - ans = [] - for div in soup.findAll(attrs={'class': ['northeast-topbox', 'explained-section-grid']}): - for a in div.findAll('a', href=True): - if not a.find('img') and '/section/' not in a['href']: - url = a['href'] - title = self.tag_to_string(a) - self.log('\t', title, '\n\t\t', url) - ans.append({'title': title, 'url': url, 'description': ''}) - return ans + # section_list = [ + # ('Daily Briefing', 'https://indianexpress.com/section/live-news/'), + # ('Front Page', 'https://indianexpress.com/print/front-page/'), + # ('India', 'https://indianexpress.com/section/india/'), + # # ('Express Network', 'https://indianexpress.com/print/express-network/'), + # ('Delhi Confidential', 'https://indianexpress.com/section/delhi-confidential/'), + # ('Editorials', 'https://indianexpress.com/section/opinion/editorials/'), + # ('Columns', 'https://indianexpress.com/section/opinion/columns/'), + # ('UPSC-CSE Key', 'https://indianexpress.com/section/upsc-current-affairs/'), + # ('Explained', 'https://indianexpress.com/section/explained/'), + # ('Business', 'https://indianexpress.com/section/business/'), + # # ('Political Pulse', 'https://indianexpress.com/section/political-pulse/'), + # ('Sunday Eye', 'https://indianexpress.com/section/express-sunday-eye/'), + # ('World', 'https://indianexpress.com/section/world/'), + # # ('Education', 'https://indianexpress.com/section/education/'), + # # ('Gadgets', 'https://indianexpress.com/section/technology/gadgets/'), + # ('Tech Review', 'https://indianexpress.com/section/technology/tech-reviews/'), + # # ('Techhook', 'https://indianexpress.com/section/technology/techook/'), + # # ('Laptops', 'https://indianexpress.com/section/technology/laptops/'), + # # ('Mobiles & Tabs', 'https://indianexpress.com/section/technology/mobile-tabs/'), + # ('Science', 'https://indianexpress.com/section/technology/science/'), + # ('Movie Review', 'https://indianexpress.com/section/entertainment/movie-review/'), + # ] - def articles_from_soup(self, soup): - ans = [] - div = soup.find('div', attrs={'class': ['nation', 'o-opin', 'myie-nation', 'opinion-more-wrapper']}) - for art in div.findAll( - attrs={'class': ['articles', 'o-opin-article', 'myie-articles']} - ): - for a in art.findAll('a', href=True): - if not a.find('img') and not any( - x in a['href'] for x in ['/profile/', '/agency/', '/section/'] - ): - url = a['href'] - title = self.tag_to_string(a) - desc = '' - if p := (art.find('p') or art.find(attrs={'class': 'opinion-news-para'})): - desc = self.tag_to_string(p) - if da := art.find( - attrs={'class': ['date', 'o-opin-date', 'opinion-date', 'my-time']} - ): - date = parse_date(self.tag_to_string(da)).replace(tzinfo=None) - today = datetime.now() - if (today - date) > timedelta(self.oldest_article): - continue - self.log('\t', title, '\n\t', desc, '\n\t\t', url) - ans.append({'title': title, 'url': url, 'description': desc}) - return ans + # feeds = [] + + # # For each section title, fetch the article urls + # for section in section_list: + # section_title = section[0] + # section_url = section[1] + # self.log(section_title, section_url) + # soup = self.index_to_soup(section_url) + # if '/world/' in section_url or '/explained/' in section_url: + # articles = self.articles_from_page(soup) + # else: + # articles = self.articles_from_soup(soup) + # if articles: + # feeds.append((section_title, articles)) + # return feeds + + # def articles_from_page(self, soup): + # ans = [] + # for div in soup.findAll(attrs={'class': ['northeast-topbox', 'explained-section-grid']}): + # for a in div.findAll('a', href=True): + # if not a.find('img') and '/section/' not in a['href']: + # url = a['href'] + # title = self.tag_to_string(a) + # self.log('\t', title, '\n\t\t', url) + # ans.append({'title': title, 'url': url, 'description': ''}) + # return ans + + # def articles_from_soup(self, soup): + # ans = [] + # div = soup.find('div', attrs={'class': ['nation', 'o-opin', 'myie-nation', 'opinion-more-wrapper']}) + # for art in div.findAll( + # attrs={'class': ['articles', 'o-opin-article', 'myie-articles']} + # ): + # for a in art.findAll('a', href=True): + # if not a.find('img') and not any( + # x in a['href'] for x in ['/profile/', '/agency/', '/section/'] + # ): + # url = a['href'] + # title = self.tag_to_string(a) + # desc = '' + # if p := (art.find('p') or art.find(attrs={'class': 'opinion-news-para'})): + # desc = self.tag_to_string(p) + # if da := art.find( + # attrs={'class': ['date', 'o-opin-date', 'opinion-date', 'my-time']} + # ): + # date = parse_date(self.tag_to_string(da)).replace(tzinfo=None) + # today = datetime.now() + # if (today - date) > timedelta(self.oldest_article): + # continue + # self.log('\t', title, '\n\t', desc, '\n\t\t', url) + # ans.append({'title': title, 'url': url, 'description': desc}) + # return ans def get_cover_url(self): soup = self.index_to_soup( @@ -136,6 +169,10 @@ class IndianExpress(BasicNewsRecipe): return citem['content'].replace('300', '600') def preprocess_html(self, soup): + width = '600' + w = self.recipe_specific_options.get('res') + if w and isinstance(w, str): + width = w if h2 := (soup.find(attrs={'itemprop': 'description'}) or soup.find(**classes('synopsis'))): h2.name = 'p' h2['id'] = 'sub-d' @@ -144,12 +181,12 @@ class IndianExpress(BasicNewsRecipe): ): span['id'] = 'img-cap' for img in soup.findAll('img', attrs={'data-src': True}): - img['src'] = img['data-src'] - if span := soup.find('span', content=True, attrs={'itemprop': 'dateModified'}): - date = parse_date(span['content']).replace(tzinfo=None) - today = datetime.now() - if (today - date) > timedelta(self.oldest_article): - self.abort_article('Skipping old article') + img['src'] = img['data-src'].split('?')[0] + '?w=' + width + # if span := soup.find('span', content=True, attrs={'itemprop': 'dateModified'}): + # date = parse_date(span['content']).replace(tzinfo=None) + # today = datetime.now() + # if (today - date) > timedelta(self.oldest_article): + # self.abort_article('Skipping old article') for img in soup.findAll('img', attrs={'src': True}): - img['src'] = img['src'].split('?')[0] + '?w=600' + img['src'] = img['src'].split('?')[0] + '?w=' + width return soup From 9305014fd284fe98342c4baf3e021644f5d0503f Mon Sep 17 00:00:00 2001 From: unkn0w7n <51942695+unkn0w7n@users.noreply.github.com> Date: Sat, 2 Aug 2025 19:09:28 +0530 Subject: [PATCH 2/4] Update press_information_bureau.recipe --- recipes/press_information_bureau.recipe | 27 +++++++++++++------------ 1 file changed, 14 insertions(+), 13 deletions(-) diff --git a/recipes/press_information_bureau.recipe b/recipes/press_information_bureau.recipe index 1558fb82e1..c5667e7e5c 100644 --- a/recipes/press_information_bureau.recipe +++ b/recipes/press_information_bureau.recipe @@ -1,30 +1,31 @@ +#!/usr/bin/env python from calibre.web.feeds.news import BasicNewsRecipe, classes class PIB(BasicNewsRecipe): - title = u'Press Information Bureau' + title = 'Press Information Bureau' language = 'en_IN' __author__ = 'unkn0wn' no_stylesheets = True use_embedded_content = False - remove_attributes = ['style','height','width'] + remove_attributes = ['style', 'height', 'width'] ignore_duplicate_articles = {'url'} - description = ('The Press Information Bureau (PIB) is the nodal agency of the Government of India' - ' to disseminate information to the print and electronic media on government policies,' - ' programmes, initiatives and achievements. Best downloaded at the end of the day!') + description = ( + 'The Press Information Bureau (PIB) is the nodal agency of the Government of India' + ' to disseminate information to the print and electronic media on government policies,' + ' programmes, initiatives and achievements. Best downloaded at the end of the day!' + ) extra_css = ''' #ltrSubtitle{color:#404040;} blockquote{color:#404040;} .ReleaseDateSubHeaddateTime{font-style:italic; font-size:small;} ''' - masthead_url = 'https://tse3.mm.bing.net/th?id=OIP.4QE8KPl1dZ3_BoR3X92aqgHaIH' - keep_only_tags = [ - classes('innner-page-main-about-us-content-right-part') - ] - remove_tags = [ - classes('ReleaseLang log_oo') - ] + masthead_url = 'https://tse3.mm.bing.net/th?id=OIP.4QE8KPl1dZ3_BoR3X92aqgHaIH' + cover_url = 'https://static.pib.gov.in/WriteReadData/specificdocs/photo/2024/jun/ph2024624343601.jpg' + + keep_only_tags = [classes('innner-page-main-about-us-content-right-part')] + remove_tags = [classes('ReleaseLang log_oo')] def parse_index(self): soup = self.index_to_soup('https://pib.gov.in/Allrel.aspx') @@ -37,7 +38,7 @@ class PIB(BasicNewsRecipe): for a in div.findAll('a', href=True): url = a['href'] if url.startswith('/'): - url = 'https://pib.gov.in' + url + url = 'https://pib.gov.in' + url title = self.tag_to_string(a) self.log('\t', title, '\n\t\t', url) articles.append({'title': title, 'url': url}) From 5809958135a57a1cf7188ed18309092cc8ba60d3 Mon Sep 17 00:00:00 2001 From: unkn0w7n <51942695+unkn0w7n@users.noreply.github.com> Date: Sat, 2 Aug 2025 19:11:57 +0530 Subject: [PATCH 3/4] ... --- recipes/indian_express.recipe | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/recipes/indian_express.recipe b/recipes/indian_express.recipe index 1ef612b11b..cea9d8a5e5 100644 --- a/recipes/indian_express.recipe +++ b/recipes/indian_express.recipe @@ -1,8 +1,8 @@ #!/usr/bin/env python # vim:fileencoding=utf-8 -from datetime import datetime, timedelta +# from datetime import datetime, timedelta -from calibre.utils.date import parse_date +# from calibre.utils.date import parse_date from calibre.web.feeds.news import BasicNewsRecipe, classes From f4ed1869fadd12c1f3dbba388337574e1d1ad148 Mon Sep 17 00:00:00 2001 From: unkn0w7n <51942695+unkn0w7n@users.noreply.github.com> Date: Sat, 2 Aug 2025 19:46:14 +0530 Subject: [PATCH 4/4] ... --- recipes/indian_express.recipe | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/recipes/indian_express.recipe b/recipes/indian_express.recipe index cea9d8a5e5..85697b32cf 100644 --- a/recipes/indian_express.recipe +++ b/recipes/indian_express.recipe @@ -173,7 +173,7 @@ class IndianExpress(BasicNewsRecipe): w = self.recipe_specific_options.get('res') if w and isinstance(w, str): width = w - if h2 := (soup.find(attrs={'itemprop': 'description'}) or soup.find(**classes('synopsis'))): + if h2 := (soup.find(attrs={'itemprop': 'description'}) or soup.find(**classes('synopsis top-description'))): h2.name = 'p' h2['id'] = 'sub-d' for span in soup.findAll(