From 91ce1e3cd1763a84bf08636fd00bea2f3b4053b6 Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Thu, 15 Dec 2022 12:31:44 +0530
Subject: [PATCH] Update Indian Express

---
 recipes/indian_express.recipe | 65 +++++++++++++++++++++++------------
 1 file changed, 43 insertions(+), 22 deletions(-)

diff --git a/recipes/indian_express.recipe b/recipes/indian_express.recipe
index e9e3620c6b..60fc14c3f8 100644
--- a/recipes/indian_express.recipe
+++ b/recipes/indian_express.recipe
@@ -1,5 +1,6 @@
 from calibre.web.feeds.news import BasicNewsRecipe, classes
-
+from datetime import date, datetime, timedelta
+from calibre.utils.date import parse_date
 
 class IndianExpress(BasicNewsRecipe):
     title = u'Indian Express'
@@ -13,7 +14,7 @@ class IndianExpress(BasicNewsRecipe):
     use_embedded_content = False
     remove_attributes = ['style', 'height', 'width']
     ignore_duplicate_articles = {'url'}
-
+    
     extra_css = '''
         #storycenterbyline {font-size:small;}
         #img-cap {font-size:small;}
@@ -22,7 +23,7 @@ class IndianExpress(BasicNewsRecipe):
         #sub-d{color:#202020; font-style:italic;}
         .ie-authorbox{font-size:small;}
     '''
-
+    
     resolve_internal_links = True
     remove_empty_feeds = True
 
@@ -40,25 +41,31 @@ class IndianExpress(BasicNewsRecipe):
             ' custom-share o-story-paper-quite ie-network-commenting audio-player-tts-sec'
         )
     ]
-
+    
     def parse_index(self):
+        
         section_list = [
             ('Front Page', 'https://indianexpress.com/print/front-page/'),
             ('India', 'https://indianexpress.com/section/india/'),
-            # ('Express Network', 'https://indianexpress.com/print/express-network/'),
+            #('Express Network', 'https://indianexpress.com/print/express-network/'),
             ('Delhi Confidential', 'https://indianexpress.com/section/delhi-confidential/'),
             ('Opinion', 'http://indianexpress.com/section/opinion/'),
             ('UPSC-CSE Key', 'https://indianexpress.com/section/upsc-current-affairs/'),
+            ('Explained', 'https://indianexpress.com/section/explained/'),
             ('Business', 'https://indianexpress.com/section/business/'),
-            ('Political Pulse', 'https://indianexpress.com/section/political-pulse/'),
+            #('Political Pulse', 'https://indianexpress.com/section/political-pulse/'),
             ('Sunday Eye', 'https://indianexpress.com/section/express-sunday-eye/'),
-            # ('Education', 'https://indianexpress.com/section/education/'),
-            # ('Gadgets', 'https://indianexpress.com/section/technology/gadgets/'),
-            # ('Tech Review', 'https://indianexpress.com/section/technology/tech-reviews/'),
+            ('World', 'https://indianexpress.com/section/world/'),
+            #('Education', 'https://indianexpress.com/section/education/'),
+            #('Gadgets', 'https://indianexpress.com/section/technology/gadgets/'),
+            ('Tech Review', 'https://indianexpress.com/section/technology/tech-reviews/'),
+            #('Techhook', 'https://indianexpress.com/section/technology/techook/'),
+            #('Laptops', 'https://indianexpress.com/section/technology/laptops/'),
+            #('Mobiles & Tabs', 'https://indianexpress.com/section/technology/mobile-tabs/'),
             ('Science', 'https://indianexpress.com/section/technology/science/'),
             ('Movie Review', 'https://indianexpress.com/section/entertainment/movie-review/'),
         ]
-
+        
         feeds = []
 
         # For each section title, fetch the article urls
@@ -67,30 +74,40 @@ class IndianExpress(BasicNewsRecipe):
             section_url = section[1]
             self.log(section_title, section_url)
             soup = self.index_to_soup(section_url)
-            articles = self.articles_from_soup(soup)
+            if '/world/' in section_url or '/explained/' in section_url:
+                articles = self.articles_from_page(soup)
+            else:
+                articles = self.articles_from_soup(soup)
             if articles:
                 feeds.append((section_title, articles))
         return feeds
-
+    
+    def articles_from_page(self, soup):
+        ans = []
+        for div in soup.findAll(attrs={'class':['northeast-topbox', 'explained-section-grid']}):
+            for a in div.findAll('a', href=True):
+                if not a.find('img') and not '/section/' in a['href']:
+                    url = a['href']
+                    title = self.tag_to_string(a)
+                    self.log('\t', title, '\n\t\t', url)
+                    ans.append({'title': title, 'url': url, 'description': ''})
+        return ans
+    
     def articles_from_soup(self, soup):
         ans = []
         div = soup.find('div', attrs={'class':['nation', 'o-opin']})
         for art in div.findAll(attrs={'class':['articles', 'o-opin-article']}):
             for a in art.findAll('a', href=True):
-                if not a.find('img'):
+                if not a.find('img') and not '/profile/' in a['href']:
                     url = a['href']
                     title = self.tag_to_string(a)
                     desc = ''
                     if p:= art.find('p'):
                         desc = self.tag_to_string(p)
                     if da := art.find('div', attrs={'class':['date', 'o-opin-date']}):
-                        from datetime import datetime, timedelta
-                        from calibre.utils.date import parse_date
-                        d = parse_date(self.tag_to_string(da)).replace(tzinfo=None)
+                        date = parse_date(self.tag_to_string(da)).replace(tzinfo=None)
                         today = datetime.now()
-                        if (today - d) > timedelta(self.oldest_article):
-                            url = ''
-                        if not url or not title:
+                        if (today - date) > timedelta(self.oldest_article):
                             continue
                     self.log('\t', title, '\n\t', desc, '\n\t\t', url)
                     ans.append({'title': title, 'url': url, 'description': desc})
@@ -104,8 +121,7 @@ class IndianExpress(BasicNewsRecipe):
             return citem['content']
 
     def preprocess_html(self, soup):
-        h2 = soup.find('h2')
-        if h2:
+        if h2 := soup.find('h2'):
             h2.name = 'p'
             h2['id'] = 'sub-d'
         for span in soup.findAll(
@@ -119,4 +135,9 @@ class IndianExpress(BasicNewsRecipe):
                 if lazy is not None:
                     lazy.extract()
                 noscript.name = 'div'
-        return soup
+        if span := soup.find('span', content=True, attrs={'itemprop':'dateModified'}):
+            date = parse_date(span['content']).replace(tzinfo=None)
+            today = datetime.now()
+            if (today - date) > timedelta(self.oldest_article):
+                self.abort_article('Skipping old article')
+        return soup
\ No newline at end of file