Update Indian Express

2025-07-09 03:04:10 -04:00 · 2022-12-15 12:31:44 +05:30 · 2022-12-15 12:31:44 +05:30 · 91ce1e3cd1
commit 91ce1e3cd1
parent c38a220aba
1 changed files with 43 additions and 22 deletions
--- a/recipes/indian_express.recipe
+++ b/recipes/indian_express.recipe
@ -1,5 +1,6 @@
 from calibre.web.feeds.news import BasicNewsRecipe, classes
-
+from datetime import date, datetime, timedelta
 from calibre.utils.date import parse_date
 class IndianExpress(BasicNewsRecipe):
    title = u'Indian Express'
@ -42,19 +43,25 @@ class IndianExpress(BasicNewsRecipe):
    ]
    def parse_index(self):
        section_list = [
            ('Front Page', 'https://indianexpress.com/print/front-page/'),
            ('India', 'https://indianexpress.com/section/india/'),
-            # ('Express Network', 'https://indianexpress.com/print/express-network/'),
+            #('Express Network', 'https://indianexpress.com/print/express-network/'),
            ('Delhi Confidential', 'https://indianexpress.com/section/delhi-confidential/'),
            ('Opinion', 'http://indianexpress.com/section/opinion/'),
            ('UPSC-CSE Key', 'https://indianexpress.com/section/upsc-current-affairs/'),
            ('Explained', 'https://indianexpress.com/section/explained/'),
            ('Business', 'https://indianexpress.com/section/business/'),
-            ('Political Pulse', 'https://indianexpress.com/section/political-pulse/'),
+            #('Political Pulse', 'https://indianexpress.com/section/political-pulse/'),
            ('Sunday Eye', 'https://indianexpress.com/section/express-sunday-eye/'),
-            # ('Education', 'https://indianexpress.com/section/education/'),
+            ('World', 'https://indianexpress.com/section/world/'),
-            # ('Gadgets', 'https://indianexpress.com/section/technology/gadgets/'),
+            #('Education', 'https://indianexpress.com/section/education/'),
-            # ('Tech Review', 'https://indianexpress.com/section/technology/tech-reviews/'),
+            #('Gadgets', 'https://indianexpress.com/section/technology/gadgets/'),
            ('Tech Review', 'https://indianexpress.com/section/technology/tech-reviews/'),
            #('Techhook', 'https://indianexpress.com/section/technology/techook/'),
            #('Laptops', 'https://indianexpress.com/section/technology/laptops/'),
            #('Mobiles & Tabs', 'https://indianexpress.com/section/technology/mobile-tabs/'),
            ('Science', 'https://indianexpress.com/section/technology/science/'),
            ('Movie Review', 'https://indianexpress.com/section/entertainment/movie-review/'),
        ]
@ -67,30 +74,40 @@ class IndianExpress(BasicNewsRecipe):
            section_url = section[1]
            self.log(section_title, section_url)
            soup = self.index_to_soup(section_url)
            if '/world/' in section_url or '/explained/' in section_url:
                articles = self.articles_from_page(soup)
            else:
                articles = self.articles_from_soup(soup)
            if articles:
                feeds.append((section_title, articles))
        return feeds
    def articles_from_page(self, soup):
        ans = []
        for div in soup.findAll(attrs={'class':['northeast-topbox', 'explained-section-grid']}):
            for a in div.findAll('a', href=True):
                if not a.find('img') and not '/section/' in a['href']:
                    url = a['href']
                    title = self.tag_to_string(a)
                    self.log('\t', title, '\n\t\t', url)
                    ans.append({'title': title, 'url': url, 'description': ''})
        return ans
    def articles_from_soup(self, soup):
        ans = []
        div = soup.find('div', attrs={'class':['nation', 'o-opin']})
        for art in div.findAll(attrs={'class':['articles', 'o-opin-article']}):
            for a in art.findAll('a', href=True):
-                if not a.find('img'):
+                if not a.find('img') and not '/profile/' in a['href']:
                    url = a['href']
                    title = self.tag_to_string(a)
                    desc = ''
                    if p:= art.find('p'):
                        desc = self.tag_to_string(p)
                    if da := art.find('div', attrs={'class':['date', 'o-opin-date']}):
-                        from datetime import datetime, timedelta
+                        date = parse_date(self.tag_to_string(da)).replace(tzinfo=None)
                        from calibre.utils.date import parse_date
                        d = parse_date(self.tag_to_string(da)).replace(tzinfo=None)
                        today = datetime.now()
-                        if (today - d) > timedelta(self.oldest_article):
+                        if (today - date) > timedelta(self.oldest_article):
                            url = ''
                        if not url or not title:
                            continue
                    self.log('\t', title, '\n\t', desc, '\n\t\t', url)
                    ans.append({'title': title, 'url': url, 'description': desc})
@ -104,8 +121,7 @@ class IndianExpress(BasicNewsRecipe):
            return citem['content']
    def preprocess_html(self, soup):
-        h2 = soup.find('h2')
+        if h2 := soup.find('h2'):
        if h2:
            h2.name = 'p'
            h2['id'] = 'sub-d'
        for span in soup.findAll(
@ -119,4 +135,9 @@ class IndianExpress(BasicNewsRecipe):
                if lazy is not None:
                    lazy.extract()
                noscript.name = 'div'
        if span := soup.find('span', content=True, attrs={'itemprop':'dateModified'}):
            date = parse_date(span['content']).replace(tzinfo=None)
            today = datetime.now()
            if (today - date) > timedelta(self.oldest_article):
                self.abort_article('Skipping old article')
        return soup