Update Frontline and Sportstar

2025-08-11 09:13:57 -04:00 · 2022-11-27 10:14:25 +05:30 · 2022-11-27 10:14:25 +05:30 · 8c88a4b306
commit 8c88a4b306
parent 5b5b7eeab1
2 changed files with 88 additions and 30 deletions
--- a/recipes/frontline.recipe
+++ b/recipes/frontline.recipe
@ -32,11 +32,23 @@ class Frontline(BasicNewsRecipe):
    def preprocess_html(self, soup):
        for img in soup.findAll('img', attrs={'data-original':True}):
-            img['src'] = img['data-original']
+            if img['data-original'].endswith('1x1_spacer.png'):
                source = img.findPrevious('source', srcset=True)
                img.extract()
                if source:
                    source['src'] = source['srcset']
                    source.name = 'img'
            else:
                img['src'] = img['data-original']
        for cap in soup.findAll(**classes('caption-cont')):
            cap.name = 'figcaption'
        return soup
    def postprocess_html(self, soup, first_fetch):
        for src in soup.findAll('source'):
            src.extract()
        return soup
    def parse_index(self):
        soup = self.index_to_soup('https://frontline.thehindu.com/magazine/')
        issue = soup.find(**classes('sptar-archive-item')).find('a')['href']
--- a/recipes/sportstar.recipe
+++ b/recipes/sportstar.recipe
@ -1,6 +1,5 @@
 #!/usr/bin/env python
 # vim:fileencoding=utf-8
 from calibre.web.feeds.news import BasicNewsRecipe, classes
 from collections import OrderedDict
 class Sportstar(BasicNewsRecipe):
@ -15,42 +14,89 @@ class Sportstar(BasicNewsRecipe):
    remove_javascript = True
    use_embedded_content = False
    encoding = 'utf-8'
    oldest_article = 14
    max_articles_per_feed = 50
    ignore_duplicate_articles = {'url'}
    resolve_internal_links = True
-    masthead_url = 'https://ss.thgim.com/static/theme/default/base/img/logoNew.png'
+    masthead_url = 'https://sportstar.thehindu.com/theme/images/ss-online/sslogo.png'
    remove_attributes = ['height', 'width']
-    extra_css = '#pic-img{font-size: small; font-style: italic;}'
+    extra_css = '''
-
+        .text-secondary{font-style:italic; color:#404040;}
-    def get_cover_url(self):
+        .lead-img-caption, .caption-cont{font-size:small; text-align:center;}
-        soup = self.index_to_soup('https://sportstar.thehindu.com/magazine/ebook/')
+        .auth-name, .datelinew {font-size:small;}
-        tag = soup.find(attrs={'class': 'wrapImg'})
+    '''
        if tag:
            self.cover_url = tag.find('img')['data-proxy-image'].replace(
                "FREE_180", "FREE_1200"
            )
        return super().get_cover_url()
    keep_only_tags = [
        dict(name='h1'),
-        dict(name='h2'),
+        dict(name='h2', attrs={'class':'text-secondary'}),
-        classes('sport-icon byline home-content-date blog-img-grow home-content-p')
+        classes('lead-img-cont auth-name datelinew art-content')
    ]
    remove_tags = [classes('bylineimg')]
    remove_tags_after = [
        classes('home-content-p'),
    ]
-    feeds = [
+    remove_tags = [
-        ('Columns', 'https://sportstar.thehindu.com/columns/feeder/default.rss'),
+        classes('mbot capsletter article-body1')
        ('Magazine', 'https://sportstar.thehindu.com/magazine/feeder/default.rss'),
        ('Statsman', 'https://sportstar.thehindu.com/statsman/feeder/default.rss'),
        # More feeds : https://sportstar.thehindu.com/rssfeeds/
    ]
    def parse_index(self):
        soup = self.index_to_soup('https://sportstar.thehindu.com/magazine/')
        url = soup.find('a', href=lambda x: x and x.startswith('https://sportstar.thehindu.com/magazine/issue/'))['href']
        self.log('Downloading Issue: ', url)
        soup = self.index_to_soup(url)
        feeds = OrderedDict()
        info = soup.find('div', attrs={'class':'sptar-cover-item'})
        self.cover_url = info.find('div', attrs={'class':'card'}
                                    ).find('img')['data-original'].replace('FREE_320', 'FREE_1200')
        data = info.find('div', attrs={'class':'cover-content'})
        self.timefmt = ' (' + self.tag_to_string(data.h3).strip() + ') [' +\
             self.tag_to_string(data.find('span', attrs={'class':'date'})) + ']'
        self.description = self.tag_to_string(data.p).strip()
        for content in soup.findAll('div', attrs={'class':'brief-cnt-wrap'}):
            articles = []
            h4 = content.find('h4', attrs={'class':'brief-title'})
            a = h4.find('a', href=True)
            url = a['href']
            title = self.tag_to_string(a).strip()
            desc = self.tag_to_string(content.find('div', attrs={'class':'artbody'})).strip()
            section_title = self.tag_to_string(content.find('span', attrs={'class':'brief-place'})).strip()
            self.log(section_title)
            self.log('\t', title)
            self.log('\t', desc)
            self.log('\t\t', url)
            articles.append({
                'title': title,
                'url': url,
                'description': desc})
            if articles:
                if section_title not in feeds:
                    feeds[section_title] = []
                feeds[section_title] += articles
        ans = [(key, val) for key, val in feeds.items()]
        return ans
    def preprocess_html(self, soup):
-        for img in soup.findAll('img', attrs={'data-proxy-image': True}):
+        h2 = soup.findAll(**classes('text-secondary'))
-            img['src'] = img['data-proxy-image'].replace("FREE_180", "FREE_1200")
+        if h2[0]:
            h2[0].name = 'p'
        if h2[1]:
            h2[1].extract()
        for img in soup.findAll('img', attrs={'data-original':True}):
            if img['data-original'].endswith('1x1_spacer.png'):
                source = img.findPrevious('source', srcset=True)
                img.extract()
                if source:
                    source['src'] = source['srcset']
                    source.name = 'img'
            else:
                img['src'] = img['data-original']
        for cap in soup.findAll('div', attrs={'class':'caption-cont'}):
            h4 = cap.find('h4')
            if h4:
                h4.name='figcaption'
        return soup
    def postprocess_html(self, soup, first_fetch):
        for src in soup.findAll('source'):
            src.extract()
        return soup