calibre/recipes/sportstar.recipe

#!/usr/bin/env python
# vim:fileencoding=utf-8
from collections import OrderedDict

from calibre.web.feeds.news import BasicNewsRecipe, classes


class Sportstar(BasicNewsRecipe):
    title = u'Sportstar'
    __author__ = 'unkn0wn'
    description = (
        'Sportstar began as a Print Only multi-sport weekly on July 15, 1978.'
        ' In 2018 the periodicity of Sportstar was made fortnightly with a lot of in-depth articles coming into the mix.'
        ' Our readers have been the fount of inspiration in our attempts at exploring new angles in sports journalism.')
    language = 'en_IN'
    no_stylesheets = True
    remove_javascript = True
    use_embedded_content = False
    encoding = 'utf-8'
    ignore_duplicate_articles = {'url'}
    resolve_internal_links = True
    masthead_url = 'https://assetsss.thehindu.com/theme/images/SSRX/sportstar-logo.svg'
    remove_attributes = ['height', 'width']
    extra_css = '''
        .sub-title {font-style:italic; color:#202020;}
        .caption {font-size:small; text-align:center;}
        .author, .publish-time {font-size:small;}
    '''

    recipe_specific_options = {
        'issue': {
            'short': 'Enter the Issue Number you want to download\n(Volume-Issue format)',
            'long': 'For example, 47-16'
        }
    }

    keep_only_tags = [
        dict(name='h1', attrs={'class':'title'}),
        dict(name='h2', attrs={'class':'sub-title'}),
        classes('publish-time author top-pic articlebodycontent')
    ]

    remove_tags = [
        classes(
            'show-mobile inlineAds related-topics related-stories comments-shares'
            ' share-page title-patch pic-caption slide-mobile also-read'
        )
    ]

    def parse_index(self):
        d = self.recipe_specific_options.get('issue')
        if d and isinstance(d, str):
            issue_url = 'https://sportstar.thehindu.com/magazine/issue/vol' + d
        else:
            soup = self.index_to_soup('https://sportstar.thehindu.com/magazine/')
            issue_url = soup.find('a', href=lambda x: x and x.startswith('https://sportstar.thehindu.com/magazine/issue/'))['href']
        self.log('Downloading Issue: ', issue_url)

        soup = self.index_to_soup(issue_url)

        feeds = OrderedDict()

        info = soup.find('div', attrs={'class':lambda x: x and 'left-sticky' in x.split()})
        self.cover_url = info.find('div', attrs={'class':'sptar-image'}
                                    ).find('img')['data-original'].replace('FREE_320', 'FREE_1200')
        self.description = self.tag_to_string(info.find('div', attrs={'class':'sub-text'})).strip()

        for content in soup.findAll('div', attrs={'class':'content'}):
            articles = []
            h3 = content.find('h3', attrs={'class':'title'})
            url = h3.find('a', href=True)['href']
            title = self.tag_to_string(h3).strip()
            desc = self.tag_to_string(content.find('div', attrs={'class':'sub-text'})).strip()
            section_title = self.tag_to_string(content.find('div', attrs={'class':'label'})).strip()
            self.log(section_title)
            self.log('\t', title)
            self.log('\t', desc)
            self.log('\t\t', url)
            articles.append({
                'title': title,
                'url': url,
                'description': desc})

            if articles:
                if section_title not in feeds:
                    feeds[section_title] = []
                feeds[section_title] += articles
        ans = list(feeds.items())
        return ans

    def preprocess_html(self, soup):
        if h2 := soup.find('h2'):
            h2.name = 'p'
        for img in soup.findAll('img', attrs={'data-original':True}):
            if img['data-original'].endswith('1x1_spacer.png'):
                source = img.findPrevious('source', srcset=True)
                img.extract()
                if source:
                    source['src'] = source['srcset'].replace('_320','_1200')
                    source.name = 'img'
            else:
                img['src'] = img['data-original']
        return soup

    def postprocess_html(self, soup, first_fetch):
        for src in soup.findAll('source'):
            src.extract()
        return soup