calibre/recipes/scroll.recipe

#!/usr/bin/env python
from calibre.web.feeds.news import BasicNewsRecipe, classes


class scroll(BasicNewsRecipe):
    title = 'Scroll.in'
    __author__ = 'unkn0wn'
    description = (
        'The leading destination for original reporting on news, politics, and culture in India. '
        'Our award-winning team of journalists brings readers insightful analysis and opinion on the day’s '
        'headlines alongside a fresh mix of features on music, books, and cinema.'
    )
    language = 'en_IN'
    masthead_url = 'https://scroll.in/static/assets/scroll-logo.0f68c78dd023e2598248ea107feba562.003.svg'

    no_stylesheets = True
    remove_javascript = True

    ignore_duplicate_articles = {'title', 'url'}
    remove_attributes = ['style', 'height', 'width']

    extra_css = '''
        .orange-tag, .article-meta-container { font-size:small; }
        .featured-image, .cms-block-image { text-align:center; font-size:small; }
    '''

    keep_only_tags = [
        dict(name='header'),
        classes('featured-image article-body')
    ]

    remove_tags = [classes('comments-entry-point-meta')]

    def parse_index(self):
        index = 'https://scroll.in/'
        sections = [
            'article', 'magazine'
        ]
        feeds = []
        soup = self.index_to_soup(index)
        for sec in sections:
            section = sec.capitalize()
            self.log(section)
            articles = []
            for a in soup.findAll('a', attrs={'href':lambda x: x and x.startswith(index + sec + '/')}):
                url = a['href'].split('?')[0]
                if url in {index + sec + '/', index + sec}:
                    continue
                title = self.tag_to_string(a)
                self.log('\t', title, '\n\t\t', url)
                articles.append({'title': title, 'url': url})
            if articles:
                feeds.append((section, articles))
        return feeds

    def populate_article_metadata(self, article, soup, first):
        if soup.find('h2'):
            article.summary = article.text_summary = self.tag_to_string(soup.find('h2'))