calibre/recipes/the_week.recipe

#!/usr/bin/env python
# vim:fileencoding=utf-8
# License: GPLv3 Copyright: 2021, Kovid Goyal <kovid at kovidgoyal.net>

from calibre.web.feeds.news import BasicNewsRecipe


def fix_title(title):
    return title.replace('-', ' ').capitalize()


class TheWeek(BasicNewsRecipe):
    title = u'The Week'
    language = 'en_IN'
    __author__ = 'Kovid Goyal'
    encoding = 'utf-8'
    oldest_article = 8  # days
    max_articles_per_feed = 25
    no_stylesheets = True
    use_embedded_content = True
    ignore_duplicate_articles = {'url'}
    remove_attributes = ['style', 'align', 'border', 'hspace']

    feeds = [
        ('Cover Story', 'https://www.theweek.in/theweek/cover.rss'),
        ('Sports', 'https://www.theweek.in/theweek/sports.rss'),
        ('Current', 'https://www.theweek.in/theweek/current.rss'),
        ('Statescan', 'https://www.theweek.in/theweek/statescan.rss'),
        ('Leisure', 'https://www.theweek.in/theweek/leisure.rss'),
        ('Business', 'https://www.theweek.in/theweek/business.rss'),
        ('Specials', 'https://www.theweek.in/theweek/specials.rss'),
        ('More', 'https://www.theweek.in/theweek/more.rss'),
        ('Society', 'https://www.theweek.in/leisure/society.rss'),
    ]

    def get_cover_url(self):
        soup = self.index_to_soup(
            'https://www.magzter.com/IN/Malayala_Manorama/THE_WEEK/Business/'
        )
        for citem in soup.findAll(
            'meta', content=lambda s: s and s.endswith('view/3.jpg')
        ):
            return citem['content']

    def preprocess_html(self, soup):
        a = soup.find('a')
        if a:
            a.name = 'div'
        h2 = soup.find('h2')
        if h2:
            h2.string = fix_title(h2.string)
        for p in soup.findAll('p'):
            if p.string == '\xa0':
                p.decompose()
        return soup

    def populate_article_metadata(self, article, soup, first):
        article.title = fix_title(article.title)