calibre/recipes/the_athletic.recipe

from calibre.web.feeds.news import BasicNewsRecipe


class Athletic(BasicNewsRecipe):
    title = u'The Athletic'
    __author__ = 'unkn0wn'
    description = 'The Athletic delivers powerful stories and smart analysis that bring sports fans closer to the heart of the game. From breaking news and live commentary, to deeply-reported long reads and exclusive interviews, subscribers rely on The Athletic for every sports story that matters.'  # noqa
    masthead_url = 'https://upload.wikimedia.org/wikipedia/commons/thumb/9/95/The_Athletic_wordmark_black_2020.svg/640px-The_Athletic_wordmark_black_2020.svg.png'  # noqa
    language = 'en'
    oldest_article = 1.15  # days
    max_articles_per_feed = 50
    encoding = 'utf-8'
    use_embedded_content = False
    no_stylesheets = True
    remove_attributes = ['style', 'height', 'width']
    ignore_duplicate_articles = {'url'}
    remove_empty_feeds = True
    extra_css = '''
        #articleByLineString{font-size:small;}
        .inline-credits{font-size:small; text-align:center;}
    '''
    keep_only_tags = [
        dict(name='amp-img', attrs={'class': 'i-amphtml-layout-fill'}),
        dict(name='div', attrs={'class': ['the-lead-article', 'article-container']})
    ]
    remove_tags = [
        dict(name='i-amphtml-sizer')
    ]

    feeds = [
        ('The Athletic Ink', 'https://theathletic.com/ink/?rss'),
        ('Football', 'https://theathletic.com/football/?rss'),
        ('Boxing', 'https://theathletic.com/boxing/?rss'),
        ('MMA', 'https://theathletic.com/mma/?rss'),
        ('Motorsports', 'https://theathletic.com/motorsports/?rss'),
        ('NBA', 'https://theathletic.com/nba/?rss'),
        ('NHL', 'https://theathletic.com/nhl/?rss'),
        ('Olympics', 'https://theathletic.com/olympics/?rss'),
        ('Culture', 'https://theathletic.com/culture/?rss'),
        ('Others', 'https://theathletic.com/rss-feed/'),  # All Articles
        # just add '/?rss' to the sections you'd like to get.. there's too many
    ]

    def preprocess_html(self, soup):
        for img in soup.findAll('amp-img'):
            if not img.find('img'):
                img.name = 'img'
        return soup

    def print_version(self, url):
        reset = url.split('?')[0] + '?amp=1'
        return reset