calibre/recipes/the_saturday_paper.recipe

#!/usr/bin/env python
# vim:fileencoding=utf-8
# License: GPLv3 Copyright: 2021, Alistair Francis <alistair@alistair23.me>

from __future__ import absolute_import, division, print_function, unicode_literals

from calibre.web.feeds.recipes import BasicNewsRecipe


def classes(classes):
    q = frozenset(classes.split(' '))
    return dict(attrs={
        'class': lambda x: x and frozenset(x.split()).intersection(q)})


class SaturdayPaper(BasicNewsRecipe):
    title = 'The Saturday Paper'
    __author__ = 'Alistair Francis'
    description = (
        'The Saturday Paper is a weekly newspaper, dedicated to narrative journalism.'
        ' It offers the biggest names and best writing in news, culture, and analysis, with a particular focus on Australia. ')
    language = 'en_AU'
    no_stylesheets = True
    remove_javascript = True
    encoding = 'utf-8'

    keep_only_tags = [
        classes('article-page__content article__text article__image article-page__title article-page__image')
    ]
    remove_tags = [
        classes('social-icons-article-top-container social-icons-article-bottom-container'
                ' article-page__sidebar article-page__social__icons share-wrapper article-footer-container')
    ]
    remove_tags_after = [
        {'name': 'div', 'class': 'end-matter'},
    ]

    def get_cover_url(self):
        soup = self.index_to_soup('https://www.thesaturdaypaper.com.au/editions/')
        div = soup.find('div', attrs={'class':'article__image'})
        return div.img['src']

    def parse_index(self):
        feeds = [
            ('News', 'https://www.thesaturdaypaper.com.au/news'),
            ('Opinion', 'https://www.thesaturdaypaper.com.au/opinion'),
            ('Culture', 'https://www.thesaturdaypaper.com.au/culture'),
            ('Life', 'https://www.thesaturdaypaper.com.au/life'),
            ('Food', 'https://www.thesaturdaypaper.com.au/food/latest'),
            ('Puzzles', 'https://www.thesaturdaypaper.com.au/puzzles'),
            ('Sport', 'https://www.thesaturdaypaper.com.au/sport'),
        ]

        articles = []

        for feed, link in feeds:
            soup = self.index_to_soup(link)
            news = []

            for a in soup.findAll(**classes('article')):
                title = a.find(class_='article__title')
                title = self.tag_to_string(title)

                url = a.find(class_='article__title_link')
                if url is None:
                    continue
                url = url['href']
                if url.startswith('/'):
                    url = 'https://www.thesaturdaypaper.com.au' + url

                desc = a.find(class_='article__text')
                desc = self.tag_to_string(desc)

                self.log(title, ' at ', url)

                news.append({'title': title, 'url': url, 'description': desc})

            articles.append((feed, news))

        return articles