calibre/recipes/the_saturday_paper.recipe
2025-01-24 11:14:24 +01:00

81 lines
2.8 KiB
Python

#!/usr/bin/env python
# vim:fileencoding=utf-8
# License: GPLv3 Copyright: 2021, Alistair Francis <alistair@alistair23.me>
from __future__ import absolute_import, division, print_function, unicode_literals
from calibre.web.feeds.recipes import BasicNewsRecipe
def classes(classes):
q = frozenset(classes.split(' '))
return dict(attrs={
'class': lambda x: x and frozenset(x.split()).intersection(q)})
class SaturdayPaper(BasicNewsRecipe):
title = 'The Saturday Paper'
__author__ = 'Alistair Francis'
description = (
'The Saturday Paper is a weekly newspaper, dedicated to narrative journalism.'
' It offers the biggest names and best writing in news, culture, and analysis, with a particular focus on Australia. ')
language = 'en_AU'
no_stylesheets = True
remove_javascript = True
encoding = 'utf-8'
keep_only_tags = [
classes('article-page__content article__text article__image article-page__title article-page__image')
]
remove_tags = [
classes('social-icons-article-top-container social-icons-article-bottom-container'
' article-page__sidebar article-page__social__icons share-wrapper article-footer-container')
]
remove_tags_after = [
{'name': 'div', 'class': 'end-matter'},
]
def get_cover_url(self):
soup = self.index_to_soup('https://www.thesaturdaypaper.com.au/editions/')
div = soup.find('div', attrs={'class':'article__image'})
return div.img['src']
def parse_index(self):
feeds = [
('News', 'https://www.thesaturdaypaper.com.au/news'),
('Opinion', 'https://www.thesaturdaypaper.com.au/opinion'),
('Culture', 'https://www.thesaturdaypaper.com.au/culture'),
('Life', 'https://www.thesaturdaypaper.com.au/life'),
('Food', 'https://www.thesaturdaypaper.com.au/food/latest'),
('Puzzles', 'https://www.thesaturdaypaper.com.au/puzzles'),
('Sport', 'https://www.thesaturdaypaper.com.au/sport'),
]
articles = []
for feed, link in feeds:
soup = self.index_to_soup(link)
news = []
for a in soup.findAll(**classes('article')):
title = a.find(class_='article__title')
title = self.tag_to_string(title)
url = a.find(class_='article__title_link')
if url is None:
continue
url = url['href']
if url.startswith('/'):
url = 'https://www.thesaturdaypaper.com.au' + url
desc = a.find(class_='article__text')
desc = self.tag_to_string(desc)
self.log(title, ' at ', url)
news.append({'title': title, 'url': url, 'description': desc})
articles.append((feed, news))
return articles