From 44e257224fac3dfe6b69e6a5574e600b320a1c2b Mon Sep 17 00:00:00 2001 From: Henrik Holm Date: Mon, 5 Dec 2022 22:46:37 +0100 Subject: [PATCH 1/2] Add Fokus (fokus.se) recipe --- recipes/fokus.recipe | 152 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 152 insertions(+) create mode 100644 recipes/fokus.recipe diff --git a/recipes/fokus.recipe b/recipes/fokus.recipe new file mode 100644 index 0000000000..cb9ba9b6d3 --- /dev/null +++ b/recipes/fokus.recipe @@ -0,0 +1,152 @@ +#!/usr/bin/env python +# vim:fileencoding=utf-8 +from datetime import datetime, timezone +from calibre.web.feeds.news import BasicNewsRecipe + + +WEB_SECTIONS = [ + ('Inrikes', 'inrikes'), + ('Utrikes', 'utrikes'), + ('Aktuellt', 'aktuellt'), + ('Politik', 'politik'), + ('Ekonomi', 'ekonomi'), + ('Kultur', 'kultur'), + ('Analys', 'analys'), + ('Vetenskap', 'vetenskap'), + ('Krönikor', 'kronika'), + ('Opinion', 'opinion'), + ('Veckans Fokus', 'veckans-fokus'), + ('Synvinkel', 'synvinkel'), + ('Minnesord', 'minnesord'), + ('Debatt', 'debatt'), + ('Andra kammaren', 'andra-kammaren'), + ('Skuggkabinettet', 'skuggkabinettet'), + ('Intervju', 'intervju'), + ('Mötet', 'motet'), + ('Veckans bråk', 'veckans-brak'), + ('Johans Blogg', 'johans-blogg'), +] + + +class NoArticles(Exception): + pass + + +class Fokus(BasicNewsRecipe): + title = 'Fokus' + main_url = 'https://www.fokus.se' + description = "The last 7 days of news and articles from the Swedish current-affairs magazine 'Fokus'" + encoding = 'utf-8' + __author__ = 'Henrik Holm (https://github.com/h-holm)' + language = 'se' + ignore_duplicate_articles = {'title', 'url'} + masthead_url = 'https://cdn.fokus.se/app/uploads/fokus/2022/05/12214931/fokus-logo.svg' + no_stylesheets = True + compress_news_images = True + needs_subscription = False + max_age = 7 # days + remove_empty_feeds = True + extra_css = 'img { display: block; width: 100%; height: auto }' + + remove_tags = [ + dict(name='div', attrs={'class': 'External-ad'}), + dict(name='header', attrs={'class': 'Header'}), + dict(name='div', attrs={'class': 'Header-expanded'}), + dict(name='div', attrs={'class': 'Overlay'}), + dict(name='div', attrs={'class': 'Search-expanded'}), + dict(name='section', attrs={'class': 'Site__footer'}), + dict(name='div', attrs={'class': 'Toaster'}), + dict(name='div', attrs={'class': 'fbc-badge'}), + dict(name='div', attrs={'class': 'Posts-by-related-cat'}), + dict(name='div', attrs={'class': 'finite-scroll'}), + dict(name='div', attrs={'class': 'Sidebar'}), + dict(name='div', attrs={'id': 'single-comments'}), + dict(name='footer', attrs={'class': 'Single__footer'}), + dict(name='div', attrs={'class': 'Social-share'}), + dict(name='div', attrs={'class': 'mediaconnect-paywall'}), + dict(name='svg', attrs={'class': 'icon'}), + dict(name='figure', attrs={'class': 'wp-block-audio'}), + ] + + remove_tags_after = [ + dict(name='div', class_='Single__content'), + ] + + keep_only_tags = [ + dict(name='h1', class_='Single__title'), # Title. + dict(name='h1', class_='Longread__title'), # Alt. title. + dict(name='p', class_='Single__lead'), # Lead text. + dict(name='p', class_='Longread__lead'), # Alt. lead text. + dict(name='figure', class_='Single__thumbnail'), # Image. + dict(name='figure', class_='Longread__thumbnail'), # Alt. image. + # dict(name='p', class_='Meta__author'), # Author. + # dict(name='time', class_='Meta__updated'), # Last updated. + # Main article. + dict(name='div', class_='mediaconnect-protected-content'), + ] + + def __init__(self, *args, **kwargs): + BasicNewsRecipe.__init__(self, *args, **kwargs) + + def parse_article_blurb(self, article_blurb): + desc = '' + if a_tag := article_blurb.find('a', href=True): + url = a_tag['href'] + if url.startswith('/'): + url = f'{self.main_url}{url}' + if title_tag := a_tag.find('h2', {'class': 'Blurb__title'}): + title = self.tag_to_string(title_tag) + if time_tag := a_tag.find('time', {'class': 'Blurb__date'}): + swedish_date_str = self.tag_to_string(time_tag) + datetime_str = time_tag['datetime'] + datetime_time = datetime.strptime( + datetime_str, '%Y-%m-%dT%H:%M:%S%z') + now = datetime.now(timezone.utc) + delta = now - datetime_time + if delta.days > self.max_age: + self.log.debug( + f"\tSkipping article '{title}' as it is too old") + else: + if desc_tag := a_tag.find('div', {'class': 'Blurb__summary'}): + desc = self.tag_to_string(desc_tag) + if in_cooperation_with_tag := a_tag.find('p', {'class': 'Blurb__meta'}): + desc += f' ({self.tag_to_string(in_cooperation_with_tag)})' + return {'title': title, 'url': url, 'description': desc, 'date': swedish_date_str} + + def parse_web_section(self, soup, slug): + def log(article): + log_message = f"\t{article['title']} : {article['date']} : {article['url']}" + if article.get('description'): + log_message += f" : {article['description']}" + self.log(log_message) + try: + article_blurbs = soup.find_all('article', {'class': 'Blurb'}) + except AttributeError: + article_blurbs = [] + if not article_blurbs: + raise ValueError(f'Failed to find article blurbs for slug: {slug}') + for article_blurb in article_blurbs: + if (article := self.parse_article_blurb(article_blurb)): + log(article) + yield article + + def parse_index(self): + feeds = [] + for section_title, slug in WEB_SECTIONS: + url = f'{self.main_url}/{slug}' + try: + soup = self.index_to_soup(url) + except Exception: + self.log.error(f'Failed to download section: {url}') + continue + self.log(f'Found section: {section_title}') + articles = list(self.parse_web_section(soup, slug)) + if articles: + feeds.append((section_title, articles)) + if not feeds: + raise NoArticles( + 'Could not find any articles. Either the fokus.se server is having issues and ' + 'you should try later or the website format has changed and the recipe needs ' + 'to be updated.' + ) + return feeds From 71b858f0c56575c28987e6db26d826f5500a3907 Mon Sep 17 00:00:00 2001 From: Henrik Holm Date: Mon, 5 Dec 2022 22:59:08 +0100 Subject: [PATCH 2/2] Set 'needs_subscription' to 'optional' and add get_browser() method --- recipes/fokus.recipe | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/recipes/fokus.recipe b/recipes/fokus.recipe index cb9ba9b6d3..b7f66baee8 100644 --- a/recipes/fokus.recipe +++ b/recipes/fokus.recipe @@ -43,7 +43,7 @@ class Fokus(BasicNewsRecipe): masthead_url = 'https://cdn.fokus.se/app/uploads/fokus/2022/05/12214931/fokus-logo.svg' no_stylesheets = True compress_news_images = True - needs_subscription = False + needs_subscription = 'optional' max_age = 7 # days remove_empty_feeds = True extra_css = 'img { display: block; width: 100%; height: auto }' @@ -88,6 +88,16 @@ class Fokus(BasicNewsRecipe): def __init__(self, *args, **kwargs): BasicNewsRecipe.__init__(self, *args, **kwargs) + def get_browser(self): + br = BasicNewsRecipe.get_browser(self) + if self.username and self.password: + br.open('https://www.fokus.se/auth/logga-in') + br.select_form(name='loginForm') + br['j_username'] = self.username + br['j_password'] = self.password + br.submit() + return br + def parse_article_blurb(self, article_blurb): desc = '' if a_tag := article_blurb.find('a', href=True):