#!/usr/bin/env python # vim:fileencoding=utf-8 from datetime import datetime, timezone from calibre.web.feeds.news import BasicNewsRecipe WEB_SECTIONS = [ ('Inrikes', 'inrikes'), ('Utrikes', 'utrikes'), ('Aktuellt', 'aktuellt'), ('Politik', 'politik'), ('Ekonomi', 'ekonomi'), ('Kultur', 'kultur'), ('Analys', 'analys'), ('Vetenskap', 'vetenskap'), ('Krönikor', 'kronika'), ('Opinion', 'opinion'), ('Veckans Fokus', 'veckans-fokus'), ('Synvinkel', 'synvinkel'), ('Minnesord', 'minnesord'), ('Debatt', 'debatt'), ('Andra kammaren', 'andra-kammaren'), ('Skuggkabinettet', 'skuggkabinettet'), ('Intervju', 'intervju'), ('Mötet', 'motet'), ('Veckans bråk', 'veckans-brak'), ('Johans Blogg', 'johans-blogg'), ] class NoArticles(Exception): pass class Fokus(BasicNewsRecipe): title = 'Fokus' main_url = 'https://www.fokus.se' description = "The last 7 days of news and articles from the Swedish current-affairs magazine 'Fokus'" encoding = 'utf-8' __author__ = 'Henrik Holm (https://github.com/h-holm)' language = 'se' ignore_duplicate_articles = {'title', 'url'} masthead_url = 'https://cdn.fokus.se/app/uploads/fokus/2022/05/12214931/fokus-logo.svg' no_stylesheets = True compress_news_images = True needs_subscription = 'optional' max_age = 7 # days remove_empty_feeds = True extra_css = 'img { display: block; width: 100%; height: auto }' remove_tags = [ dict(name='div', attrs={'class': 'External-ad'}), dict(name='header', attrs={'class': 'Header'}), dict(name='div', attrs={'class': 'Header-expanded'}), dict(name='div', attrs={'class': 'Overlay'}), dict(name='div', attrs={'class': 'Search-expanded'}), dict(name='section', attrs={'class': 'Site__footer'}), dict(name='div', attrs={'class': 'Toaster'}), dict(name='div', attrs={'class': 'fbc-badge'}), dict(name='div', attrs={'class': 'Posts-by-related-cat'}), dict(name='div', attrs={'class': 'finite-scroll'}), dict(name='div', attrs={'class': 'Sidebar'}), dict(name='div', attrs={'id': 'single-comments'}), dict(name='footer', attrs={'class': 'Single__footer'}), dict(name='div', attrs={'class': 'Social-share'}), dict(name='div', attrs={'class': 'mediaconnect-paywall'}), dict(name='svg', attrs={'class': 'icon'}), dict(name='figure', attrs={'class': 'wp-block-audio'}), ] remove_tags_after = [ dict(name='div', class_='Single__content'), ] keep_only_tags = [ dict(name='h1', class_='Single__title'), # Title. dict(name='h1', class_='Longread__title'), # Alt. title. dict(name='p', class_='Single__lead'), # Lead text. dict(name='p', class_='Longread__lead'), # Alt. lead text. dict(name='figure', class_='Single__thumbnail'), # Image. dict(name='figure', class_='Longread__thumbnail'), # Alt. image. # dict(name='p', class_='Meta__author'), # Author. # dict(name='time', class_='Meta__updated'), # Last updated. # Main article. dict(name='div', class_='mediaconnect-protected-content'), ] def get_browser(self): br = BasicNewsRecipe.get_browser(self) if self.username and self.password: br.open('https://www.fokus.se/auth/logga-in') br.select_form(name='loginForm') br['j_username'] = self.username br['j_password'] = self.password br.submit() return br def parse_article_blurb(self, article_blurb): desc = '' if a_tag := article_blurb.find('a', href=True): url = a_tag['href'] if url.startswith('/'): url = f'{self.main_url}{url}' if title_tag := a_tag.find('h2', {'class': 'Blurb__title'}): title = self.tag_to_string(title_tag) if time_tag := a_tag.find('time', {'class': 'Blurb__date'}): swedish_date_str = self.tag_to_string(time_tag) datetime_str = time_tag['datetime'] datetime_time = datetime.strptime( datetime_str, '%Y-%m-%dT%H:%M:%S%z') now = datetime.now(timezone.utc) delta = now - datetime_time if delta.days > self.max_age: self.log.debug( f"\tSkipping article '{title}' as it is too old") else: if desc_tag := a_tag.find('div', {'class': 'Blurb__summary'}): desc = self.tag_to_string(desc_tag) if in_cooperation_with_tag := a_tag.find('p', {'class': 'Blurb__meta'}): desc += f' ({self.tag_to_string(in_cooperation_with_tag)})' return {'title': title, 'url': url, 'description': desc, 'date': swedish_date_str} def parse_web_section(self, soup, slug): def log(article): log_message = f"\t{article['title']} : {article['date']} : {article['url']}" if article.get('description'): log_message += f" : {article['description']}" self.log(log_message) try: article_blurbs = soup.find_all('article', {'class': 'Blurb'}) except AttributeError: article_blurbs = [] if not article_blurbs: raise ValueError(f'Failed to find article blurbs for slug: {slug}') for article_blurb in article_blurbs: if (article := self.parse_article_blurb(article_blurb)): log(article) yield article def parse_index(self): feeds = [] for section_title, slug in WEB_SECTIONS: url = f'{self.main_url}/{slug}' try: soup = self.index_to_soup(url) except Exception: self.log.error(f'Failed to download section: {url}') continue self.log(f'Found section: {section_title}') articles = list(self.parse_web_section(soup, slug)) if articles: feeds.append((section_title, articles)) if not feeds: raise NoArticles( 'Could not find any articles. Either the fokus.se server is having issues and ' 'you should try later or the website format has changed and the recipe needs ' 'to be updated.' ) return feeds