#!/usr/bin/env python # vim:fileencoding=utf-8 from datetime import datetime from calibre.web.feeds.news import BasicNewsRecipe class NoArticles(Exception): pass class Fokus(BasicNewsRecipe): title = 'Fokus' main_url = 'https://www.fokus.se' description = "The current week's edition of Swedish current-affairs magazine 'Fokus'" encoding = 'utf-8' __author__ = 'Henrik Holm (https://github.com/h-holm)' language = 'sv' ignore_duplicate_articles = {'title', 'url'} masthead_url = 'https://cdn.fokus.se/app/uploads/fokus/2022/05/12214931/fokus-logo.svg' no_stylesheets = True compress_news_images = True needs_subscription = 'optional' use_embedded_content = False remove_empty_feeds = True scale_news_images_to_device = True scale_news_images = (800, 600) delay = 3 # Avoid throttling by the server. # 1. Center and reduce the size of images and image captions. # 2. Make the lead text italic. # 3. Make the article metadata text gray and small. extra_css = ''' img { display: block; margin: auto; width: 50%; height: auto; } div.calibre-nuked-tag-figure { font-size: small; text-align: center; } p.Single__lead, p.Longread__lead { font-style: italic; color:#202020; } p.article-metadata { color: gray; font-size:small; } ''' keep_only_tags = [ dict(name='h1', class_='Single__title'), # Title of "Single" type articles. dict(name='h1', class_='Longread__title'), # Title of "Longread" type articles. dict(name='p', class_='Single__lead'), # Lead text of "Single" type articles. dict(name='p', class_='Longread__lead'), # Lead text of "Longread" type articles. dict(name='p', class_='article-metadata'), # Dynamically created by the recipe. dict(name='figure', class_='Single__thumbnail'), # Image of "Single" type articles. dict(name='figure', class_='Longread__thumbnail'), # Image of "Longread" type articles. dict(name='div', class_='sesamy-protected-content'), # Article body. ] def extract_cover_url(self, a_tag) -> str: '''Given the tag of the current edition, extract the URL of the highest-resolution cover image.''' figure_tag = a_tag.find('figure') img_tag = figure_tag.find('img') try: # The `srcset` attribute contains a comma-separated list of URLs and their resolutions. cover_urls = img_tag['srcset'].split(', ') cover_urls = [src.split(' ') for src in cover_urls] # The second item of each tuple should be the resolution, e.g., '578w' or '821w'. Remove the 'w' suffix, cast # to an integer and sort in descending order. cover_urls = [(url, int(resolution[:-1])) for url, resolution in cover_urls] cover_urls = sorted(cover_urls, key=lambda x: x[1], reverse=True) # The first item of the sorted list is now the URL of the highest-resolution image. self.cover_url = cover_urls[0][0] self.log(f"Identified cover URL: '{self.cover_url}'") except (KeyError, ValueError) as exc: self.log.error(f'Failed to extract cover URL! Has the website format changed?\n{exc}') return def get_current_edition_url(self) -> str: '''Return the URL of the current (weekly) edition of Fokus.''' current_year = datetime.now().year try: soup = self.index_to_soup(f'{self.main_url}/vara-utgavor') # Identify all tags of class 'Issue' that have an href attribute containing the current year. a_tags = soup.find_all('a', class_='Issue', href=True) # Keep only the href, and subset to only those links that contain the current year. edition_links = [a_tag['href'] for a_tag in a_tags if str(current_year) in a_tag['href']] # In order to successfully sort the links chronologically, first convert the data structure to a dict, wherein # the key consists of only the date part of the URL and the value consists of the entire (unmodified) URL. edition_links = {link.removesuffix('/').split('/')[-1]: link for link in edition_links} # Then, shorten the key further by keeping only the part after the first hyphen. This removes the year and # typically results in only the calendar week number remaining, e.g., '1', '21' or '52'. Note however that # editions can sometimes cover multiples weeks, e.g., '1-2', '01-03' or '50-51-52'. In order to sort correctly, # it is therefore necessary to additionally keep only the first part of the week number(s) after the hyphen. edition_links = {key.split('-', 1)[-1].split('-', 1)[0]: value for key, value in edition_links.items()} # Now, convert the resulting keys to integers edition_links = {int(key): value for key, value in edition_links.items()} # Finally, sort in descending order, so that the most recent edition is first. edition_links = dict(sorted(edition_links.items(), reverse=True)) current_edition_url = edition_links[list(edition_links.keys())[0]] self.log(f"Identified {len(edition_links)} editions, of which the most recent is '{current_edition_url}'.") # Now that we know the URL of the current edition, we can use it to identify the cover image. The cover # image URL exists in the `src` attribute of the child tag of the
child tag of the tag # of the current edition. current_edition_a_tag = soup.find('a', class_='Issue', href=current_edition_url) self.extract_cover_url(current_edition_a_tag) except Exception as exc: self.log.error(f'Failed to identify the current edition URL: {exc}') raise NoArticles( f"Could not find the URL of the current edition. Either the '{self.main_url}' server is experiencing " 'issues, in which case you should try again later, or the website format has changed and the recipe ' 'needs updating.' ) from exc return current_edition_url def parse_article_blurb(self, article_blurb) -> dict[str, str, str, str, str] | None: '''Given a
tag of class 'Blurb', parse it into a dict. Args: article_blurb (Tag): An
tag hosting metadata and the URL of an article. Returns: A dict on a `{'url': str, 'title': str, 'date': str, 'category': str, 'description': str}` format. ''' if a_tag := article_blurb.find('a', href=True): url = a_tag['href'].strip().rstrip('/') if url.startswith('/'): url = f'{self.main_url}{url}' if title_tag := a_tag.find('h2', {'class': 'Blurb__title'}): title = self.tag_to_string(title_tag).strip() if date_tag := a_tag.find('time', {'class': 'Blurb__date'}): # Results in a Swedish date format, e.g., '23 MAJ 2025'. date = self.tag_to_string(date_tag).strip() # Add a newline before the date to make it more readable. date = f'\n{date}' # Assign the article to its first listed category as inferred from the first
  • tag of class # 'Blurb__category'. Default to 'Fokus' if no such tag is found. category = 'Fokus' if category_tag := a_tag.find('li', {'class': 'Blurb__category'}): category = self.tag_to_string(category_tag).strip() desc = '' if desc_tag := a_tag.find('div', {'class': 'Blurb__summary'}): desc = self.tag_to_string(desc_tag).strip() return {'url': url, 'title': title, 'date': date, 'category': category, 'description': desc} return def get_article_blurbs(self, soup) -> dict[str, dict[str, str, str, str, str]]: '''Given a Fokus webpage `soup`, return a dict of unique article entries found on the page. The key of a given entry in the output dictionary is the article URL. The corresponding value is a dictionary on a `{'url': str, 'title': str, 'date': str, 'category': str, 'description': str}` format. Args: soup (BeautifulSoup): The `bs4.BeautifulSoup` soup of a Fokus webpage. Returns: dict[str, dict[str, str, str, str, str]]: A dict with article URLs as keys and 'article dicts' as values. ''' def _log(article) -> None: '''Log a digestible summary of the input `article` blurb.''' log_message = f"\t{article['title']} : {article['url']}" if article.get('description'): log_message += f" : {article['description']}" self.log.debug(log_message) # Identify all
    tags of class 'Blurb' that have an href attribute. self.log('Identifying all articles...') try: article_blurbs = soup.find_all('article', {'class': 'Blurb'}) except AttributeError: article_blurbs = [] if not article_blurbs: self.log.error('Failed to identify any article blurbs.') return {} parsed_blurbs = {} for article_blurb in article_blurbs: if article := self.parse_article_blurb(article_blurb): _log(article) parsed_blurbs[article['url']] = article return parsed_blurbs def convert_to_section_lists( self, articles: dict[str, dict[str, str, str, str, str]], ) -> dict[str, list[dict[str, str, str, str]]]: '''Convert the `articles` dict of dicts to a dict of lists; each list holds the articles of a given section. Args: articles (dict[str, dict[str, str, str, str, str]]): A dict of article URLs and article dicts. Returns: dict[str, list[dict[str, str, str, str]]]: A dict on a `{section_title: list[article_dict]}` format. ''' self.log(f'Assigning each of the {len(articles)} articles to a section...') section_to_articles = {} for article_url, article_dict in articles.items(): section_title = article_dict['category'] if section_title not in section_to_articles: section_to_articles[section_title] = [] # Remove the 'category' key from the article dict, as it is not needed in the final output. article_dict.pop('category') section_to_articles[section_title].append(article_dict) return section_to_articles def parse_index(self): current_edition_url = self.get_current_edition_url() if not current_edition_url: raise NoArticles( f"Could not find the URL of the current edition. Either the '{self.main_url}' server is experiencing " 'issues, in which case you should try again later, or the website format has changed and the recipe ' 'needs updating.' ) self.log(f'Current edition URL: {current_edition_url}') # Identify all sections in the web version of Fokus. edition_soup = self.index_to_soup(current_edition_url) # From the section URLs and the main URL, identify all unique articles. articles = self.get_article_blurbs(edition_soup) if not articles: raise NoArticles( f"Could not find any articles. Either the '{self.main_url}' server is experiencing issues, in which " 'case you should try again later, or the website format has changed and the recipe needs updating.' ) # Assign each identified article to a section based on its URL. section_to_articles = self.convert_to_section_lists(articles) # Convert to tuples. return list(section_to_articles.items())