From 84772e8b14d7411b803d86bf69a49fabbf6c8116 Mon Sep 17 00:00:00 2001 From: Henrik Holm Date: Fri, 16 May 2025 18:33:47 +0200 Subject: [PATCH 1/3] Update to follow updated structure of Fokus website --- recipes/fokus.recipe | 27 +++++++-------------------- 1 file changed, 7 insertions(+), 20 deletions(-) diff --git a/recipes/fokus.recipe b/recipes/fokus.recipe index 24f288c911..5df5e35a70 100644 --- a/recipes/fokus.recipe +++ b/recipes/fokus.recipe @@ -131,27 +131,14 @@ class Fokus(BasicNewsRecipe): if url.startswith('/'): url = f'{self.main_url}{url}' - if title_tag := a_tag.find('h2', {'class': 'Blurb__title'}): + if title_tag := a_tag.find('h2', {'class': 'PostBlurb__title'}): title = self.tag_to_string(title_tag).strip() - if time_tag := a_tag.find('time', {'class': 'Blurb__date'}): - swedish_date_str = self.tag_to_string(time_tag).rstrip() + desc = '' + if desc_tag := a_tag.find('div', {'class': 'PostBlurb__excerpt'}): + desc = self.tag_to_string(desc_tag).strip() - # Skip articles older than `self.oldest_article`. - datetime_str = time_tag['datetime'] - datetime_time = datetime.strptime(datetime_str, '%Y-%m-%dT%H:%M:%S%z') - now = datetime.now(timezone.utc) - delta = now - datetime_time - if delta.days > self.oldest_article: - self.log.debug(f"\tSkipping article as it is too old: '{title}'") - return + return {'url': url, 'title': title, 'description': desc} - desc = '' - if desc_tag := a_tag.find('div', {'class': 'Blurb__summary'}): - desc = self.tag_to_string(desc_tag).strip() - if in_cooperation_with_tag := a_tag.find('p', {'class': 'Blurb__meta'}): - desc += f' ({self.tag_to_string(in_cooperation_with_tag).strip()})' - - return {'url': url, 'title': title, 'description': desc, 'date': swedish_date_str} return def _get_article_blurbs(self, soup) -> dict[str, dict[str, str, str, str]]: @@ -169,13 +156,13 @@ class Fokus(BasicNewsRecipe): def _log(article) -> None: '''Log a digestible summary of the input `article` blurb.''' - log_message = f"\t{article['title']} : {article['date']} : {article['url']}" + log_message = f"\t{article['title']} : {article['url']}" if article.get('description'): log_message += f" : {article['description']}" self.log.debug(log_message) try: - article_blurbs = soup.find_all('article', {'class': 'Blurb'}) + article_blurbs = soup.find_all('article', {'class': 'PostBlurb'}) except AttributeError: article_blurbs = [] From 8abd9b706ea9c252a68d6bd9c43c4cb02e2d6fc8 Mon Sep 17 00:00:00 2001 From: Henrik Holm Date: Fri, 16 May 2025 18:33:56 +0200 Subject: [PATCH 2/3] Extract article metadata from the article itself --- recipes/fokus.recipe | 61 +++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 58 insertions(+), 3 deletions(-) diff --git a/recipes/fokus.recipe b/recipes/fokus.recipe index 5df5e35a70..b0af45a2fc 100644 --- a/recipes/fokus.recipe +++ b/recipes/fokus.recipe @@ -1,6 +1,7 @@ #!/usr/bin/env python # vim:fileencoding=utf-8 -from datetime import datetime, timezone +import time +from datetime import datetime, timedelta from mechanize import Request @@ -25,15 +26,21 @@ class Fokus(BasicNewsRecipe): compress_news_images = True needs_subscription = 'optional' oldest_article = 7 # days + max_articles_per_feed = 15 use_embedded_content = False remove_empty_feeds = True scale_news_images_to_device = True scale_news_images = (800, 600) + delay = 3 # Avoid throttling by the server. - # Center and reduce the size of images and image captions. + # 1. Center and reduce the size of images and image captions. + # 2. Make the lead text italic. + # 3. Make the article metadata text gray and small. extra_css = ''' - img { display: block; margin: auto; width: 50%; height: auto } + img { display: block; margin: auto; width: 50%; height: auto; } div.calibre-nuked-tag-figure { font-size: small; text-align: center; } + p.Single__lead, p.Longread__lead { font-style: italic; color:#202020; } + p.article-metadata { color: gray; font-size:small; } ''' keep_only_tags = [ @@ -41,8 +48,11 @@ class Fokus(BasicNewsRecipe): dict(name='h1', class_='Longread__title'), # Title of "Longread" type articles. dict(name='p', class_='Single__lead'), # Lead text of "Single" type articles. dict(name='p', class_='Longread__lead'), # Lead text of "Longread" type articles. + dict(name='p', class_='article-metadata'), # Dynamically created by the recipe. dict(name='figure', class_='Single__thumbnail'), # Image of "Single" type articles. dict(name='figure', class_='Longread__thumbnail'), # Image of "Longread" type articles. + dict(name='p', class_='Meta__author'), # Author of the article. + dict(name='time', class_='Meta__updated'), # Last updated date of the article. dict(name='div', class_='sesamy-protected-content'), # Article body. ] @@ -277,3 +287,48 @@ class Fokus(BasicNewsRecipe): self.log(f'A total of {num_articles} articles belonging to {len(section_to_articles)} sections were kept.') return feeds + + def populate_article_metadata(self, article, soup, _): + # The article description/summary is found in the

tag of class 'Single__lead' or 'Longread__lead'. + lead_tag = soup.find('p', {'class': ['Single__lead', 'Longread__lead']}) + article.summary = article.text_summary = lead_tag.get_text(strip=True) + + # Extract the article timestamp from the `datetime` attribute of the first

tag of class 'Meta__author'. After the author name has been + # extracted, remove all such

tags from the soup (the article can contain several). + if author_tag := soup.find('p', {'class': 'Meta__author'}): + # If the tag contains an child tag, extract the author name from it. + if a_tag := author_tag.find('a'): + author_info = a_tag.get_text(strip=True) + # To ensure a clean output, remove the child tag. + a_tag.decompose() + else: + # If the tag does not contain an child tag, extract the author name from the text of the

tag. + author_info = author_tag.get_text(strip=True) + # Remove the 'Text: ' prefix from the author name (if any). + if author_info.startswith('Text: '): + author_info = author_info[6:] + for author_tag in soup.find_all('p', {'class': 'Meta__author'}): + author_tag.decompose() + else: + # If the author name is empty, set it to 'Fokus'. + if not author_info: + author_info = 'Fokus' + + # Concatenate the author name and the article date. + article_metadata = f"{author_info} | {article.date}" + + # Finally, add a new

tag with the article metadata to the soup. Place it directly after the lead text. + new_tag = soup.new_tag('p') + new_tag['class'] = 'article-metadata' + new_tag.string = article_metadata + lead_tag.insert_after(new_tag) From 9a756077486e95616524c0941561e75b42625b09 Mon Sep 17 00:00:00 2001 From: Henrik Holm Date: Sat, 24 May 2025 01:57:50 +0200 Subject: [PATCH 3/3] Refactor "Fokus.se" recipe --- recipes/fokus.recipe | 294 +++++++++++++++---------------------------- 1 file changed, 101 insertions(+), 193 deletions(-) diff --git a/recipes/fokus.recipe b/recipes/fokus.recipe index b0af45a2fc..b62ced6c87 100644 --- a/recipes/fokus.recipe +++ b/recipes/fokus.recipe @@ -1,9 +1,6 @@ #!/usr/bin/env python # vim:fileencoding=utf-8 -import time -from datetime import datetime, timedelta - -from mechanize import Request +from datetime import datetime, timedelta, timezone from calibre.ebooks.BeautifulSoup import BeautifulSoup from calibre.web.feeds.news import BasicNewsRecipe @@ -16,7 +13,7 @@ class NoArticles(Exception): class Fokus(BasicNewsRecipe): title = 'Fokus' main_url = 'https://www.fokus.se' - description = "The last 7 days of news and articles from the Swedish current-affairs magazine 'Fokus'" + description = "The current week's edition of Swedish current-affairs magazine 'Fokus'" encoding = 'utf-8' __author__ = 'Henrik Holm (https://github.com/h-holm)' language = 'sv' @@ -25,8 +22,6 @@ class Fokus(BasicNewsRecipe): no_stylesheets = True compress_news_images = True needs_subscription = 'optional' - oldest_article = 7 # days - max_articles_per_feed = 15 use_embedded_content = False remove_empty_feeds = True scale_news_images_to_device = True @@ -51,117 +46,122 @@ class Fokus(BasicNewsRecipe): dict(name='p', class_='article-metadata'), # Dynamically created by the recipe. dict(name='figure', class_='Single__thumbnail'), # Image of "Single" type articles. dict(name='figure', class_='Longread__thumbnail'), # Image of "Longread" type articles. - dict(name='p', class_='Meta__author'), # Author of the article. - dict(name='time', class_='Meta__updated'), # Last updated date of the article. dict(name='div', class_='sesamy-protected-content'), # Article body. ] - def get_cover_url(self) -> str: - # Create a `mechanize.Request` object. - req = Request(url=self.main_url, method='POST') + def extract_cover_url(self, a_tag) -> str: + '''Given the tag of the current edition, extract the URL of the highest-resolution cover image.''' + figure_tag = a_tag.find('figure') + img_tag = figure_tag.find('img') - # Open the requested URL in the built-in browser of the `BasicNewsRecipe` parent class. - browser = self.get_browser() - response = browser.open(req) + # The `srcset` attribute contains a comma-separated list of URLs and their resolutions. + cover_urls = img_tag['srcset'].split(', ') + cover_urls = [src.split(' ') for src in cover_urls] - # Parse the response into a BeautifulSoup soup. - soup = BeautifulSoup(response.get_data(), 'html.parser') + # The second item of each tuple should be the resolution, e.g., '578w' or '821w'. Remove the 'w' suffix, cast + # to an integer and sort in descending order. + cover_urls = [(url, int(resolution[:-1])) for url, resolution in cover_urls] + cover_urls = sorted(cover_urls, key=lambda x: x[1], reverse=True) - # The cover image of the current edition is located in a

tag with class 'Issue__thumbnail'. + # The first item of the sorted list is now the URL of the highest-resolution image. + self.cover_url = cover_urls[0][0] + self.log(f"Identified cover URL: '{self.cover_url}'") + + return + + def get_current_edition_url(self) -> str: + '''Return the URL of the current (weekly) edition of Fokus.''' + current_year = datetime.now().year try: - figure_tag = soup.find('figure', class_='Issue__thumbnail') - img_tag = figure_tag.find('img') - # Set the `img_tag` to `None` if it is falsy. This way, we can force an `AttributeError` if no cover URL - # can be found. - img_tag = img_tag if img_tag else None - cover_url = img_tag['src'] - except AttributeError: - self.log.error("Failed to identify the cover image URL. Does an 'Issue__thumbnail' figure still exist?") - return '' + soup = self.index_to_soup(f"{self.main_url}/vara-utgavor") - return cover_url + # Identify all tags of class 'Issue' that have an href attribute containing the current year. + a_tags = soup.find_all('a', class_='Issue', href=True) - def get_browser(self): - br = BasicNewsRecipe.get_browser(self) - if self.username and self.password: - br.open('https://www.fokus.se/auth/logga-in') - br.select_form(name='loginForm') - br['j_username'] = self.username - br['j_password'] = self.password - br.submit() - return br + # Keep only the href, and subset to only those links that contain the current year. + edition_links = [a_tag['href'] for a_tag in a_tags if str(current_year) in a_tag['href']] - def get_web_sections(self, main_url: str) -> dict[str, str]: - '''Return a dict of (1) section URL and (2) section name key-value pairs found at `main_url`. + # In order to successfully sort the links chronologically, first convert the data structure to a dict, wherein + # the key consists of only the date part of the URL and the value consists of the entire (unmodified) URL. + edition_links = {link.removesuffix('/').split('/')[-1]: link for link in edition_links} - For example, if the Fokus website currently includes an 'Aktuellt' section, the dict should include an entry on - the form: `{'https://www.fokus.se/aktuellt': 'Aktuellt'}`. + # Then, shorten the key further by keeping only the part after the first hyphen. This removes the year and + # typically results in only the calendar week number remaining, e.g., '1', '21' or '52'. Note however that + # editions can sometimes cover multiples weeks, e.g., '1-2', '01-03' or '50-51-52'. In order to sort correctly, + # it is therefore necessary to additionally keep only the first part of the week number(s) after the hyphen. + edition_links = {key.split('-', 1)[-1].split('-', 1)[0]: value for key, value in edition_links.items()} - Args: - main_url (str): The entrypoint URL of the Fokus website. + # Now, convert the resulting keys to integers + edition_links = {int(key): value for key, value in edition_links.items()} - Yields: - dict[str, str]: (1) URLs and (2) human-readable names of Fokus sections. - ''' - self.log(f"Identifying all sections under '{main_url}'...") - soup = self.index_to_soup(main_url) + # Finally, sort in descending order, so that the most recent edition is first. + edition_links = dict(sorted(edition_links.items(), reverse=True)) + current_edition_url = edition_links[list(edition_links.keys())[0]] - # Identify all unique
  • tags of class 'menu-item-type-taxonomy'. The class subsetting excludes sections that - # are not suited for reading, e.g., the "Podcast" and "Läs E-Tidningen" sections. - section_urls_and_names = {} - for li_tag in soup.find_all('li', class_='menu-item-type-taxonomy'): - # The
  • tag contains (should contain) an anchor that in turn contains the URL and link name. - a_tag = li_tag.find('a') - url = a_tag.get('href').rstrip('/') - section_name = a_tag.text.strip() + self.log(f"Identified {len(edition_links)} editions, of which the most recent is '{current_edition_url}'.") - if url in section_urls_and_names: - # If this section URL has already been extracted from another
  • tag, it can be the case that the - # section name differs within this duplicate pair. In this case, use whichever section name is longer. - if len(section_name) >= len(section_urls_and_names[url]): - section_urls_and_names[url] = section_name + # Now that we know the URL of the current edition, we can use it to identify the cover image. The cover + # image URL exists in the `src` attribute of the child tag of the
    child tag of the tag + # of the current edition. + current_edition_a_tag = soup.find('a', class_='Issue', href=current_edition_url) + self.extract_cover_url(current_edition_a_tag) + except Exception as exc: + self.log.error(f"Failed to identify the current edition URL: {e}") + raise NoArticles( + f"Could not find the URL of the current edition. Either the '{self.main_url}' server is experiencing " + 'issues, in which case you should try again later, or the website format has changed and the recipe ' + 'needs updating.' + ) from exc + return current_edition_url - self.log(f"Identified section '{section_name}' at URL '{url}'.") - section_urls_and_names[url] = section_name - - self.log(f'Identified a total of {len(section_urls_and_names)} unique sections.') - return section_urls_and_names - - def parse_article_blurb(self, article_blurb) -> dict[str, str, str, str] | None: + def parse_article_blurb(self, article_blurb) -> dict[str, str, str, str, str] | None: '''Given a
    tag of class 'Blurb', parse it into a dict. Args: article_blurb (Tag): An
    tag hosting metadata and the URL of an article. Returns: - dict[str, str, str, str]: A dict on a `{'url': str, 'title': str, 'description': str, 'date': str}` format. + A dict on a `{'url': str, 'title': str, 'date': str, 'category': str, 'description': str}` format. ''' if a_tag := article_blurb.find('a', href=True): url = a_tag['href'].strip().rstrip('/') if url.startswith('/'): url = f'{self.main_url}{url}' - if title_tag := a_tag.find('h2', {'class': 'PostBlurb__title'}): + if title_tag := a_tag.find('h2', {'class': 'Blurb__title'}): title = self.tag_to_string(title_tag).strip() - desc = '' - if desc_tag := a_tag.find('div', {'class': 'PostBlurb__excerpt'}): + + if date_tag := a_tag.find('time', {'class': 'Blurb__date'}): + # Results in a Swedish date format, e.g., '23 MAJ 2025'. + date = self.tag_to_string(date_tag).strip() + # Add a newline before the date to make it more readable. + date = f'\n{date}' + + # Assign the article to its first listed category as inferred from the first
  • tag of class + # 'Blurb__category'. Default to 'Fokus' if no such tag is found. + category = 'Fokus' + if category_tag := a_tag.find('li', {'class': 'Blurb__category'}): + category = self.tag_to_string(category_tag).strip() + + desc = '' + if desc_tag := a_tag.find('div', {'class': 'Blurb__summary'}): desc = self.tag_to_string(desc_tag).strip() - return {'url': url, 'title': title, 'description': desc} + return {'url': url, 'title': title, 'date': date, 'category': category, 'description': desc} return - def _get_article_blurbs(self, soup) -> dict[str, dict[str, str, str, str]]: + def get_article_blurbs(self, soup) -> dict[str, dict[str, str, str, str, str]]: '''Given a Fokus webpage `soup`, return a dict of unique article entries found on the page. The key of a given entry in the output dictionary is the article URL. The corresponding value is a dictionary - on a `{'url': str, 'title': str, 'description': str, 'date': str}` format. + on a `{'url': str, 'title': str, 'date': str, 'category': str, 'description': str}` format. Args: soup (BeautifulSoup): The `bs4.BeautifulSoup` soup of a Fokus webpage. Returns: - dict[str, dict[str, str, str, str]]: A dict with article URLs as keys and 'article dicts' as values. + dict[str, dict[str, str, str, str, str]]: A dict with article URLs as keys and 'article dicts' as values. ''' def _log(article) -> None: @@ -171,8 +171,10 @@ class Fokus(BasicNewsRecipe): log_message += f" : {article['description']}" self.log.debug(log_message) + # Identify all
    tags of class 'Blurb' that have an href attribute. + self.log(f'Identifying all articles...') try: - article_blurbs = soup.find_all('article', {'class': 'PostBlurb'}) + article_blurbs = soup.find_all('article', {'class': 'Blurb'}) except AttributeError: article_blurbs = [] @@ -184,94 +186,49 @@ class Fokus(BasicNewsRecipe): for article_blurb in article_blurbs: if article := self.parse_article_blurb(article_blurb): _log(article) - # If an entry with the same URL already exists, keep whichever entry has the longer description. - if article['url'] in article_blurbs: - if len(article['description']) <= len(parsed_blurbs[article['url']]['description']): - continue parsed_blurbs[article['url']] = article return parsed_blurbs - def get_article_blurbs(self, sections: dict[str, str]) -> dict[str, dict[str, str, str, str]]: - '''Create and return a dict of all unique article blurbs found in all `sections`. - - The key of a given entry in the output dictionary is the article URL. The corresponding value is a dictionary - on a `{'url': str, 'title': str, 'description': str, 'date': str}` format. - - Args: - sections (dict[str, str]): A dict on a `{section_url: section_name}` format. - - Returns: - dict[str, dict[str, str, str, str]]: A dict with article URLs as keys and 'article dicts' as values. - ''' - self.log(f'Identifying all articles under all {len(sections)} sections...') - - article_blurbs = {} - for section_url, section_title in sections.items(): - try: - section_soup = self.index_to_soup(section_url) - except Exception: - self.log.error(f"Failed to download section '{section_title}' via URL '{section_url}'") - continue - self.log(f"Identifying all articles under '{section_url}'...") - for article_url, article_blurb in self._get_article_blurbs(section_soup).items(): - # If the article URL has already been encountered, keep only the article blurb with the longer - # description string. - if article_url not in article_blurbs: - article_blurbs[article_url] = article_blurb - elif len(article_blurb['description']) > len(article_blurbs[article_url]['description']): - article_blurbs[article_url] = article_blurb - - self.log(f'A total of {len(article_blurbs)} articles were identified in the {len(sections)} sections.') - return article_blurbs - - def assign_articles_to_sections( + def convert_to_section_lists( self, - sections: dict[str, str], - articles: dict[str, dict[str, str, str, str]], + articles: dict[str, dict[str, str, str, str, str]], ) -> dict[str, list[dict[str, str, str, str]]]: - '''Assign each article in `articles` to a section in `sections`. + '''Convert the `articles` dict of dicts to a dict of lists; each list holds the articles of a given section. Args: - sections (dict[str, str]): A dict of section URLs as keys and section titles as values. - articles (dict[str, dict[str, str, str, str]]): A dict of article URLs as keys and article dicts as values. + articles (dict[str, dict[str, str, str, str, str]]): A dict of article URLs and article dicts. Returns: dict[str, list[dict[str, str, str, str]]]: A dict on a `{section_title: list[article_dict]}` format. ''' - self.log(f'Assigning each of the {len(articles)} articles to either of the {len(sections)} sections...') + self.log(f'Assigning each of the {len(articles)} articles to a section...') section_to_articles = {} for article_url, article_dict in articles.items(): - last_url = article_url - while article_url not in sections and len(article_url) > len(self.main_url): - article_url = article_url.rsplit('/', 1)[0] - - # Prevent an infinite loop. - if article_url == last_url: - break - last_url = article_url - - # If no section corresponding to the URL exists, default to the 'Home Page' section. - section_title = sections[article_url] if article_url in sections else sections[self.main_url] + section_title = article_dict['category'] if section_title not in section_to_articles: section_to_articles[section_title] = [] + # Remove the 'category' key from the article dict, as it is not needed in the final output. + article_dict.pop('category') section_to_articles[section_title].append(article_dict) - # Log how many sections contained no articles younger than `self.oldest_article`. - if diff := len(sections) - len(section_to_articles): - self.log(f'{diff} sections contained no articles younger than {self.oldest_article} days.') - return section_to_articles def parse_index(self): - # Identify all sections in the web version of Fokus. - sections = self.get_web_sections(self.main_url) + current_edition_url = self.get_current_edition_url() + if not current_edition_url: + raise NoArticles( + f"Could not find the URL of the current edition. Either the '{self.main_url}' server is experiencing " + 'issues, in which case you should try again later, or the website format has changed and the recipe ' + 'needs updating.' + ) + self.log(f'Current edition URL: {current_edition_url}') - # Add an entry for the start page. - sections[self.main_url] = 'Home Page' + # Identify all sections in the web version of Fokus. + edition_soup = self.index_to_soup(current_edition_url) # From the section URLs and the main URL, identify all unique articles. - articles = self.get_article_blurbs(sections) + articles = self.get_article_blurbs(edition_soup) if not articles: raise NoArticles( f"Could not find any articles. Either the '{self.main_url}' server is experiencing issues, in which " @@ -279,56 +236,7 @@ class Fokus(BasicNewsRecipe): ) # Assign each identified article to a section based on its URL. - section_to_articles = self.assign_articles_to_sections(sections, articles) + section_to_articles = self.convert_to_section_lists(articles) - # Convert to the expected `list[tuple[str, dict[str, str, str, str]]]` format. - feeds = list(section_to_articles.items()) - num_articles = sum(len(article_dicts) for article_dicts in section_to_articles.values()) - self.log(f'A total of {num_articles} articles belonging to {len(section_to_articles)} sections were kept.') - - return feeds - - def populate_article_metadata(self, article, soup, _): - # The article description/summary is found in the

    tag of class 'Single__lead' or 'Longread__lead'. - lead_tag = soup.find('p', {'class': ['Single__lead', 'Longread__lead']}) - article.summary = article.text_summary = lead_tag.get_text(strip=True) - - # Extract the article timestamp from the `datetime` attribute of the first

    tag of class 'Meta__author'. After the author name has been - # extracted, remove all such

    tags from the soup (the article can contain several). - if author_tag := soup.find('p', {'class': 'Meta__author'}): - # If the tag contains an child tag, extract the author name from it. - if a_tag := author_tag.find('a'): - author_info = a_tag.get_text(strip=True) - # To ensure a clean output, remove the child tag. - a_tag.decompose() - else: - # If the tag does not contain an child tag, extract the author name from the text of the

    tag. - author_info = author_tag.get_text(strip=True) - # Remove the 'Text: ' prefix from the author name (if any). - if author_info.startswith('Text: '): - author_info = author_info[6:] - for author_tag in soup.find_all('p', {'class': 'Meta__author'}): - author_tag.decompose() - else: - # If the author name is empty, set it to 'Fokus'. - if not author_info: - author_info = 'Fokus' - - # Concatenate the author name and the article date. - article_metadata = f"{author_info} | {article.date}" - - # Finally, add a new

    tag with the article metadata to the soup. Place it directly after the lead text. - new_tag = soup.new_tag('p') - new_tag['class'] = 'article-metadata' - new_tag.string = article_metadata - lead_tag.insert_after(new_tag) + # Convert to tuples. + return list(section_to_articles.items())