diff --git a/recipes/fokus.recipe b/recipes/fokus.recipe
index 7475ece9ca..bfe187da99 100644
--- a/recipes/fokus.recipe
+++ b/recipes/fokus.recipe
@@ -2,30 +2,9 @@
# vim:fileencoding=utf-8
from datetime import datetime, timezone
-from calibre.web.feeds.news import BasicNewsRecipe
+from bs4 import BeautifulSoup
-WEB_SECTIONS = [
- ('Inrikes', 'inrikes'),
- ('Utrikes', 'utrikes'),
- ('Aktuellt', 'aktuellt'),
- ('Politik', 'politik'),
- ('Ekonomi', 'ekonomi'),
- ('Kultur', 'kultur'),
- ('Analys', 'analys'),
- ('Vetenskap', 'vetenskap'),
- ('Krönikor', 'kronika'),
- ('Opinion', 'opinion'),
- ('Veckans Fokus', 'veckans-fokus'),
- ('Synvinkel', 'synvinkel'),
- ('Minnesord', 'minnesord'),
- ('Debatt', 'debatt'),
- ('Andra kammaren', 'andra-kammaren'),
- ('Skuggkabinettet', 'skuggkabinettet'),
- ('Intervju', 'intervju'),
- ('Mötet', 'motet'),
- ('Veckans bråk', 'veckans-brak'),
- ('Johans Blogg', 'johans-blogg'),
-]
+from calibre.web.feeds.news import BasicNewsRecipe
class NoArticles(Exception):
@@ -95,6 +74,41 @@ class Fokus(BasicNewsRecipe):
br.submit()
return br
+ def get_web_sections(self, main_url: str) -> list[tuple[str, str]]:
+ """Return a list of tuples of (1) the URL and (2) the name of each section found on the Fokus website.
+
+ For example, if the Fokus website currently includes an 'Aktuellt' section, create a
+ `('https://www.fokus.se/aktuellt', 'Aktuellt')` tuple.
+
+ Args:
+ main_url (str): The entrypoint URL of the Fokus website.
+
+ Yields:
+ list[tuple[str, str]]: Pairs of (1) the URL and (2) the human-readable name of each Fokus section.
+ """
+ soup = self.index_to_soup(main_url)
+
+ # Identify all unique
tags of class 'menu-item-type-taxonomy'. The class subsetting excludes sections that
+ # are not suited for reading, e.g., the "Podcast" and "Läs E-Tidningen" sections.
+ unique_urls = set()
+ urls_and_section_names = list()
+ for li_tag in soup.find_all('li', class_='menu-item-type-taxonomy'):
+ # The tag contains (should contain) an anchor that in turn contains the URL and link name.
+ a_tag = li_tag.find('a')
+ url = a_tag.get('href').rstrip('/')
+ link_name = a_tag.text.strip()
+
+ # Skip this tag as we have already extracted its URL and link name from another tag.
+ if url in unique_urls:
+ continue
+ unique_urls.add(url)
+
+ self.log(f"Identified section '{link_name}' at URL '{url}'")
+ urls_and_section_names.append((url, link_name))
+
+ self.log(f'Identified a total of {len(urls_and_section_names)} unique sections.')
+ return urls_and_section_names
+
def parse_article_blurb(self, article_blurb):
desc = ''
if a_tag := article_blurb.find('a', href=True):
@@ -120,7 +134,7 @@ class Fokus(BasicNewsRecipe):
desc += f' ({self.tag_to_string(in_cooperation_with_tag)})'
return {'title': title, 'url': url, 'description': desc, 'date': swedish_date_str}
- def parse_web_section(self, soup, slug):
+ def parse_web_section(self, section_soup: BeautifulSoup):
def _log(article):
log_message = f"\t{article['title']} : {article['date']} : {article['url']}"
if article.get('description'):
@@ -128,27 +142,26 @@ class Fokus(BasicNewsRecipe):
self.log(log_message)
try:
- article_blurbs = soup.find_all('article', {'class': 'Blurb'})
+ article_blurbs = section_soup.find_all('article', {'class': 'Blurb'})
except AttributeError:
article_blurbs = []
if not article_blurbs:
- raise ValueError(f'Failed to find article blurbs for slug: {slug}')
+ raise ValueError('Failed to find article blurbs.')
for article_blurb in article_blurbs:
- if (article := self.parse_article_blurb(article_blurb)):
- log(article)
+ if article := self.parse_article_blurb(article_blurb):
+ _log(article)
yield article
def parse_index(self):
feeds = []
- for section_title, slug in WEB_SECTIONS:
- url = f'{self.main_url}/{slug}'
+ for section_url, section_title in self.get_web_sections(self.main_url):
try:
- soup = self.index_to_soup(url)
+ soup = self.index_to_soup(section_url)
except Exception:
- self.log.error(f'Failed to download section: {url}')
+ self.log.error(f"Failed to download section '{section_title}' via URL '{section_url}'")
continue
- self.log(f'Found section: {section_title}')
- articles = list(self.parse_web_section(soup, slug))
+ breakpoint()
+ articles = list(self.parse_web_section(soup))
if articles:
feeds.append((section_title, articles))
if not feeds: