Dynamically identify all unique web sections

2025-07-09 03:04:10 -04:00 · 2024-10-06 00:06:05 +02:00 · 2024-10-06 00:06:05 +02:00 · 6db1fe5af4
commit 6db1fe5af4
parent ee5b24a937
1 changed files with 47 additions and 34 deletions
--- a/recipes/fokus.recipe
+++ b/recipes/fokus.recipe
@ -2,30 +2,9 @@
 # vim:fileencoding=utf-8
 from datetime import datetime, timezone

-from calibre.web.feeds.news import BasicNewsRecipe
+from bs4 import BeautifulSoup

-WEB_SECTIONS = [
-    ('Inrikes', 'inrikes'),
-    ('Utrikes', 'utrikes'),
-    ('Aktuellt', 'aktuellt'),
-    ('Politik', 'politik'),
-    ('Ekonomi', 'ekonomi'),
-    ('Kultur', 'kultur'),
-    ('Analys', 'analys'),
-    ('Vetenskap', 'vetenskap'),
-    ('Krönikor', 'kronika'),
-    ('Opinion', 'opinion'),
-    ('Veckans Fokus', 'veckans-fokus'),
-    ('Synvinkel', 'synvinkel'),
-    ('Minnesord', 'minnesord'),
-    ('Debatt', 'debatt'),
-    ('Andra kammaren', 'andra-kammaren'),
-    ('Skuggkabinettet', 'skuggkabinettet'),
-    ('Intervju', 'intervju'),
-    ('Mötet', 'motet'),
-    ('Veckans bråk', 'veckans-brak'),
-    ('Johans Blogg', 'johans-blogg'),
-]
+from calibre.web.feeds.news import BasicNewsRecipe


 class NoArticles(Exception):
@ -95,6 +74,41 @@ class Fokus(BasicNewsRecipe):
            br.submit()
        return br

+    def get_web_sections(self, main_url: str) -> list[tuple[str, str]]:
+        """Return a list of tuples of (1) the URL and (2) the name of each section found on the Fokus website.
+
+        For example, if the Fokus website currently includes an 'Aktuellt' section, create a
+        `('https://www.fokus.se/aktuellt', 'Aktuellt')` tuple.
+
+        Args:
+            main_url (str): The entrypoint URL of the Fokus website.
+
+        Yields:
+            list[tuple[str, str]]: Pairs of (1) the URL and (2) the human-readable name of each Fokus section.
+        """
+        soup = self.index_to_soup(main_url)
+
+        # Identify all unique <li> tags of class 'menu-item-type-taxonomy'. The class subsetting excludes sections that
+        # are not suited for reading, e.g., the "Podcast" and "Läs E-Tidningen" sections.
+        unique_urls = set()
+        urls_and_section_names = list()
+        for li_tag in soup.find_all('li', class_='menu-item-type-taxonomy'):
+            # The <li> tag contains (should contain) an <a> anchor that in turn contains the URL and link name.
+            a_tag = li_tag.find('a')
+            url = a_tag.get('href').rstrip('/')
+            link_name = a_tag.text.strip()
+
+            # Skip this <li> tag as we have already extracted its URL and link name from another <li> tag.
+            if url in unique_urls:
+                continue
+            unique_urls.add(url)
+
+            self.log(f"Identified section '{link_name}' at URL '{url}'")
+            urls_and_section_names.append((url, link_name))
+
+        self.log(f'Identified a total of {len(urls_and_section_names)} unique sections.')
+        return urls_and_section_names
+
    def parse_article_blurb(self, article_blurb):
        desc = ''
        if a_tag := article_blurb.find('a', href=True):
@ -120,7 +134,7 @@ class Fokus(BasicNewsRecipe):
                                desc += f' ({self.tag_to_string(in_cooperation_with_tag)})'
                        return {'title': title, 'url': url, 'description': desc, 'date': swedish_date_str}

-    def parse_web_section(self, soup, slug):
+    def parse_web_section(self, section_soup: BeautifulSoup):
        def _log(article):
            log_message = f"\t{article['title']} : {article['date']} : {article['url']}"
            if article.get('description'):
@ -128,27 +142,26 @@ class Fokus(BasicNewsRecipe):
            self.log(log_message)

        try:
-            article_blurbs = soup.find_all('article', {'class': 'Blurb'})
+            article_blurbs = section_soup.find_all('article', {'class': 'Blurb'})
        except AttributeError:
            article_blurbs = []
        if not article_blurbs:
-            raise ValueError(f'Failed to find article blurbs for slug: {slug}')
+            raise ValueError('Failed to find article blurbs.')
        for article_blurb in article_blurbs:
-            if (article := self.parse_article_blurb(article_blurb)):
-                log(article)
+            if article := self.parse_article_blurb(article_blurb):
+                _log(article)
                yield article

    def parse_index(self):
        feeds = []
-        for section_title, slug in WEB_SECTIONS:
-            url = f'{self.main_url}/{slug}'
+        for section_url, section_title in self.get_web_sections(self.main_url):
            try:
-                soup = self.index_to_soup(url)
+                soup = self.index_to_soup(section_url)
            except Exception:
-                self.log.error(f'Failed to download section: {url}')
+                self.log.error(f"Failed to download section '{section_title}' via URL '{section_url}'")
                continue
-            self.log(f'Found section: {section_title}')
-            articles = list(self.parse_web_section(soup, slug))
+            breakpoint()
+            articles = list(self.parse_web_section(soup))
            if articles:
                feeds.append((section_title, articles))
        if not feeds: