Dynamically identify all unique web sections

2025-07-09 03:04:10 -04:00 · 2024-10-06 00:06:05 +02:00 · 2024-10-06 00:06:05 +02:00 · 6db1fe5af4
commit 6db1fe5af4
parent ee5b24a937
1 changed files with 47 additions and 34 deletions
--- a/recipes/fokus.recipe
+++ b/recipes/fokus.recipe
@ -2,30 +2,9 @@
 # vim:fileencoding=utf-8
 from datetime import datetime, timezone
-from calibre.web.feeds.news import BasicNewsRecipe
+from bs4 import BeautifulSoup
-WEB_SECTIONS = [
+from calibre.web.feeds.news import BasicNewsRecipe
    ('Inrikes', 'inrikes'),
    ('Utrikes', 'utrikes'),
    ('Aktuellt', 'aktuellt'),
    ('Politik', 'politik'),
    ('Ekonomi', 'ekonomi'),
    ('Kultur', 'kultur'),
    ('Analys', 'analys'),
    ('Vetenskap', 'vetenskap'),
    ('Krönikor', 'kronika'),
    ('Opinion', 'opinion'),
    ('Veckans Fokus', 'veckans-fokus'),
    ('Synvinkel', 'synvinkel'),
    ('Minnesord', 'minnesord'),
    ('Debatt', 'debatt'),
    ('Andra kammaren', 'andra-kammaren'),
    ('Skuggkabinettet', 'skuggkabinettet'),
    ('Intervju', 'intervju'),
    ('Mötet', 'motet'),
    ('Veckans bråk', 'veckans-brak'),
    ('Johans Blogg', 'johans-blogg'),
 ]
 class NoArticles(Exception):
@ -95,6 +74,41 @@ class Fokus(BasicNewsRecipe):
            br.submit()
        return br
    def get_web_sections(self, main_url: str) -> list[tuple[str, str]]:
        """Return a list of tuples of (1) the URL and (2) the name of each section found on the Fokus website.
        For example, if the Fokus website currently includes an 'Aktuellt' section, create a
        `('https://www.fokus.se/aktuellt', 'Aktuellt')` tuple.
        Args:
            main_url (str): The entrypoint URL of the Fokus website.
        Yields:
            list[tuple[str, str]]: Pairs of (1) the URL and (2) the human-readable name of each Fokus section.
        """
        soup = self.index_to_soup(main_url)
        # Identify all unique <li> tags of class 'menu-item-type-taxonomy'. The class subsetting excludes sections that
        # are not suited for reading, e.g., the "Podcast" and "Läs E-Tidningen" sections.
        unique_urls = set()
        urls_and_section_names = list()
        for li_tag in soup.find_all('li', class_='menu-item-type-taxonomy'):
            # The <li> tag contains (should contain) an <a> anchor that in turn contains the URL and link name.
            a_tag = li_tag.find('a')
            url = a_tag.get('href').rstrip('/')
            link_name = a_tag.text.strip()
            # Skip this <li> tag as we have already extracted its URL and link name from another <li> tag.
            if url in unique_urls:
                continue
            unique_urls.add(url)
            self.log(f"Identified section '{link_name}' at URL '{url}'")
            urls_and_section_names.append((url, link_name))
        self.log(f'Identified a total of {len(urls_and_section_names)} unique sections.')
        return urls_and_section_names
    def parse_article_blurb(self, article_blurb):
        desc = ''
        if a_tag := article_blurb.find('a', href=True):
@ -120,7 +134,7 @@ class Fokus(BasicNewsRecipe):
                                desc += f' ({self.tag_to_string(in_cooperation_with_tag)})'
                        return {'title': title, 'url': url, 'description': desc, 'date': swedish_date_str}
-    def parse_web_section(self, soup, slug):
+    def parse_web_section(self, section_soup: BeautifulSoup):
        def _log(article):
            log_message = f"\t{article['title']} : {article['date']} : {article['url']}"
            if article.get('description'):
@ -128,27 +142,26 @@ class Fokus(BasicNewsRecipe):
            self.log(log_message)
        try:
-            article_blurbs = soup.find_all('article', {'class': 'Blurb'})
+            article_blurbs = section_soup.find_all('article', {'class': 'Blurb'})
        except AttributeError:
            article_blurbs = []
        if not article_blurbs:
-            raise ValueError(f'Failed to find article blurbs for slug: {slug}')
+            raise ValueError('Failed to find article blurbs.')
        for article_blurb in article_blurbs:
-            if (article := self.parse_article_blurb(article_blurb)):
+            if article := self.parse_article_blurb(article_blurb):
-                log(article)
+                _log(article)
                yield article
    def parse_index(self):
        feeds = []
-        for section_title, slug in WEB_SECTIONS:
+        for section_url, section_title in self.get_web_sections(self.main_url):
            url = f'{self.main_url}/{slug}'
            try:
-                soup = self.index_to_soup(url)
+                soup = self.index_to_soup(section_url)
            except Exception:
-                self.log.error(f'Failed to download section: {url}')
+                self.log.error(f"Failed to download section '{section_title}' via URL '{section_url}'")
                continue
-            self.log(f'Found section: {section_title}')
+            breakpoint()
-            articles = list(self.parse_web_section(soup, slug))
+            articles = list(self.parse_web_section(soup))
            if articles:
                feeds.append((section_title, articles))
        if not feeds: