diff --git a/recipes/fokus.recipe b/recipes/fokus.recipe index 7475ece9ca..bfe187da99 100644 --- a/recipes/fokus.recipe +++ b/recipes/fokus.recipe @@ -2,30 +2,9 @@ # vim:fileencoding=utf-8 from datetime import datetime, timezone -from calibre.web.feeds.news import BasicNewsRecipe +from bs4 import BeautifulSoup -WEB_SECTIONS = [ - ('Inrikes', 'inrikes'), - ('Utrikes', 'utrikes'), - ('Aktuellt', 'aktuellt'), - ('Politik', 'politik'), - ('Ekonomi', 'ekonomi'), - ('Kultur', 'kultur'), - ('Analys', 'analys'), - ('Vetenskap', 'vetenskap'), - ('Krönikor', 'kronika'), - ('Opinion', 'opinion'), - ('Veckans Fokus', 'veckans-fokus'), - ('Synvinkel', 'synvinkel'), - ('Minnesord', 'minnesord'), - ('Debatt', 'debatt'), - ('Andra kammaren', 'andra-kammaren'), - ('Skuggkabinettet', 'skuggkabinettet'), - ('Intervju', 'intervju'), - ('Mötet', 'motet'), - ('Veckans bråk', 'veckans-brak'), - ('Johans Blogg', 'johans-blogg'), -] +from calibre.web.feeds.news import BasicNewsRecipe class NoArticles(Exception): @@ -95,6 +74,41 @@ class Fokus(BasicNewsRecipe): br.submit() return br + def get_web_sections(self, main_url: str) -> list[tuple[str, str]]: + """Return a list of tuples of (1) the URL and (2) the name of each section found on the Fokus website. + + For example, if the Fokus website currently includes an 'Aktuellt' section, create a + `('https://www.fokus.se/aktuellt', 'Aktuellt')` tuple. + + Args: + main_url (str): The entrypoint URL of the Fokus website. + + Yields: + list[tuple[str, str]]: Pairs of (1) the URL and (2) the human-readable name of each Fokus section. + """ + soup = self.index_to_soup(main_url) + + # Identify all unique
  • tags of class 'menu-item-type-taxonomy'. The class subsetting excludes sections that + # are not suited for reading, e.g., the "Podcast" and "Läs E-Tidningen" sections. + unique_urls = set() + urls_and_section_names = list() + for li_tag in soup.find_all('li', class_='menu-item-type-taxonomy'): + # The
  • tag contains (should contain) an anchor that in turn contains the URL and link name. + a_tag = li_tag.find('a') + url = a_tag.get('href').rstrip('/') + link_name = a_tag.text.strip() + + # Skip this
  • tag as we have already extracted its URL and link name from another
  • tag. + if url in unique_urls: + continue + unique_urls.add(url) + + self.log(f"Identified section '{link_name}' at URL '{url}'") + urls_and_section_names.append((url, link_name)) + + self.log(f'Identified a total of {len(urls_and_section_names)} unique sections.') + return urls_and_section_names + def parse_article_blurb(self, article_blurb): desc = '' if a_tag := article_blurb.find('a', href=True): @@ -120,7 +134,7 @@ class Fokus(BasicNewsRecipe): desc += f' ({self.tag_to_string(in_cooperation_with_tag)})' return {'title': title, 'url': url, 'description': desc, 'date': swedish_date_str} - def parse_web_section(self, soup, slug): + def parse_web_section(self, section_soup: BeautifulSoup): def _log(article): log_message = f"\t{article['title']} : {article['date']} : {article['url']}" if article.get('description'): @@ -128,27 +142,26 @@ class Fokus(BasicNewsRecipe): self.log(log_message) try: - article_blurbs = soup.find_all('article', {'class': 'Blurb'}) + article_blurbs = section_soup.find_all('article', {'class': 'Blurb'}) except AttributeError: article_blurbs = [] if not article_blurbs: - raise ValueError(f'Failed to find article blurbs for slug: {slug}') + raise ValueError('Failed to find article blurbs.') for article_blurb in article_blurbs: - if (article := self.parse_article_blurb(article_blurb)): - log(article) + if article := self.parse_article_blurb(article_blurb): + _log(article) yield article def parse_index(self): feeds = [] - for section_title, slug in WEB_SECTIONS: - url = f'{self.main_url}/{slug}' + for section_url, section_title in self.get_web_sections(self.main_url): try: - soup = self.index_to_soup(url) + soup = self.index_to_soup(section_url) except Exception: - self.log.error(f'Failed to download section: {url}') + self.log.error(f"Failed to download section '{section_title}' via URL '{section_url}'") continue - self.log(f'Found section: {section_title}') - articles = list(self.parse_web_section(soup, slug)) + breakpoint() + articles = list(self.parse_web_section(soup)) if articles: feeds.append((section_title, articles)) if not feeds: