Dynamically identify all unique web sections

This commit is contained in:
Henrik Holm 2024-10-06 00:06:05 +02:00
parent ee5b24a937
commit 6db1fe5af4

View File

@ -2,30 +2,9 @@
# vim:fileencoding=utf-8
from datetime import datetime, timezone
from calibre.web.feeds.news import BasicNewsRecipe
from bs4 import BeautifulSoup
WEB_SECTIONS = [
('Inrikes', 'inrikes'),
('Utrikes', 'utrikes'),
('Aktuellt', 'aktuellt'),
('Politik', 'politik'),
('Ekonomi', 'ekonomi'),
('Kultur', 'kultur'),
('Analys', 'analys'),
('Vetenskap', 'vetenskap'),
('Krönikor', 'kronika'),
('Opinion', 'opinion'),
('Veckans Fokus', 'veckans-fokus'),
('Synvinkel', 'synvinkel'),
('Minnesord', 'minnesord'),
('Debatt', 'debatt'),
('Andra kammaren', 'andra-kammaren'),
('Skuggkabinettet', 'skuggkabinettet'),
('Intervju', 'intervju'),
('Mötet', 'motet'),
('Veckans bråk', 'veckans-brak'),
('Johans Blogg', 'johans-blogg'),
]
from calibre.web.feeds.news import BasicNewsRecipe
class NoArticles(Exception):
@ -95,6 +74,41 @@ class Fokus(BasicNewsRecipe):
br.submit()
return br
def get_web_sections(self, main_url: str) -> list[tuple[str, str]]:
"""Return a list of tuples of (1) the URL and (2) the name of each section found on the Fokus website.
For example, if the Fokus website currently includes an 'Aktuellt' section, create a
`('https://www.fokus.se/aktuellt', 'Aktuellt')` tuple.
Args:
main_url (str): The entrypoint URL of the Fokus website.
Yields:
list[tuple[str, str]]: Pairs of (1) the URL and (2) the human-readable name of each Fokus section.
"""
soup = self.index_to_soup(main_url)
# Identify all unique <li> tags of class 'menu-item-type-taxonomy'. The class subsetting excludes sections that
# are not suited for reading, e.g., the "Podcast" and "Läs E-Tidningen" sections.
unique_urls = set()
urls_and_section_names = list()
for li_tag in soup.find_all('li', class_='menu-item-type-taxonomy'):
# The <li> tag contains (should contain) an <a> anchor that in turn contains the URL and link name.
a_tag = li_tag.find('a')
url = a_tag.get('href').rstrip('/')
link_name = a_tag.text.strip()
# Skip this <li> tag as we have already extracted its URL and link name from another <li> tag.
if url in unique_urls:
continue
unique_urls.add(url)
self.log(f"Identified section '{link_name}' at URL '{url}'")
urls_and_section_names.append((url, link_name))
self.log(f'Identified a total of {len(urls_and_section_names)} unique sections.')
return urls_and_section_names
def parse_article_blurb(self, article_blurb):
desc = ''
if a_tag := article_blurb.find('a', href=True):
@ -120,7 +134,7 @@ class Fokus(BasicNewsRecipe):
desc += f' ({self.tag_to_string(in_cooperation_with_tag)})'
return {'title': title, 'url': url, 'description': desc, 'date': swedish_date_str}
def parse_web_section(self, soup, slug):
def parse_web_section(self, section_soup: BeautifulSoup):
def _log(article):
log_message = f"\t{article['title']} : {article['date']} : {article['url']}"
if article.get('description'):
@ -128,27 +142,26 @@ class Fokus(BasicNewsRecipe):
self.log(log_message)
try:
article_blurbs = soup.find_all('article', {'class': 'Blurb'})
article_blurbs = section_soup.find_all('article', {'class': 'Blurb'})
except AttributeError:
article_blurbs = []
if not article_blurbs:
raise ValueError(f'Failed to find article blurbs for slug: {slug}')
raise ValueError('Failed to find article blurbs.')
for article_blurb in article_blurbs:
if (article := self.parse_article_blurb(article_blurb)):
log(article)
if article := self.parse_article_blurb(article_blurb):
_log(article)
yield article
def parse_index(self):
feeds = []
for section_title, slug in WEB_SECTIONS:
url = f'{self.main_url}/{slug}'
for section_url, section_title in self.get_web_sections(self.main_url):
try:
soup = self.index_to_soup(url)
soup = self.index_to_soup(section_url)
except Exception:
self.log.error(f'Failed to download section: {url}')
self.log.error(f"Failed to download section '{section_title}' via URL '{section_url}'")
continue
self.log(f'Found section: {section_title}')
articles = list(self.parse_web_section(soup, slug))
breakpoint()
articles = list(self.parse_web_section(soup))
if articles:
feeds.append((section_title, articles))
if not feeds: