mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Dynamically identify all unique web sections
This commit is contained in:
parent
ee5b24a937
commit
6db1fe5af4
@ -2,30 +2,9 @@
|
||||
# vim:fileencoding=utf-8
|
||||
from datetime import datetime, timezone
|
||||
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
WEB_SECTIONS = [
|
||||
('Inrikes', 'inrikes'),
|
||||
('Utrikes', 'utrikes'),
|
||||
('Aktuellt', 'aktuellt'),
|
||||
('Politik', 'politik'),
|
||||
('Ekonomi', 'ekonomi'),
|
||||
('Kultur', 'kultur'),
|
||||
('Analys', 'analys'),
|
||||
('Vetenskap', 'vetenskap'),
|
||||
('Krönikor', 'kronika'),
|
||||
('Opinion', 'opinion'),
|
||||
('Veckans Fokus', 'veckans-fokus'),
|
||||
('Synvinkel', 'synvinkel'),
|
||||
('Minnesord', 'minnesord'),
|
||||
('Debatt', 'debatt'),
|
||||
('Andra kammaren', 'andra-kammaren'),
|
||||
('Skuggkabinettet', 'skuggkabinettet'),
|
||||
('Intervju', 'intervju'),
|
||||
('Mötet', 'motet'),
|
||||
('Veckans bråk', 'veckans-brak'),
|
||||
('Johans Blogg', 'johans-blogg'),
|
||||
]
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
|
||||
class NoArticles(Exception):
|
||||
@ -95,6 +74,41 @@ class Fokus(BasicNewsRecipe):
|
||||
br.submit()
|
||||
return br
|
||||
|
||||
def get_web_sections(self, main_url: str) -> list[tuple[str, str]]:
|
||||
"""Return a list of tuples of (1) the URL and (2) the name of each section found on the Fokus website.
|
||||
|
||||
For example, if the Fokus website currently includes an 'Aktuellt' section, create a
|
||||
`('https://www.fokus.se/aktuellt', 'Aktuellt')` tuple.
|
||||
|
||||
Args:
|
||||
main_url (str): The entrypoint URL of the Fokus website.
|
||||
|
||||
Yields:
|
||||
list[tuple[str, str]]: Pairs of (1) the URL and (2) the human-readable name of each Fokus section.
|
||||
"""
|
||||
soup = self.index_to_soup(main_url)
|
||||
|
||||
# Identify all unique <li> tags of class 'menu-item-type-taxonomy'. The class subsetting excludes sections that
|
||||
# are not suited for reading, e.g., the "Podcast" and "Läs E-Tidningen" sections.
|
||||
unique_urls = set()
|
||||
urls_and_section_names = list()
|
||||
for li_tag in soup.find_all('li', class_='menu-item-type-taxonomy'):
|
||||
# The <li> tag contains (should contain) an <a> anchor that in turn contains the URL and link name.
|
||||
a_tag = li_tag.find('a')
|
||||
url = a_tag.get('href').rstrip('/')
|
||||
link_name = a_tag.text.strip()
|
||||
|
||||
# Skip this <li> tag as we have already extracted its URL and link name from another <li> tag.
|
||||
if url in unique_urls:
|
||||
continue
|
||||
unique_urls.add(url)
|
||||
|
||||
self.log(f"Identified section '{link_name}' at URL '{url}'")
|
||||
urls_and_section_names.append((url, link_name))
|
||||
|
||||
self.log(f'Identified a total of {len(urls_and_section_names)} unique sections.')
|
||||
return urls_and_section_names
|
||||
|
||||
def parse_article_blurb(self, article_blurb):
|
||||
desc = ''
|
||||
if a_tag := article_blurb.find('a', href=True):
|
||||
@ -120,7 +134,7 @@ class Fokus(BasicNewsRecipe):
|
||||
desc += f' ({self.tag_to_string(in_cooperation_with_tag)})'
|
||||
return {'title': title, 'url': url, 'description': desc, 'date': swedish_date_str}
|
||||
|
||||
def parse_web_section(self, soup, slug):
|
||||
def parse_web_section(self, section_soup: BeautifulSoup):
|
||||
def _log(article):
|
||||
log_message = f"\t{article['title']} : {article['date']} : {article['url']}"
|
||||
if article.get('description'):
|
||||
@ -128,27 +142,26 @@ class Fokus(BasicNewsRecipe):
|
||||
self.log(log_message)
|
||||
|
||||
try:
|
||||
article_blurbs = soup.find_all('article', {'class': 'Blurb'})
|
||||
article_blurbs = section_soup.find_all('article', {'class': 'Blurb'})
|
||||
except AttributeError:
|
||||
article_blurbs = []
|
||||
if not article_blurbs:
|
||||
raise ValueError(f'Failed to find article blurbs for slug: {slug}')
|
||||
raise ValueError('Failed to find article blurbs.')
|
||||
for article_blurb in article_blurbs:
|
||||
if (article := self.parse_article_blurb(article_blurb)):
|
||||
log(article)
|
||||
if article := self.parse_article_blurb(article_blurb):
|
||||
_log(article)
|
||||
yield article
|
||||
|
||||
def parse_index(self):
|
||||
feeds = []
|
||||
for section_title, slug in WEB_SECTIONS:
|
||||
url = f'{self.main_url}/{slug}'
|
||||
for section_url, section_title in self.get_web_sections(self.main_url):
|
||||
try:
|
||||
soup = self.index_to_soup(url)
|
||||
soup = self.index_to_soup(section_url)
|
||||
except Exception:
|
||||
self.log.error(f'Failed to download section: {url}')
|
||||
self.log.error(f"Failed to download section '{section_title}' via URL '{section_url}'")
|
||||
continue
|
||||
self.log(f'Found section: {section_title}')
|
||||
articles = list(self.parse_web_section(soup, slug))
|
||||
breakpoint()
|
||||
articles = list(self.parse_web_section(soup))
|
||||
if articles:
|
||||
feeds.append((section_title, articles))
|
||||
if not feeds:
|
||||
|
Loading…
x
Reference in New Issue
Block a user