mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Dynamically identify all unique web sections
This commit is contained in:
parent
ee5b24a937
commit
6db1fe5af4
@ -2,30 +2,9 @@
|
|||||||
# vim:fileencoding=utf-8
|
# vim:fileencoding=utf-8
|
||||||
from datetime import datetime, timezone
|
from datetime import datetime, timezone
|
||||||
|
|
||||||
from calibre.web.feeds.news import BasicNewsRecipe
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
WEB_SECTIONS = [
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
('Inrikes', 'inrikes'),
|
|
||||||
('Utrikes', 'utrikes'),
|
|
||||||
('Aktuellt', 'aktuellt'),
|
|
||||||
('Politik', 'politik'),
|
|
||||||
('Ekonomi', 'ekonomi'),
|
|
||||||
('Kultur', 'kultur'),
|
|
||||||
('Analys', 'analys'),
|
|
||||||
('Vetenskap', 'vetenskap'),
|
|
||||||
('Krönikor', 'kronika'),
|
|
||||||
('Opinion', 'opinion'),
|
|
||||||
('Veckans Fokus', 'veckans-fokus'),
|
|
||||||
('Synvinkel', 'synvinkel'),
|
|
||||||
('Minnesord', 'minnesord'),
|
|
||||||
('Debatt', 'debatt'),
|
|
||||||
('Andra kammaren', 'andra-kammaren'),
|
|
||||||
('Skuggkabinettet', 'skuggkabinettet'),
|
|
||||||
('Intervju', 'intervju'),
|
|
||||||
('Mötet', 'motet'),
|
|
||||||
('Veckans bråk', 'veckans-brak'),
|
|
||||||
('Johans Blogg', 'johans-blogg'),
|
|
||||||
]
|
|
||||||
|
|
||||||
|
|
||||||
class NoArticles(Exception):
|
class NoArticles(Exception):
|
||||||
@ -95,6 +74,41 @@ class Fokus(BasicNewsRecipe):
|
|||||||
br.submit()
|
br.submit()
|
||||||
return br
|
return br
|
||||||
|
|
||||||
|
def get_web_sections(self, main_url: str) -> list[tuple[str, str]]:
|
||||||
|
"""Return a list of tuples of (1) the URL and (2) the name of each section found on the Fokus website.
|
||||||
|
|
||||||
|
For example, if the Fokus website currently includes an 'Aktuellt' section, create a
|
||||||
|
`('https://www.fokus.se/aktuellt', 'Aktuellt')` tuple.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
main_url (str): The entrypoint URL of the Fokus website.
|
||||||
|
|
||||||
|
Yields:
|
||||||
|
list[tuple[str, str]]: Pairs of (1) the URL and (2) the human-readable name of each Fokus section.
|
||||||
|
"""
|
||||||
|
soup = self.index_to_soup(main_url)
|
||||||
|
|
||||||
|
# Identify all unique <li> tags of class 'menu-item-type-taxonomy'. The class subsetting excludes sections that
|
||||||
|
# are not suited for reading, e.g., the "Podcast" and "Läs E-Tidningen" sections.
|
||||||
|
unique_urls = set()
|
||||||
|
urls_and_section_names = list()
|
||||||
|
for li_tag in soup.find_all('li', class_='menu-item-type-taxonomy'):
|
||||||
|
# The <li> tag contains (should contain) an <a> anchor that in turn contains the URL and link name.
|
||||||
|
a_tag = li_tag.find('a')
|
||||||
|
url = a_tag.get('href').rstrip('/')
|
||||||
|
link_name = a_tag.text.strip()
|
||||||
|
|
||||||
|
# Skip this <li> tag as we have already extracted its URL and link name from another <li> tag.
|
||||||
|
if url in unique_urls:
|
||||||
|
continue
|
||||||
|
unique_urls.add(url)
|
||||||
|
|
||||||
|
self.log(f"Identified section '{link_name}' at URL '{url}'")
|
||||||
|
urls_and_section_names.append((url, link_name))
|
||||||
|
|
||||||
|
self.log(f'Identified a total of {len(urls_and_section_names)} unique sections.')
|
||||||
|
return urls_and_section_names
|
||||||
|
|
||||||
def parse_article_blurb(self, article_blurb):
|
def parse_article_blurb(self, article_blurb):
|
||||||
desc = ''
|
desc = ''
|
||||||
if a_tag := article_blurb.find('a', href=True):
|
if a_tag := article_blurb.find('a', href=True):
|
||||||
@ -120,7 +134,7 @@ class Fokus(BasicNewsRecipe):
|
|||||||
desc += f' ({self.tag_to_string(in_cooperation_with_tag)})'
|
desc += f' ({self.tag_to_string(in_cooperation_with_tag)})'
|
||||||
return {'title': title, 'url': url, 'description': desc, 'date': swedish_date_str}
|
return {'title': title, 'url': url, 'description': desc, 'date': swedish_date_str}
|
||||||
|
|
||||||
def parse_web_section(self, soup, slug):
|
def parse_web_section(self, section_soup: BeautifulSoup):
|
||||||
def _log(article):
|
def _log(article):
|
||||||
log_message = f"\t{article['title']} : {article['date']} : {article['url']}"
|
log_message = f"\t{article['title']} : {article['date']} : {article['url']}"
|
||||||
if article.get('description'):
|
if article.get('description'):
|
||||||
@ -128,27 +142,26 @@ class Fokus(BasicNewsRecipe):
|
|||||||
self.log(log_message)
|
self.log(log_message)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
article_blurbs = soup.find_all('article', {'class': 'Blurb'})
|
article_blurbs = section_soup.find_all('article', {'class': 'Blurb'})
|
||||||
except AttributeError:
|
except AttributeError:
|
||||||
article_blurbs = []
|
article_blurbs = []
|
||||||
if not article_blurbs:
|
if not article_blurbs:
|
||||||
raise ValueError(f'Failed to find article blurbs for slug: {slug}')
|
raise ValueError('Failed to find article blurbs.')
|
||||||
for article_blurb in article_blurbs:
|
for article_blurb in article_blurbs:
|
||||||
if (article := self.parse_article_blurb(article_blurb)):
|
if article := self.parse_article_blurb(article_blurb):
|
||||||
log(article)
|
_log(article)
|
||||||
yield article
|
yield article
|
||||||
|
|
||||||
def parse_index(self):
|
def parse_index(self):
|
||||||
feeds = []
|
feeds = []
|
||||||
for section_title, slug in WEB_SECTIONS:
|
for section_url, section_title in self.get_web_sections(self.main_url):
|
||||||
url = f'{self.main_url}/{slug}'
|
|
||||||
try:
|
try:
|
||||||
soup = self.index_to_soup(url)
|
soup = self.index_to_soup(section_url)
|
||||||
except Exception:
|
except Exception:
|
||||||
self.log.error(f'Failed to download section: {url}')
|
self.log.error(f"Failed to download section '{section_title}' via URL '{section_url}'")
|
||||||
continue
|
continue
|
||||||
self.log(f'Found section: {section_title}')
|
breakpoint()
|
||||||
articles = list(self.parse_web_section(soup, slug))
|
articles = list(self.parse_web_section(soup))
|
||||||
if articles:
|
if articles:
|
||||||
feeds.append((section_title, articles))
|
feeds.append((section_title, articles))
|
||||||
if not feeds:
|
if not feeds:
|
||||||
|
Loading…
x
Reference in New Issue
Block a user