mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-09-29 15:31:08 -04:00
284 lines
13 KiB
Python
284 lines
13 KiB
Python
#!/usr/bin/env python
|
|
# vim:fileencoding=utf-8
|
|
from datetime import datetime, timezone
|
|
|
|
from calibre.web.feeds.news import BasicNewsRecipe
|
|
|
|
|
|
class NoArticles(Exception):
|
|
pass
|
|
|
|
|
|
class Fokus(BasicNewsRecipe):
|
|
title = 'Fokus'
|
|
main_url = 'https://www.fokus.se'
|
|
description = "The last 7 days of news and articles from the Swedish current-affairs magazine 'Fokus'"
|
|
encoding = 'utf-8'
|
|
__author__ = 'Henrik Holm (https://github.com/h-holm)'
|
|
language = 'sv'
|
|
ignore_duplicate_articles = {'title', 'url'}
|
|
masthead_url = 'https://cdn.fokus.se/app/uploads/fokus/2022/05/12214931/fokus-logo.svg'
|
|
no_stylesheets = True
|
|
compress_news_images = True
|
|
needs_subscription = 'optional'
|
|
max_age = 7 # days
|
|
remove_empty_feeds = True
|
|
extra_css = 'img { display: block; width: 75%; height: auto }'
|
|
|
|
remove_tags = [
|
|
dict(name='div', attrs={'class': 'External-ad'}),
|
|
dict(name='header', attrs={'class': 'Header'}),
|
|
dict(name='div', attrs={'class': 'Header-expanded'}),
|
|
dict(name='div', attrs={'class': 'Overlay'}),
|
|
dict(name='div', attrs={'class': 'Search-expanded'}),
|
|
dict(name='section', attrs={'class': 'Site__footer'}),
|
|
dict(name='div', attrs={'class': 'Toaster'}),
|
|
dict(name='div', attrs={'class': 'fbc-badge'}),
|
|
dict(name='div', attrs={'class': 'Posts-by-related-cat'}),
|
|
dict(name='div', attrs={'class': 'finite-scroll'}),
|
|
dict(name='div', attrs={'class': 'Sidebar'}),
|
|
dict(name='div', attrs={'id': 'single-comments'}),
|
|
dict(name='footer', attrs={'class': 'Single__footer'}),
|
|
dict(name='div', attrs={'class': 'Social-share'}),
|
|
dict(name='div', attrs={'class': 'mediaconnect-paywall'}),
|
|
dict(name='svg', attrs={'class': 'icon'}),
|
|
dict(name='figure', attrs={'class': 'wp-block-audio'}),
|
|
]
|
|
|
|
remove_tags_after = [
|
|
dict(name='div', class_='Single__content'),
|
|
]
|
|
|
|
keep_only_tags = [
|
|
dict(name='h1', class_='Single__title'), # Title.
|
|
dict(name='h1', class_='Longread__title'), # Alt. title.
|
|
dict(name='p', class_='Single__lead'), # Lead text.
|
|
dict(name='p', class_='Longread__lead'), # Alt. lead text.
|
|
dict(name='figure', class_='Single__thumbnail'), # Image.
|
|
dict(name='figure', class_='Longread__thumbnail'), # Alt. image.
|
|
# dict(name='p', class_='Meta__author'), # Author.
|
|
# dict(name='time', class_='Meta__updated'), # Last updated.
|
|
# Main article.
|
|
dict(name='div', class_='sesamy-protected-content'),
|
|
dict(name='div', class_='wp-block-core-paragraph'),
|
|
]
|
|
|
|
def get_browser(self):
|
|
br = BasicNewsRecipe.get_browser(self)
|
|
if self.username and self.password:
|
|
br.open('https://www.fokus.se/auth/logga-in')
|
|
br.select_form(name='loginForm')
|
|
br['j_username'] = self.username
|
|
br['j_password'] = self.password
|
|
br.submit()
|
|
return br
|
|
|
|
def get_web_sections(self, main_url: str) -> dict[str, str]:
|
|
'''Return a dict of (1) section URL and (2) section name key-value pairs found at `main_url`.
|
|
|
|
For example, if the Fokus website currently includes an 'Aktuellt' section, the dict should include an entry on
|
|
the form: `{'https://www.fokus.se/aktuellt': 'Aktuellt'}`.
|
|
|
|
Args:
|
|
main_url (str): The entrypoint URL of the Fokus website.
|
|
|
|
Yields:
|
|
dict[str, str]: (1) URLs and (2) human-readable names of Fokus sections.
|
|
'''
|
|
self.log(f"Identifying all sections under '{main_url}'...")
|
|
soup = self.index_to_soup(main_url)
|
|
|
|
# Identify all unique <li> tags of class 'menu-item-type-taxonomy'. The class subsetting excludes sections that
|
|
# are not suited for reading, e.g., the "Podcast" and "Läs E-Tidningen" sections.
|
|
section_urls_and_names = {}
|
|
for li_tag in soup.find_all('li', class_='menu-item-type-taxonomy'):
|
|
# The <li> tag contains (should contain) an <a> anchor that in turn contains the URL and link name.
|
|
a_tag = li_tag.find('a')
|
|
url = a_tag.get('href').rstrip('/')
|
|
section_name = a_tag.text.strip()
|
|
|
|
if url in section_urls_and_names:
|
|
# If this section URL has already been extracted from another <li> tag, it can be the case that the
|
|
# section name differs within this duplicate pair. In this case, use whichever section name is longer.
|
|
if len(section_name) >= len(section_urls_and_names[url]):
|
|
section_urls_and_names[url] = section_name
|
|
|
|
self.log(f"Identified section '{section_name}' at URL '{url}'.")
|
|
section_urls_and_names[url] = section_name
|
|
|
|
self.log(f'Identified a total of {len(section_urls_and_names)} unique sections.')
|
|
return section_urls_and_names
|
|
|
|
def parse_article_blurb(self, article_blurb) -> dict[str, str, str, str] | None:
|
|
'''Given a <article> tag of class 'Blurb', parse it into a dict.
|
|
|
|
Args:
|
|
article_blurb (Tag): An <article> tag hosting metadata and the URL of an article.
|
|
|
|
Returns:
|
|
dict[str, str, str, str]: A dict on a `{'url': str, 'title': str, 'description': str, 'date': str}` format.
|
|
'''
|
|
if a_tag := article_blurb.find('a', href=True):
|
|
url = a_tag['href'].strip().rstrip('/')
|
|
if url.startswith('/'):
|
|
url = f'{self.main_url}{url}'
|
|
|
|
if title_tag := a_tag.find('h2', {'class': 'Blurb__title'}):
|
|
title = self.tag_to_string(title_tag).strip()
|
|
if time_tag := a_tag.find('time', {'class': 'Blurb__date'}):
|
|
swedish_date_str = self.tag_to_string(time_tag).rstrip()
|
|
|
|
# Skip articles older than `self.max_age`.
|
|
datetime_str = time_tag['datetime']
|
|
datetime_time = datetime.strptime(datetime_str, '%Y-%m-%dT%H:%M:%S%z')
|
|
now = datetime.now(timezone.utc)
|
|
delta = now - datetime_time
|
|
if delta.days > self.max_age:
|
|
self.log.debug(f"\tSkipping article as it is too old: '{title}'")
|
|
return
|
|
|
|
desc = ''
|
|
if desc_tag := a_tag.find('div', {'class': 'Blurb__summary'}):
|
|
desc = self.tag_to_string(desc_tag).strip()
|
|
if in_cooperation_with_tag := a_tag.find('p', {'class': 'Blurb__meta'}):
|
|
desc += f' ({self.tag_to_string(in_cooperation_with_tag).strip()})'
|
|
|
|
return {'url': url, 'title': title, 'description': desc, 'date': swedish_date_str}
|
|
return
|
|
|
|
def _get_article_blurbs(self, soup) -> dict[str, dict[str, str, str, str]]:
|
|
'''Given a Fokus webpage `soup`, return a dict of unique article entries found on the page.
|
|
|
|
The key of a given entry in the output dictionary is the article URL. The corresponding value is a dictionary
|
|
on a `{'url': str, 'title': str, 'description': str, 'date': str}` format.
|
|
|
|
Args:
|
|
soup (BeautifulSoup): The `bs4.BeautifulSoup` soup of a Fokus webpage.
|
|
|
|
Returns:
|
|
dict[str, dict[str, str, str, str]]: A dict with article URLs as keys and 'article dicts' as values.
|
|
'''
|
|
|
|
def _log(article) -> None:
|
|
'''Log a digestible summary of the input `article` blurb.'''
|
|
log_message = f"\t{article['title']} : {article['date']} : {article['url']}"
|
|
if article.get('description'):
|
|
log_message += f" : {article['description']}"
|
|
self.log.debug(log_message)
|
|
|
|
try:
|
|
article_blurbs = soup.find_all('article', {'class': 'Blurb'})
|
|
except AttributeError:
|
|
article_blurbs = []
|
|
|
|
if not article_blurbs:
|
|
raise ValueError('Failed to identify any article blurbs.')
|
|
|
|
parsed_blurbs = {}
|
|
for article_blurb in article_blurbs:
|
|
if article := self.parse_article_blurb(article_blurb):
|
|
_log(article)
|
|
# If an entry with the same URL already exists, keep whichever entry has the longer description.
|
|
if article['url'] in article_blurbs:
|
|
if len(article['description']) <= len(parsed_blurbs[article['url']]['description']):
|
|
continue
|
|
parsed_blurbs[article['url']] = article
|
|
|
|
return parsed_blurbs
|
|
|
|
def get_article_blurbs(self, sections: dict[str, str]) -> dict[str, dict[str, str, str, str]]:
|
|
'''Create and return a dict of all unique article blurbs found in all `sections`.
|
|
|
|
The key of a given entry in the output dictionary is the article URL. The corresponding value is a dictionary
|
|
on a `{'url': str, 'title': str, 'description': str, 'date': str}` format.
|
|
|
|
Args:
|
|
sections (dict[str, str]): A dict on a `{section_url: section_name}` format.
|
|
|
|
Returns:
|
|
dict[str, dict[str, str, str, str]]: A dict with article URLs as keys and 'article dicts' as values.
|
|
'''
|
|
self.log(f'Identifying all articles under all {len(sections)} sections...')
|
|
|
|
article_blurbs = {}
|
|
for section_url, section_title in sections.items():
|
|
try:
|
|
section_soup = self.index_to_soup(section_url)
|
|
except Exception:
|
|
self.log.error(f"Failed to download section '{section_title}' via URL '{section_url}'")
|
|
continue
|
|
self.log(f"Identifying all articles under '{section_url}'...")
|
|
for article_url, article_blurb in self._get_article_blurbs(section_soup).items():
|
|
# If the article URL has already been encountered, keep only the article blurb with the longer
|
|
# description string.
|
|
if article_url not in article_blurbs:
|
|
article_blurbs[article_url] = article_blurb
|
|
elif len(article_blurb['description']) > len(article_blurbs[article_url]['description']):
|
|
article_blurbs[article_url] = article_blurb
|
|
|
|
self.log(f'A total of {len(article_blurbs)} articles were identified in the {len(sections)} sections.')
|
|
return article_blurbs
|
|
|
|
def assign_articles_to_sections(
|
|
self,
|
|
sections: dict[str, str],
|
|
articles: dict[str, dict[str, str, str, str]],
|
|
) -> dict[str, list[dict[str, str, str, str]]]:
|
|
'''Assign each article in `articles` to a section in `sections`.
|
|
|
|
Args:
|
|
sections (dict[str, str]): A dict of section URLs as keys and section titles as values.
|
|
articles (dict[str, dict[str, str, str, str]]): A dict of article URLs as keys and article dicts as values.
|
|
|
|
Returns:
|
|
dict[str, list[dict[str, str, str, str]]]: A dict on a `{section_title: list[article_dict]}` format.
|
|
'''
|
|
self.log(f'Assigning each of the {len(articles)} articles to either of the {len(sections)} sections...')
|
|
section_to_articles = {}
|
|
for article_url, article_dict in articles.items():
|
|
last_url = article_url
|
|
while article_url not in sections and len(article_url) > len(self.main_url):
|
|
article_url = article_url.rsplit('/', 1)[0]
|
|
|
|
# Prevent an infinite loop.
|
|
if article_url == last_url:
|
|
break
|
|
last_url = article_url
|
|
|
|
# If no section corresponding to the URL exists, default to the 'Home Page' section.
|
|
section_title = sections[article_url] if article_url in sections else sections[self.main_url]
|
|
if section_title not in section_to_articles:
|
|
section_to_articles[section_title] = []
|
|
section_to_articles[section_title].append(article_dict)
|
|
|
|
# Log how many sections contained no articles younger than `self.max_age`.
|
|
if diff := len(sections) - len(section_to_articles):
|
|
self.log(f'{diff} sections contained no articles younger than {self.max_age} days.')
|
|
|
|
return section_to_articles
|
|
|
|
def parse_index(self):
|
|
# Identify all sections in the web version of Fokus.
|
|
sections = self.get_web_sections(self.main_url)
|
|
|
|
# Add an entry for the start page.
|
|
sections[self.main_url] = 'Home Page'
|
|
|
|
# From the section URLs and the main URL, identify all unique articles.
|
|
articles = self.get_article_blurbs(sections)
|
|
if not articles:
|
|
raise NoArticles(
|
|
f"Could not find any articles. Either the '{self.main_url}' server is experiencing issues, in which "
|
|
'case you should try again later, or the website format has changed and the recipe needs updating.'
|
|
)
|
|
|
|
# Assign each identified article to a section based on its URL.
|
|
section_to_articles = self.assign_articles_to_sections(sections, articles)
|
|
|
|
# Convert to the expected `list[tuple[str, dict[str, str, str, str]]]` format.
|
|
feeds = list(section_to_articles.items())
|
|
num_articles = sum(len(article_dicts) for article_dicts in section_to_articles.values())
|
|
self.log(f'A total of {num_articles} articles belonging to {len(section_to_articles)} sections were kept.')
|
|
|
|
return feeds
|