This commit is contained in:
Kovid Goyal 2025-05-24 08:56:19 +05:30
commit 91b477400a
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C

View File

@ -1,8 +1,6 @@
#!/usr/bin/env python #!/usr/bin/env python
# vim:fileencoding=utf-8 # vim:fileencoding=utf-8
from datetime import datetime, timezone from datetime import datetime, timedelta, timezone
from mechanize import Request
from calibre.ebooks.BeautifulSoup import BeautifulSoup from calibre.ebooks.BeautifulSoup import BeautifulSoup
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
@ -15,7 +13,7 @@ class NoArticles(Exception):
class Fokus(BasicNewsRecipe): class Fokus(BasicNewsRecipe):
title = 'Fokus' title = 'Fokus'
main_url = 'https://www.fokus.se' main_url = 'https://www.fokus.se'
description = "The last 7 days of news and articles from the Swedish current-affairs magazine 'Fokus'" description = "The current week's edition of Swedish current-affairs magazine 'Fokus'"
encoding = 'utf-8' encoding = 'utf-8'
__author__ = 'Henrik Holm (https://github.com/h-holm)' __author__ = 'Henrik Holm (https://github.com/h-holm)'
language = 'sv' language = 'sv'
@ -24,16 +22,20 @@ class Fokus(BasicNewsRecipe):
no_stylesheets = True no_stylesheets = True
compress_news_images = True compress_news_images = True
needs_subscription = 'optional' needs_subscription = 'optional'
oldest_article = 7 # days
use_embedded_content = False use_embedded_content = False
remove_empty_feeds = True remove_empty_feeds = True
scale_news_images_to_device = True scale_news_images_to_device = True
scale_news_images = (800, 600) scale_news_images = (800, 600)
delay = 3 # Avoid throttling by the server.
# Center and reduce the size of images and image captions. # 1. Center and reduce the size of images and image captions.
# 2. Make the lead text italic.
# 3. Make the article metadata text gray and small.
extra_css = ''' extra_css = '''
img { display: block; margin: auto; width: 50%; height: auto } img { display: block; margin: auto; width: 50%; height: auto; }
div.calibre-nuked-tag-figure { font-size: small; text-align: center; } div.calibre-nuked-tag-figure { font-size: small; text-align: center; }
p.Single__lead, p.Longread__lead { font-style: italic; color:#202020; }
p.article-metadata { color: gray; font-size:small; }
''' '''
keep_only_tags = [ keep_only_tags = [
@ -41,90 +43,85 @@ class Fokus(BasicNewsRecipe):
dict(name='h1', class_='Longread__title'), # Title of "Longread" type articles. dict(name='h1', class_='Longread__title'), # Title of "Longread" type articles.
dict(name='p', class_='Single__lead'), # Lead text of "Single" type articles. dict(name='p', class_='Single__lead'), # Lead text of "Single" type articles.
dict(name='p', class_='Longread__lead'), # Lead text of "Longread" type articles. dict(name='p', class_='Longread__lead'), # Lead text of "Longread" type articles.
dict(name='p', class_='article-metadata'), # Dynamically created by the recipe.
dict(name='figure', class_='Single__thumbnail'), # Image of "Single" type articles. dict(name='figure', class_='Single__thumbnail'), # Image of "Single" type articles.
dict(name='figure', class_='Longread__thumbnail'), # Image of "Longread" type articles. dict(name='figure', class_='Longread__thumbnail'), # Image of "Longread" type articles.
dict(name='div', class_='sesamy-protected-content'), # Article body. dict(name='div', class_='sesamy-protected-content'), # Article body.
] ]
def get_cover_url(self) -> str: def extract_cover_url(self, a_tag) -> str:
# Create a `mechanize.Request` object. '''Given the <a> tag of the current edition, extract the URL of the highest-resolution cover image.'''
req = Request(url=self.main_url, method='POST') figure_tag = a_tag.find('figure')
# Open the requested URL in the built-in browser of the `BasicNewsRecipe` parent class.
browser = self.get_browser()
response = browser.open(req)
# Parse the response into a BeautifulSoup soup.
soup = BeautifulSoup(response.get_data(), 'html.parser')
# The cover image of the current edition is located in a <figure> tag with class 'Issue__thumbnail'.
try:
figure_tag = soup.find('figure', class_='Issue__thumbnail')
img_tag = figure_tag.find('img') img_tag = figure_tag.find('img')
# Set the `img_tag` to `None` if it is falsy. This way, we can force an `AttributeError` if no cover URL
# can be found.
img_tag = img_tag if img_tag else None
cover_url = img_tag['src']
except AttributeError:
self.log.error("Failed to identify the cover image URL. Does an 'Issue__thumbnail' figure still exist?")
return ''
return cover_url # The `srcset` attribute contains a comma-separated list of URLs and their resolutions.
cover_urls = img_tag['srcset'].split(', ')
cover_urls = [src.split(' ') for src in cover_urls]
def get_browser(self): # The second item of each tuple should be the resolution, e.g., '578w' or '821w'. Remove the 'w' suffix, cast
br = BasicNewsRecipe.get_browser(self) # to an integer and sort in descending order.
if self.username and self.password: cover_urls = [(url, int(resolution[:-1])) for url, resolution in cover_urls]
br.open('https://www.fokus.se/auth/logga-in') cover_urls = sorted(cover_urls, key=lambda x: x[1], reverse=True)
br.select_form(name='loginForm')
br['j_username'] = self.username
br['j_password'] = self.password
br.submit()
return br
def get_web_sections(self, main_url: str) -> dict[str, str]: # The first item of the sorted list is now the URL of the highest-resolution image.
'''Return a dict of (1) section URL and (2) section name key-value pairs found at `main_url`. self.cover_url = cover_urls[0][0]
self.log(f"Identified cover URL: '{self.cover_url}'")
For example, if the Fokus website currently includes an 'Aktuellt' section, the dict should include an entry on return
the form: `{'https://www.fokus.se/aktuellt': 'Aktuellt'}`.
Args: def get_current_edition_url(self) -> str:
main_url (str): The entrypoint URL of the Fokus website. '''Return the URL of the current (weekly) edition of Fokus.'''
current_year = datetime.now().year
try:
soup = self.index_to_soup(f"{self.main_url}/vara-utgavor")
Yields: # Identify all <a> tags of class 'Issue' that have an href attribute containing the current year.
dict[str, str]: (1) URLs and (2) human-readable names of Fokus sections. a_tags = soup.find_all('a', class_='Issue', href=True)
'''
self.log(f"Identifying all sections under '{main_url}'...")
soup = self.index_to_soup(main_url)
# Identify all unique <li> tags of class 'menu-item-type-taxonomy'. The class subsetting excludes sections that # Keep only the href, and subset to only those links that contain the current year.
# are not suited for reading, e.g., the "Podcast" and "Läs E-Tidningen" sections. edition_links = [a_tag['href'] for a_tag in a_tags if str(current_year) in a_tag['href']]
section_urls_and_names = {}
for li_tag in soup.find_all('li', class_='menu-item-type-taxonomy'):
# The <li> tag contains (should contain) an <a> anchor that in turn contains the URL and link name.
a_tag = li_tag.find('a')
url = a_tag.get('href').rstrip('/')
section_name = a_tag.text.strip()
if url in section_urls_and_names: # In order to successfully sort the links chronologically, first convert the data structure to a dict, wherein
# If this section URL has already been extracted from another <li> tag, it can be the case that the # the key consists of only the date part of the URL and the value consists of the entire (unmodified) URL.
# section name differs within this duplicate pair. In this case, use whichever section name is longer. edition_links = {link.removesuffix('/').split('/')[-1]: link for link in edition_links}
if len(section_name) >= len(section_urls_and_names[url]):
section_urls_and_names[url] = section_name
self.log(f"Identified section '{section_name}' at URL '{url}'.") # Then, shorten the key further by keeping only the part after the first hyphen. This removes the year and
section_urls_and_names[url] = section_name # typically results in only the calendar week number remaining, e.g., '1', '21' or '52'. Note however that
# editions can sometimes cover multiples weeks, e.g., '1-2', '01-03' or '50-51-52'. In order to sort correctly,
# it is therefore necessary to additionally keep only the first part of the week number(s) after the hyphen.
edition_links = {key.split('-', 1)[-1].split('-', 1)[0]: value for key, value in edition_links.items()}
self.log(f'Identified a total of {len(section_urls_and_names)} unique sections.') # Now, convert the resulting keys to integers
return section_urls_and_names edition_links = {int(key): value for key, value in edition_links.items()}
def parse_article_blurb(self, article_blurb) -> dict[str, str, str, str] | None: # Finally, sort in descending order, so that the most recent edition is first.
edition_links = dict(sorted(edition_links.items(), reverse=True))
current_edition_url = edition_links[list(edition_links.keys())[0]]
self.log(f"Identified {len(edition_links)} editions, of which the most recent is '{current_edition_url}'.")
# Now that we know the URL of the current edition, we can use it to identify the cover image. The cover
# image URL exists in the `src` attribute of the <img> child tag of the <figure> child tag of the <a> tag
# of the current edition.
current_edition_a_tag = soup.find('a', class_='Issue', href=current_edition_url)
self.extract_cover_url(current_edition_a_tag)
except Exception as exc:
self.log.error(f"Failed to identify the current edition URL: {e}")
raise NoArticles(
f"Could not find the URL of the current edition. Either the '{self.main_url}' server is experiencing "
'issues, in which case you should try again later, or the website format has changed and the recipe '
'needs updating.'
) from exc
return current_edition_url
def parse_article_blurb(self, article_blurb) -> dict[str, str, str, str, str] | None:
'''Given a <article> tag of class 'Blurb', parse it into a dict. '''Given a <article> tag of class 'Blurb', parse it into a dict.
Args: Args:
article_blurb (Tag): An <article> tag hosting metadata and the URL of an article. article_blurb (Tag): An <article> tag hosting metadata and the URL of an article.
Returns: Returns:
dict[str, str, str, str]: A dict on a `{'url': str, 'title': str, 'description': str, 'date': str}` format. A dict on a `{'url': str, 'title': str, 'date': str, 'category': str, 'description': str}` format.
''' '''
if a_tag := article_blurb.find('a', href=True): if a_tag := article_blurb.find('a', href=True):
url = a_tag['href'].strip().rstrip('/') url = a_tag['href'].strip().rstrip('/')
@ -133,47 +130,49 @@ class Fokus(BasicNewsRecipe):
if title_tag := a_tag.find('h2', {'class': 'Blurb__title'}): if title_tag := a_tag.find('h2', {'class': 'Blurb__title'}):
title = self.tag_to_string(title_tag).strip() title = self.tag_to_string(title_tag).strip()
if time_tag := a_tag.find('time', {'class': 'Blurb__date'}):
swedish_date_str = self.tag_to_string(time_tag).rstrip()
# Skip articles older than `self.oldest_article`. if date_tag := a_tag.find('time', {'class': 'Blurb__date'}):
datetime_str = time_tag['datetime'] # Results in a Swedish date format, e.g., '23 MAJ 2025'.
datetime_time = datetime.strptime(datetime_str, '%Y-%m-%dT%H:%M:%S%z') date = self.tag_to_string(date_tag).strip()
now = datetime.now(timezone.utc) # Add a newline before the date to make it more readable.
delta = now - datetime_time date = f'\n{date}'
if delta.days > self.oldest_article:
self.log.debug(f"\tSkipping article as it is too old: '{title}'") # Assign the article to its first listed category as inferred from the first <li> tag of class
return # 'Blurb__category'. Default to 'Fokus' if no such tag is found.
category = 'Fokus'
if category_tag := a_tag.find('li', {'class': 'Blurb__category'}):
category = self.tag_to_string(category_tag).strip()
desc = '' desc = ''
if desc_tag := a_tag.find('div', {'class': 'Blurb__summary'}): if desc_tag := a_tag.find('div', {'class': 'Blurb__summary'}):
desc = self.tag_to_string(desc_tag).strip() desc = self.tag_to_string(desc_tag).strip()
if in_cooperation_with_tag := a_tag.find('p', {'class': 'Blurb__meta'}):
desc += f' ({self.tag_to_string(in_cooperation_with_tag).strip()})'
return {'url': url, 'title': title, 'description': desc, 'date': swedish_date_str} return {'url': url, 'title': title, 'date': date, 'category': category, 'description': desc}
return return
def _get_article_blurbs(self, soup) -> dict[str, dict[str, str, str, str]]: def get_article_blurbs(self, soup) -> dict[str, dict[str, str, str, str, str]]:
'''Given a Fokus webpage `soup`, return a dict of unique article entries found on the page. '''Given a Fokus webpage `soup`, return a dict of unique article entries found on the page.
The key of a given entry in the output dictionary is the article URL. The corresponding value is a dictionary The key of a given entry in the output dictionary is the article URL. The corresponding value is a dictionary
on a `{'url': str, 'title': str, 'description': str, 'date': str}` format. on a `{'url': str, 'title': str, 'date': str, 'category': str, 'description': str}` format.
Args: Args:
soup (BeautifulSoup): The `bs4.BeautifulSoup` soup of a Fokus webpage. soup (BeautifulSoup): The `bs4.BeautifulSoup` soup of a Fokus webpage.
Returns: Returns:
dict[str, dict[str, str, str, str]]: A dict with article URLs as keys and 'article dicts' as values. dict[str, dict[str, str, str, str, str]]: A dict with article URLs as keys and 'article dicts' as values.
''' '''
def _log(article) -> None: def _log(article) -> None:
'''Log a digestible summary of the input `article` blurb.''' '''Log a digestible summary of the input `article` blurb.'''
log_message = f"\t{article['title']} : {article['date']} : {article['url']}" log_message = f"\t{article['title']} : {article['url']}"
if article.get('description'): if article.get('description'):
log_message += f" : {article['description']}" log_message += f" : {article['description']}"
self.log.debug(log_message) self.log.debug(log_message)
# Identify all <article> tags of class 'Blurb' that have an href attribute.
self.log(f'Identifying all articles...')
try: try:
article_blurbs = soup.find_all('article', {'class': 'Blurb'}) article_blurbs = soup.find_all('article', {'class': 'Blurb'})
except AttributeError: except AttributeError:
@ -187,94 +186,49 @@ class Fokus(BasicNewsRecipe):
for article_blurb in article_blurbs: for article_blurb in article_blurbs:
if article := self.parse_article_blurb(article_blurb): if article := self.parse_article_blurb(article_blurb):
_log(article) _log(article)
# If an entry with the same URL already exists, keep whichever entry has the longer description.
if article['url'] in article_blurbs:
if len(article['description']) <= len(parsed_blurbs[article['url']]['description']):
continue
parsed_blurbs[article['url']] = article parsed_blurbs[article['url']] = article
return parsed_blurbs return parsed_blurbs
def get_article_blurbs(self, sections: dict[str, str]) -> dict[str, dict[str, str, str, str]]: def convert_to_section_lists(
'''Create and return a dict of all unique article blurbs found in all `sections`.
The key of a given entry in the output dictionary is the article URL. The corresponding value is a dictionary
on a `{'url': str, 'title': str, 'description': str, 'date': str}` format.
Args:
sections (dict[str, str]): A dict on a `{section_url: section_name}` format.
Returns:
dict[str, dict[str, str, str, str]]: A dict with article URLs as keys and 'article dicts' as values.
'''
self.log(f'Identifying all articles under all {len(sections)} sections...')
article_blurbs = {}
for section_url, section_title in sections.items():
try:
section_soup = self.index_to_soup(section_url)
except Exception:
self.log.error(f"Failed to download section '{section_title}' via URL '{section_url}'")
continue
self.log(f"Identifying all articles under '{section_url}'...")
for article_url, article_blurb in self._get_article_blurbs(section_soup).items():
# If the article URL has already been encountered, keep only the article blurb with the longer
# description string.
if article_url not in article_blurbs:
article_blurbs[article_url] = article_blurb
elif len(article_blurb['description']) > len(article_blurbs[article_url]['description']):
article_blurbs[article_url] = article_blurb
self.log(f'A total of {len(article_blurbs)} articles were identified in the {len(sections)} sections.')
return article_blurbs
def assign_articles_to_sections(
self, self,
sections: dict[str, str], articles: dict[str, dict[str, str, str, str, str]],
articles: dict[str, dict[str, str, str, str]],
) -> dict[str, list[dict[str, str, str, str]]]: ) -> dict[str, list[dict[str, str, str, str]]]:
'''Assign each article in `articles` to a section in `sections`. '''Convert the `articles` dict of dicts to a dict of lists; each list holds the articles of a given section.
Args: Args:
sections (dict[str, str]): A dict of section URLs as keys and section titles as values. articles (dict[str, dict[str, str, str, str, str]]): A dict of article URLs and article dicts.
articles (dict[str, dict[str, str, str, str]]): A dict of article URLs as keys and article dicts as values.
Returns: Returns:
dict[str, list[dict[str, str, str, str]]]: A dict on a `{section_title: list[article_dict]}` format. dict[str, list[dict[str, str, str, str]]]: A dict on a `{section_title: list[article_dict]}` format.
''' '''
self.log(f'Assigning each of the {len(articles)} articles to either of the {len(sections)} sections...') self.log(f'Assigning each of the {len(articles)} articles to a section...')
section_to_articles = {} section_to_articles = {}
for article_url, article_dict in articles.items(): for article_url, article_dict in articles.items():
last_url = article_url section_title = article_dict['category']
while article_url not in sections and len(article_url) > len(self.main_url):
article_url = article_url.rsplit('/', 1)[0]
# Prevent an infinite loop.
if article_url == last_url:
break
last_url = article_url
# If no section corresponding to the URL exists, default to the 'Home Page' section.
section_title = sections[article_url] if article_url in sections else sections[self.main_url]
if section_title not in section_to_articles: if section_title not in section_to_articles:
section_to_articles[section_title] = [] section_to_articles[section_title] = []
# Remove the 'category' key from the article dict, as it is not needed in the final output.
article_dict.pop('category')
section_to_articles[section_title].append(article_dict) section_to_articles[section_title].append(article_dict)
# Log how many sections contained no articles younger than `self.oldest_article`.
if diff := len(sections) - len(section_to_articles):
self.log(f'{diff} sections contained no articles younger than {self.oldest_article} days.')
return section_to_articles return section_to_articles
def parse_index(self): def parse_index(self):
# Identify all sections in the web version of Fokus. current_edition_url = self.get_current_edition_url()
sections = self.get_web_sections(self.main_url) if not current_edition_url:
raise NoArticles(
f"Could not find the URL of the current edition. Either the '{self.main_url}' server is experiencing "
'issues, in which case you should try again later, or the website format has changed and the recipe '
'needs updating.'
)
self.log(f'Current edition URL: {current_edition_url}')
# Add an entry for the start page. # Identify all sections in the web version of Fokus.
sections[self.main_url] = 'Home Page' edition_soup = self.index_to_soup(current_edition_url)
# From the section URLs and the main URL, identify all unique articles. # From the section URLs and the main URL, identify all unique articles.
articles = self.get_article_blurbs(sections) articles = self.get_article_blurbs(edition_soup)
if not articles: if not articles:
raise NoArticles( raise NoArticles(
f"Could not find any articles. Either the '{self.main_url}' server is experiencing issues, in which " f"Could not find any articles. Either the '{self.main_url}' server is experiencing issues, in which "
@ -282,11 +236,7 @@ class Fokus(BasicNewsRecipe):
) )
# Assign each identified article to a section based on its URL. # Assign each identified article to a section based on its URL.
section_to_articles = self.assign_articles_to_sections(sections, articles) section_to_articles = self.convert_to_section_lists(articles)
# Convert to the expected `list[tuple[str, dict[str, str, str, str]]]` format. # Convert to tuples.
feeds = list(section_to_articles.items()) return list(section_to_articles.items())
num_articles = sum(len(article_dicts) for article_dicts in section_to_articles.values())
self.log(f'A total of {num_articles} articles belonging to {len(section_to_articles)} sections were kept.')
return feeds