From be1002cc20fb9fde4e4c51c815f27411788fbf0d Mon Sep 17 00:00:00 2001 From: Henrik Holm Date: Sun, 27 Apr 2025 16:45:00 +0200 Subject: [PATCH 1/2] Prevent duplication --- recipes/fokus.recipe | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/recipes/fokus.recipe b/recipes/fokus.recipe index 9493541727..a4809c8a39 100644 --- a/recipes/fokus.recipe +++ b/recipes/fokus.recipe @@ -1,5 +1,7 @@ #!/usr/bin/env python # vim:fileencoding=utf-8 +import re + from datetime import datetime, timezone from mechanize import Request @@ -314,3 +316,26 @@ class Fokus(BasicNewsRecipe): self.log(f'A total of {num_articles} articles belonging to {len(section_to_articles)} sections were kept.') return feeds + + def postprocess_html(self, soup, _, read_more_regex: re.Pattern = re.compile("^Läs även:")): + # When scraped, Fokus sometimes returns a duplicate of the article, i.e., the exact same article concatenated + # to itself. To avoid duplication in the output file, remove all
tags after the

tag whose content is + # "***" (the Fokus indication of the end of the article). "***" is sometimes followed by one or multiple + # "Läs även:" ("Read also:") paragraphs. If such paragraphs are encountered, do not delete them. + has_reached_end, has_reached_read_more = False, False + for div_tag in soup.find_all('div', class_='wp-block-core-paragraph'): + if has_reached_read_more: + div_tag.decompose() + elif has_reached_end and div_tag.find('strong', text=read_more_regex): + # If the end has been reached, check if this is a "Läs även:" paragraph. If yes, keep it. + continue + elif has_reached_end and not has_reached_read_more: + # If the end of the article has been reached, and if no more "Läs även:" paragraphs follow, delete this + # tag and set `has_reached_read_more` to `True`. + has_reached_read_more = True + div_tag.decompose() + elif div_tag.find('p', text='***'): + # The end of the article has been reached. + has_reached_end = True + + return soup From 0a7993c09abd5737796c490960ff513752ed661f Mon Sep 17 00:00:00 2001 From: Henrik Holm Date: Sun, 27 Apr 2025 16:45:18 +0200 Subject: [PATCH 2/2] Center images and image captions --- recipes/fokus.recipe | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/recipes/fokus.recipe b/recipes/fokus.recipe index a4809c8a39..b97dc5b07b 100644 --- a/recipes/fokus.recipe +++ b/recipes/fokus.recipe @@ -27,13 +27,17 @@ class Fokus(BasicNewsRecipe): compress_news_images = True needs_subscription = 'optional' oldest_article = 7 # days - remove_empty_feeds = True - extra_css = 'img { display: block; width: 75%; height: auto }' - use_embedded_content = False + remove_empty_feeds = True scale_news_images_to_device = True scale_news_images = (800, 600) + # Center and reduce the size of images and image captions. + extra_css = ''' + img { display: block; margin: auto; width: 50%; height: auto } + div.calibre-nuked-tag-figure { font-size: small; text-align: center; } + ''' + remove_tags = [ dict(name='div', attrs={'class': 'External-ad'}), dict(name='header', attrs={'class': 'Header'}),