calibre/recipes/rnd.recipe
un-pogaz ed2930712d various whitespace (extra-edit)
!partial 'E203,E222,E241,E271,E272'
2025-01-24 11:14:24 +01:00

121 lines
5.2 KiB
Python

#!/usr/bin/env python
##
## Written: March 2020
## Version: 1.2
## Last update: 2024-09-26
##
from __future__ import absolute_import, division, print_function, unicode_literals
'''
Fetch RSS-Feeds from saechsische.de
'''
from calibre.web.feeds.news import BasicNewsRecipe
def prefixed_classes(classes):
q = frozenset(classes.split(' '))
def matcher(x):
if x:
for candidate in frozenset(x.split()):
for x in q:
if candidate.startswith(x):
return True
return False
return {'attrs': {'class': matcher}}
class RND(BasicNewsRecipe):
title = 'RedaktionsNetzwerk Deutschland'
__author__ = 'epubli'
description = 'RSS-Feeds von RedaktionsNetzwerk Deutschland (rnd.de)'
publisher = 'RND'
publication_type = 'newspaper'
language = 'de'
encoding = 'UTF-8'
oldest_article = 1
max_articles_per_feed = 40
no_stylesheets = True
remove_javascript = True
remove_empty_feeds = True
compress_news_images = True
compress_news_images_auto_size = 8
scale_news_images_to_device = True
delay = 1
ignore_duplicate_articles = {'title', 'url'}
cover_url = 'https://www.designtagebuch.de/wp-content/uploads/mediathek//2022/03/rnd-logo-bildmarke-1500x1125.jpg'
keep_only_tags = [
prefixed_classes('ArticleHeadstyled__ArticleHeader- Articlestyled__ArticleBodyWrapper- ArticleImagestyled__ArticleImage-')
]
remove_tags = [
prefixed_classes('CallToActionBasestyled__Container- ArticleDetailsstyled__ArticleDetailsToggleCheckbox-'
' ArticleDetailsstyled__ArticleDetailsToggleButton- ArticleImagestyled__ArticleImageCheckbox-'
' Adstyled__AdWrapper- MoreItemsBlockstyled__TitleWrapper- MoreItemsBlockstyled__MoreItemsBlock-'
' ContentTeaserstyled__Article- Buttonstyled__Button-'),
dict(name='source')
]
feeds = [
('Politik', 'https://www.rnd.de/arc/outboundfeeds/rss/category/politik/'),
('Wirtschaft', 'https://www.rnd.de/arc/outboundfeeds/rss/category/wirtschaft/'),
('Sport', 'https://www.rnd.de/arc/outboundfeeds/rss/category/sport/'),
('Panorama', 'https://www.rnd.de/arc/outboundfeeds/rss/category/panorama/'),
# ('Promis', 'https://www.rnd.de/arc/outboundfeeds/rss/category/promis/'),
# ('Reise', 'https://www.rnd.de/arc/outboundfeeds/rss/category/reise/'),
# ('Medien', 'https://www.rnd.de/arc/outboundfeeds/rss/category/medien/'),
# ('Digital', 'https://www.rnd.de/arc/outboundfeeds/rss/category/digital/'),
# ('Kultur', 'https://www.rnd.de/arc/outboundfeeds/rss/category/kultur/'),
# ('Wissen', 'https://www.rnd.de/arc/outboundfeeds/rss/category/wissen/'),
# ('Familie', 'https://www.rnd.de/arc/outboundfeeds/rss/category/familie/'),
# ('Gesundheit', 'https://www.rnd.de/arc/outboundfeeds/rss/category/gesundheit/'),
# ('Lifestyle', 'https://www.rnd.de/arc/outboundfeeds/rss/category/lifestyle/'),
# ('Bauen & Wohnen', 'https://www.rnd.de/arc/outboundfeeds/rss/category/bauen-und-wohnen/'),
# ('Geld & Finanzen', 'https://www.rnd.de/arc/outboundfeeds/rss/category/geld-und-finanzen/'),
# ('Liebe & Partnerschaft', 'https://www.rnd.de/arc/outboundfeeds/rss/category/liebe-und-partnerschaft/'),
# ('Beruf & Bildung', 'https://www.rnd.de/arc/outboundfeeds/rss/category/beruf-und-bildung/'),
# ('E-Mobility', 'https://www.rnd.de/arc/outboundfeeds/rss/category/e-mobility/')
]
def parse_feeds(self):
# Call parent's method.
feeds = BasicNewsRecipe.parse_feeds(self)
# Loop through all feeds.
for feed in feeds:
# Loop through all articles in feed.
for article in feed.articles[:]:
# Remove articles with '...' in the url.
if '/anzeige/' in article.url:
print('Removing:',article.title)
feed.articles.remove(article)
# Remove articles with '...' in the title.
elif 'Liveticker' in article.title:
print('Removing:',article.title)
feed.articles.remove(article)
elif 'Liveblog' in article.title:
print('Removing:',article.title)
feed.articles.remove(article)
elif 'Newsblog' in article.title:
print('Removing:',article.title)
feed.articles.remove(article)
elif 'Podcast' in article.title:
print('Removing:',article.title)
feed.articles.remove(article)
return feeds
def preprocess_raw_html(self, raw, url):
# remove articles requiring login and advertisements
unwantedtag='ArticleHeadstyled__ArticleHeadPaidIconContainer'
if unwantedtag in raw:
print('Skipping unwanted article with tag:',unwantedtag)
self.abort_article('Skipping unwanted article')
unwanted_article_keywords = ['Zum Login']
for keyword in unwanted_article_keywords:
if keyword in raw:
print('Skipping unwanted article with keyword(s):',keyword)
# self.abort_article('Skipping unwanted article')
return raw