#!/usr/bin/env python ## ## Written: March 2020 ## Version: 1.1 ## Last update: 2023-03-31 ## from __future__ import unicode_literals, division, absolute_import, print_function ''' Fetch RSS-Feeds from saechsische.de ''' from calibre.web.feeds.news import BasicNewsRecipe def classes(classes): q = frozenset(classes.split(' ')) return dict(attrs={'class': lambda x: x and frozenset(x.split()).intersection(q)}) class Saechsische(BasicNewsRecipe): title = 'Saechsische Zeitung' __author__ = 'epubli' description = 'RSS-Feeds von saechsische.de' publisher = 'SZ' publication_type = 'newspaper' language = 'de' encoding = 'UTF-8' oldest_article = 1 max_articles_per_feed = 40 no_stylesheets = True remove_javascript = True remove_empty_feeds = True compress_news_images = True compress_news_images_auto_size = 8 scale_news_images_to_device = True delay = 1 ignore_duplicate_articles = {'title', 'url'} cover_url = 'https://www.saechsische.de/img/logo.svg' feeds = [ ('Dresden', 'feed://www.saechsische.de/rss/dresden'), ('Sachsen', 'feed://saechsische.de/rss/sachsen'), ('Deutschland und Welt', 'feed://www.saechsische.de/rss/deutschland-welt'), ('Politik', 'feed://www.saechsische.de/rss/politik'), ('Wirtschaft', 'feed://www.saechsische.de/rss/wirtschaft'), ('Feuilleton', 'feed://www.saechsische.de/rss/feuilleton'), ('Sport', 'feed://www.saechsische.de/rss/sport'), #('Dynamo', 'feed://www.saechsische.de/rss/dynamo'), #('Bautzen', 'feed://www.saechsische.de/rss/bautzen'), #('Bischofswerda', 'feed://www.saechsische.de/rss/bischofswerda'), #('Dippoldiswalde', 'feed://www.saechsische.de/rss/dippoldiswalde'), #('Döbeln', 'feed://www.saechsische.de/rss/doebeln'), #('Freital', 'feed://www.saechsische.de/rss/freital'), #('Großenhain', 'feed://www.saechsische.de/rss/grossenhain'), #('Görlitz', 'feed://www.saechsische.de/rss/goerlitz'), #('Kamenz', 'feed://www.saechsische.de/rss/kamenz'), #('Löbau', 'feed://www.saechsische.de/rss/loebau'), #('Meißen', 'feed://www.saechsische.de/rss/meissen'), #('Niesky', 'feed://www.saechsische.de/rss/niesky'), #('Pirna', 'feed://www.saechsische.de/rss/pirna'), #('Radeberg', 'feed://www.saechsische.de/rss/radeberg'), #('Radebeul', 'feed://www.saechsische.de/rss/radebeul'), #('Riesa', 'feed://www.saechsische.de/rss/riesa'), #('Sebnitz', 'feed://www.saechsische.de/rss/sebnitz'), #('Zittau', 'feed://www.saechsische.de/rss/zittau'), ] template_css = ''' .article_date { color: gray; font-family: monospace;} .article_description { text-indent: 0pt; } a.article { font-weight: bold; text-align:left; } a.feed { font-weight: bold; } .calibre_navbar { font-size: 200% !important; } ''' extra_css = ''' h2 {margin-top: 0em;} ''' keep_only_tags = [ dict(name='article', attrs={'class':'article-detail'}), ] remove_tags = [ classes('article-fill'), dict(name='div', attrs={'class':'related-articles'}), dict(name='a', attrs={'class':'article-remember-link'}), dict(name='a', attrs={'href':'https://www.saechsische.de/dresden'}), dict(name='a', attrs={'href':'https://www.saechsische.de/content/newsletter-lp?utm_content=dresden_kompakt'}), dict(name='div', attrs={'class':'article-detail-socials'}), dict(name='div', attrs={'class':'d-desktop-none'}), dict(name='div', attrs={'class':'floating-share-icon'}), ] def parse_feeds(self): # Call parent's method. feeds = BasicNewsRecipe.parse_feeds(self) # Loop through all feeds. for feed in feeds: # Loop through all articles in feed. for article in feed.articles[:]: # Remove articles with '...' in the url. if '/anzeige/' in article.url: print('Removing:',article.title) feed.articles.remove(article) elif 'newsletter-dresden' in article.url: print('Removing:',article.title) feed.articles.remove(article) # Remove articles with '...' in the title. elif 'Newsblog' in article.title: print('Removing:',article.title) feed.articles.remove(article) elif 'Podcast' in article.title: print('Removing:',article.title) feed.articles.remove(article) return feeds def preprocess_raw_html(self, raw, url): # remove Newsblogs, articles requiring login and advertisements unwanted_article_keywords = ['unser Newsblog', 'Zum Login', '00:00 Uhr',] for keyword in unwanted_article_keywords: if keyword in raw: print('Skipping unwanted article with keyword(s):',keyword) self.abort_article('Skipping unwanted article') return raw