#!/usr/bin/env python # vim:fileencoding=utf-8 ''' Recipe for Die Tagespost ''' from calibre.web.feeds.news import BasicNewsRecipe, classes class Tagespost(BasicNewsRecipe): title = 'Tagespost' language = 'de' __author__ = 'unkn0wn' description = ( 'Die Tagespost trägt den Untertitel Katholische Wochenzeitung für Politik, Gesellschaft' ' und Kultur und ist eine überregionale, wöchentlich im Johann Wilhelm Naumann Verlag ' 'in Würzburg erscheinende Zeitung.' ) oldest_article = 2 encoding = 'utf-8' ignore_duplicate_articles = {'url'} masthead_url = 'https://www.die-tagespost.de/design2020/images/tp_logo_small.webp' remove_javascript = True keep_only_tags = [ classes('topline headline description datetime autor-name article_main') ] remove_tags = [ dict(name=['source', 'svg', 'aside', 'clearfix', 'footer']), classes('content-box extras jwnIconTeaser behindWall'), ] remove_tags_after = [classes('abbinder-text')] no_stylesheets = True use_embedded_content = False remove_attributes = ['style', 'height', 'width'] recipe_specific_options = { 'days': { 'short': 'Oldest article to download from this news source. In days ', 'long': 'For example, 0.5, gives you articles from the past 12 hours', 'default': str(oldest_article), }, } def __init__(self, *args, **kwargs): BasicNewsRecipe.__init__(self, *args, **kwargs) d = self.recipe_specific_options.get('days') if d and isinstance(d, str): self.oldest_article = float(d) def get_cover_url(self): soup = self.index_to_soup('https://www.ikiosk.de/shop/epaper/die-tagespost.html') return soup.find('a', attrs={'class': 'preview-cover'})['href'] feeds = [ ( 'Tagespost', 'https://www.die-tagespost.de/storage/rss/rss/die-tagespost-komplett.xml', ), ] extra_css = ''' .abbinder-text, .calibre-nuked-tag-figcaption, .datetime, .autor-name, .topline { font-size:small; } .description { font-style: italic; } ''' def preprocess_html(self, soup): desc = soup.find(**classes('description')) if desc: desc.name = 'p' for h2 in soup.findAll(['h2', 'h3']): h2.name = 'h4' return soup