calibre/recipes/dr_dk.recipe
un-pogaz 19b9d979ab comment formating (auto-fix)
ruff 'E114,E115,E116,E261,E262,E265'
2025-01-24 11:14:21 +01:00

132 lines
5.9 KiB
Python

#!/usr/bin/env python
# vim:fileencoding=utf-8
# License: GPLv3 Copyright: 2023, Joel Davies <joeld.dev at gmail.com>
from calibre.web.feeds.news import BasicNewsRecipe
class DRNyheder(BasicNewsRecipe):
# Feeds are found here: https://www.dr.dk/nyheder/dr-nyheder-som-rss-feed
feeds = [
('Seneste nyt', 'https://www.dr.dk/nyheder/service/feeds/senestenyt'),
('Indland', 'https://www.dr.dk/nyheder/service/feeds/indland'),
('Udland', 'https://www.dr.dk/nyheder/service/feeds/udland'),
('Penge', 'https://www.dr.dk/nyheder/service/feeds/penge'),
('Politik', 'https://www.dr.dk/nyheder/service/feeds/politik'),
# ('Sporten', 'https://www.dr.dk/nyheder/service/feeds/sporten'),
# ('Seneste sport', 'https://www.dr.dk/nyheder/service/feeds/senestesport'),
('Viden', 'https://www.dr.dk/nyheder/service/feeds/viden'),
('Kultur', 'https://www.dr.dk/nyheder/service/feeds/kultur'),
('Musik', 'https://www.dr.dk/nyheder/service/feeds/musik'),
('Mit Liv', 'https://www.dr.dk/nyheder/service/feeds/mitliv'),
('Mad', 'https://www.dr.dk/nyheder/service/feeds/mad'),
('Vejret', 'https://www.dr.dk/nyheder/service/feeds/vejret'),
('Regionale', 'https://www.dr.dk/nyheder/service/feeds/regionale'),
('DR Hovedstadsområdet', 'https://www.dr.dk/nyheder/service/feeds/regionale/kbh'),
('DR Bornholm', 'https://www.dr.dk/nyheder/service/feeds/regionale/bornholm'),
('DR Syd og Sønderjylland', 'https://www.dr.dk/nyheder/service/feeds/regionale/syd'),
('DR Fyn', 'https://www.dr.dk/nyheder/service/feeds/regionale/fyn'),
('DR Midt- og Vestjylland', 'https://www.dr.dk/nyheder/service/feeds/regionale/vest'),
('DR Nordjylland', 'https://www.dr.dk/nyheder/service/feeds/regionale/nord'),
('DR Trekantområdet', 'https://www.dr.dk/nyheder/service/feeds/regionale/trekanten'),
('DR Sjælland', 'https://www.dr.dk/nyheder/service/feeds/regionale/sjaelland'),
('DR Østjylland', 'https://www.dr.dk/nyheder/service/feeds/regionale/oestjylland')
]
title = 'DR Nyheder'
__author__ = 'Joel Davies'
publisher = 'DR Nyheder'
description = 'Her finder du nyheder fra DR.'
category = 'news, politics, money, culture, sport, science, Denmark'
publication_type = 'newspaper'
encoding = 'utf8'
language = 'da'
oldest_article = 4 # 2 might be best
max_articles_per_feed = 50 # 100 better, this is just for testing
no_stylesheets = True
use_embedded_content = False
auto_cleanup = False
remove_empty_feeds = True
ignore_duplicate_articles = {'title', 'url'}
simultaneous_downloads = 20
compress_news_images = True
masthead_url = 'https://upload.wikimedia.org/wikipedia/commons/thumb/1/18/DR_logo.svg/1024px-DR_logo.svg.png'
extra_css = '''
.dre-byline__contributions {
margin-bottom: 10px;
}
.dre-byline__contributions div {
display: inline;
}
.dre-byline__contribution + .dre-byline__contribution:before {
display: inline;
content: ", ";
}
.dre-standard-article__figure {
margin-bottom: 30px;
text-align: center;
}
.dre-picture {
margin-bottom: 10px;
}
.dre-picture__image {
max-width: 100%;
height: auto;
}
.dre-standard-article__figure-caption {
font-size: .85em;
color: #575757;
}
'''
# Skip articles with /stories/ URL as these are Instagram story-style interactive pieces that play videos
# Also DRTV as these are just links to the live TV channel
def preprocess_raw_html(self, raw_html, url):
if '/stories/' in url or '/drtv/' in url:
self.abort_article('Skipping unsupported article type')
return raw_html
# Generate cover from the first image on the dr.dk homepage
def get_cover_url(self):
cover_url = None
soup = self.index_to_soup('https://www.dr.dk/')
main_content = soup.find('ul', attrs={'class': 'dre-grid-layout'})
cover_item = main_content.find('img')
if cover_item:
cover_url = cover_item['src']
return cover_url
keep_only_tags = [
dict(name='h1', attrs={'class': 'dre-article-title__heading'}), # Title
dict(name='div', attrs={'class': 'dre-article-byline'}), # Author
dict(name='figure', attrs={'class': 'dre-standard-article__figure'}), # Comment out to remove images
dict(name='p', attrs={'class': 'dre-article-body-paragraph'}), # All body text of the article
dict(name='article', attrs={'itemtype': 'http://schema.org/NewsArticle'}),
# dict(name="h1", attrs={'class': 'hydra-latest-news-page-short-news__title'}),
# dict(name="p", attrs={'class': 'hydra-latest-news-page-short-news__paragraph'}),
# dict(name="div", attrs={'class': 'dre-speech'}),
# dict(name="div", attrs={'itemprop': 'author'})
]
remove_tags = [
dict(name='ol', attrs={'class': 'hydra-latest-news-page__list'}),
dict(name='div', attrs={'class': [
'hydra-latest-news-page-short-news__share', 'hydra-latest-news-page-short-news__a11y-container',
'hydra-latest-news-page-short-news__meta', 'hydra-latest-news-page-short-news__image-slider', 'dre-byline__dates']}),
dict(name='source'),
# dict(name='menu', attrs={'class': 'share'}),
# dict(name='menu', attrs={'class': 'dr-site-share-horizontal'}),
]
# Fixes images having the wrong aspect ratio
remove_attributes = ['width', 'height']