calibre/recipes/volksrant.recipe
2023-07-12 17:47:24 +05:30

98 lines
3.5 KiB
Python

#!/usr/bin/env python
from calibre.web.feeds.recipes import BasicNewsRecipe
import uuid
class Volkskrant(BasicNewsRecipe):
title = 'Volkskrant'
__author__ = 'Cristi Ghera'
max_articles_per_feed = 100
language = 'nl'
description = 'Volkskrant - Nieuws, achtergronden en columns'
needs_subscription = False
resolve_internal_links = True
remove_tags_before = dict(id='main-content')
remove_tags_after = dict(id='main-content')
remove_tags = [
dict(
attrs={
'class': [
'article-footer__sharing',
'artstyle__editorial-tips',
'artstyle__advertisement',
'artstyle__container__icon',
'artstyle__disabled-embed',
'container__title__icon',
]
}
),
dict(attrs={'data-element-id': ['article-element-authors']}),
dict(name=['script', 'noscript', 'style']),
]
remove_attributes = ["class", "id", "name", "style"]
encoding = 'utf-8'
no_stylesheets = True
ignore_duplicate_articles = {'url'}
def parse_index(self):
soup = self.index_to_soup(
'https://www.volkskrant.nl/privacy-wall/accept?redirectUri=%2Feditie%2Fvandaag%2F&authId=' + str(uuid.uuid4())
)
containers = soup.findAll('section', attrs={'class': 'section--horizontal'})
sections = []
for container in containers:
section_title = self.tag_to_string(container.find('h2')).strip()
articles = []
for art in container.findAll('article'):
a = art.find('a')
url = a['href']
if url[0] == '/':
url = 'https://www.volkskrant.nl' + url
if '/editie/' not in url:
continue
header = a.find('header')
teaser_label = self.tag_to_string(
header.find('h4').find('span', attrs={'class': 'teaser__label'})
).strip()
teaser_sublabel = self.tag_to_string(
header.find('h4'
).find('span', attrs={'class': 'teaser__sublabel'})
).strip()
teaser_title = self.tag_to_string(
header.find('h3').find(
'span', attrs={'class': 'teaser__title__value--short'}
)
).strip()
if teaser_label.lower() == "podcast":
continue
parts = []
if teaser_label:
parts.append(teaser_label.upper())
if teaser_sublabel:
parts.append(teaser_sublabel)
if teaser_title:
parts.append(teaser_title)
article_title = ' \u2022 '.join(parts)
pubdate = ''
description = ''
articles.append(
dict(
title=article_title,
url=url,
date=pubdate,
description=description,
content=''
)
)
sections.append((section_title, articles))
return sections
def preprocess_html(self, soup):
for tag in soup():
if tag.name == 'img':
if tag['src'][0] == '/':
tag['src'] = 'https://www.volkskrant.nl' + tag['src']
return soup