#!/usr/bin/env python from calibre.web.feeds.recipes import BasicNewsRecipe import uuid class Volkskrant(BasicNewsRecipe): title = 'Volkskrant' __author__ = 'Cristi Ghera' max_articles_per_feed = 100 language = 'nl' description = 'Volkskrant - Nieuws, achtergronden en columns' needs_subscription = False resolve_internal_links = True remove_tags_before = dict(id='main-content') remove_tags_after = dict(id='main-content') remove_tags = [ dict( attrs={ 'class': [ 'article-footer__sharing', 'artstyle__editorial-tips', 'artstyle__advertisement', 'artstyle__container__icon', 'artstyle__disabled-embed', 'container__title__icon', ] } ), dict(attrs={'data-element-id': ['article-element-authors']}), dict(name=['script', 'noscript', 'style']), ] remove_attributes = ["class", "id", "name", "style"] encoding = 'utf-8' no_stylesheets = True ignore_duplicate_articles = {'url'} def parse_index(self): soup = self.index_to_soup( 'https://www.volkskrant.nl/privacy-wall/accept?redirectUri=%2Feditie%2Fvandaag%2F&authId=' + str(uuid.uuid4()) ) containers = soup.findAll('section', attrs={'class': 'section--horizontal'}) sections = [] for container in containers: section_title = self.tag_to_string(container.find('h2')).strip() articles = [] for art in container.findAll('article'): a = art.find('a') url = a['href'] if url[0] == '/': url = 'https://www.volkskrant.nl' + url if '/editie/' not in url: continue header = a.find('header') teaser_label = self.tag_to_string( header.find('h4').find('span', attrs={'class': 'teaser__label'}) ).strip() teaser_sublabel = self.tag_to_string( header.find('h4' ).find('span', attrs={'class': 'teaser__sublabel'}) ).strip() teaser_title = self.tag_to_string( header.find('h3').find( 'span', attrs={'class': 'teaser__title__value--short'} ) ).strip() if teaser_label.lower() == "podcast": continue parts = [] if teaser_label: parts.append(teaser_label.upper()) if teaser_sublabel: parts.append(teaser_sublabel) if teaser_title: parts.append(teaser_title) article_title = ' \u2022 '.join(parts) pubdate = '' description = '' articles.append( dict( title=article_title, url=url, date=pubdate, description=description, content='' ) ) sections.append((section_title, articles)) return sections def preprocess_html(self, soup): for tag in soup(): if tag.name == 'img': if tag['src'][0] == '/': tag['src'] = 'https://www.volkskrant.nl' + tag['src'] return soup