#!/usr/bin/env python # vim:fileencoding=utf-8 from __future__ import unicode_literals, division, absolute_import, print_function __license__ = 'GPL v3' __copyright__ = '2009, Darko Miletic ' ''' www.gva.be ''' import re from calibre.web.feeds.news import BasicNewsRecipe class GazetvanAntwerpen(BasicNewsRecipe): title = 'Gazet van Antwerpen' __author__ = 'Darko Miletic' description = 'News from Belgium in Dutch' publisher = 'Mediahuis' category = 'news, politics, Belgium' language = 'nl_BE' oldest_article = 2 max_articles_per_feed = 100 no_stylesheets = True use_embedded_content = False remove_javascript = True masthead_url = 'http://2.gvacdn.be/extra/assets/img/gazet-van-antwerpen-red.svg' feeds = [ ('Stad & Regio', 'http://www.gva.be/syndicationservices/artfeedservice.svc/rss/mostrecent/stadenregio'), ('Economie', 'http://www.gva.be/syndicationservices/artfeedservice.svc/rss/mostrecent/economie'), ('Binnenland', 'http://www.gva.be/syndicationservices/artfeedservice.svc/rss/mostrecent/binnenland'), ('Buitenland', 'http://www.gva.be/syndicationservices/artfeedservice.svc/rss/mostrecent/buitenland'), ('Media & Cultuur', 'http://www.gva.be/syndicationservices/artfeedservice.svc/rss/mostrecent/mediaencultuur'), ('Sport', 'http://www.gva.be/syndicationservices/artfeedservice.svc/rss/mostrecent/sport') ] keep_only_tags = [ dict(name='header', attrs={'class': 'article__header'}), dict(name='footer', attrs={'class': 'article__meta'}), dict(name='div', attrs={ 'class': ['article', 'article__body', 'slideshow__intro']}), dict(name='figure', attrs={'class': 'article__image'}) ] remove_tags = [ dict(name=['embed', 'object']), dict(name='div', attrs={'class': ['note NotePortrait', 'note']}), dict(name='ul', attrs={'class': re.compile('article__share')}), dict(name='div', attrs={'class': 'slideshow__controls'}), dict(name='a', attrs={'role': 'button'}), dict(name='figure', attrs={'class': re.compile('video')}) ] remove_attributes = ['width', 'height'] def preprocess_html(self, soup): del soup.body['onload'] for item in soup.findAll(style=True): del item['style'] return soup