diff --git a/recipes/gva_be.recipe b/recipes/gva_be.recipe index 6e3fd45718..639565447f 100644 --- a/recipes/gva_be.recipe +++ b/recipes/gva_be.recipe @@ -1,12 +1,16 @@ -#!/usr/bin/env python2 +#!/usr/bin/env python2 +# vim:fileencoding=utf-8 +from __future__ import unicode_literals, division, absolute_import, print_function __license__ = 'GPL v3' __copyright__ = '2009, Darko Miletic ' + ''' www.gva.be ''' + +import re from calibre.web.feeds.news import BasicNewsRecipe -from calibre.ebooks.BeautifulSoup import Tag class GazetvanAntwerpen(BasicNewsRecipe): title = 'Gazet van Antwerpen' @@ -14,50 +18,45 @@ class GazetvanAntwerpen(BasicNewsRecipe): description = 'News from Belgium in Dutch' publisher = 'Gazet van Antwerpen' category = 'news, politics, Belgium' + language = 'nl_BE' + oldest_article = 2 max_articles_per_feed = 100 no_stylesheets = True use_embedded_content = False - encoding = 'utf-8' - language = 'nl_BE' + remove_javascript = True - lang = 'nl-BE' - direction = 'ltr' - - html2lrf_options = [ - '--comment' , description - , '--category' , category - , '--publisher', publisher - ] - - html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"\noverride_css=" p {text-indent: 0cm; margin-top: 0em; margin-bottom: 0.5em} "' - - keep_only_tags = [dict(name='div', attrs={'id':'article'})] - remove_tags = [ - dict(name=['embed','object']) - , dict (name='div',attrs={'class':['note NotePortrait','note']}) - ] - remove_tags_after = dict(name='span', attrs={'class':'author'}) + masthead_url = 'http://2.gvacdn.be/extra/assets/img/gazet-van-antwerpen-red.svg' feeds = [ - (u'Binnenland' , u'http://www.gva.be/syndicationservices/artfeedservice.svc/rss/mostrecent/binnenland' ) - ,(u'Buitenland' , u'http://www.gva.be/syndicationservices/artfeedservice.svc/rss/mostrecent/buitenland' ) - ,(u'Stad & Regio' , u'http://www.gva.be/syndicationservices/artfeedservice.svc/rss/mostrecent/stadenregio' ) - ,(u'Economie' , u'http://www.gva.be/syndicationservices/artfeedservice.svc/rss/mostrecent/economie' ) - ,(u'Media & Cultur' , u'http://www.gva.be/syndicationservices/artfeedservice.svc/rss/mostrecent/mediaencultuur') - ,(u'Wetenschap' , u'http://www.gva.be/syndicationservices/artfeedservice.svc/rss/mostrecent/wetenschap' ) - ,(u'Sport' , u'http://www.gva.be/syndicationservices/artfeedservice.svc/rss/mostrecent/sport' ) - ] + ('Stad & Regio', 'http://www.gva.be/syndicationservices/artfeedservice.svc/rss/mostrecent/stadenregio'), + ('Economie', 'http://www.gva.be/syndicationservices/artfeedservice.svc/rss/mostrecent/economie'), + ('Binnenland', 'http://www.gva.be/syndicationservices/artfeedservice.svc/rss/mostrecent/binnenland'), + ('Buitenland', 'http://www.gva.be/syndicationservices/artfeedservice.svc/rss/mostrecent/buitenland'), + ('Media & Cultuur', 'http://www.gva.be/syndicationservices/artfeedservice.svc/rss/mostrecent/mediaencultuur'), + ('Sport', 'http://www.gva.be/syndicationservices/artfeedservice.svc/rss/mostrecent/sport') + ] + + keep_only_tags = [ + dict(name='header', attrs={'class':'article__header'}), + dict(name='footer', attrs={'class':'article__meta'}), + dict(name='div', attrs={'class':['article', 'article__body', 'slideshow__intro']}), + dict(name='figure', attrs={'class':'article__image'}) + ] + + remove_tags = [ + dict(name=['embed', 'object']), + dict(name='div', attrs={'class':['note NotePortrait', 'note']}), + dict(name='ul', attrs={'class':re.compile('article__share')}), + dict(name='div', attrs={'class':'slideshow__controls'}), + dict(name='a', attrs={'role':'button'}), + dict(name='figure', attrs={'class':re.compile('video')}) + ] + + remove_attributes = ['width', 'height'] def preprocess_html(self, soup): del soup.body['onload'] for item in soup.findAll(style=True): del item['style'] - soup.html['lang'] = self.lang - soup.html['dir' ] = self.direction - mlang = Tag(soup,'meta',[("http-equiv","Content-Language"),("content",self.lang)]) - mcharset = Tag(soup,'meta',[("http-equiv","Content-Type"),("content","text/html; charset=utf-8")]) - soup.head.insert(0,mlang) - soup.head.insert(1,mcharset) return soup -