From 118632d6302de3aa0c0e430d4ba35afb62e00f5c Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Thu, 29 Mar 2012 14:09:03 +0530 Subject: [PATCH] NRC Handelsblad (free) by veezh --- recipes/nrc_handelsblad.recipe | 76 ++++++++++++++++++++++++++++++++++ 1 file changed, 76 insertions(+) create mode 100644 recipes/nrc_handelsblad.recipe diff --git a/recipes/nrc_handelsblad.recipe b/recipes/nrc_handelsblad.recipe new file mode 100644 index 0000000000..9675b191cc --- /dev/null +++ b/recipes/nrc_handelsblad.recipe @@ -0,0 +1,76 @@ +__license__ = 'GPL v3' +__copyright__ = '2012' +''' +nrc.nl +''' +from calibre.web.feeds.recipes import BasicNewsRecipe + +class NRC(BasicNewsRecipe): + title = 'NRC Handelsblad' + __author__ = 'veezh' + description = 'Nieuws' + oldest_article = 1 + max_articles_per_feed = 100 + no_stylesheets = True + #delay = 1 + use_embedded_content = False + encoding = 'utf-8' + publisher = 'nrc.nl' + category = 'news, Netherlands, world' + language = 'nl' + timefmt = '' + #publication_type = 'newsportal' + extra_css = ''' + h1{font-size:130%;} + #h2{font-size:100%;font-weight:normal;} + #.href{font-size:xx-small;} + .bijschrift{color:#666666; font-size:x-small;} + #.main-article-info{font-family:Arial,Helvetica,sans-serif;} + #full-contents{font-size:small; font-family:Arial,Helvetica,sans-serif;font-weight:normal;} + #match-stats-summary{font-size:small; font-family:Arial,Helvetica,sans-serif;font-weight:normal;} + ''' + #preprocess_regexps = [(re.compile(r'', re.DOTALL), lambda m: '')] + conversion_options = { + 'comments' : description + ,'tags' : category + ,'language' : language + ,'publisher' : publisher + ,'linearize_tables': True + } + + remove_empty_feeds = True + + filterDuplicates = True + + def preprocess_html(self, soup): + for alink in soup.findAll('a'): + if alink.string is not None: + tstr = alink.string + alink.replaceWith(tstr) + return soup + + keep_only_tags = [dict(name='div', attrs={'class':'article'})] + remove_tags_after = [dict(id='broodtekst')] + +# keep_only_tags = [ +# dict(name='div', attrs={'class':['label']}) +# ] + +# remove_tags_after = [dict(name='dl', attrs={'class':['tags']})] + +# def get_article_url(self, article): +# link = article.get('link') +# if 'blog' not in link and ('chat' not in link): +# return link + + feeds = [ +# ('Nieuws', 'http://www.nrc.nl/rss.php'), + ('Binnenland', 'http://www.nrc.nl/nieuws/categorie/binnenland/rss.php'), + ('Buitenland', 'http://www.nrc.nl/nieuws/categorie/buitenland/rss.php'), + ('Economie', 'http://www.nrc.nl/nieuws/categorie/economie/rss.php'), + ('Wetenschap', 'http://www.nrc.nl/nieuws/categorie/wetenschap/rss.php'), + ('Cultuur', 'http://www.nrc.nl/nieuws/categorie/cultuur/rss.php'), + ('Boeken', 'http://www.nrc.nl/boeken/rss.php'), + ('Tech', 'http://www.nrc.nl/tech/rss.php/'), + ('Klimaat', 'http://www.nrc.nl/klimaat/rss.php/'), + ]