From 417cf783a9024ff348ee2c7c9a803390d9ea0fa6 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Mon, 25 Apr 2016 07:15:17 +0530 Subject: [PATCH] Update Berliner Zeitung and Tagesspiegel --- recipes/berliner_zeitung.recipe | 64 +++++++++++++++++---------------- recipes/tagesspiegel.recipe | 52 ++++++++++----------------- 2 files changed, 51 insertions(+), 65 deletions(-) diff --git a/recipes/berliner_zeitung.recipe b/recipes/berliner_zeitung.recipe index c4190439c7..1c00a9f7b0 100644 --- a/recipes/berliner_zeitung.recipe +++ b/recipes/berliner_zeitung.recipe @@ -1,44 +1,46 @@ +#!/usr/bin/env python2 +# vim:fileencoding=utf-8 +# License: GPLv3 Copyright: 2016, Kovid Goyal + +from __future__ import (unicode_literals, division, absolute_import, + print_function) from calibre.web.feeds.recipes import BasicNewsRecipe -'''Calibre recipe to convert the RSS feeds of the Berliner Zeitung to an ebook.''' +def classes(classes): + q = frozenset(classes.split(' ')) + return dict(attrs={'class':lambda x:x and frozenset(x.split()).intersection(q)}) -class SportsIllustratedRecipe(BasicNewsRecipe) : - __author__ = 'a.peter' - __copyright__ = 'a.peter' - __license__ = 'GPL v3' +class BerlinerZeitung(BasicNewsRecipe) : + title = 'Berliner Zeitung' + __author__ = 'Kovid Goyal' language = 'de' description = 'Berliner Zeitung RSS' - version = 4 - title = u'Berliner Zeitung' timefmt = ' [%d.%m.%Y]' + ignore_duplicate_articles = {'title', 'url'} + remove_empty_feeds = True - #oldest_article = 7.0 + # oldest_article = 7.0 no_stylesheets = True remove_javascript = True use_embedded_content = False publication_type = 'newspaper' - remove_tags_before = dict(name='div', attrs={'class':'newstype'}) - remove_tags_after = [dict(id='article_text')] + keep_only_tags = [ + classes('dm_article_body dm_article_header'), + ] + remove_tags = [ + classes('dm_article_share'), + ] - feeds = [(u'Startseite', u'http://www.berliner-zeitung.de/home/10808950,10808950,view,asFeed.xml'), - (u'Politik', u'http://www.berliner-zeitung.de/home/10808018,10808018,view,asFeed.xml'), - (u'Wirtschaft', u'http://www.berliner-zeitung.de/home/10808230,10808230,view,asFeed.xml'), - (u'Berlin', u'http://www.berliner-zeitung.de/home/10809148,10809148,view,asFeed.xml'), - (u'Brandenburg', u'http://www.berliner-zeitung.de/home/10809312,10809312,view,asFeed.xml'), - (u'Wissenschaft', u'http://www.berliner-zeitung.de/home/10808894,10808894,view,asFeed.xml'), - (u'Digital', u'http://www.berliner-zeitung.de/home/10808718,10808718,view,asFeed.xml'), - (u'Kultur', u'http://www.berliner-zeitung.de/home/10809150,10809150,view,asFeed.xml'), - (u'Panorama', u'http://www.berliner-zeitung.de/home/10808334,10808334,view,asFeed.xml'), - (u'Sport', u'http://www.berliner-zeitung.de/home/10808794,10808794,view,asFeed.xml'), - (u'Hertha', u'http://www.berliner-zeitung.de/home/10808800,10808800,view,asFeed.xml'), - (u'Union', u'http://www.berliner-zeitung.de/home/10808802,10808802,view,asFeed.xml'), - (u'Verkehr', u'http://www.berliner-zeitung.de/home/10809298,10809298,view,asFeed.xml'), - (u'Polizei', u'http://www.berliner-zeitung.de/home/10809296,10809296,view,asFeed.xml'), - (u'Meinung', u'http://www.berliner-zeitung.de/home/10808020,10808020,view,asFeed.xml')] - - def get_masthead_url(self): - return 'http://www.berliner-zeitung.de/image/view/10810244,7040611,data,logo.png' - - def print_version(self, url): - return url.replace('.html', ',view,printVersion.html') + feeds = [x.split() for x in [ + 'Berlin http://www.berliner-zeitung.de/blueprint/servlet/xml/berliner-zeitung/23699382-asYahooFeed.xml', + 'Brandenburg http://www.berliner-zeitung.de/blueprint/servlet/xml/berliner-zeitung/23699570-asYahooFeed.xml', + 'Politik http://www.berliner-zeitung.de/blueprint/servlet/xml/berliner-zeitung/23699614-asYahooFeed.xml', + 'Wirtschaft http://www.berliner-zeitung.de/blueprint/servlet/xml/berliner-zeitung/23699644-asYahooFeed.xml', + 'Sport http://www.berliner-zeitung.de/blueprint/servlet/xml/berliner-zeitung/23699874-asYahooFeed.xml', + 'Kultur http://www.berliner-zeitung.de/blueprint/servlet/xml/berliner-zeitung/23700020-asYahooFeed.xml', + 'Panorama http://www.berliner-zeitung.de/blueprint/servlet/xml/berliner-zeitung/23700178-asYahooFeed.xml', + 'Wissen http://www.berliner-zeitung.de/blueprint/servlet/xml/berliner-zeitung/23700222-asYahooFeed.xml', + 'Digital http://www.berliner-zeitung.de/blueprint/servlet/xml/berliner-zeitung/23700594-asYahooFeed.xml', + 'Ratgeber http://www.berliner-zeitung.de/blueprint/servlet/xml/berliner-zeitung/23700190-asYahooFeed.xml', + ]] diff --git a/recipes/tagesspiegel.recipe b/recipes/tagesspiegel.recipe index 7c0ccede9c..9bfad37451 100644 --- a/recipes/tagesspiegel.recipe +++ b/recipes/tagesspiegel.recipe @@ -1,34 +1,33 @@ -from calibre.web.feeds.news import BasicNewsRecipe +#!/usr/bin/env python2 +# vim:fileencoding=utf-8 +# License: GPLv3 Copyright: 2016, Kovid Goyal + +from __future__ import (unicode_literals, division, absolute_import, + print_function) +from calibre.web.feeds.recipes import BasicNewsRecipe + +def classes(classes): + q = frozenset(classes.split(' ')) + return dict(attrs={'class':lambda x:x and frozenset(x.split()).intersection(q)}) class TagesspiegelRss(BasicNewsRecipe): - title = u'Der Tagesspiegel' + title = 'Der Tagesspiegel' + __author__ = 'Kovid Goyal' oldest_article = 1 max_articles_per_feed = 100 language = 'de' publication_type = 'newspaper' - auto_cleanup = True no_stylesheets = True remove_stylesheets = True remove_javascript = True - remove_empty_feeds = True encoding = 'utf-8' use_embedded_content = False + ignore_duplicate_articles = {'title', 'url'} + remove_empty_feeds = True - extra_css = ''' - .hcf-overline{color:#990000; font-family:Arial,Helvetica,sans-serif;font-size:xx-small;display:block} - .hcf-teaser{font-family:Verdana,Arial,Helvetica;font-size:x-small;margin-top:0} - h1{font-family:Arial,Helvetica,sans-serif;font-size:large;clear:right;} - .hcf-caption{color:#666666; font-family:Arial,Helvetica,sans-serif;font-size:xx-small;} - .hcf-copyright{color:#666666; font-family:Arial,Helvetica,sans-serif;font-size:xx-small;} - .hcf-article{font-family:Arial,Helvetica;font-size:x-small} - .quote{font-family:Georgia,Palatino,Palatino Linotype,FreeSerif,serif;font-size:x-small} - .quote .cite{font-family:Georgia,Palatino,Palatino Linotype,FreeSerif,serif;font-size:xx-small} - .hcf-inline-left{float:left;margin-right:15px;position:relative;} - .hcf-inline-right{float:right;margin-right:15px;position:relative;} - .hcf-smart-box{font-family: Arial, Helvetica, sans-serif; font-size: xx-small; margin: 0px 15px 8px 0px; width: 300px;} - ''' - - remove_tags = [{'class':'hcf-header'}, {'class':'hcf-atlas'}, {'class':'hcf-colon'}, {'class':'hcf-date hcf-separate'}] + keep_only_tags = [ + classes('ts-lead ts-article-body ts-intro ts-title ts-authors') + ] feeds = [ (u'Politik', u'http://www.tagesspiegel.de/contentexport/feed/politik'), @@ -42,20 +41,5 @@ class TagesspiegelRss(BasicNewsRecipe): (u'Wissen', u'http://www.tagesspiegel.de/contentexport/feed/wissen') ] - def print_version(self, url): - # print url - u = url.find('0L0Stagesspiegel0Bde') - u = 'http://www.tagesspiegel.de' + url[u + 20:] - u = u.replace('0C', '/') - u = u.replace('0E', '-') - u = u.replace('A', '') - u = u.replace('0B', '.') - u = u.replace('.html/story01.htm', '.html') - url = u.split('/') - url[-1] = 'v_print,%s?p='%url[-1] - u = '/'.join(url) - # print u - return u - def get_masthead_url(self): return 'http://www.tagesspiegel.de/images/tsp_logo/3114/6.png'