From 417cf783a9024ff348ee2c7c9a803390d9ea0fa6 Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Mon, 25 Apr 2016 07:15:17 +0530
Subject: [PATCH] Update Berliner Zeitung and Tagesspiegel

---
 recipes/berliner_zeitung.recipe | 64 +++++++++++++++++----------------
 recipes/tagesspiegel.recipe     | 52 ++++++++++-----------------
 2 files changed, 51 insertions(+), 65 deletions(-)
diff --git a/recipes/berliner_zeitung.recipe b/recipes/berliner_zeitung.recipe
index c4190439c7..1c00a9f7b0 100644
--- a/recipes/berliner_zeitung.recipe
+++ b/recipes/berliner_zeitung.recipe
@@ -1,44 +1,46 @@
+#!/usr/bin/env python2
+# vim:fileencoding=utf-8
+# License: GPLv3 Copyright: 2016, Kovid Goyal <kovid at kovidgoyal.net>
+
+from __future__ import (unicode_literals, division, absolute_import,
+                        print_function)
 from calibre.web.feeds.recipes import BasicNewsRecipe
 
-'''Calibre recipe to convert the RSS feeds of the Berliner Zeitung to an ebook.'''
+def classes(classes):
+    q = frozenset(classes.split(' '))
+    return dict(attrs={'class':lambda x:x and frozenset(x.split()).intersection(q)})
 
-class SportsIllustratedRecipe(BasicNewsRecipe) :
-    __author__    = 'a.peter'
-    __copyright__ = 'a.peter'
-    __license__   = 'GPL v3'
+class BerlinerZeitung(BasicNewsRecipe) :
+    title         = 'Berliner Zeitung'
+    __author__    = 'Kovid Goyal'
     language      = 'de'
     description   = 'Berliner Zeitung RSS'
-    version       = 4
-    title         = u'Berliner Zeitung'
     timefmt       = ' [%d.%m.%Y]'
+    ignore_duplicate_articles = {'title', 'url'}
+    remove_empty_feeds = True
 
-    #oldest_article = 7.0
+    # oldest_article = 7.0
     no_stylesheets = True
     remove_javascript = True
     use_embedded_content = False
     publication_type = 'newspaper'
 
-    remove_tags_before = dict(name='div', attrs={'class':'newstype'})
-    remove_tags_after = [dict(id='article_text')]
+    keep_only_tags = [
+        classes('dm_article_body dm_article_header'),
+    ]
+    remove_tags = [
+        classes('dm_article_share'),
+    ]
 
-    feeds = [(u'Startseite', u'http://www.berliner-zeitung.de/home/10808950,10808950,view,asFeed.xml'),
-             (u'Politik', u'http://www.berliner-zeitung.de/home/10808018,10808018,view,asFeed.xml'),
-             (u'Wirtschaft', u'http://www.berliner-zeitung.de/home/10808230,10808230,view,asFeed.xml'),
-             (u'Berlin', u'http://www.berliner-zeitung.de/home/10809148,10809148,view,asFeed.xml'),
-             (u'Brandenburg', u'http://www.berliner-zeitung.de/home/10809312,10809312,view,asFeed.xml'),
-             (u'Wissenschaft', u'http://www.berliner-zeitung.de/home/10808894,10808894,view,asFeed.xml'),
-             (u'Digital', u'http://www.berliner-zeitung.de/home/10808718,10808718,view,asFeed.xml'),
-             (u'Kultur', u'http://www.berliner-zeitung.de/home/10809150,10809150,view,asFeed.xml'),
-             (u'Panorama', u'http://www.berliner-zeitung.de/home/10808334,10808334,view,asFeed.xml'),
-             (u'Sport', u'http://www.berliner-zeitung.de/home/10808794,10808794,view,asFeed.xml'),
-             (u'Hertha', u'http://www.berliner-zeitung.de/home/10808800,10808800,view,asFeed.xml'),
-             (u'Union', u'http://www.berliner-zeitung.de/home/10808802,10808802,view,asFeed.xml'),
-             (u'Verkehr', u'http://www.berliner-zeitung.de/home/10809298,10809298,view,asFeed.xml'),
-             (u'Polizei', u'http://www.berliner-zeitung.de/home/10809296,10809296,view,asFeed.xml'),
-             (u'Meinung', u'http://www.berliner-zeitung.de/home/10808020,10808020,view,asFeed.xml')]
-
-    def get_masthead_url(self):
-        return 'http://www.berliner-zeitung.de/image/view/10810244,7040611,data,logo.png'
-
-    def print_version(self, url):
-        return url.replace('.html', ',view,printVersion.html')
+    feeds = [x.split() for x in [
+        'Berlin http://www.berliner-zeitung.de/blueprint/servlet/xml/berliner-zeitung/23699382-asYahooFeed.xml',
+        'Brandenburg http://www.berliner-zeitung.de/blueprint/servlet/xml/berliner-zeitung/23699570-asYahooFeed.xml',
+        'Politik http://www.berliner-zeitung.de/blueprint/servlet/xml/berliner-zeitung/23699614-asYahooFeed.xml',
+        'Wirtschaft http://www.berliner-zeitung.de/blueprint/servlet/xml/berliner-zeitung/23699644-asYahooFeed.xml',
+        'Sport http://www.berliner-zeitung.de/blueprint/servlet/xml/berliner-zeitung/23699874-asYahooFeed.xml',
+        'Kultur http://www.berliner-zeitung.de/blueprint/servlet/xml/berliner-zeitung/23700020-asYahooFeed.xml',
+        'Panorama http://www.berliner-zeitung.de/blueprint/servlet/xml/berliner-zeitung/23700178-asYahooFeed.xml',
+        'Wissen http://www.berliner-zeitung.de/blueprint/servlet/xml/berliner-zeitung/23700222-asYahooFeed.xml',
+        'Digital http://www.berliner-zeitung.de/blueprint/servlet/xml/berliner-zeitung/23700594-asYahooFeed.xml',
+        'Ratgeber http://www.berliner-zeitung.de/blueprint/servlet/xml/berliner-zeitung/23700190-asYahooFeed.xml',
+    ]]
diff --git a/recipes/tagesspiegel.recipe b/recipes/tagesspiegel.recipe
index 7c0ccede9c..9bfad37451 100644
--- a/recipes/tagesspiegel.recipe
+++ b/recipes/tagesspiegel.recipe
@@ -1,34 +1,33 @@
-from calibre.web.feeds.news import BasicNewsRecipe
+#!/usr/bin/env python2
+# vim:fileencoding=utf-8
+# License: GPLv3 Copyright: 2016, Kovid Goyal <kovid at kovidgoyal.net>
+
+from __future__ import (unicode_literals, division, absolute_import,
+                        print_function)
+from calibre.web.feeds.recipes import BasicNewsRecipe
+
+def classes(classes):
+    q = frozenset(classes.split(' '))
+    return dict(attrs={'class':lambda x:x and frozenset(x.split()).intersection(q)})
 
 class TagesspiegelRss(BasicNewsRecipe):
-    title          = u'Der Tagesspiegel'
+    title          = 'Der Tagesspiegel'
+    __author__     = 'Kovid Goyal'
     oldest_article = 1
     max_articles_per_feed = 100
     language = 'de'
     publication_type = 'newspaper'
-    auto_cleanup = True
     no_stylesheets = True
     remove_stylesheets = True
     remove_javascript = True
-    remove_empty_feeds = True
     encoding = 'utf-8'
     use_embedded_content = False
+    ignore_duplicate_articles = {'title', 'url'}
+    remove_empty_feeds = True
 
-    extra_css = '''
-                .hcf-overline{color:#990000; font-family:Arial,Helvetica,sans-serif;font-size:xx-small;display:block}
-                .hcf-teaser{font-family:Verdana,Arial,Helvetica;font-size:x-small;margin-top:0}
-                h1{font-family:Arial,Helvetica,sans-serif;font-size:large;clear:right;}
-                .hcf-caption{color:#666666; font-family:Arial,Helvetica,sans-serif;font-size:xx-small;}
-                .hcf-copyright{color:#666666; font-family:Arial,Helvetica,sans-serif;font-size:xx-small;}
-                .hcf-article{font-family:Arial,Helvetica;font-size:x-small}
-                .quote{font-family:Georgia,Palatino,Palatino Linotype,FreeSerif,serif;font-size:x-small}
-                .quote .cite{font-family:Georgia,Palatino,Palatino Linotype,FreeSerif,serif;font-size:xx-small}
-                .hcf-inline-left{float:left;margin-right:15px;position:relative;}
-                .hcf-inline-right{float:right;margin-right:15px;position:relative;}
-                .hcf-smart-box{font-family: Arial, Helvetica, sans-serif; font-size: xx-small; margin: 0px 15px 8px 0px; width: 300px;}
-                '''
-
-    remove_tags = [{'class':'hcf-header'}, {'class':'hcf-atlas'}, {'class':'hcf-colon'}, {'class':'hcf-date hcf-separate'}]
+    keep_only_tags = [
+        classes('ts-lead ts-article-body ts-intro ts-title ts-authors')
+    ]
 
     feeds          = [
         (u'Politik', u'http://www.tagesspiegel.de/contentexport/feed/politik'),
@@ -42,20 +41,5 @@ class TagesspiegelRss(BasicNewsRecipe):
         (u'Wissen', u'http://www.tagesspiegel.de/contentexport/feed/wissen')
     ]
 
-    def print_version(self, url):
-        # print url
-        u = url.find('0L0Stagesspiegel0Bde')
-        u = 'http://www.tagesspiegel.de' + url[u + 20:]
-        u = u.replace('0C', '/')
-        u = u.replace('0E', '-')
-        u = u.replace('A', '')
-        u = u.replace('0B', '.')
-        u = u.replace('.html/story01.htm', '.html')
-        url = u.split('/')
-        url[-1] = 'v_print,%s?p='%url[-1]
-        u = '/'.join(url)
-        # print u
-        return u
-
     def get_masthead_url(self):
         return 'http://www.tagesspiegel.de/images/tsp_logo/3114/6.png'