calibre/recipes/taz_rss.recipe

__license__ = 'GPL v3'
__copyright__ = '2013, Alexander Schremmer <alex@alexanderweb.de>, Robert Riemann <robert@riemann.cc>'

import re
from calibre.web.feeds.news import BasicNewsRecipe

class TazRSSRecipe(BasicNewsRecipe):
    title = u'Taz - die Tageszeitung'
    description = u'Taz.de - die tageszeitung (Anpassung von Robert)'
    __author__ = 'Alexander Schremmer, Robert Riemann'
    language = 'de'
    lang = 'de-DE'
    category = 'news, Germany'
    timefmt = ' [%a, %d %b %Y]'
    publication_type = 'newspaper'
    remove_empty_feeds = True
    use_embedded_content = False
    oldest_article = 7
    max_articles_per_feed = 100
    publisher = 'taz Entwicklungs GmbH & Co. Medien KG'
    # masthead_url = u'http://galeria-autonomica.de/wp-content/uploads/a_taz-logo.gif'
    masthead_url = u'http://upload.wikimedia.org/wikipedia/de/thumb/1/15/Die-Tageszeitung-Logo.svg/500px-Die-Tageszeitung-Logo.svg.png'

    conversion_options = {'publisher': publisher,
                          'language': lang,
                        }
    feeds = [
            (u'Schlagzeilen', u'http://www.taz.de/!p3270;rss/'),
            (u'Politik', u'http://www.taz.de/Politik/!p2;rss/'),
            (u'Zukunft', u'http://www.taz.de/Zukunft/!p4;rss/'),
            (u'Netz', u'http://www.taz.de/Netz/!p5;rss/'),
            (u'Debatte', u'http://www.taz.de/Debatte/!p9;rss/'),
            (u'Leben', u'http://www.taz.de/Leben/!p10;rss/'),
            (u'Sport', u'http://www.taz.de/Sport/!p12;rss/'),
            (u'Wahrheit', u'http://www.taz.de/Wahrheit/!p13;rss/'),
            (u'Berlin', u'http://www.taz.de/Berlin/!p14;rss/'),
            (u'Nord', u'http://www.taz.de/Nord/!p11;rss/')
            ]
    # omit articles already linked in Schlagzeilen feed
    ignore_duplicate_articles = {'title', 'url'}

    # use the cover presented on the homepage
    cover_url = 'http://www.taz.de/digitaz/.s1jpeg320'

    no_stylesheets = True  # default value is False, but True makes process much faster
    keep_only_tags = [
                      dict(name=['div'], attrs={'class': re.compile(r".*\bsect_article\b.*")})
                    ]
    remove_tags = [
                  dict(name=['div'], attrs={'class': 'sectfoot'}),
                  # remove: taz paywall
                  dict(name=['div'], attrs={'id': 'tzi_paywall'})
                  ]

# with article pictures on Kindle super-slow
#    def populate_article_metadata(self, article, soup, first):
#        if first and hasattr(self, 'add_toc_thumbnail'):
#            picdiv = soup.find('img')
#            if picdiv is not None:
#                self.add_toc_thumbnail(article,picdiv['src'])