calibre/recipes/rmf24_opinie.recipe

#!/usr/bin/env python

__license__ = 'GPL v3'
__copyright__ = u'2010, Tomasz Dlugosz <tomek3d@gmail.com>'
'''
rmf24.pl
'''

import re
from calibre.web.feeds.news import BasicNewsRecipe


class RMF24_opinie(BasicNewsRecipe):
    title = u'Rmf24.pl - Opinie'
    description = u'Blogi, wywiady i komentarze ze strony rmf24.pl'
    language = 'pl'
    oldest_article = 7
    max_articles_per_feed = 100
    __author__ = u'Tomasz D\u0142ugosz'
    no_stylesheets = True
    remove_javascript = True
    remove_empty_feeds = True

    feeds = [(u'Blogi', u'http://www.rmf24.pl/opinie/blogi/feed'),
             (u'Kontrwywiad',
              u'http://www.rmf24.pl/opinie/wywiady/kontrwywiad/feed'),
             (u'Przes\u0142uchanie',
              u'http://www.rmf24.pl/opinie/wywiady/przesluchanie/feed'),
             (u'Komentarze', u'http://www.rmf24.pl/opinie/komentarze/feed')]

    keep_only_tags = [
        dict(name='header', attrs={'class': 'article-header'}),
        dict(name='div', attrs={'class': 'article-container'})]

    remove_tags = [dict(name='div', attrs={'id': 'ReklamaMobile'})]

    extra_css = '''
        h1 { font-size: 1.2em; }
    '''

    # thanks to Kovid Goyal
    def get_article_url(self, article):
        link = article.get('link')
        if '/audio,aId' not in link:
            return link

    preprocess_regexps = [
        (re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in
        [
            (r'<h2>Zdj.cie</h2>', lambda match: ''),
            (r'embed embed(Left|Right|Center) articleEmbed(Audio|Wideo articleEmbedVideo|ArticleFull|ArticleTitle|ArticleListTitle|AlbumHorizontal)">', lambda match: 'REMOVE">'),  # noqa
            (r'<a href="http://www.facebook.com/pages/RMF24pl/.*?>RMF24.pl</a> on Facebook</div>',
             lambda match: '</div>')
        ]
    ]