calibre/recipes/sport_pl.recipe

#!/usr/bin/env  python2

__license__ = 'GPL v3'
__copyright__ = 'teepel 2012'

'''
sport.pl
'''

from calibre.web.feeds.news import BasicNewsRecipe


class sport_pl(BasicNewsRecipe):
    title = 'Sport.pl'
    __author__ = 'teepel <teepel44@gmail.com>'
    language = 'pl'
    description = u'Największy portal sportowy w Polsce. Wiadomości sportowe z najważniejszych wydarzeń, relacje i wyniki meczów na żywo.'
    masthead_url = 'http://press.gazeta.pl/file/mediakit/154509/c8/sportpl.jpg'
    oldest_article = 1
    max_articles_per_feed = 100
    remove_javascript = True
    no_stylesheets = True
    remove_empty_feeds = True
    ignore_duplicate_articles = {'title', 'url'}
    keep_only_tags = []
    keep_only_tags.append(dict(name='div', attrs={'id': 'article'}))

    remove_tags = []
    remove_tags.append(dict(name='a', attrs={'href': 'www.gazeta.pl'}))

    feeds = [
        (u'Wszystkie wiadomości', u'http://rss.gazeta.pl/pub/rss/sport.xml'),
        (u'Piłka nożna',
         u'http://www.sport.pl/pub/rss/sport/pilka_nozna.htm'),
        (u'F1', u'http://www.sport.pl/pub/rss/sportf1.htm'),
        (u'Tenis', u'http://serwisy.gazeta.pl/pub/rss/tenis.htm'),
        (u'Siatkówka', u'http://gazeta.pl.feedsportal.com/c/32739/f/611628/index.rss'),
        (u'Koszykówka', u'http://gazeta.pl.feedsportal.com/c/32739/f/611647/index.rss'),
        (u'Piłka ręczna',
         u'http://gazeta.pl.feedsportal.com/c/32739/f/611635/index.rss'),
        (u'Inne sporty', u'http://gazeta.pl.feedsportal.com/c/32739/f/611649/index.rss'),
    ]

    def parse_feeds(self):
        feeds = BasicNewsRecipe.parse_feeds(self)
        for feed in feeds:
            for article in feed.articles[:]:
                if '[ZDJĘCIA]' in article.title:
                    article.title = article.title.replace('[ZDJĘCIA]', '')
                elif '[WIDEO]' in article.title:
                    article.title = article.title.replace('[WIDEO]', '')
        return feeds

    def print_version(self, url):
        if 'feedsportal' in url:
            segment = url.split('/')
            urlPart = segment[-2]
            urlPart = urlPart.replace('0L0Ssport0Bpl0C', '')
            urlPart = urlPart.replace('0C10H', '/')
            urlPart = urlPart.replace('0H', ',')
            urlPart = urlPart.replace('0I', '_')
            urlPart = urlPart.replace('A', '')
            segment1 = urlPart.split('/')
            seg1 = segment1[0]
            seg2 = segment1[1]
            segment2 = seg2.split(',')
            part = segment2[0] + ',' + segment2[1]
            return 'http://www.sport.pl/' + seg1 + '/2029020,' + part + '.html'
        else:
            segment = url.split('/')
            part2 = segment[-2]
            part1 = segment[-1]
            segment2 = part1.split(',')
            part = segment2[1] + ',' + segment2[2]
            return 'http://www.sport.pl/' + part2 + '/2029020,' + part + '.html'