#!/usr/bin/env python __license__ = 'GPL v3' __copyright__ = u'2010, Tomasz Dlugosz ' ''' sport.interia.pl ''' import re from calibre.web.feeds.news import BasicNewsRecipe class InteriaSport(BasicNewsRecipe): title = u'Interia.pl - Sport' description = u'Sport ze strony interia.pl' language = 'pl' oldest_article = 7 __author__ = u'Tomasz D\u0142ugosz' simultaneous_downloads = 3 no_stylesheets = True remove_javascript = True max_articles_per_feed = 100 feeds = [(u'Wydarzenia sportowe', u'http://kanaly.rss.interia.pl/sport.xml'), (u'Pi\u0142ka no\u017cna', u'http://kanaly.rss.interia.pl/pilka_nozna.xml'), (u'Siatk\xf3wka', u'http://kanaly.rss.interia.pl/siatkowka.xml'), (u'Koszyk\xf3wka', u'http://kanaly.rss.interia.pl/koszykowka.xml'), (u'NBA', u'http://kanaly.rss.interia.pl/nba.xml'), (u'Kolarstwo', u'http://kanaly.rss.interia.pl/kolarstwo.xml'), (u'\u017bu\u017cel', u'http://kanaly.rss.interia.pl/zuzel.xml'), (u'Tenis', u'http://kanaly.rss.interia.pl/tenis.xml')] keep_only_tags = [dict(name='div', attrs={'id':'article'})] remove_tags = [dict(name='div', attrs={'class':'object gallery'}), dict(name='div', attrs={'class':'box fontSizeSwitch'})] extra_css = ''' .articleDate { font-size: 0.5em; color: black; } .articleFoto { display: block; font-family: sans; font-size: 0.5em; text-indent: 0 color: black; } .articleText { display: block; margin-bottom: 1em; margin-left: 0; margin-right: 0; margin-top: 1em color: black; } .articleLead { font-size: 1.2em; } ''' preprocess_regexps = [ (re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in [ (r'

', lambda match: ''), # FIXME #(r'(
)(.*?)()(.*?)()', lambda match: '\1\2\4'), (r'

()?(ZOBACZ|CZYTAJ) T.*?

', lambda match: '') ] ]