#!/usr/bin/env python __license__ = 'GPL v3' from calibre.web.feeds.news import BasicNewsRecipe class SATKurier(BasicNewsRecipe): title = u'SATKurier.pl' __author__ = 'Artur Stachecki ' language = 'pl' description = u'Serwis poświęcony telewizji cyfrowej' oldest_article = 7 masthead_url = 'http://satkurier.pl/img/header_sk_logo.gif' max_articles_per_feed = 100 simultaneous_downloads = 5 remove_javascript = True no_stylesheets = True keep_only_tags = [dict(name='div', attrs={'id': ['leftNewsContainer', 'content']})] remove_tags = [dict(name='div', attrs={'class': ['col-xs-20', 'coverNews','btn-group']})] remove_tags_after = [dict(name='div',attrs={'class':'btn-group'})] feeds = [(u'Najnowsze wiadomości', u'http://feeds.feedburner.com/satkurierpl?format=xml'), (u'Sport w telewizji', u'http://feeds.feedburner.com/satkurier/sport?format=xml'), (u'Blog', u'http://feeds.feedburner.com/satkurier/blog?format=xml')] def preprocess_html(self, soup): image = soup.find(attrs={'id': 'news_mini_photo'}) if image: image.extract() header = soup.find('h1') header.replaceWith(header.prettify() + image.prettify()) for alink in soup.findAll('a'): if alink.string is not None: tstr = alink.string alink.replaceWith(tstr) return soup