#!/usr/bin/env python2 __license__ = 'GPL v3' __copyright__ = u'2010-2015, Tomasz Dlugosz ' ''' fakty.interia.pl ''' import re from calibre.web.feeds.news import BasicNewsRecipe class InteriaFakty(BasicNewsRecipe): title = u'Interia.pl - Fakty' description = u'Fakty ze strony interia.pl' language = 'pl' oldest_article = 1 __author__ = u'Tomasz D\u0142ugosz' no_stylesheets = True remove_javascript = True remove_empty_feeds = True use_embedded_content = False ignore_duplicate_articles = {'title', 'url'} feeds = [(u'Kraj', u'http://kanaly.rss.interia.pl/kraj.xml'), (u'\u015awiat', u'http://kanaly.rss.interia.pl/swiat.xml'), (u'Wiadomo\u015bci dnia', u'http://kanaly.rss.interia.pl/fakty.xml'), (u'Przegl\u0105d prasy', u'http://kanaly.rss.interia.pl/przeglad_prasy.xml'), (u'Wywiady', u'http://kanaly.rss.interia.pl/wywiady.xml'), (u'Ciekawostki', u'http://kanaly.rss.interia.pl/ciekawostki.xml')] keep_only_tags = [ dict(name='h1'), dict(name='div', attrs={'class': ['lead textContent fontSize-medium', 'text textContent fontSize-medium', 'source']})] remove_tags = [ dict(name='div', attrs={'class': ['embed embedAd', 'REMOVE', 'boxHeader']})] preprocess_regexps = [ (re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in [ (r'embed embed(Left|Right|Center) articleEmbed(Audio|Wideo articleEmbedVideo|ArticleFull|ArticleTitle|ArticleListTitle|AlbumHorizontal)">', lambda match: 'REMOVE">'), # noqa (r'
', lambda match: ''), (r'