calibre/recipes/tvn24.recipe

from calibre.web.feeds.news import BasicNewsRecipe
class tvn24(BasicNewsRecipe):
    title          = u'TVN24'
    oldest_article = 7
    max_articles_per_feed = 100
    __author__        = 'fenuks'
    description   = u'Sport, Biznes, Gospodarka, Informacje, Wiadomości Zawsze aktualne wiadomości z Polski i ze świata'
    category       = 'news'
    language       = 'pl'
    #masthead_url= 'http://www.tvn24.pl/_d/topmenu/logo2.gif'
    cover_url= 'http://www.userlogos.org/files/logos/Struna/TVN24.jpg'
    extra_css = 'ul {list-style:none;} \
                 li {list-style:none; float: left; margin: 0 0.15em;} \
                 h2 {font-size: medium} \
                 .date60m {float: left; margin: 0 10px 0 5px;}'
    remove_empty_feeds = True
    remove_javascript = True
    no_stylesheets = True
    use_embedded_content = False
    ignore_duplicate_articles = {'title', 'url'}
    keep_only_tags=[dict(name='h1', attrs={'class':['size30 mt10 pb10', 'size38 mt10 pb15']}), dict(name='figure', attrs={'class':'articleMainPhoto articleMainPhotoWide'}), dict(name='article', attrs={'class':['mb20', 'mb20 textArticleDefault']}), dict(name='ul', attrs={'class':'newsItem'})]
    remove_tags = [dict(name='aside', attrs={'class':['innerArticleModule onRight cols externalContent', 'innerArticleModule center']}), dict(name='div', attrs={'class':['thumbsGallery', 'articleTools', 'article right rd7', 'heading', 'quizContent']}), dict(name='a', attrs={'class':'watchMaterial text'}), dict(name='section', attrs={'class':['quiz toCenter', 'quiz toRight']})]

    feeds          = [(u'Najnowsze', u'http://www.tvn24.pl/najnowsze.xml'),
		(u'Polska', u'www.tvn24.pl/polska.xml'), (u'\u015awiat', u'http://www.tvn24.pl/swiat.xml'), (u'Sport', u'http://www.tvn24.pl/sport.xml'), (u'Biznes', u'http://www.tvn24.pl/biznes.xml'), (u'Meteo', u'http://www.tvn24.pl/meteo.xml'), (u'Micha\u0142ki', u'http://www.tvn24.pl/michalki.xml'), (u'Kultura', u'http://www.tvn24.pl/kultura.xml')]

    def preprocess_html(self, soup):
        for item in soup.findAll(style=True):
            del item['style']
        tag = soup.find(name='ul', attrs={'class':'newsItem'})
        if tag:
            tag.name='div'
            tag.li.name='div'
        return soup