calibre/recipes/pravo.recipe

# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from __future__ import unicode_literals

from calibre.web.feeds.news import BasicNewsRecipe

class pravo(BasicNewsRecipe):
    __author__  = 'bubak'
    title      = 'Právo'
    language = 'cs'

    remove_tags_before = dict(name='div', attrs={'class':'rubrika-ostat'})
    remove_tags_after = dict(name='td', attrs={'class':'rubrika'})
    remove_tags        = [dict(name='td', attrs={'width':'273'})
                 ,dict(name='td', attrs={'class':'rubrika'})
                 ,dict(name='div', attrs={'class':'rubrika-ostat'})
    ]
    extra_css = '.nadpis {font-weight: bold; font-size: 130%;} .medium {text-align: justify;}'
    cover_url = 'http://pravo.novinky.cz/images/horni_6_logo.gif'
    cover_margins = (0, 100, '#ffffff')
    conversion_options = {'linearize_tables' : True}

    no_stylesheets = True

    # our variables
    seen_titles = set([])
    # only yesterday's articles are online
    parent_url = 'http://pravo.novinky.cz/minule/'
    feeds = [
            ('Hlavní stránka', 'http://pravo.novinky.cz/minule/index.php'),
            ('Zpravodajství', 'http://pravo.novinky.cz/minule/zpravodajstvi.php'),
            ('Komentáře', 'http://pravo.novinky.cz/minule/komentare.php'),
            ('Praha a střední Čechy', 'http://pravo.novinky.cz/minule/praha_stredni_cechy.php')
        ]


    def parse_index(self):
        articles = []

        for feed in self.feeds:
            articles.append(self.parse_page(feed))
        return articles

    def parse_page(self, (feed_title, url)):
        articles = []

        soup = self.index_to_soup(url)
        titles = soup.findAll('a', attrs={'class':'nadpis'})
        if titles is None:
            raise ValueError('Could not find any articles on page ' + url)

        articles = []
        for article in titles:
            title = article.string
            if title in self.seen_titles:
                continue
            self.seen_titles.add(title)
            url = article['href']
            if not url.startswith('http'):
                url = self.parent_url + url
            self.log('\tFound article:', title, 'at', url)
            articles.append({'title':title.string, 'url':url, 'description':'',
                            'date':''})
            return (feed_title, articles)