# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai from __future__ import unicode_literals from calibre.web.feeds.news import BasicNewsRecipe class pravo(BasicNewsRecipe): __author__ = 'bubak' title = 'Právo' language = 'cs' remove_tags_before = dict(name='div', attrs={'class': 'rubrika-ostat'}) remove_tags_after = dict(name='td', attrs={'class': 'rubrika'}) remove_tags = [dict(name='td', attrs={'width': '273'}), dict(name='td', attrs={'class': 'rubrika'}), dict(name='div', attrs={'class': 'rubrika-ostat'}) ] extra_css = '.nadpis {font-weight: bold; font-size: 130%;} .medium {text-align: justify;}' cover_url = 'http://pravo.novinky.cz/images/horni_6_logo.gif' cover_margins = (0, 100, '#ffffff') conversion_options = {'linearize_tables': True} no_stylesheets = True # our variables seen_titles = set() # only yesterday's articles are online parent_url = 'http://pravo.novinky.cz/minule/' feeds = [ ('Hlavní stránka', 'http://pravo.novinky.cz/minule/index.php'), ('Zpravodajství', 'http://pravo.novinky.cz/minule/zpravodajstvi.php'), ('Komentáře', 'http://pravo.novinky.cz/minule/komentare.php'), ('Praha a střední Čechy', 'http://pravo.novinky.cz/minule/praha_stredni_cechy.php') ] def parse_index(self): articles = [] for feed in self.feeds: articles.append(self.parse_page(feed)) return articles def parse_page(self, title_and_url): (feed_title, url) = title_and_url articles = [] soup = self.index_to_soup(url) titles = soup.findAll('a', attrs={'class': 'nadpis'}) if titles is None: raise ValueError('Could not find any articles on page ' + url) articles = [] for article in titles: title = article.string if title in self.seen_titles: continue self.seen_titles.add(title) url = article['href'] if not url.startswith('http'): url = self.parent_url + url self.log('\tFound article:', title, 'at', url) articles.append({'title': title.string, 'url': url, 'description': '', 'date': ''}) return (feed_title, articles)