calibre/recipes/pravo.recipe
2012-11-18 23:58:09 +05:30

65 lines
2.3 KiB
Plaintext

# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from __future__ import unicode_literals
from calibre.web.feeds.news import BasicNewsRecipe
class pravo(BasicNewsRecipe):
__author__ = 'bubak'
title = 'Právo'
language = 'cs'
remove_tags_before = dict(name='div', attrs={'class':'rubrika-ostat'})
remove_tags_after = dict(name='td', attrs={'class':'rubrika'})
remove_tags = [dict(name='td', attrs={'width':'273'})
,dict(name='td', attrs={'class':'rubrika'})
,dict(name='div', attrs={'class':'rubrika-ostat'})
]
extra_css = '.nadpis {font-weight: bold; font-size: 130%;} .medium {text-align: justify;}'
cover_url = 'http://pravo.novinky.cz/images/horni_6_logo.gif'
cover_margins = (0, 100, '#ffffff')
conversion_options = {'linearize_tables' : True}
no_stylesheets = True
# our variables
seen_titles = set([])
# only yesterday's articles are online
parent_url = 'http://pravo.novinky.cz/minule/'
feeds = [
('Hlavní stránka', 'http://pravo.novinky.cz/minule/index.php'),
('Zpravodajství', 'http://pravo.novinky.cz/minule/zpravodajstvi.php'),
('Komentáře', 'http://pravo.novinky.cz/minule/komentare.php'),
('Praha a střední Čechy', 'http://pravo.novinky.cz/minule/praha_stredni_cechy.php')
]
def parse_index(self):
articles = []
for feed in self.feeds:
articles.append(self.parse_page(feed))
return articles
def parse_page(self, (feed_title, url)):
articles = []
soup = self.index_to_soup(url)
titles = soup.findAll('a', attrs={'class':'nadpis'})
if titles is None:
raise ValueError('Could not find any articles on page ' + url)
articles = []
for article in titles:
title = article.string
if title in self.seen_titles:
continue
self.seen_titles.add(title)
url = article['href']
if not url.startswith('http'):
url = self.parent_url + url
self.log('\tFound article:', title, 'at', url)
articles.append({'title':title.string, 'url':url, 'description':'',
'date':''})
return (feed_title, articles)