mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-06-23 15:30:45 -04:00
63 lines
2.3 KiB
Plaintext
63 lines
2.3 KiB
Plaintext
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
|
|
from __future__ import unicode_literals
|
|
|
|
from calibre.web.feeds.news import BasicNewsRecipe
|
|
|
|
|
|
class pravo(BasicNewsRecipe):
|
|
__author__ = 'bubak'
|
|
title = 'Právo'
|
|
language = 'cs'
|
|
|
|
remove_tags_before = dict(name='div', attrs={'class': 'rubrika-ostat'})
|
|
remove_tags_after = dict(name='td', attrs={'class': 'rubrika'})
|
|
remove_tags = [dict(name='td', attrs={'width': '273'}), dict(name='td', attrs={'class': 'rubrika'}), dict(name='div', attrs={'class': 'rubrika-ostat'})
|
|
]
|
|
extra_css = '.nadpis {font-weight: bold; font-size: 130%;} .medium {text-align: justify;}'
|
|
cover_url = 'http://pravo.novinky.cz/images/horni_6_logo.gif'
|
|
cover_margins = (0, 100, '#ffffff')
|
|
conversion_options = {'linearize_tables': True}
|
|
|
|
no_stylesheets = True
|
|
|
|
# our variables
|
|
seen_titles = set()
|
|
# only yesterday's articles are online
|
|
parent_url = 'http://pravo.novinky.cz/minule/'
|
|
feeds = [
|
|
('Hlavní stránka', 'http://pravo.novinky.cz/minule/index.php'),
|
|
('Zpravodajství', 'http://pravo.novinky.cz/minule/zpravodajstvi.php'),
|
|
('Komentáře', 'http://pravo.novinky.cz/minule/komentare.php'),
|
|
('Praha a střední Čechy', 'http://pravo.novinky.cz/minule/praha_stredni_cechy.php')
|
|
]
|
|
|
|
def parse_index(self):
|
|
articles = []
|
|
|
|
for feed in self.feeds:
|
|
articles.append(self.parse_page(feed))
|
|
return articles
|
|
|
|
def parse_page(self, title_and_url):
|
|
(feed_title, url) = title_and_url
|
|
articles = []
|
|
|
|
soup = self.index_to_soup(url)
|
|
titles = soup.findAll('a', attrs={'class': 'nadpis'})
|
|
if titles is None:
|
|
raise ValueError('Could not find any articles on page ' + url)
|
|
|
|
articles = []
|
|
for article in titles:
|
|
title = article.string
|
|
if title in self.seen_titles:
|
|
continue
|
|
self.seen_titles.add(title)
|
|
url = article['href']
|
|
if not url.startswith('http'):
|
|
url = self.parent_url + url
|
|
self.log('\tFound article:', title, 'at', url)
|
|
articles.append({'title': title.string, 'url': url, 'description': '',
|
|
'date': ''})
|
|
return (feed_title, articles)
|