# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai from __future__ import unicode_literals import re from calibre.web.feeds.news import BasicNewsRecipe class insider(BasicNewsRecipe): __author__ = 'bubak' title = 'Insider' language = 'cs' remove_tags = [dict(name='div', attrs={'class': 'article-related-content'}), dict(name='div', attrs={'class': 'calendar'}), dict(name='span', attrs={'id': 'labelHolder'}) # noqa ] no_stylesheets = True keep_only_tags = [ dict(name='div', attrs={'class': ['doubleBlock textContentFormat']})] preprocess_regexps = [ (re.compile(r'T.mata:.*', re.DOTALL | re.IGNORECASE), lambda m: '')] needs_subscription = True def get_browser(self): br = BasicNewsRecipe.get_browser(self) br.open('http://www.denikinsider.cz/') br.select_form(nr=0) br['login-name'] = self.username br['login-password'] = self.password res = br.submit() raw = res.read() if u'Odhlásit se'.encode('utf-8') not in raw: raise ValueError('Failed to login to insider.cz' 'Check your username and password.') return br def parse_index(self): articles = [] soup = self.index_to_soup('http://www.denikinsider.cz') titles = soup.findAll('span', attrs={'class': 'homepageArticleTitle'}) if titles is None: raise ValueError('Could not find category content') articles = [] seen_titles = set() for title in titles: if title.string in seen_titles: continue article = title.parent seen_titles.add(title.string) url = article['href'] if url.startswith('/'): url = 'http://www.denikinsider.cz/' + url self.log('\tFound article:', title, 'at', url) articles.append({'title': title.string, 'url': url, 'description': '', 'date': ''}) return [(self.title, articles)]