#!/usr/bin/env python2 # vim:fileencoding=utf-8 # # Copyright 2014 - 2015 Martin Račák # Copyright 2011 Miroslav Vasko # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # You should have received a copy of the GNU General Public License # along with this program. If not, see . __license__ = 'GPL v3' __copyright__ = '2014 - 2015 Martin Račák , 2011 Miroslav Vasko ' ''' .týždeň - iný pohľad na spoločnosť ''' import re from calibre import strftime from calibre.web.feeds.news import BasicNewsRecipe class Tyzden(BasicNewsRecipe): title = u'týždeň' __author__ = u'Martin Račák, zemiak' description = 'A conservative weekly magazine.' publisher = 'www.tyzden.sk' publication_type = 'magazine' language = 'sk' needs_subscription = 'optional' use_embedded_content = False no_stylesheets = True def get_browser(self): br = BasicNewsRecipe.get_browser(self) if self.username is not None and self.password is not None: br.open('http://www.tyzden.sk/prihlasenie.html') br.select_form(nr=1) br['user'] = self.username br['pass'] = self.password br.submit() return br base_url = 'http://www.tyzden.sk/' issue_url = base_url + 'casopis.html' keep_only_tags = [] keep_only_tags.append(dict(name='div', attrs={'class': 'text_area top_nofoto'})) keep_only_tags.append(dict(name='div', attrs={'class': 'text_block'})) def find_sections(self): soup = self.index_to_soup(self.issue_url) # Use only the impotant part of page content = soup.find('div', 'top') content.extract() # Find cover pic img = content.find('div', 'foto').img if img is not None: self.cover_url = self.base_url + img['src'] for section in content.findAll('a', {'href': re.compile(r'rubrika/.*')}): yield (self.tag_to_string(section), section) def find_articles(self, soup): for article in soup.findAllNext('a'): if (not article['href'].startswith('casopis/')): break yield { 'title': self.tag_to_string(article), 'url': self.base_url + article['href'], 'date': strftime(' %a, %d %b'), } def parse_index(self): feeds = [] for title, section in self.find_sections(): feeds.append((title, list(self.find_articles(section)))) return feeds