diff --git a/recipes/tyzden.recipe b/recipes/tyzden.recipe index 9a609765ed..87cd0f1f39 100644 --- a/recipes/tyzden.recipe +++ b/recipes/tyzden.recipe @@ -1,7 +1,7 @@ #!/usr/bin/env python # vim:fileencoding=utf-8 # -# Copyright 2014 Martin Račák +# Copyright 2014 - 2015 Martin Račák # Copyright 2011 Miroslav Vasko # # This program is free software: you can redistribute it and/or modify @@ -18,7 +18,7 @@ # along with this program. If not, see . __license__ = 'GPL v3' -__copyright__ = '2014 Martin Račák , 2011 Miroslav Vasko ' +__copyright__ = '2014 - 2015 Martin Račák , 2011 Miroslav Vasko ' ''' .týždeň - iný pohľad na spoločnosť @@ -27,7 +27,6 @@ __copyright__ = '2014 Martin Račák , 2011 Miroslav Va import re from calibre import strftime from calibre.web.feeds.news import BasicNewsRecipe -from datetime import date class Tyzden(BasicNewsRecipe): title = u'týždeň' @@ -50,50 +49,40 @@ class Tyzden(BasicNewsRecipe): br.submit() return br - today = date.today() - iso = today.isocalendar() - year = iso[0] - weeknum = iso[1] - - base_url_path = 'http://www.tyzden.sk/casopis/' + str(year) + '/' + str(weeknum) - base_url = base_url_path + '.html' + base_url = 'http://www.tyzden.sk/casopis.html' keep_only_tags = [] keep_only_tags.append(dict(name='div', attrs={'class': 'text_area top_nofoto'})) keep_only_tags.append(dict(name='div', attrs={'class': 'text_block'})) - remove_tags_after = [dict(name='div', attrs={'class': 'text_block'})] - - extra_css = '.top_nofoto h1 { text-align: left; font-size: 1.7em; }' - def find_sections(self): soup = self.index_to_soup(self.base_url) - # find cover pic - imgdiv = soup.find('div', attrs={'class': 'foto'}) - if imgdiv is not None: - img = imgdiv.find('img') - if img is not None: - self.cover_url = 'http://www.tyzden.sk/' + img['src'] - # end find cover pic + # Use only the impotant part of page + content = soup.find('div', 'top') + content.extract() - for s in soup.findAll('a', attrs={'href': re.compile(r'rubrika/.*')}): - yield (self.tag_to_string(s), s) + # Find cover pic + img = content.find('div', 'foto').img + if img is not None: + self.cover_url = 'http://www.tyzden.sk/' + img['src'] + + for section in content.findAll('a', {'href': re.compile(r'rubrika/.*')}): + yield (self.tag_to_string(section), section) def find_articles(self, soup): - for art in soup.findAllNext('a'): - if (not art['href'].startswith('casopis/' + str(self.year) + '/' + str(self.weeknum) + '/')): + for article in soup.findAllNext('a'): + if (not article['href'].startswith('casopis/')): break - url = art['href'] - title = self.tag_to_string(art) yield { - 'title': title, 'url':self.base_url_path + '/' + url, - 'date' : strftime(' %a, %d %b'), + 'title': self.tag_to_string(article), + 'url': self.base_url + '/' + article['href'], + 'date': strftime(' %a, %d %b'), } def parse_index(self): feeds = [] - for title, soup in self.find_sections(): - feeds.append((title, list(self.find_articles(soup)))) + for title, section in self.find_sections(): + feeds.append((title, list(self.find_articles(section)))) return feeds