From 17014f461ba957c1f9e907d4f7494baebe6c5085 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20Ra=C4=8D=C3=A1k?= Date: Wed, 7 Jan 2015 16:44:21 +0100 Subject: [PATCH] =?UTF-8?q?Updated=20and=20simplified=20.t=C3=BD=C5=BEde?= =?UTF-8?q?=C5=88=20recipe?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- recipes/tyzden.recipe | 57 +++++++++++++++++-------------------------- 1 file changed, 23 insertions(+), 34 deletions(-) diff --git a/recipes/tyzden.recipe b/recipes/tyzden.recipe index 9a609765ed..f0953e6098 100644 --- a/recipes/tyzden.recipe +++ b/recipes/tyzden.recipe @@ -1,7 +1,7 @@ #!/usr/bin/env python # vim:fileencoding=utf-8 # -# Copyright 2014 Martin Račák +# Copyright 2014 - 2015 Martin Račák # Copyright 2011 Miroslav Vasko # # This program is free software: you can redistribute it and/or modify @@ -18,7 +18,7 @@ # along with this program. If not, see . __license__ = 'GPL v3' -__copyright__ = '2014 Martin Račák , 2011 Miroslav Vasko ' +__copyright__ = '2014 - 2015 Martin Račák , 2011 Miroslav Vasko ' ''' .týždeň - iný pohľad na spoločnosť @@ -27,10 +27,9 @@ __copyright__ = '2014 Martin Račák , 2011 Miroslav Va import re from calibre import strftime from calibre.web.feeds.news import BasicNewsRecipe -from datetime import date class Tyzden(BasicNewsRecipe): - title = u'týždeň' + title = u'.týždeň' __author__ = u'Martin Račák, zemiak' description = 'A conservative weekly magazine.' publisher = 'www.tyzden.sk' @@ -50,50 +49,40 @@ class Tyzden(BasicNewsRecipe): br.submit() return br - today = date.today() - iso = today.isocalendar() - year = iso[0] - weeknum = iso[1] - - base_url_path = 'http://www.tyzden.sk/casopis/' + str(year) + '/' + str(weeknum) - base_url = base_url_path + '.html' + base_url = 'http://www.tyzden.sk/casopis.html' keep_only_tags = [] - keep_only_tags.append(dict(name='div', attrs={'class': 'text_area top_nofoto'})) - keep_only_tags.append(dict(name='div', attrs={'class': 'text_block'})) - - remove_tags_after = [dict(name='div', attrs={'class': 'text_block'})] - - extra_css = '.top_nofoto h1 { text-align: left; font-size: 1.7em; }' + keep_only_tags.append(dict(name = 'div', attrs = {'class': 'text_area top_nofoto'})) + keep_only_tags.append(dict(name = 'div', attrs = {'class': 'text_block'})) def find_sections(self): soup = self.index_to_soup(self.base_url) - # find cover pic - imgdiv = soup.find('div', attrs={'class': 'foto'}) - if imgdiv is not None: - img = imgdiv.find('img') - if img is not None: - self.cover_url = 'http://www.tyzden.sk/' + img['src'] - # end find cover pic + # Use only the impotant part of page + content = soup.find('div', 'top') + content.extract() + + # Find cover pic + img = content.find('div', 'foto').img + if img is not None: + self.cover_url = 'http://www.tyzden.sk/' + img['src'] - for s in soup.findAll('a', attrs={'href': re.compile(r'rubrika/.*')}): - yield (self.tag_to_string(s), s) + for section in content.findAll('a', {'href': re.compile(r'rubrika/.*')}): + yield (self.tag_to_string(section), section) def find_articles(self, soup): - for art in soup.findAllNext('a'): - if (not art['href'].startswith('casopis/' + str(self.year) + '/' + str(self.weeknum) + '/')): + for article in soup.findAllNext('a'): + if (not article['href'].startswith('casopis/')): break - url = art['href'] - title = self.tag_to_string(art) yield { - 'title': title, 'url':self.base_url_path + '/' + url, - 'date' : strftime(' %a, %d %b'), + 'title': self.tag_to_string(article), + 'url': self.base_url + '/' + article['href'], + 'date': strftime(' %a, %d %b'), } def parse_index(self): feeds = [] - for title, soup in self.find_sections(): - feeds.append((title, list(self.find_articles(soup)))) + for title, section in self.find_sections(): + feeds.append((title, list(self.find_articles(section)))) return feeds