#!/usr/bin/env python __license__ = 'GPL v3' __copyright__ = '2010, matek09, matek09@gmail.com' from calibre.web.feeds.news import BasicNewsRecipe class Polityka(BasicNewsRecipe): title = u'Polityka' __author__ = 'matek09' description = 'Weekly magazine. Last archive issue' encoding = 'utf-8' no_stylesheets = True language = 'pl' remove_javascript = True remove_tags_before = dict(dict(name='h2', attrs={'class': 'box_nag'})) remove_tags_after = dict(dict(name='div', attrs={'class': 'box_footer'})) remove_tags = [] remove_tags.append(dict(name='h2', attrs={'class': 'box_nag'})) remove_tags.append(dict(name='div', attrs={'class': 'box_footer'})) extra_css = ''' h1 {font-size: x-large; font-weight: bold} ''' def parse_index(self): soup = self.index_to_soup('http://archiwum.polityka.pl/') box_img3 = soup.findAll(attrs={'class': 'box_img3'}) feeds = [] last = 0 self.cover_url = 'http://archiwum.polityka.pl' + \ box_img3[-1].find('img')['src'] last_edition = 'http://archiwum.polityka.pl' + \ box_img3[-1].find('a')['href'] while True: index = self.index_to_soup(last_edition) box_list = index.findAll('div', attrs={'class': 'box_list'}) if len(box_list) == 0: break articles = {} for box in box_list: for div in box.findAll('div', attrs={'class': 'list_tresc'}): article_page = self.index_to_soup( 'http://archiwum.polityka.pl' + div.a['href'],) section = self.tag_to_string(article_page.find( 'h2', attrs={'class': 'box_nag'})).split('/')[0].lstrip().rstrip() if section not in articles: articles[section] = [] articles[section].append({ 'title': self.tag_to_string(div.a), 'url': 'http://archiwum.polityka.pl' + div.a['href'], 'date': '', 'description': '' }) for section in articles: feeds.append((section, articles[section])) last_edition = last_edition.replace('http://archiwum.polityka.pl/wydanie/' + str( last), 'http://archiwum.polityka.pl/wydanie/' + str(last + 1)) last = last + 1 return feeds