#!/usr/bin/env python __license__ = 'GPL v3' __copyright__ = '2010, matek09, matek09@gmail.com' from calibre.web.feeds.news import BasicNewsRecipe import re class Histmag(BasicNewsRecipe): title = u'Histmag' __author__ = 'matek09' description = u"Artykuly historyczne i publicystyczne" encoding = 'utf-8' no_stylesheets = True language = 'pl' remove_javascript = True #max_articles_per_feed = 1 remove_tags_before = dict(dict(name = 'div', attrs = {'id' : 'article'})) remove_tags_after = dict(dict(name = 'h2', attrs = {'class' : 'komentarze'})) #keep_only_tags =[] #keep_only_tags.append(dict(name = 'h2')) #keep_only_tags.append(dict(name = 'p')) remove_tags =[] remove_tags.append(dict(name = 'p', attrs = {'class' : 'podpis'})) remove_tags.append(dict(name = 'h2', attrs = {'class' : 'komentarze'})) remove_tags.append(dict(name = 'img', attrs = {'src' : 'style/buttons/wesprzyjnas-1.jpg'})) preprocess_regexps = [(re.compile(r''), lambda match: '

'), (re.compile(r''), lambda match: '

')] extra_css = ''' .left {font-size: x-small} .right {font-size: x-small} ''' def find_articles(self, soup): articles = [] for div in soup.findAll('div', attrs={'class' : 'text'}): articles.append({ 'title' : self.tag_to_string(div.h3.a), 'url' : 'http://www.histmag.org/' + div.h3.a['href'], 'date' : self.tag_to_string(div.next('p')).split('|')[0], 'description' : self.tag_to_string(div.next('p', podpis=False)), }) return articles def parse_index(self): soup = self.index_to_soup('http://histmag.org/?arc=4&dx=0') feeds = [] feeds.append((u"Artykuly historyczne", self.find_articles(soup))) soup = self.index_to_soup('http://histmag.org/?arc=5&dx=0') feeds.append((u"Artykuly publicystyczne", self.find_articles(soup))) soup = self.index_to_soup('http://histmag.org/?arc=1&dx=0') feeds.append((u"Wydarzenia", self.find_articles(soup))) return feeds