#!/usr/bin/env python # vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai from __future__ import with_statement __license__ = 'GPL v3' __copyright__ = '2009, Kovid Goyal ' __docformat__ = 'restructuredtext en' from calibre.web.feeds.news import BasicNewsRecipe class KellogInsight(BasicNewsRecipe): title = 'Kellog Insight' __author__ = 'Kovid Goyal and Sujata Raman' description = 'Articles from the Kellog School of Management' no_stylesheets = True encoding = 'utf-8' language = 'en' oldest_article = 60 keep_only_tags = [dict(name='div', attrs={'id':['print_no_comments']})] remove_tags = [dict(name='div', attrs={'class':'col-three'})] extra_css = ''' h1{font-family:arial; font-size:medium; color:#333333;} .col-one{font-family:arial; font-size:xx-small;} .col-two{font-family:arial; font-size:x-small; } h2{font-family:arial; font-size:small; color:#666666;} h3{font-family:arial; font-size:small; color:#333333;text-transform: uppercase; font-weight:normal;} h4{color:#660000;font-family:arial; font-size:x-small;} .col-two-text{font-family:arial; font-size:x-small; color:#333333;} ''' feeds = [('Articles', 'http://insight.kellogg.northwestern.edu/index.php/Kellogg/RSS')] def get_article_url(self, article): # Get only article not blog links link = BasicNewsRecipe.get_article_url(self, article) if link and '/article/' in link: return link self.log('Skipping non-article', link) return None def preprocess_html(self, soup): for tag in soup.findAll(name=['span']): tag.nextSibling.name = 'h4' return soup