calibre/recipes/kellog_insight.recipe

#!/usr/bin/env python
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from __future__ import with_statement

__license__   = 'GPL v3'
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'


from calibre.web.feeds.news import BasicNewsRecipe

class KellogInsight(BasicNewsRecipe):

    title          = 'Kellog Insight'
    __author__     = 'Kovid Goyal and Sujata Raman'
    description    = 'Articles from the Kellog School of Management'
    no_stylesheets = True
    encoding       = 'utf-8'
    language = 'en'

    oldest_article = 60

    keep_only_tags = [dict(name='div', attrs={'id':['print_no_comments']})]

    remove_tags = [dict(name='div', attrs={'class':'col-three'})]

    extra_css = '''
                h1{font-family:arial; font-size:medium; color:#333333;}
                .col-one{font-family:arial; font-size:xx-small;}
                .col-two{font-family:arial; font-size:x-small; }
                h2{font-family:arial; font-size:small; color:#666666;}
                h3{font-family:arial; font-size:small; color:#333333;text-transform: uppercase; font-weight:normal;}
                h4{color:#660000;font-family:arial; font-size:x-small;}
                .col-two-text{font-family:arial; font-size:x-small; color:#333333;}
                '''

    feeds = [('Articles', 'http://insight.kellogg.northwestern.edu/index.php/Kellogg/RSS')]

    def get_article_url(self, article):
        # Get only article not blog links
        link = BasicNewsRecipe.get_article_url(self, article)
        if link and '/article/' in link:
            return link
        self.log('Skipping non-article', link)
        return None

    def preprocess_html(self, soup):

            for tag in soup.findAll(name=['span']):
                tag.nextSibling.name = 'h4'

            return soup