calibre/recipes/toronto_sun.recipe

#!/usr/bin/env  python

__license__   = 'GPL v3'
__copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>'
'''
www.torontosun.com
'''

from calibre.web.feeds.news import BasicNewsRecipe

class TorontoSun(BasicNewsRecipe):
    title                 = 'Toronto SUN'
    __author__            = 'Darko Miletic and Sujata Raman'
    description           = 'News from Canada'
    publisher             = 'Toronto Sun'
    category              = 'news, politics, Canada'
    oldest_article        = 2
    max_articles_per_feed = 100
    no_stylesheets        = True
    use_embedded_content  = False
    encoding              = 'cp1252'
    language              = 'en_CA'

    conversion_options  = {
                              'comment'   : description
                            , 'tags'      : category
                            , 'publisher' : publisher
                            , 'language'  : language
                          }

    keep_only_tags      = [
                               dict(name='div', attrs={'class':['articleHead','leftBox']})
                              ,dict(name='div', attrs={'id':'channelContent'})
                              ,dict(name='div', attrs={'id':'rotateBox'})
                              ,dict(name='img')
                          ]
    remove_tags         = [
                              dict(name='div',attrs={'class':['bottomBox clear','bottomBox','breadCrumb','articleControls thin','articleControls thin short','extraVideoList']})
                             ,dict(name='h2',attrs={'class':'microhead'})
                             ,dict(name='div',attrs={'id':'commentsBottom'})
                             ,dict(name=['link','iframe','object'])
                             ,dict(name='a',attrs={'rel':'swap'})
                             ,dict(name='a',attrs={'href':'/news/haiti/'})
                             ,dict(name='ul',attrs={'class':['tabs dl contentSwap','micrositeNav clearIt hList','galleryNav rotateNav']})
                          ]

    remove_tags_after   = [
                            dict(name='div',attrs={'class':'bottomBox clear'})
                           ,dict(name='div',attrs={'class':'rotateBox'})
                           ,dict(name='div',attrs={'id':'contentSwap'})
                          ]


    extra_css = '''
                h1{font-family :Arial,Helvetica,sans-serif; font-size:large;}
                h2{font-family :Arial,Helvetica,sans-serif; font-size:medium; color:#666666;}
                h3{font-family :Arial,Helvetica,sans-serif; font-size:medium; color:#000000;}
                p{font-family :Arial,Helvetica,sans-serif; font-size:x-small;}
                .bold{font-family :Arial,Helvetica,sans-serif; font-size: xx-small;color:#444444;margin-left: 0px;}
                .subheading{font-family :Arial,Helvetica,sans-serif; font-size:medium; color:#000000; font-weight: bold;}
                .byline{color:#666666; font-family :Arial,Helvetica,sans-serif; font-size: xx-small;}
                .byline span{color:#666666; font-family :Arial,Helvetica,sans-serif; font-size: xx-small; text-transform: uppercase;}
                .updated{font-family :Arial,Helvetica,sans-serif; font-size: xx-small;}
                .galleryCaption{font-family :Arial,Helvetica,sans-serif; font-size: x-small;}
                .galleryUpdated{font-family :Arial,Helvetica,sans-serif; font-size: x-small;}
                '''


    feeds = [
              (u'News'       , u'http://www.torontosun.com/news/rss.xml'           )
             ,(u'Canada'     , u'http://www.torontosun.com/news/canada/rss.xml'    )
             ,(u'Columnists' , u'http://www.torontosun.com/news/columnists/rss.xml')
             ,(u'World'      , u'http://www.torontosun.com/news/world/rss.xml'     )
             ,(u'Money'      , u'http://www.torontosun.com/money/rss.xml'          )
            ]

    def preprocess_html(self, soup):
        ##To fetch images from the specified source
        for img in soup.findAll('img', src=True):
            url= img.get('src').split('?')[-1].partition('=')[-1]
            if url:
                img['src'] = url.split('&')[0].partition('=')[0]
                img['width'] = url.split('&')[-1].partition('=')[-1].split('x')[0]
                img['height'] =url.split('&')[-1].partition('=')[-1].split('x')[1]
        return soup