calibre/recipes/msdnmag_en.recipe

#!/usr/bin/env  python

__license__   = 'GPL v3'
__copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>'
'''
msdn.microsoft.com/en-us/magazine
'''
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulSoup, BeautifulStoneSoup

class MSDNMagazine_en(BasicNewsRecipe):
    title                 = 'MSDN Magazine'
    __author__            = 'Darko Miletic'
    description           = 'The Microsoft Journal for Developers'
    masthead_url          = 'http://i3.msdn.microsoft.com/Platform/MasterPages/MsdnMagazine/smalllogo.png'
    publisher             = 'Microsoft Press'
    category              = 'news, IT, Microsoft, programming, windows'
    oldest_article        = 31
    max_articles_per_feed = 100
    no_stylesheets        = True
    use_embedded_content  = False
    encoding              = 'utf-8'
    language              = 'en'

    base_url              = 'http://msdn.microsoft.com/en-us/magazine/default.aspx'
    rss_url               = 'http://msdn.microsoft.com/en-us/magazine/rss/default.aspx?z=z&iss=1'


    keep_only_tags = [dict(name='div', attrs={'id':'MainContent'})]

    remove_tags = [
                    dict(name='div', attrs={'class':'DivRatingsOnly'})
                    ,dict(name='div', attrs={'class':'ShareThisButton4'})
                  ]

    def find_articles(self):
        idx_contents = self.browser.open(self.rss_url).read()
        idx = BeautifulStoneSoup(idx_contents, convertEntities=BeautifulStoneSoup.XML_ENTITIES)

        for article in idx.findAll('item'):
            desc_html = self.tag_to_string(article.find('description'))
            description = self.tag_to_string(BeautifulSoup(desc_html))

            a = {
                    'title':  self.tag_to_string(article.find('title')),
                    'url': self.tag_to_string(article.find('link')),
                    'description': description,
                    'date' : self.tag_to_string(article.find('pubdate')),
                    }
            yield a


    def parse_index(self):
        soup = self.index_to_soup(self.base_url)

        #find issue name, eg "August 2011"
        issue_name = self.tag_to_string(soup.find('h1'))

        # find cover pic
        img = soup.find('img',attrs ={'alt':issue_name})
        if img is not None:
            self.cover_url = img['src']

        return [(issue_name, list(self.find_articles()))]