calibre/recipes/msdnmag_en.recipe

#!/usr/bin/env  python2

__license__ = 'GPL v3'
__copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>'
'''
msdn.microsoft.com/en-us/magazine
'''
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulSoup, BeautifulStoneSoup


class MSDNMagazine_en(BasicNewsRecipe):
    title = 'MSDN Magazine'
    __author__ = 'Darko Miletic'
    description = 'The Microsoft Journal for Developers'
    masthead_url = 'http://i3.msdn.microsoft.com/Platform/MasterPages/MsdnMagazine/smalllogo.png'
    publisher = 'Microsoft Press'
    category = 'news, IT, Microsoft, programming, windows'
    oldest_article = 31
    max_articles_per_feed = 100
    no_stylesheets = True
    use_embedded_content = False
    encoding = 'utf-8'
    language = 'en'

    base_url = 'http://msdn.microsoft.com/en-us/magazine/default.aspx'
    rss_url = 'http://msdn.microsoft.com/en-us/magazine/rss/default.aspx?z=z&iss=1'

    keep_only_tags = [dict(name='div', attrs={'id': 'MainContent'})]

    remove_tags = [
        dict(name='div', attrs={'class': 'DivRatingsOnly'}), dict(
            name='div', attrs={'class': 'ShareThisButton4'})
    ]

    def find_articles(self):
        idx_contents = self.browser.open(self.rss_url).read()
        idx = BeautifulStoneSoup(
            idx_contents, convertEntities=BeautifulStoneSoup.XML_ENTITIES)

        for article in idx.findAll('item'):
            desc_html = self.tag_to_string(article.find('description'))
            description = self.tag_to_string(BeautifulSoup(desc_html))

            a = {
                'title':  self.tag_to_string(article.find('title')),
                'url': self.tag_to_string(article.find('link')),
                'description': description,
                'date': self.tag_to_string(article.find('pubdate')),
            }
            yield a

    def parse_index(self):
        soup = self.index_to_soup(self.base_url)

        # find issue name, eg "August 2011"
        issue_name = self.tag_to_string(soup.find('h1'))

        # find cover pic
        img = soup.find('img', attrs={'alt': issue_name})
        if img is not None:
            self.cover_url = img['src']

        return [(issue_name, list(self.find_articles()))]