#!/usr/bin/env python __license__ = 'GPL v3' __copyright__ = '2009, Darko Miletic ' ''' msdn.microsoft.com/en-us/magazine ''' from calibre.web.feeds.news import BasicNewsRecipe from calibre.ebooks.BeautifulSoup import BeautifulSoup, BeautifulStoneSoup class MSDNMagazine_en(BasicNewsRecipe): title = 'MSDN Magazine' __author__ = 'Darko Miletic' description = 'The Microsoft Journal for Developers' masthead_url = 'http://i3.msdn.microsoft.com/Platform/MasterPages/MsdnMagazine/smalllogo.png' publisher = 'Microsoft Press' category = 'news, IT, Microsoft, programming, windows' oldest_article = 31 max_articles_per_feed = 100 no_stylesheets = True use_embedded_content = False encoding = 'utf-8' language = 'en' base_url = 'http://msdn.microsoft.com/en-us/magazine/default.aspx' rss_url = 'http://msdn.microsoft.com/en-us/magazine/rss/default.aspx?z=z&iss=1' keep_only_tags = [dict(name='div', attrs={'id':'MainContent'})] remove_tags = [ dict(name='div', attrs={'class':'DivRatingsOnly'}) ,dict(name='div', attrs={'class':'ShareThisButton4'}) ] def find_articles(self): idx_contents = self.browser.open(self.rss_url).read() idx = BeautifulStoneSoup(idx_contents, convertEntities=BeautifulStoneSoup.XML_ENTITIES) for article in idx.findAll('item'): desc_html = self.tag_to_string(article.find('description')) description = self.tag_to_string(BeautifulSoup(desc_html)) a = { 'title': self.tag_to_string(article.find('title')), 'url': self.tag_to_string(article.find('link')), 'description': description, 'date' : self.tag_to_string(article.find('pubdate')), } yield a def parse_index(self): soup = self.index_to_soup(self.base_url) #find issue name, eg "August 2011" issue_name = self.tag_to_string(soup.find('h1')) # find cover pic img = soup.find('img',attrs ={'alt':issue_name}) if img is not None: self.cover_url = img['src'] return [(issue_name, list(self.find_articles()))]