diff --git a/recipes/msdnmag_en.recipe b/recipes/msdnmag_en.recipe index 77b8da17a8..341ca027f6 100644 --- a/recipes/msdnmag_en.recipe +++ b/recipes/msdnmag_en.recipe @@ -6,11 +6,13 @@ __copyright__ = '2009, Darko Miletic ' msdn.microsoft.com/en-us/magazine ''' from calibre.web.feeds.news import BasicNewsRecipe +from calibre.ebooks.BeautifulSoup import BeautifulSoup, BeautifulStoneSoup, Tag class MSDNMagazine_en(BasicNewsRecipe): title = 'MSDN Magazine' __author__ = 'Darko Miletic' description = 'The Microsoft Journal for Developers' + masthead_url = 'http://i3.msdn.microsoft.com/Platform/MasterPages/MsdnMagazine/smalllogo.png' publisher = 'Microsoft Press' category = 'news, IT, Microsoft, programming, windows' oldest_article = 31 @@ -19,25 +21,45 @@ class MSDNMagazine_en(BasicNewsRecipe): use_embedded_content = False encoding = 'utf-8' language = 'en' + + base_url = 'http://msdn.microsoft.com/en-us/magazine/default.aspx' + rss_url = 'http://msdn.microsoft.com/en-us/magazine/rss/default.aspx?z=z&iss=1' - - feeds = [(u'Articles', u'http://msdn.microsoft.com/en-us/magazine/rss/default.aspx?z=z&iss=1')] - - keep_only_tags = [dict(name='div', attrs={'class':'navpage'})] + keep_only_tags = [dict(name='div', attrs={'id':'MainContent'})] remove_tags = [ - dict(name=['object','link','base','table']) - ,dict(name='div', attrs={'class':'MTPS_CollapsibleRegion'}) + dict(name='div', attrs={'class':'DivRatingsOnly'}) + ,dict(name='div', attrs={'class':'ShareThisButton4'}) ] - remove_tags_after = dict(name='div', attrs={'class':'navpage'}) + + def find_articles(self): + idx_contents = self.browser.open(self.rss_url).read() + idx = BeautifulStoneSoup(idx_contents, convertEntities=BeautifulStoneSoup.XML_ENTITIES) + + for article in idx.findAll('item'): + desc_html = self.tag_to_string(article.find('description')) + description = self.tag_to_string(BeautifulSoup(desc_html)) + + a = { + 'title': self.tag_to_string(article.find('title')), + 'url': self.tag_to_string(article.find('link')), + 'description': description, + 'date' : self.tag_to_string(article.find('pubdate')), + } + yield a - def preprocess_html(self, soup): - for item in soup.findAll('div',attrs={'class':['FeatureSmallHead','ColumnTypeSubTitle']}): - item.name="h2" - for item in soup.findAll('div',attrs={'class':['FeatureHeadline','ColumnTypeTitle']}): - item.name="h1" - for item in soup.findAll('div',attrs={'class':'ArticleTypeTitle'}): - item.name="h3" - return soup + def parse_index(self): + soup = self.index_to_soup(self.base_url) + + #find issue name, eg "August 2011" + issue_name = self.tag_to_string(soup.find('h1')) + + # find cover pic + img = soup.find('img',attrs ={'alt':issue_name}) + if img is not None: + self.cover_url = img['src'] + + return [(issue_name, list(self.find_articles()))] +