__license__ = 'GPL v3' __copyright__ = '2010-2011, Darko Miletic ' ''' msnbc.msn.com ''' from calibre.web.feeds.recipes import BasicNewsRecipe class MsNBC(BasicNewsRecipe): title = 'msnbc.com' __author__ = 'Darko Miletic' description = 'A Fuller Spectrum of News' oldest_article = 2 max_articles_per_feed = 100 no_stylesheets = True use_embedded_content = False encoding = 'utf8' publisher = 'msnbc.com' language = 'en' remove_empty_feeds = True ignore_duplicate_articles = {'title', 'url'} keep_only_tags = [ dict(itemprop='headline'), dict(attrs={'class': lambda x: x and set(x.split()).intersection( {'byline_article', 'article_main'})}), dict(attrs={'class': lambda x: x and set(x.split()).intersection( {'authors-names', 'pane-node-body'})}), ] remove_tags = [ dict(name=['iframe', 'button', 'meta', 'link']), dict(attrs={'class': lambda x: x and set( x.split()).intersection({'widget_video', 'ad-container'})}), ] feeds = [ ('Latest', 'http://www.msnbc.com/feeds/latest'), (u'US News', u'http://rss.msnbc.msn.com/id/3032524/device/rss/rss.xml'), (u'Politics', u'http://rss.msnbc.msn.com/id/3032552/device/rss/rss.xml'), (u'Business', u'http://rss.msnbc.msn.com/id/3032071/device/rss/rss.xml'), (u'Health', u'http://rss.msnbc.msn.com/id/3088327/device/rss/rss.xml'), (u'Tech & Science', u'http://rss.msnbc.msn.com/id/3032117/device/rss/rss.xml') ] def get_article_url(self, article): ans = article.get('guid') if '/video/' not in ans: return ans def preprocess_html(self, soup): for img in soup.findAll('img', attrs={'data-original': True}): img['src'] = img['data-original'] return soup