From cae761166f203c220077afc3d7be6b348131332d Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Tue, 22 Mar 2016 20:28:16 +0530 Subject: [PATCH] Update MSNBC --- recipes/msnbc.recipe | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/recipes/msnbc.recipe b/recipes/msnbc.recipe index 22c73f9f20..c47d59a04a 100644 --- a/recipes/msnbc.recipe +++ b/recipes/msnbc.recipe @@ -17,9 +17,12 @@ class MsNBC(BasicNewsRecipe): encoding = 'utf8' publisher = 'msnbc.com' language = 'en' + remove_empty_feeds = True + ignore_duplicate_articles = {'title', 'url'} keep_only_tags=[ dict(itemprop='headline'), dict(attrs={'class':lambda x: x and set(x.split()).intersection({'byline_article', 'article_main'})}), + dict(attrs={'class':lambda x: x and set(x.split()).intersection({'authors-names', 'pane-node-body'})}), ] remove_tags = [ dict(name=['iframe', 'button', 'meta', 'link']), @@ -27,17 +30,18 @@ class MsNBC(BasicNewsRecipe): ] feeds = [ + ('Latest', 'http://www.msnbc.com/feeds/latest'), (u'US News' , u'http://rss.msnbc.msn.com/id/3032524/device/rss/rss.xml') ,(u'Politics' , u'http://rss.msnbc.msn.com/id/3032552/device/rss/rss.xml') ,(u'Business' , u'http://rss.msnbc.msn.com/id/3032071/device/rss/rss.xml') - ,(u'Sports' , u'http://rss.nbcsports.msnbc.com/id/3032112/device/rss/rss.xml') - ,(u'Entertainment' , u'http://rss.msnbc.msn.com/id/3032083/device/rss/rss.xml') ,(u'Health' , u'http://rss.msnbc.msn.com/id/3088327/device/rss/rss.xml') ,(u'Tech & Science', u'http://rss.msnbc.msn.com/id/3032117/device/rss/rss.xml') ] def get_article_url(self, article): - return article.get('guid') + ans = article.get('guid') + if '/video/' not in ans: + return ans def preprocess_html(self, soup): for img in soup.findAll('img', attrs={'data-original':True}):