From d882c28144e28d7cbe78addb95e3c3402e1c7ada Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Wed, 18 May 2011 09:15:17 -0600 Subject: [PATCH] Updated Newsweek --- recipes/newsweek.recipe | 26 ++++++++++++++++++-------- 1 file changed, 18 insertions(+), 8 deletions(-) diff --git a/recipes/newsweek.recipe b/recipes/newsweek.recipe index 97abd69aac..a31706e257 100644 --- a/recipes/newsweek.recipe +++ b/recipes/newsweek.recipe @@ -11,6 +11,20 @@ class Newsweek(BasicNewsRecipe): BASE_URL = 'http://www.newsweek.com' + topics = { + 'Culture' : '/tag/culture.html', + 'Business' : '/tag/business.html', + 'Society' : '/tag/society.html', + 'Science' : '/tag/science.html', + 'Education' : '/tag/education.html', + 'Politics' : '/tag/politics.html', + 'Health' : '/tag/health.html', + 'World' : '/tag/world.html', + 'Nation' : '/tag/nation.html', + 'Technology' : '/tag/technology.html', + 'Game Changers' : '/tag/game-changers.html', + } + keep_only_tags = dict(name='article', attrs={'class':'article-text'}) remove_tags = [dict(attrs={'data-dartad':True})] remove_attributes = ['property'] @@ -21,14 +35,10 @@ class Newsweek(BasicNewsRecipe): return soup def newsweek_sections(self): - return [ - ('Nation', 'http://www.newsweek.com/tag/nation.html'), - ('Society', 'http://www.newsweek.com/tag/society.html'), - ('Culture', 'http://www.newsweek.com/tag/culture.html'), - ('World', 'http://www.newsweek.com/tag/world.html'), - ('Politics', 'http://www.newsweek.com/tag/politics.html'), - ('Business', 'http://www.newsweek.com/tag/business.html'), - ] + for topic_name, topic_url in self.topics.iteritems(): + yield (topic_name, + self.BASE_URL+topic_url) + def newsweek_parse_section_page(self, soup): for article in soup.findAll('article', about=True,