Update WSJ

This commit is contained in:
Kovid Goyal 2013-05-18 09:34:51 +05:30
parent 16c5f8b1c1
commit b45e97134e

View File

@ -9,8 +9,9 @@ import copy
# http://online.wsj.com/page/us_in_todays_paper.html # http://online.wsj.com/page/us_in_todays_paper.html
def filter_classes(x): def filter_classes(x):
if not x: return False if not x:
bad_classes = {'sTools', 'printSummary', 'mostPopular', 'relatedCollection'} return False
bad_classes = {'articleInsetPoll', 'trendingNow', 'sTools', 'printSummary', 'mostPopular', 'relatedCollection'}
classes = frozenset(x.split()) classes = frozenset(x.split())
return len(bad_classes.intersection(classes)) > 0 return len(bad_classes.intersection(classes)) > 0
@ -42,14 +43,15 @@ class WallStreetJournal(BasicNewsRecipe):
remove_tags_before = dict(name='h1') remove_tags_before = dict(name='h1')
remove_tags = [ remove_tags = [
dict(id=["articleTabs_tab_article", dict(id=["articleTabs_tab_article",
"articleTabs_tab_comments", "articleTabs_tab_comments", 'msnLinkback', 'yahooLinkback',
'articleTabs_panel_comments', 'footer', 'articleTabs_panel_comments', 'footer', 'emailThisScrim', 'emailConfScrim', 'emailErrorScrim',
"articleTabs_tab_interactive", "articleTabs_tab_video", "articleTabs_tab_interactive", "articleTabs_tab_video",
"articleTabs_tab_map", "articleTabs_tab_slideshow", "articleTabs_tab_map", "articleTabs_tab_slideshow",
"articleTabs_tab_quotes", "articleTabs_tab_document", "articleTabs_tab_quotes", "articleTabs_tab_document",
"printModeAd", "aFbLikeAuth", "videoModule", "printModeAd", "aFbLikeAuth", "videoModule",
"mostRecommendations", "topDiscussions"]), "mostRecommendations", "topDiscussions"]),
{'class':['footer_columns','network','insetCol3wide','interactive','video','slideshow','map','insettip','insetClose','more_in', "insetContent", 'articleTools_bottom', 'aTools', "tooltip", "adSummary", "nav-inline"]}, {'class':['footer_columns','hidden', 'network','insetCol3wide','interactive','video','slideshow','map','insettip',
'insetClose','more_in', "insetContent", 'articleTools_bottom', 'aTools', "tooltip", "adSummary", "nav-inline"]},
dict(rel='shortcut icon'), dict(rel='shortcut icon'),
{'class':filter_classes}, {'class':filter_classes},
] ]
@ -74,7 +76,10 @@ class WallStreetJournal(BasicNewsRecipe):
for tag in soup.findAll(name=['table', 'tr', 'td']): for tag in soup.findAll(name=['table', 'tr', 'td']):
tag.name = 'div' tag.name = 'div'
for tag in soup.findAll('div', dict(id=["articleThumbnail_1", "articleThumbnail_2", "articleThumbnail_3", "articleThumbnail_4", "articleThumbnail_5", "articleThumbnail_6", "articleThumbnail_7"])): for tag in soup.findAll('div', dict(id=[
"articleThumbnail_1", "articleThumbnail_2", "articleThumbnail_3",
"articleThumbnail_4", "articleThumbnail_5", "articleThumbnail_6",
"articleThumbnail_7"])):
tag.extract() tag.extract()
return soup return soup
@ -172,7 +177,7 @@ class WallStreetJournal(BasicNewsRecipe):
flavorstory['class'] = 'mjLinkItem' flavorstory['class'] = 'mjLinkItem'
metapage = soup.find('span', attrs={'class':lambda x: x and 'meta_sectionName' in x}) metapage = soup.find('span', attrs={'class':lambda x: x and 'meta_sectionName' in x})
if metapage is not None: if metapage is not None:
flavorstory.append( copy.copy(metapage) ) #metapage should always be A1 because that should be first on the page flavorstory.append(copy.copy(metapage)) # metapage should always be A1 because that should be first on the page
for a in soup.findAll('a', attrs={'class':'mjLinkItem'}, href=True): for a in soup.findAll('a', attrs={'class':'mjLinkItem'}, href=True):
container = a.findParent(['li', 'div']) container = a.findParent(['li', 'div'])
@ -199,7 +204,6 @@ class WallStreetJournal(BasicNewsRecipe):
return articles return articles
def cleanup(self): def cleanup(self):
self.browser.open('http://online.wsj.com/logout?url=http://online.wsj.com') self.browser.open('http://online.wsj.com/logout?url=http://online.wsj.com')