Update Wall Street Journal

See #1239477 (Private bug)
This commit is contained in:
Kovid Goyal 2013-10-15 22:03:43 +05:30
parent 4768f29bf2
commit bec396158a
2 changed files with 61 additions and 86 deletions

View File

@ -8,17 +8,10 @@ import copy
# http://online.wsj.com/page/us_in_todays_paper.html
def filter_classes(x):
if not x:
return False
bad_classes = {'articleInsetPoll', 'trendingNow', 'sTools', 'printSummary', 'mostPopular', 'relatedCollection'}
classes = frozenset(x.split())
return len(bad_classes.intersection(classes)) > 0
class WallStreetJournal(BasicNewsRecipe):
title = 'The Wall Street Journal'
__author__ = 'Kovid Goyal, Sujata Raman, and Joshua Oster-Morris'
__author__ = 'Kovid Goyal and Joshua Oster-Morris'
description = 'News and current affairs'
needs_subscription = True
language = 'en'
@ -39,23 +32,16 @@ class WallStreetJournal(BasicNewsRecipe):
.byline{color:blue;font-family:Arial,Helvetica,sans-serif; font-size:xx-small}
h6{color:#333333; font-family:Georgia,"Century Schoolbook","Times New Roman",Times,serif; font-size:small;font-style:italic; }
.paperLocation{color:#666666; font-size:xx-small}'''
remove_tags_before = dict(name='h1')
remove_tags = [
dict(id=["articleTabs_tab_article",
"articleTabs_tab_comments", 'msnLinkback', 'yahooLinkback',
'articleTabs_panel_comments', 'footer', 'emailThisScrim', 'emailConfScrim', 'emailErrorScrim',
"articleTabs_tab_interactive", "articleTabs_tab_video",
"articleTabs_tab_map", "articleTabs_tab_slideshow",
"articleTabs_tab_quotes", "articleTabs_tab_document",
"printModeAd", "aFbLikeAuth", "videoModule",
"mostRecommendations", "topDiscussions"]),
{'class':['footer_columns','hidden', 'network','insetCol3wide','interactive','video','slideshow','map','insettip',
'insetClose','more_in', "insetContent", 'articleTools_bottom', 'aTools', "tooltip", "adSummary", "nav-inline"]},
dict(rel='shortcut icon'),
{'class':filter_classes},
keep_only_tags = [
dict(name='h1'), dict(name='h2', attrs={'class':['subhead', 'subHed deck']}),
dict(name='span', itemprop='author', rel='author'),
dict(name='article', id='articleBody'),
dict(name='div', id='article_story_body'),
]
remove_tags = [
dict(attrs={'class':['insetButton', 'insettipBox']}),
dict(name='span', attrs={'data-country-code':True, 'data-ticker-code':True}),
]
remove_tags_after = [dict(id="article_story_body"), {'class':"article story"},]
use_javascript_to_login = True
@ -72,15 +58,12 @@ class WallStreetJournal(BasicNewsRecipe):
if picdiv is not None:
self.add_toc_thumbnail(article,picdiv['src'])
def postprocess_html(self, soup, first):
for tag in soup.findAll(name=['table', 'tr', 'td']):
tag.name = 'div'
for tag in soup.findAll('div', dict(id=[
"articleThumbnail_1", "articleThumbnail_2", "articleThumbnail_3",
"articleThumbnail_4", "articleThumbnail_5", "articleThumbnail_6",
"articleThumbnail_7"])):
tag.extract()
def preprocess_html(self, soup):
# Remove thumbnail for zoomable images
for div in soup.findAll('div', attrs={'class':lambda x: x and 'insetZoomTargetBox' in x.split()}):
img = div.find('img')
if img is not None:
img.extract()
return soup

View File

@ -33,21 +33,16 @@ class WallStreetJournal(BasicNewsRecipe):
h6{color:#333333; font-family:Georgia,"Century Schoolbook","Times New Roman",Times,serif; font-size:small;font-style:italic; }
.paperLocation{color:#666666; font-size:xx-small}'''
remove_tags_before = dict(name='h1')
remove_tags = [
dict(id=["articleTabs_tab_article",
"articleTabs_tab_comments",
"articleTabs_tab_interactive","articleTabs_tab_video","articleTabs_tab_map","articleTabs_tab_slideshow",
"articleTabs_tab_quotes"]),
{'class':['footer_columns','network','insetCol3wide','interactive','video','slideshow','map','insettip','insetClose','more_in', "insetContent", 'articleTools_bottom', 'aTools', "tooltip", "adSummary", "nav-inline"]},
dict(name='div', attrs={'data-flash-settings':True}),
{'class':['insetContent embedType-interactive insetCol3wide','insetCol6wide','insettipUnit']},
dict(rel='shortcut icon'),
{'class':lambda x: x and 'sTools' in x},
{'class':lambda x: x and 'printSummary' in x},
{'class':lambda x: x and 'mostPopular' in x},
keep_only_tags = [
dict(name='h1'), dict(name='h2', attrs={'class':['subhead', 'subHed deck']}),
dict(name='span', itemprop='author', rel='author'),
dict(name='article', id='articleBody'),
dict(name='div', id='article_story_body'),
]
remove_tags = [
dict(attrs={'class':['insetButton', 'insettipBox']}),
dict(name='span', attrs={'data-country-code':True, 'data-ticker-code':True}),
]
remove_tags_after = [dict(id="article_story_body"), {'class':"article story"},]
def populate_article_metadata(self, article, soup, first):
if first and hasattr(self, 'add_toc_thumbnail'):
@ -55,12 +50,12 @@ class WallStreetJournal(BasicNewsRecipe):
if picdiv is not None:
self.add_toc_thumbnail(article,picdiv['src'])
def postprocess_html(self, soup, first):
for tag in soup.findAll(name=['table', 'tr', 'td']):
tag.name = 'div'
for tag in soup.findAll('div', dict(id=["articleThumbnail_1", "articleThumbnail_2", "articleThumbnail_3", "articleThumbnail_4", "articleThumbnail_5", "articleThumbnail_6", "articleThumbnail_7"])):
tag.extract()
def preprocess_html(self, soup):
# Remove thumbnail for zoomable images
for div in soup.findAll('div', attrs={'class':lambda x: x and 'insetZoomTargetBox' in x.split()}):
img = div.find('img')
if img is not None:
img.extract()
return soup
@ -69,7 +64,6 @@ class WallStreetJournal(BasicNewsRecipe):
href = 'http://online.wsj.com' + href
return href
def wsj_get_index(self):
return self.index_to_soup('http://online.wsj.com/itp')
@ -176,5 +170,3 @@ class WallStreetJournal(BasicNewsRecipe):
self.log('\tFound article:', title)
return articles