From bec396158afb520b487adf6091df42bdbf3eb18c Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Tue, 15 Oct 2013 22:03:43 +0530 Subject: [PATCH] Update Wall Street Journal See #1239477 (Private bug) --- recipes/wsj.recipe | 49 +++++++-------------- recipes/wsj_free.recipe | 98 +++++++++++++++++++---------------------- 2 files changed, 61 insertions(+), 86 deletions(-) diff --git a/recipes/wsj.recipe b/recipes/wsj.recipe index 8c68668745..c138fb2a04 100644 --- a/recipes/wsj.recipe +++ b/recipes/wsj.recipe @@ -8,17 +8,10 @@ import copy # http://online.wsj.com/page/us_in_todays_paper.html -def filter_classes(x): - if not x: - return False - bad_classes = {'articleInsetPoll', 'trendingNow', 'sTools', 'printSummary', 'mostPopular', 'relatedCollection'} - classes = frozenset(x.split()) - return len(bad_classes.intersection(classes)) > 0 - class WallStreetJournal(BasicNewsRecipe): title = 'The Wall Street Journal' - __author__ = 'Kovid Goyal, Sujata Raman, and Joshua Oster-Morris' + __author__ = 'Kovid Goyal and Joshua Oster-Morris' description = 'News and current affairs' needs_subscription = True language = 'en' @@ -39,23 +32,16 @@ class WallStreetJournal(BasicNewsRecipe): .byline{color:blue;font-family:Arial,Helvetica,sans-serif; font-size:xx-small} h6{color:#333333; font-family:Georgia,"Century Schoolbook","Times New Roman",Times,serif; font-size:small;font-style:italic; } .paperLocation{color:#666666; font-size:xx-small}''' - - remove_tags_before = dict(name='h1') + keep_only_tags = [ + dict(name='h1'), dict(name='h2', attrs={'class':['subhead', 'subHed deck']}), + dict(name='span', itemprop='author', rel='author'), + dict(name='article', id='articleBody'), + dict(name='div', id='article_story_body'), + ] remove_tags = [ - dict(id=["articleTabs_tab_article", - "articleTabs_tab_comments", 'msnLinkback', 'yahooLinkback', - 'articleTabs_panel_comments', 'footer', 'emailThisScrim', 'emailConfScrim', 'emailErrorScrim', - "articleTabs_tab_interactive", "articleTabs_tab_video", - "articleTabs_tab_map", "articleTabs_tab_slideshow", - "articleTabs_tab_quotes", "articleTabs_tab_document", - "printModeAd", "aFbLikeAuth", "videoModule", - "mostRecommendations", "topDiscussions"]), - {'class':['footer_columns','hidden', 'network','insetCol3wide','interactive','video','slideshow','map','insettip', - 'insetClose','more_in', "insetContent", 'articleTools_bottom', 'aTools', "tooltip", "adSummary", "nav-inline"]}, - dict(rel='shortcut icon'), - {'class':filter_classes}, - ] - remove_tags_after = [dict(id="article_story_body"), {'class':"article story"},] + dict(attrs={'class':['insetButton', 'insettipBox']}), + dict(name='span', attrs={'data-country-code':True, 'data-ticker-code':True}), + ] use_javascript_to_login = True @@ -72,15 +58,12 @@ class WallStreetJournal(BasicNewsRecipe): if picdiv is not None: self.add_toc_thumbnail(article,picdiv['src']) - def postprocess_html(self, soup, first): - for tag in soup.findAll(name=['table', 'tr', 'td']): - tag.name = 'div' - - for tag in soup.findAll('div', dict(id=[ - "articleThumbnail_1", "articleThumbnail_2", "articleThumbnail_3", - "articleThumbnail_4", "articleThumbnail_5", "articleThumbnail_6", - "articleThumbnail_7"])): - tag.extract() + def preprocess_html(self, soup): + # Remove thumbnail for zoomable images + for div in soup.findAll('div', attrs={'class':lambda x: x and 'insetZoomTargetBox' in x.split()}): + img = div.find('img') + if img is not None: + img.extract() return soup diff --git a/recipes/wsj_free.recipe b/recipes/wsj_free.recipe index 5f3cf476c7..eea9789f79 100644 --- a/recipes/wsj_free.recipe +++ b/recipes/wsj_free.recipe @@ -33,21 +33,16 @@ class WallStreetJournal(BasicNewsRecipe): h6{color:#333333; font-family:Georgia,"Century Schoolbook","Times New Roman",Times,serif; font-size:small;font-style:italic; } .paperLocation{color:#666666; font-size:xx-small}''' - remove_tags_before = dict(name='h1') + keep_only_tags = [ + dict(name='h1'), dict(name='h2', attrs={'class':['subhead', 'subHed deck']}), + dict(name='span', itemprop='author', rel='author'), + dict(name='article', id='articleBody'), + dict(name='div', id='article_story_body'), + ] remove_tags = [ - dict(id=["articleTabs_tab_article", - "articleTabs_tab_comments", - "articleTabs_tab_interactive","articleTabs_tab_video","articleTabs_tab_map","articleTabs_tab_slideshow", - "articleTabs_tab_quotes"]), - {'class':['footer_columns','network','insetCol3wide','interactive','video','slideshow','map','insettip','insetClose','more_in', "insetContent", 'articleTools_bottom', 'aTools', "tooltip", "adSummary", "nav-inline"]}, - dict(name='div', attrs={'data-flash-settings':True}), - {'class':['insetContent embedType-interactive insetCol3wide','insetCol6wide','insettipUnit']}, - dict(rel='shortcut icon'), - {'class':lambda x: x and 'sTools' in x}, - {'class':lambda x: x and 'printSummary' in x}, - {'class':lambda x: x and 'mostPopular' in x}, - ] - remove_tags_after = [dict(id="article_story_body"), {'class':"article story"},] + dict(attrs={'class':['insetButton', 'insettipBox']}), + dict(name='span', attrs={'data-country-code':True, 'data-ticker-code':True}), + ] def populate_article_metadata(self, article, soup, first): if first and hasattr(self, 'add_toc_thumbnail'): @@ -55,12 +50,12 @@ class WallStreetJournal(BasicNewsRecipe): if picdiv is not None: self.add_toc_thumbnail(article,picdiv['src']) - def postprocess_html(self, soup, first): - for tag in soup.findAll(name=['table', 'tr', 'td']): - tag.name = 'div' - - for tag in soup.findAll('div', dict(id=["articleThumbnail_1", "articleThumbnail_2", "articleThumbnail_3", "articleThumbnail_4", "articleThumbnail_5", "articleThumbnail_6", "articleThumbnail_7"])): - tag.extract() + def preprocess_html(self, soup): + # Remove thumbnail for zoomable images + for div in soup.findAll('div', attrs={'class':lambda x: x and 'insetZoomTargetBox' in x.split()}): + img = div.find('img') + if img is not None: + img.extract() return soup @@ -69,7 +64,6 @@ class WallStreetJournal(BasicNewsRecipe): href = 'http://online.wsj.com' + href return href - def wsj_get_index(self): return self.index_to_soup('http://online.wsj.com/itp') @@ -83,7 +77,7 @@ class WallStreetJournal(BasicNewsRecipe): except: articles = [] if articles: - feeds.append((title, articles)) + feeds.append((title, articles)) return feeds def parse_index(self): @@ -99,16 +93,16 @@ class WallStreetJournal(BasicNewsRecipe): for a in div.findAll('a', href=lambda x: x and '/itp/' in x): pageone = a['href'].endswith('pageone') if pageone: - title = 'Front Section' - url = self.abs_wsj_url(a['href']) - feeds = self.wsj_add_feed(feeds,title,url) - title = 'What''s News' - url = url.replace('pageone','whatsnews') - feeds = self.wsj_add_feed(feeds,title,url) + title = 'Front Section' + url = self.abs_wsj_url(a['href']) + feeds = self.wsj_add_feed(feeds,title,url) + title = 'What''s News' + url = url.replace('pageone','whatsnews') + feeds = self.wsj_add_feed(feeds,title,url) else: - title = self.tag_to_string(a) - url = self.abs_wsj_url(a['href']) - feeds = self.wsj_add_feed(feeds,title,url) + title = self.tag_to_string(a) + url = self.abs_wsj_url(a['href']) + feeds = self.wsj_add_feed(feeds,title,url) return feeds def wsj_find_wn_articles(self, url): @@ -117,21 +111,21 @@ class WallStreetJournal(BasicNewsRecipe): whats_news = soup.find('div', attrs={'class':lambda x: x and 'whatsNews-simple' in x}) if whats_news is not None: - for a in whats_news.findAll('a', href=lambda x: x and '/article/' in x): - container = a.findParent(['p']) - meta = a.find(attrs={'class':'meta_sectionName'}) - if meta is not None: - meta.extract() - title = self.tag_to_string(a).strip() - url = a['href'] - desc = '' - if container is not None: - desc = self.tag_to_string(container) + for a in whats_news.findAll('a', href=lambda x: x and '/article/' in x): + container = a.findParent(['p']) + meta = a.find(attrs={'class':'meta_sectionName'}) + if meta is not None: + meta.extract() + title = self.tag_to_string(a).strip() + url = a['href'] + desc = '' + if container is not None: + desc = self.tag_to_string(container) - articles.append({'title':title, 'url':url, - 'description':desc, 'date':''}) + articles.append({'title':title, 'url':url, + 'description':desc, 'date':''}) - self.log('\tFound WN article:', title) + self.log('\tFound WN article:', title) return articles @@ -140,18 +134,18 @@ class WallStreetJournal(BasicNewsRecipe): whats_news = soup.find('div', attrs={'class':lambda x: x and 'whatsNews-simple' in x}) if whats_news is not None: - whats_news.extract() + whats_news.extract() articles = [] flavorarea = soup.find('div', attrs={'class':lambda x: x and 'ahed' in x}) if flavorarea is not None: - flavorstory = flavorarea.find('a', href=lambda x: x and x.startswith('/article')) - if flavorstory is not None: - flavorstory['class'] = 'mjLinkItem' - metapage = soup.find('span', attrs={'class':lambda x: x and 'meta_sectionName' in x}) - if metapage is not None: - flavorstory.append( copy.copy(metapage) ) #metapage should always be A1 because that should be first on the page + flavorstory = flavorarea.find('a', href=lambda x: x and x.startswith('/article')) + if flavorstory is not None: + flavorstory['class'] = 'mjLinkItem' + metapage = soup.find('span', attrs={'class':lambda x: x and 'meta_sectionName' in x}) + if metapage is not None: + flavorstory.append(copy.copy(metapage)) # metapage should always be A1 because that should be first on the page for a in soup.findAll('a', attrs={'class':'mjLinkItem'}, href=True): container = a.findParent(['li', 'div']) @@ -176,5 +170,3 @@ class WallStreetJournal(BasicNewsRecipe): self.log('\tFound article:', title) return articles - -