From bec396158afb520b487adf6091df42bdbf3eb18c Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Tue, 15 Oct 2013 22:03:43 +0530
Subject: [PATCH] Update Wall Street Journal

See #1239477 (Private bug)
---
 recipes/wsj.recipe      | 49 +++++++--------------
 recipes/wsj_free.recipe | 98 +++++++++++++++++++----------------------
 2 files changed, 61 insertions(+), 86 deletions(-)

diff --git a/recipes/wsj.recipe b/recipes/wsj.recipe
index 8c68668745..c138fb2a04 100644
--- a/recipes/wsj.recipe
+++ b/recipes/wsj.recipe
@@ -8,17 +8,10 @@ import copy
 
 # http://online.wsj.com/page/us_in_todays_paper.html
 
-def filter_classes(x):
-    if not x:
-        return False
-    bad_classes = {'articleInsetPoll', 'trendingNow', 'sTools', 'printSummary', 'mostPopular', 'relatedCollection'}
-    classes = frozenset(x.split())
-    return len(bad_classes.intersection(classes)) > 0
-
 class WallStreetJournal(BasicNewsRecipe):
 
     title = 'The Wall Street Journal'
-    __author__ = 'Kovid Goyal, Sujata Raman, and Joshua Oster-Morris'
+    __author__ = 'Kovid Goyal and Joshua Oster-Morris'
     description = 'News and current affairs'
     needs_subscription = True
     language = 'en'
@@ -39,23 +32,16 @@ class WallStreetJournal(BasicNewsRecipe):
                         .byline{color:blue;font-family:Arial,Helvetica,sans-serif; font-size:xx-small}
                         h6{color:#333333; font-family:Georgia,"Century Schoolbook","Times New Roman",Times,serif; font-size:small;font-style:italic; }
                     .paperLocation{color:#666666; font-size:xx-small}'''
-
-    remove_tags_before = dict(name='h1')
+    keep_only_tags = [
+        dict(name='h1'), dict(name='h2', attrs={'class':['subhead', 'subHed deck']}),
+        dict(name='span', itemprop='author', rel='author'),
+        dict(name='article', id='articleBody'),
+        dict(name='div', id='article_story_body'),
+    ]
     remove_tags = [
-                    dict(id=["articleTabs_tab_article",
-                             "articleTabs_tab_comments", 'msnLinkback', 'yahooLinkback',
-                        'articleTabs_panel_comments', 'footer', 'emailThisScrim', 'emailConfScrim', 'emailErrorScrim',
-                        "articleTabs_tab_interactive", "articleTabs_tab_video",
-                        "articleTabs_tab_map", "articleTabs_tab_slideshow",
-                        "articleTabs_tab_quotes", "articleTabs_tab_document",
-                        "printModeAd", "aFbLikeAuth", "videoModule",
-                        "mostRecommendations", "topDiscussions"]),
-                    {'class':['footer_columns','hidden', 'network','insetCol3wide','interactive','video','slideshow','map','insettip',
-                        'insetClose','more_in', "insetContent", 'articleTools_bottom', 'aTools', "tooltip", "adSummary", "nav-inline"]},
-                    dict(rel='shortcut icon'),
-                    {'class':filter_classes},
-                    ]
-    remove_tags_after = [dict(id="article_story_body"), {'class':"article story"},]
+        dict(attrs={'class':['insetButton', 'insettipBox']}),
+        dict(name='span', attrs={'data-country-code':True, 'data-ticker-code':True}),
+    ]
 
     use_javascript_to_login = True
 
@@ -72,15 +58,12 @@ class WallStreetJournal(BasicNewsRecipe):
             if picdiv is not None:
                 self.add_toc_thumbnail(article,picdiv['src'])
 
-    def postprocess_html(self, soup, first):
-        for tag in soup.findAll(name=['table', 'tr', 'td']):
-            tag.name = 'div'
-
-        for tag in soup.findAll('div', dict(id=[
-            "articleThumbnail_1", "articleThumbnail_2", "articleThumbnail_3",
-            "articleThumbnail_4", "articleThumbnail_5", "articleThumbnail_6",
-            "articleThumbnail_7"])):
-            tag.extract()
+    def preprocess_html(self, soup):
+        # Remove thumbnail for zoomable images
+        for div in soup.findAll('div', attrs={'class':lambda x: x and 'insetZoomTargetBox' in x.split()}):
+            img = div.find('img')
+            if img is not None:
+                img.extract()
 
         return soup
 
diff --git a/recipes/wsj_free.recipe b/recipes/wsj_free.recipe
index 5f3cf476c7..eea9789f79 100644
--- a/recipes/wsj_free.recipe
+++ b/recipes/wsj_free.recipe
@@ -33,21 +33,16 @@ class WallStreetJournal(BasicNewsRecipe):
                         h6{color:#333333; font-family:Georgia,"Century Schoolbook","Times New Roman",Times,serif; font-size:small;font-style:italic; }
                     .paperLocation{color:#666666; font-size:xx-small}'''
 
-    remove_tags_before = dict(name='h1')
+    keep_only_tags = [
+        dict(name='h1'), dict(name='h2', attrs={'class':['subhead', 'subHed deck']}),
+        dict(name='span', itemprop='author', rel='author'),
+        dict(name='article', id='articleBody'),
+        dict(name='div', id='article_story_body'),
+    ]
     remove_tags = [
-                    dict(id=["articleTabs_tab_article",
-                        "articleTabs_tab_comments",
-                        "articleTabs_tab_interactive","articleTabs_tab_video","articleTabs_tab_map","articleTabs_tab_slideshow",
-                        "articleTabs_tab_quotes"]),
-                    {'class':['footer_columns','network','insetCol3wide','interactive','video','slideshow','map','insettip','insetClose','more_in', "insetContent", 'articleTools_bottom', 'aTools', "tooltip", "adSummary", "nav-inline"]},
-                    dict(name='div', attrs={'data-flash-settings':True}),
-                    {'class':['insetContent embedType-interactive insetCol3wide','insetCol6wide','insettipUnit']},
-                    dict(rel='shortcut icon'),
-                    {'class':lambda x: x and 'sTools' in x},
-                    {'class':lambda x: x and 'printSummary' in x},
-                    {'class':lambda x: x and 'mostPopular' in x},
-                    ]
-    remove_tags_after = [dict(id="article_story_body"), {'class':"article story"},]
+        dict(attrs={'class':['insetButton', 'insettipBox']}),
+        dict(name='span', attrs={'data-country-code':True, 'data-ticker-code':True}),
+    ]
 
     def populate_article_metadata(self, article, soup, first):
         if first and hasattr(self, 'add_toc_thumbnail'):
@@ -55,12 +50,12 @@ class WallStreetJournal(BasicNewsRecipe):
             if picdiv is not None:
                 self.add_toc_thumbnail(article,picdiv['src'])
 
-    def postprocess_html(self, soup, first):
-        for tag in soup.findAll(name=['table', 'tr', 'td']):
-            tag.name = 'div'
-
-        for tag in soup.findAll('div', dict(id=["articleThumbnail_1", "articleThumbnail_2", "articleThumbnail_3", "articleThumbnail_4", "articleThumbnail_5", "articleThumbnail_6", "articleThumbnail_7"])):
-            tag.extract()
+    def preprocess_html(self, soup):
+        # Remove thumbnail for zoomable images
+        for div in soup.findAll('div', attrs={'class':lambda x: x and 'insetZoomTargetBox' in x.split()}):
+            img = div.find('img')
+            if img is not None:
+                img.extract()
 
         return soup
 
@@ -69,7 +64,6 @@ class WallStreetJournal(BasicNewsRecipe):
             href = 'http://online.wsj.com' + href
         return href
 
-
     def wsj_get_index(self):
         return self.index_to_soup('http://online.wsj.com/itp')
 
@@ -83,7 +77,7 @@ class WallStreetJournal(BasicNewsRecipe):
         except:
             articles = []
         if articles:
-           feeds.append((title, articles))
+            feeds.append((title, articles))
         return feeds
 
     def parse_index(self):
@@ -99,16 +93,16 @@ class WallStreetJournal(BasicNewsRecipe):
         for a in div.findAll('a', href=lambda x: x and '/itp/' in x):
             pageone = a['href'].endswith('pageone')
             if pageone:
-               title = 'Front Section'
-               url = self.abs_wsj_url(a['href'])
-               feeds = self.wsj_add_feed(feeds,title,url)
-               title = 'What''s News'
-               url = url.replace('pageone','whatsnews')
-               feeds = self.wsj_add_feed(feeds,title,url)
+                title = 'Front Section'
+                url = self.abs_wsj_url(a['href'])
+                feeds = self.wsj_add_feed(feeds,title,url)
+                title = 'What''s News'
+                url = url.replace('pageone','whatsnews')
+                feeds = self.wsj_add_feed(feeds,title,url)
             else:
-               title = self.tag_to_string(a)
-               url = self.abs_wsj_url(a['href'])
-               feeds = self.wsj_add_feed(feeds,title,url)
+                title = self.tag_to_string(a)
+                url = self.abs_wsj_url(a['href'])
+                feeds = self.wsj_add_feed(feeds,title,url)
         return feeds
 
     def wsj_find_wn_articles(self, url):
@@ -117,21 +111,21 @@ class WallStreetJournal(BasicNewsRecipe):
 
         whats_news = soup.find('div', attrs={'class':lambda x: x and 'whatsNews-simple' in x})
         if whats_news is not None:
-          for a in whats_news.findAll('a', href=lambda x: x and '/article/' in x):
-            container = a.findParent(['p'])
-            meta = a.find(attrs={'class':'meta_sectionName'})
-            if meta is not None:
-                meta.extract()
-            title = self.tag_to_string(a).strip()
-            url = a['href']
-            desc = ''
-            if container is not None:
-                desc = self.tag_to_string(container)
+            for a in whats_news.findAll('a', href=lambda x: x and '/article/' in x):
+                container = a.findParent(['p'])
+                meta = a.find(attrs={'class':'meta_sectionName'})
+                if meta is not None:
+                    meta.extract()
+                title = self.tag_to_string(a).strip()
+                url = a['href']
+                desc = ''
+                if container is not None:
+                    desc = self.tag_to_string(container)
 
-            articles.append({'title':title, 'url':url,
-                'description':desc, 'date':''})
+                articles.append({'title':title, 'url':url,
+                    'description':desc, 'date':''})
 
-            self.log('\tFound WN article:', title)
+                self.log('\tFound WN article:', title)
 
         return articles
 
@@ -140,18 +134,18 @@ class WallStreetJournal(BasicNewsRecipe):
 
         whats_news = soup.find('div', attrs={'class':lambda x: x and 'whatsNews-simple' in x})
         if whats_news is not None:
-           whats_news.extract()
+            whats_news.extract()
 
         articles = []
 
         flavorarea = soup.find('div', attrs={'class':lambda x: x and 'ahed' in x})
         if flavorarea is not None:
-           flavorstory = flavorarea.find('a', href=lambda x: x and x.startswith('/article'))
-           if flavorstory is not None:
-              flavorstory['class'] = 'mjLinkItem'
-              metapage = soup.find('span', attrs={'class':lambda x: x and 'meta_sectionName' in x})
-              if metapage is not None:
-                 flavorstory.append( copy.copy(metapage) ) #metapage should always be A1 because that should be first on the page
+            flavorstory = flavorarea.find('a', href=lambda x: x and x.startswith('/article'))
+            if flavorstory is not None:
+                flavorstory['class'] = 'mjLinkItem'
+                metapage = soup.find('span', attrs={'class':lambda x: x and 'meta_sectionName' in x})
+                if metapage is not None:
+                    flavorstory.append(copy.copy(metapage))  # metapage should always be A1 because that should be first on the page
 
         for a in soup.findAll('a', attrs={'class':'mjLinkItem'}, href=True):
             container = a.findParent(['li', 'div'])
@@ -176,5 +170,3 @@ class WallStreetJournal(BasicNewsRecipe):
             self.log('\tFound article:', title)
 
         return articles
-
-