From 9a6671c3ce0669590b0b658d23928bd9aa21cb5b Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Mon, 5 Sep 2022 07:59:35 +0530
Subject: [PATCH] Update The Wall Street Journal

---
 recipes/wsj.recipe      | 68 ++++++++++++++++-------------------------
 recipes/wsj_free.recipe | 66 +++++++++++++++------------------------
 2 files changed, 51 insertions(+), 83 deletions(-)

diff --git a/recipes/wsj.recipe b/recipes/wsj.recipe
index 7b3db46b3c..762585b9c5 100644
--- a/recipes/wsj.recipe
+++ b/recipes/wsj.recipe
@@ -10,16 +10,9 @@ from base64 import standard_b64encode
 from datetime import date, timedelta
 from mechanize import Request
 
-from calibre.web.feeds.news import BasicNewsRecipe, classes, prefixed_classes
+from calibre.web.feeds.news import BasicNewsRecipe, classes
 from css_selectors import Select
 
-# WSJ has started delivering the paywalled content encrypted even for logged in subscribers.
-# The content is then decrypted via javascript and displayed.
-# I could in theory reverse engineer their javascript and decrypt the content in the recipe,
-# but this is too much effort, at least for me. If anybody wants to have a stab at it, feel free,
-# the decryption code is in https://www.wsj.com/_next/static/chunks/fec483df-86515f08f3742e3f.js
-# You can get the encrypted data from any wsj paywalled article page by searching for encryptedDataHash in the HTML.
-#
 try:
     import urllib.parse as urlparse
 except ImportError:
@@ -62,46 +55,37 @@ class WSJ(BasicNewsRecipe):
     timefmt = ' [%a, %b %d, %Y]'
     no_stylesheets = True
     ignore_duplicate_articles = {'url'}
-    remove_attributes = ['style', 'data-scrim']
+    remove_attributes = ['style','height','width']
     needs_subscription = needs_subscription
     WSJ_ITP = 'https://www.wsj.com/print-edition/today'
 
+    extra_css = '''
+        .imageCaption{font-size:small; text-align:center;}
+        .sub-head{font-style:italic; color:#404040;}
+        .bylineWrap{font-size:small; text-align:left;}
+    '''
+
     keep_only_tags = [
-        dict(substring_classes('StyledHeadline wsj-mosdo-Dek-Dek -Figure -Figcaption ArticleBodyContent__ -Paragraph')),
-        dict(classes('wsj-article-headline-wrap article_header bigTop__hed bigTop__dek bigTop__captioncredit')),
-        dict(name='span', itemprop='author', rel='author'),
-        dict(name='article', id='article-contents articleBody'.split()),
-        dict(name='div', id='article_story_body ncTitleArea snipper-ad-login cx-snippet-overlay'.split()),
-        classes('nc-exp-artbody errorNotFound'),
-        dict(attrs={'data-module-zone': 'article_snippet'}),
-        prefixed_classes(
-            'Headline__StyledHeadline- MediaLayout__Layout- ArticleByline__Container-'
-            ' ArticleTimestamp__Timestamp- ArticleBody__Container- PaywalledContent__PaywalledContentContainer- ArticleRoadblock__Container-'),
+        classes('wsj-article-headline-wrap articleLead bylineWrap bigTop-hero'),
+        dict(name='section', attrs={'subscriptions-section':'content'})
     ]
 
     remove_tags = [
-        dict(id='right-rail'),
-        dict(id='narrator-nav'),
-        dict(name='div', id='ad_and_popular'),
-        classes('strap-container right-rail comments-count-container insetButton insettipBox author-info'
-            ' media-object-video article_tools nc-exp-artmeta category type-InsetArticlesRelatedByType media-object-rich-text'),
-        dict(name='span', attrs={
-             'data-country-code': True, 'data-ticker-code': True}),
-        dict(name='meta link button'.split()),
+        classes('wsj-ad newsletter-inset media-object-video media-object-podcast podcast--iframe dynamic-inset-overflow-button'),
+        dict(name='amp-iframe')  # interactive graphics
     ]
 
-    def preprocess_soup(self, soup):
-        # Slideshow and expandable images need to be processed here to
-        # set the src attribute correctly
-        found = 0
-        for img in soup.findAll('img', attrs={'data-in-base-data-lazy': True}):
-            img['src'] = img['data-in-base-data-lazy']
-            found += 1
-        for img in soup.findAll('img', attrs={'data-enlarge': True}):
-            img['src'] = img['data-enlarge']
-            found += 1
-        if found:
-            self.log.debug('Found %d dynamic images in:' % found)
+    def preprocess_html(self, soup):
+        for by in soup.findAll(**classes('bylineWrap')):
+            for p in by.findAll('p'):
+                p.name = 'span'
+        for img in soup.findAll('amp-img'):
+            img.name = 'img'
+            if img['src'] == 'https://s.wsj.net/img/meta/wsj-social-share.png':
+                img.extract()
+        h2 = soup.find('h2', attrs={'class':'sub-head'})
+        if h2:
+            h2.name = 'p'
         return soup
 
     def get_cover_url(self):
@@ -215,9 +199,9 @@ class WSJ(BasicNewsRecipe):
 
     def abs_wsj_url(self, href, modify_query=True):
         if not href.startswith('http'):
-            href = 'https://www.wsj.com' + href
+            href = 'https://www.wsj.com' + href.replace('/articles/', '/amp/articles/')
         if modify_query:
-            href = href.split('?')[0] + '?mod=djemalertNEWS'
+            href = href.replace('/articles/', '/amp/articles/')
         return href
 
     def wsj_find_articles(self, url, ahed=False):
@@ -289,7 +273,7 @@ class WSJ(BasicNewsRecipe):
             self.log.warn('No articles found in', url)
 
     def parse_index(self):
-        # return self.test_wsj_index()
+        return self.test_wsj_index()
         root = self.index_to_soup(self.wsj_itp_page, as_tree=True)
         CSSSelect = Select(root)
         # from calibre.utils.ipython import ipython
diff --git a/recipes/wsj_free.recipe b/recipes/wsj_free.recipe
index a841051f62..324c04807c 100644
--- a/recipes/wsj_free.recipe
+++ b/recipes/wsj_free.recipe
@@ -10,16 +10,9 @@ from base64 import standard_b64encode
 from datetime import date, timedelta
 from mechanize import Request
 
-from calibre.web.feeds.news import BasicNewsRecipe, classes, prefixed_classes
+from calibre.web.feeds.news import BasicNewsRecipe, classes
 from css_selectors import Select
 
-# WSJ has started delivering the paywalled content encrypted even for logged in subscribers.
-# The content is then decrypted via javascript and displayed.
-# I could in theory reverse engineer their javascript and decrypt the content in the recipe,
-# but this is too much effort, at least for me. If anybody wants to have a stab at it, feel free,
-# the decryption code is in https://www.wsj.com/_next/static/chunks/fec483df-86515f08f3742e3f.js
-# You can get the encrypted data from any wsj paywalled article page by searching for encryptedDataHash in the HTML.
-#
 try:
     import urllib.parse as urlparse
 except ImportError:
@@ -62,46 +55,37 @@ class WSJ(BasicNewsRecipe):
     timefmt = ' [%a, %b %d, %Y]'
     no_stylesheets = True
     ignore_duplicate_articles = {'url'}
-    remove_attributes = ['style', 'data-scrim']
+    remove_attributes = ['style','height','width']
     needs_subscription = needs_subscription
     WSJ_ITP = 'https://www.wsj.com/print-edition/today'
 
+    extra_css = '''
+        .imageCaption{font-size:small; text-align:center;}
+        .sub-head{font-style:italic; color:#404040;}
+        .bylineWrap{font-size:small; text-align:left;}
+    '''
+
     keep_only_tags = [
-        dict(substring_classes('StyledHeadline wsj-mosdo-Dek-Dek -Figure -Figcaption ArticleBodyContent__ -Paragraph')),
-        dict(classes('wsj-article-headline-wrap article_header bigTop__hed bigTop__dek bigTop__captioncredit')),
-        dict(name='span', itemprop='author', rel='author'),
-        dict(name='article', id='article-contents articleBody'.split()),
-        dict(name='div', id='article_story_body ncTitleArea snipper-ad-login cx-snippet-overlay'.split()),
-        classes('nc-exp-artbody errorNotFound'),
-        dict(attrs={'data-module-zone': 'article_snippet'}),
-        prefixed_classes(
-            'Headline__StyledHeadline- MediaLayout__Layout- ArticleByline__Container-'
-            ' ArticleTimestamp__Timestamp- ArticleBody__Container- PaywalledContent__PaywalledContentContainer- ArticleRoadblock__Container-'),
+        classes('wsj-article-headline-wrap articleLead bylineWrap bigTop-hero'),
+        dict(name='section', attrs={'subscriptions-section':'content'})
     ]
 
     remove_tags = [
-        dict(id='right-rail'),
-        dict(id='narrator-nav'),
-        dict(name='div', id='ad_and_popular'),
-        classes('strap-container right-rail comments-count-container insetButton insettipBox author-info'
-            ' media-object-video article_tools nc-exp-artmeta category type-InsetArticlesRelatedByType media-object-rich-text'),
-        dict(name='span', attrs={
-             'data-country-code': True, 'data-ticker-code': True}),
-        dict(name='meta link button'.split()),
+        classes('wsj-ad newsletter-inset media-object-video media-object-podcast podcast--iframe dynamic-inset-overflow-button'),
+        dict(name='amp-iframe')  # interactive graphics
     ]
 
-    def preprocess_soup(self, soup):
-        # Slideshow and expandable images need to be processed here to
-        # set the src attribute correctly
-        found = 0
-        for img in soup.findAll('img', attrs={'data-in-base-data-lazy': True}):
-            img['src'] = img['data-in-base-data-lazy']
-            found += 1
-        for img in soup.findAll('img', attrs={'data-enlarge': True}):
-            img['src'] = img['data-enlarge']
-            found += 1
-        if found:
-            self.log.debug('Found %d dynamic images in:' % found)
+    def preprocess_html(self, soup):
+        for by in soup.findAll(**classes('bylineWrap')):
+            for p in by.findAll('p'):
+                p.name = 'span'
+        for img in soup.findAll('amp-img'):
+            img.name = 'img'
+            if img['src'] == 'https://s.wsj.net/img/meta/wsj-social-share.png':
+                img.extract()
+        h2 = soup.find('h2', attrs={'class':'sub-head'})
+        if h2:
+            h2.name = 'p'
         return soup
 
     def get_cover_url(self):
@@ -215,9 +199,9 @@ class WSJ(BasicNewsRecipe):
 
     def abs_wsj_url(self, href, modify_query=True):
         if not href.startswith('http'):
-            href = 'https://www.wsj.com' + href
+            href = 'https://www.wsj.com' + href.replace('/articles/', '/amp/articles/')
         if modify_query:
-            href = href.split('?')[0] + '?mod=djemalertNEWS'
+            href = href.replace('/articles/', '/amp/articles/')
         return href
 
     def wsj_find_articles(self, url, ahed=False):