From c1b908825ea98e66bad011fdefb13a653c4666cd Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Tue, 24 May 2022 10:59:53 +0530
Subject: [PATCH] Update WSJ login procedure to match current practice

---
 recipes/wsj.recipe      | 63 +++++++++++++++++++++--------------------
 recipes/wsj_free.recipe | 63 +++++++++++++++++++++--------------------
 2 files changed, 66 insertions(+), 60 deletions(-)
diff --git a/recipes/wsj.recipe b/recipes/wsj.recipe
index 04762ec80b..cc0e84d99c 100644
--- a/recipes/wsj.recipe
+++ b/recipes/wsj.recipe
@@ -3,16 +3,23 @@
 # License: GPLv3 Copyright: 2016, Kovid Goyal <kovid at kovidgoyal.net>
 
 from __future__ import absolute_import, division, print_function, unicode_literals
-
-import json, time, random
+import json
+import random
+import time
 from base64 import standard_b64encode
 from datetime import date, timedelta
-
 from mechanize import Request
 
-from calibre.web.feeds.news import BasicNewsRecipe
+from calibre.web.feeds.news import BasicNewsRecipe, classes, prefixed_classes
 from css_selectors import Select
 
+# WSJ has started delivering the paywalled content encrypted even for logged in subscribers.
+# The content is then decrypted via javascript and displayed.
+# I could in theory reverse engineer their javascript and decrypt the content in the recipe,
+# but this is too much effort, at least for me. If anybody wants to have a stab at it, feel free,
+# the decryption code is in https://www.wsj.com/_next/static/chunks/fec483df-86515f08f3742e3f.js
+# You can get the encrypted data from any wsj paywalled article page by searching for encryptedDataHash in the HTML.
+#
 try:
     import urllib.parse as urlparse
 except ImportError:
@@ -26,25 +33,6 @@ except ImportError:
 needs_subscription = True
 
 
-def classes(classes):
-    q = frozenset(classes.split(' '))
-    return dict(attrs={
-        'class': lambda x: x and frozenset(x.split()).intersection(q)})
-
-
-def prefixed_classes(classes):
-    q = frozenset(classes.split(' '))
-
-    def matcher(x):
-        if x:
-            for candidate in frozenset(x.split()):
-                for x in q:
-                    if candidate.startswith(x):
-                        return True
-        return False
-    return {'attrs': {'class': matcher}}
-
-
 class WSJ(BasicNewsRecipe):
 
     if needs_subscription:
@@ -72,7 +60,9 @@ class WSJ(BasicNewsRecipe):
         dict(name='div', id='article_story_body ncTitleArea snipper-ad-login'.split()),
         classes('nc-exp-artbody errorNotFound'),
         dict(attrs={'data-module-zone': 'article_snippet'}),
-        prefixed_classes('Headline__StyledHeadline- MediaLayout__Layout- ArticleByline__Container- ArticleTimestamp__Timestamp- ArticleBody__Container-'),
+        prefixed_classes(
+            'Headline__StyledHeadline- MediaLayout__Layout- ArticleByline__Container-'
+            ' ArticleTimestamp__Timestamp- ArticleBody__Container- PaywalledContent__PaywalledContentContainer- ArticleRoadblock__Container-'),
     ]
 
     remove_tags = [
@@ -119,33 +109,44 @@ class WSJ(BasicNewsRecipe):
 
     if needs_subscription:
         def get_browser(self, *a, **kw):
+            from pprint import pprint
+            pprint
             # To understand the login logic read app-min.js from
             # https://sso.accounts.dowjones.com/login
             itp = quote(self.WSJ_ITP, safe='')
             start_url = 'https://accounts.wsj.com/login?target=' + itp
-            self.log('Starting login process...')
+            self.log('Starting login process at', start_url)
             br = self.get_browser_for_wsj(*a, **kw)
+            # br.set_debug_http(True)
             res = br.open(start_url)
             sso_url = res.geturl()
             query =  urlparse.parse_qs(urlparse.urlparse(sso_url).query)
             query = {k:v[0] for k, v in query.items()}
+            # pprint(query)
             request_query = {
                 'username': self.username,
                 'password': self.password,
                 'client_id': query['client'],
-                'sso': 'true',
                 'tenant': 'sso',
                 '_intstate': 'deprecated',
                 'connection': 'DJldap',
+                'headers': {
+                    'X-REMOTE-USER': self.username,
+                    'x-_dj-_client__id': query['client'],
+                },
             }
-            for k in 'scope connection nonce state ui_locales ns protocol redirect_uri'.split():
+            for cookie in br.cookiejar:
+                if cookie.name == '_csrf':
+                    request_query[cookie.name] = cookie.value
+            for k in 'scope connection nonce state ui_locales ns mars protocol redirect_uri'.split():
                 if k in query:
                     request_query[k] = query[k]
+            # pprint(request_query)
             login_url = 'https://sso.accounts.dowjones.com/usernamepassword/login'
             # you can get the version below from lib-min.js
-            # search for: str: "x.x.x"
+            # search for: "\d+\.\d+\.\d+"
             # This might need to be updated in the future
-            auth0_client = json.dumps({"name": "auth0.js", "version": "7.0.3"})
+            auth0_client = json.dumps({"name": "auth0.js-ulp", "version": "9.11.3"})
             if not isinstance(auth0_client, bytes):
                 auth0_client = auth0_client.encode('utf-8')
             auth0_client = standard_b64encode(auth0_client)
@@ -154,10 +155,12 @@ class WSJ(BasicNewsRecipe):
             rq = Request(login_url, headers={
                 'Accept': 'text/html',
                 'Accept-Language': 'en-US,en;q=0.8',
+                'Origin': 'https://sso.accounts.dowjones.com',
                 'Auth0-Client': auth0_client.rstrip('='),
                 'X-HTTP-Method-Override': 'POST',
                 'X-Requested-With': 'XMLHttpRequest',
-                'X-Remote-User': self.username
+                'X-Remote-User': self.username,
+                'x-dj-client_id': request_query['client_id'],
             }, data=request_query)
             self.log('Sending login request...')
             try:
diff --git a/recipes/wsj_free.recipe b/recipes/wsj_free.recipe
index a9f87bbcec..0228c875ca 100644
--- a/recipes/wsj_free.recipe
+++ b/recipes/wsj_free.recipe
@@ -3,16 +3,23 @@
 # License: GPLv3 Copyright: 2016, Kovid Goyal <kovid at kovidgoyal.net>
 
 from __future__ import absolute_import, division, print_function, unicode_literals
-
-import json, time, random
+import json
+import random
+import time
 from base64 import standard_b64encode
 from datetime import date, timedelta
-
 from mechanize import Request
 
-from calibre.web.feeds.news import BasicNewsRecipe
+from calibre.web.feeds.news import BasicNewsRecipe, classes, prefixed_classes
 from css_selectors import Select
 
+# WSJ has started delivering the paywalled content encrypted even for logged in subscribers.
+# The content is then decrypted via javascript and displayed.
+# I could in theory reverse engineer their javascript and decrypt the content in the recipe,
+# but this is too much effort, at least for me. If anybody wants to have a stab at it, feel free,
+# the decryption code is in https://www.wsj.com/_next/static/chunks/fec483df-86515f08f3742e3f.js
+# You can get the encrypted data from any wsj paywalled article page by searching for encryptedDataHash in the HTML.
+#
 try:
     import urllib.parse as urlparse
 except ImportError:
@@ -26,25 +33,6 @@ except ImportError:
 needs_subscription = False
 
 
-def classes(classes):
-    q = frozenset(classes.split(' '))
-    return dict(attrs={
-        'class': lambda x: x and frozenset(x.split()).intersection(q)})
-
-
-def prefixed_classes(classes):
-    q = frozenset(classes.split(' '))
-
-    def matcher(x):
-        if x:
-            for candidate in frozenset(x.split()):
-                for x in q:
-                    if candidate.startswith(x):
-                        return True
-        return False
-    return {'attrs': {'class': matcher}}
-
-
 class WSJ(BasicNewsRecipe):
 
     if needs_subscription:
@@ -72,7 +60,9 @@ class WSJ(BasicNewsRecipe):
         dict(name='div', id='article_story_body ncTitleArea snipper-ad-login'.split()),
         classes('nc-exp-artbody errorNotFound'),
         dict(attrs={'data-module-zone': 'article_snippet'}),
-        prefixed_classes('Headline__StyledHeadline- MediaLayout__Layout- ArticleByline__Container- ArticleTimestamp__Timestamp- ArticleBody__Container-'),
+        prefixed_classes(
+            'Headline__StyledHeadline- MediaLayout__Layout- ArticleByline__Container-'
+            ' ArticleTimestamp__Timestamp- ArticleBody__Container- PaywalledContent__PaywalledContentContainer- ArticleRoadblock__Container-'),
     ]
 
     remove_tags = [
@@ -119,33 +109,44 @@ class WSJ(BasicNewsRecipe):
 
     if needs_subscription:
         def get_browser(self, *a, **kw):
+            from pprint import pprint
+            pprint
             # To understand the login logic read app-min.js from
             # https://sso.accounts.dowjones.com/login
             itp = quote(self.WSJ_ITP, safe='')
             start_url = 'https://accounts.wsj.com/login?target=' + itp
-            self.log('Starting login process...')
+            self.log('Starting login process at', start_url)
             br = self.get_browser_for_wsj(*a, **kw)
+            # br.set_debug_http(True)
             res = br.open(start_url)
             sso_url = res.geturl()
             query =  urlparse.parse_qs(urlparse.urlparse(sso_url).query)
             query = {k:v[0] for k, v in query.items()}
+            # pprint(query)
             request_query = {
                 'username': self.username,
                 'password': self.password,
                 'client_id': query['client'],
-                'sso': 'true',
                 'tenant': 'sso',
                 '_intstate': 'deprecated',
                 'connection': 'DJldap',
+                'headers': {
+                    'X-REMOTE-USER': self.username,
+                    'x-_dj-_client__id': query['client'],
+                },
             }
-            for k in 'scope connection nonce state ui_locales ns protocol redirect_uri'.split():
+            for cookie in br.cookiejar:
+                if cookie.name == '_csrf':
+                    request_query[cookie.name] = cookie.value
+            for k in 'scope connection nonce state ui_locales ns mars protocol redirect_uri'.split():
                 if k in query:
                     request_query[k] = query[k]
+            # pprint(request_query)
             login_url = 'https://sso.accounts.dowjones.com/usernamepassword/login'
             # you can get the version below from lib-min.js
-            # search for: str: "x.x.x"
+            # search for: "\d+\.\d+\.\d+"
             # This might need to be updated in the future
-            auth0_client = json.dumps({"name": "auth0.js", "version": "7.0.3"})
+            auth0_client = json.dumps({"name": "auth0.js-ulp", "version": "9.11.3"})
             if not isinstance(auth0_client, bytes):
                 auth0_client = auth0_client.encode('utf-8')
             auth0_client = standard_b64encode(auth0_client)
@@ -154,10 +155,12 @@ class WSJ(BasicNewsRecipe):
             rq = Request(login_url, headers={
                 'Accept': 'text/html',
                 'Accept-Language': 'en-US,en;q=0.8',
+                'Origin': 'https://sso.accounts.dowjones.com',
                 'Auth0-Client': auth0_client.rstrip('='),
                 'X-HTTP-Method-Override': 'POST',
                 'X-Requested-With': 'XMLHttpRequest',
-                'X-Remote-User': self.username
+                'X-Remote-User': self.username,
+                'x-dj-client_id': request_query['client_id'],
             }, data=request_query)
             self.log('Sending login request...')
             try: