Update WSJ login procedure to match current practice

This commit is contained in:
Kovid Goyal 2022-05-24 10:59:53 +05:30
parent 225c58393a
commit c1b908825e
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C
2 changed files with 66 additions and 60 deletions

View File

@ -3,16 +3,23 @@
# License: GPLv3 Copyright: 2016, Kovid Goyal <kovid at kovidgoyal.net> # License: GPLv3 Copyright: 2016, Kovid Goyal <kovid at kovidgoyal.net>
from __future__ import absolute_import, division, print_function, unicode_literals from __future__ import absolute_import, division, print_function, unicode_literals
import json
import json, time, random import random
import time
from base64 import standard_b64encode from base64 import standard_b64encode
from datetime import date, timedelta from datetime import date, timedelta
from mechanize import Request from mechanize import Request
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe, classes, prefixed_classes
from css_selectors import Select from css_selectors import Select
# WSJ has started delivering the paywalled content encrypted even for logged in subscribers.
# The content is then decrypted via javascript and displayed.
# I could in theory reverse engineer their javascript and decrypt the content in the recipe,
# but this is too much effort, at least for me. If anybody wants to have a stab at it, feel free,
# the decryption code is in https://www.wsj.com/_next/static/chunks/fec483df-86515f08f3742e3f.js
# You can get the encrypted data from any wsj paywalled article page by searching for encryptedDataHash in the HTML.
#
try: try:
import urllib.parse as urlparse import urllib.parse as urlparse
except ImportError: except ImportError:
@ -26,25 +33,6 @@ except ImportError:
needs_subscription = True needs_subscription = True
def classes(classes):
q = frozenset(classes.split(' '))
return dict(attrs={
'class': lambda x: x and frozenset(x.split()).intersection(q)})
def prefixed_classes(classes):
q = frozenset(classes.split(' '))
def matcher(x):
if x:
for candidate in frozenset(x.split()):
for x in q:
if candidate.startswith(x):
return True
return False
return {'attrs': {'class': matcher}}
class WSJ(BasicNewsRecipe): class WSJ(BasicNewsRecipe):
if needs_subscription: if needs_subscription:
@ -72,7 +60,9 @@ class WSJ(BasicNewsRecipe):
dict(name='div', id='article_story_body ncTitleArea snipper-ad-login'.split()), dict(name='div', id='article_story_body ncTitleArea snipper-ad-login'.split()),
classes('nc-exp-artbody errorNotFound'), classes('nc-exp-artbody errorNotFound'),
dict(attrs={'data-module-zone': 'article_snippet'}), dict(attrs={'data-module-zone': 'article_snippet'}),
prefixed_classes('Headline__StyledHeadline- MediaLayout__Layout- ArticleByline__Container- ArticleTimestamp__Timestamp- ArticleBody__Container-'), prefixed_classes(
'Headline__StyledHeadline- MediaLayout__Layout- ArticleByline__Container-'
' ArticleTimestamp__Timestamp- ArticleBody__Container- PaywalledContent__PaywalledContentContainer- ArticleRoadblock__Container-'),
] ]
remove_tags = [ remove_tags = [
@ -119,33 +109,44 @@ class WSJ(BasicNewsRecipe):
if needs_subscription: if needs_subscription:
def get_browser(self, *a, **kw): def get_browser(self, *a, **kw):
from pprint import pprint
pprint
# To understand the login logic read app-min.js from # To understand the login logic read app-min.js from
# https://sso.accounts.dowjones.com/login # https://sso.accounts.dowjones.com/login
itp = quote(self.WSJ_ITP, safe='') itp = quote(self.WSJ_ITP, safe='')
start_url = 'https://accounts.wsj.com/login?target=' + itp start_url = 'https://accounts.wsj.com/login?target=' + itp
self.log('Starting login process...') self.log('Starting login process at', start_url)
br = self.get_browser_for_wsj(*a, **kw) br = self.get_browser_for_wsj(*a, **kw)
# br.set_debug_http(True)
res = br.open(start_url) res = br.open(start_url)
sso_url = res.geturl() sso_url = res.geturl()
query = urlparse.parse_qs(urlparse.urlparse(sso_url).query) query = urlparse.parse_qs(urlparse.urlparse(sso_url).query)
query = {k:v[0] for k, v in query.items()} query = {k:v[0] for k, v in query.items()}
# pprint(query)
request_query = { request_query = {
'username': self.username, 'username': self.username,
'password': self.password, 'password': self.password,
'client_id': query['client'], 'client_id': query['client'],
'sso': 'true',
'tenant': 'sso', 'tenant': 'sso',
'_intstate': 'deprecated', '_intstate': 'deprecated',
'connection': 'DJldap', 'connection': 'DJldap',
'headers': {
'X-REMOTE-USER': self.username,
'x-_dj-_client__id': query['client'],
},
} }
for k in 'scope connection nonce state ui_locales ns protocol redirect_uri'.split(): for cookie in br.cookiejar:
if cookie.name == '_csrf':
request_query[cookie.name] = cookie.value
for k in 'scope connection nonce state ui_locales ns mars protocol redirect_uri'.split():
if k in query: if k in query:
request_query[k] = query[k] request_query[k] = query[k]
# pprint(request_query)
login_url = 'https://sso.accounts.dowjones.com/usernamepassword/login' login_url = 'https://sso.accounts.dowjones.com/usernamepassword/login'
# you can get the version below from lib-min.js # you can get the version below from lib-min.js
# search for: str: "x.x.x" # search for: "\d+\.\d+\.\d+"
# This might need to be updated in the future # This might need to be updated in the future
auth0_client = json.dumps({"name": "auth0.js", "version": "7.0.3"}) auth0_client = json.dumps({"name": "auth0.js-ulp", "version": "9.11.3"})
if not isinstance(auth0_client, bytes): if not isinstance(auth0_client, bytes):
auth0_client = auth0_client.encode('utf-8') auth0_client = auth0_client.encode('utf-8')
auth0_client = standard_b64encode(auth0_client) auth0_client = standard_b64encode(auth0_client)
@ -154,10 +155,12 @@ class WSJ(BasicNewsRecipe):
rq = Request(login_url, headers={ rq = Request(login_url, headers={
'Accept': 'text/html', 'Accept': 'text/html',
'Accept-Language': 'en-US,en;q=0.8', 'Accept-Language': 'en-US,en;q=0.8',
'Origin': 'https://sso.accounts.dowjones.com',
'Auth0-Client': auth0_client.rstrip('='), 'Auth0-Client': auth0_client.rstrip('='),
'X-HTTP-Method-Override': 'POST', 'X-HTTP-Method-Override': 'POST',
'X-Requested-With': 'XMLHttpRequest', 'X-Requested-With': 'XMLHttpRequest',
'X-Remote-User': self.username 'X-Remote-User': self.username,
'x-dj-client_id': request_query['client_id'],
}, data=request_query) }, data=request_query)
self.log('Sending login request...') self.log('Sending login request...')
try: try:

View File

@ -3,16 +3,23 @@
# License: GPLv3 Copyright: 2016, Kovid Goyal <kovid at kovidgoyal.net> # License: GPLv3 Copyright: 2016, Kovid Goyal <kovid at kovidgoyal.net>
from __future__ import absolute_import, division, print_function, unicode_literals from __future__ import absolute_import, division, print_function, unicode_literals
import json
import json, time, random import random
import time
from base64 import standard_b64encode from base64 import standard_b64encode
from datetime import date, timedelta from datetime import date, timedelta
from mechanize import Request from mechanize import Request
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe, classes, prefixed_classes
from css_selectors import Select from css_selectors import Select
# WSJ has started delivering the paywalled content encrypted even for logged in subscribers.
# The content is then decrypted via javascript and displayed.
# I could in theory reverse engineer their javascript and decrypt the content in the recipe,
# but this is too much effort, at least for me. If anybody wants to have a stab at it, feel free,
# the decryption code is in https://www.wsj.com/_next/static/chunks/fec483df-86515f08f3742e3f.js
# You can get the encrypted data from any wsj paywalled article page by searching for encryptedDataHash in the HTML.
#
try: try:
import urllib.parse as urlparse import urllib.parse as urlparse
except ImportError: except ImportError:
@ -26,25 +33,6 @@ except ImportError:
needs_subscription = False needs_subscription = False
def classes(classes):
q = frozenset(classes.split(' '))
return dict(attrs={
'class': lambda x: x and frozenset(x.split()).intersection(q)})
def prefixed_classes(classes):
q = frozenset(classes.split(' '))
def matcher(x):
if x:
for candidate in frozenset(x.split()):
for x in q:
if candidate.startswith(x):
return True
return False
return {'attrs': {'class': matcher}}
class WSJ(BasicNewsRecipe): class WSJ(BasicNewsRecipe):
if needs_subscription: if needs_subscription:
@ -72,7 +60,9 @@ class WSJ(BasicNewsRecipe):
dict(name='div', id='article_story_body ncTitleArea snipper-ad-login'.split()), dict(name='div', id='article_story_body ncTitleArea snipper-ad-login'.split()),
classes('nc-exp-artbody errorNotFound'), classes('nc-exp-artbody errorNotFound'),
dict(attrs={'data-module-zone': 'article_snippet'}), dict(attrs={'data-module-zone': 'article_snippet'}),
prefixed_classes('Headline__StyledHeadline- MediaLayout__Layout- ArticleByline__Container- ArticleTimestamp__Timestamp- ArticleBody__Container-'), prefixed_classes(
'Headline__StyledHeadline- MediaLayout__Layout- ArticleByline__Container-'
' ArticleTimestamp__Timestamp- ArticleBody__Container- PaywalledContent__PaywalledContentContainer- ArticleRoadblock__Container-'),
] ]
remove_tags = [ remove_tags = [
@ -119,33 +109,44 @@ class WSJ(BasicNewsRecipe):
if needs_subscription: if needs_subscription:
def get_browser(self, *a, **kw): def get_browser(self, *a, **kw):
from pprint import pprint
pprint
# To understand the login logic read app-min.js from # To understand the login logic read app-min.js from
# https://sso.accounts.dowjones.com/login # https://sso.accounts.dowjones.com/login
itp = quote(self.WSJ_ITP, safe='') itp = quote(self.WSJ_ITP, safe='')
start_url = 'https://accounts.wsj.com/login?target=' + itp start_url = 'https://accounts.wsj.com/login?target=' + itp
self.log('Starting login process...') self.log('Starting login process at', start_url)
br = self.get_browser_for_wsj(*a, **kw) br = self.get_browser_for_wsj(*a, **kw)
# br.set_debug_http(True)
res = br.open(start_url) res = br.open(start_url)
sso_url = res.geturl() sso_url = res.geturl()
query = urlparse.parse_qs(urlparse.urlparse(sso_url).query) query = urlparse.parse_qs(urlparse.urlparse(sso_url).query)
query = {k:v[0] for k, v in query.items()} query = {k:v[0] for k, v in query.items()}
# pprint(query)
request_query = { request_query = {
'username': self.username, 'username': self.username,
'password': self.password, 'password': self.password,
'client_id': query['client'], 'client_id': query['client'],
'sso': 'true',
'tenant': 'sso', 'tenant': 'sso',
'_intstate': 'deprecated', '_intstate': 'deprecated',
'connection': 'DJldap', 'connection': 'DJldap',
'headers': {
'X-REMOTE-USER': self.username,
'x-_dj-_client__id': query['client'],
},
} }
for k in 'scope connection nonce state ui_locales ns protocol redirect_uri'.split(): for cookie in br.cookiejar:
if cookie.name == '_csrf':
request_query[cookie.name] = cookie.value
for k in 'scope connection nonce state ui_locales ns mars protocol redirect_uri'.split():
if k in query: if k in query:
request_query[k] = query[k] request_query[k] = query[k]
# pprint(request_query)
login_url = 'https://sso.accounts.dowjones.com/usernamepassword/login' login_url = 'https://sso.accounts.dowjones.com/usernamepassword/login'
# you can get the version below from lib-min.js # you can get the version below from lib-min.js
# search for: str: "x.x.x" # search for: "\d+\.\d+\.\d+"
# This might need to be updated in the future # This might need to be updated in the future
auth0_client = json.dumps({"name": "auth0.js", "version": "7.0.3"}) auth0_client = json.dumps({"name": "auth0.js-ulp", "version": "9.11.3"})
if not isinstance(auth0_client, bytes): if not isinstance(auth0_client, bytes):
auth0_client = auth0_client.encode('utf-8') auth0_client = auth0_client.encode('utf-8')
auth0_client = standard_b64encode(auth0_client) auth0_client = standard_b64encode(auth0_client)
@ -154,10 +155,12 @@ class WSJ(BasicNewsRecipe):
rq = Request(login_url, headers={ rq = Request(login_url, headers={
'Accept': 'text/html', 'Accept': 'text/html',
'Accept-Language': 'en-US,en;q=0.8', 'Accept-Language': 'en-US,en;q=0.8',
'Origin': 'https://sso.accounts.dowjones.com',
'Auth0-Client': auth0_client.rstrip('='), 'Auth0-Client': auth0_client.rstrip('='),
'X-HTTP-Method-Override': 'POST', 'X-HTTP-Method-Override': 'POST',
'X-Requested-With': 'XMLHttpRequest', 'X-Requested-With': 'XMLHttpRequest',
'X-Remote-User': self.username 'X-Remote-User': self.username,
'x-dj-client_id': request_query['client_id'],
}, data=request_query) }, data=request_query)
self.log('Sending login request...') self.log('Sending login request...')
try: try: