mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-08 10:44:09 -04:00
Update WSJ login procedure to match current practice
This commit is contained in:
parent
225c58393a
commit
c1b908825e
@ -3,16 +3,23 @@
|
|||||||
# License: GPLv3 Copyright: 2016, Kovid Goyal <kovid at kovidgoyal.net>
|
# License: GPLv3 Copyright: 2016, Kovid Goyal <kovid at kovidgoyal.net>
|
||||||
|
|
||||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||||
|
import json
|
||||||
import json, time, random
|
import random
|
||||||
|
import time
|
||||||
from base64 import standard_b64encode
|
from base64 import standard_b64encode
|
||||||
from datetime import date, timedelta
|
from datetime import date, timedelta
|
||||||
|
|
||||||
from mechanize import Request
|
from mechanize import Request
|
||||||
|
|
||||||
from calibre.web.feeds.news import BasicNewsRecipe
|
from calibre.web.feeds.news import BasicNewsRecipe, classes, prefixed_classes
|
||||||
from css_selectors import Select
|
from css_selectors import Select
|
||||||
|
|
||||||
|
# WSJ has started delivering the paywalled content encrypted even for logged in subscribers.
|
||||||
|
# The content is then decrypted via javascript and displayed.
|
||||||
|
# I could in theory reverse engineer their javascript and decrypt the content in the recipe,
|
||||||
|
# but this is too much effort, at least for me. If anybody wants to have a stab at it, feel free,
|
||||||
|
# the decryption code is in https://www.wsj.com/_next/static/chunks/fec483df-86515f08f3742e3f.js
|
||||||
|
# You can get the encrypted data from any wsj paywalled article page by searching for encryptedDataHash in the HTML.
|
||||||
|
#
|
||||||
try:
|
try:
|
||||||
import urllib.parse as urlparse
|
import urllib.parse as urlparse
|
||||||
except ImportError:
|
except ImportError:
|
||||||
@ -26,25 +33,6 @@ except ImportError:
|
|||||||
needs_subscription = True
|
needs_subscription = True
|
||||||
|
|
||||||
|
|
||||||
def classes(classes):
|
|
||||||
q = frozenset(classes.split(' '))
|
|
||||||
return dict(attrs={
|
|
||||||
'class': lambda x: x and frozenset(x.split()).intersection(q)})
|
|
||||||
|
|
||||||
|
|
||||||
def prefixed_classes(classes):
|
|
||||||
q = frozenset(classes.split(' '))
|
|
||||||
|
|
||||||
def matcher(x):
|
|
||||||
if x:
|
|
||||||
for candidate in frozenset(x.split()):
|
|
||||||
for x in q:
|
|
||||||
if candidate.startswith(x):
|
|
||||||
return True
|
|
||||||
return False
|
|
||||||
return {'attrs': {'class': matcher}}
|
|
||||||
|
|
||||||
|
|
||||||
class WSJ(BasicNewsRecipe):
|
class WSJ(BasicNewsRecipe):
|
||||||
|
|
||||||
if needs_subscription:
|
if needs_subscription:
|
||||||
@ -72,7 +60,9 @@ class WSJ(BasicNewsRecipe):
|
|||||||
dict(name='div', id='article_story_body ncTitleArea snipper-ad-login'.split()),
|
dict(name='div', id='article_story_body ncTitleArea snipper-ad-login'.split()),
|
||||||
classes('nc-exp-artbody errorNotFound'),
|
classes('nc-exp-artbody errorNotFound'),
|
||||||
dict(attrs={'data-module-zone': 'article_snippet'}),
|
dict(attrs={'data-module-zone': 'article_snippet'}),
|
||||||
prefixed_classes('Headline__StyledHeadline- MediaLayout__Layout- ArticleByline__Container- ArticleTimestamp__Timestamp- ArticleBody__Container-'),
|
prefixed_classes(
|
||||||
|
'Headline__StyledHeadline- MediaLayout__Layout- ArticleByline__Container-'
|
||||||
|
' ArticleTimestamp__Timestamp- ArticleBody__Container- PaywalledContent__PaywalledContentContainer- ArticleRoadblock__Container-'),
|
||||||
]
|
]
|
||||||
|
|
||||||
remove_tags = [
|
remove_tags = [
|
||||||
@ -119,33 +109,44 @@ class WSJ(BasicNewsRecipe):
|
|||||||
|
|
||||||
if needs_subscription:
|
if needs_subscription:
|
||||||
def get_browser(self, *a, **kw):
|
def get_browser(self, *a, **kw):
|
||||||
|
from pprint import pprint
|
||||||
|
pprint
|
||||||
# To understand the login logic read app-min.js from
|
# To understand the login logic read app-min.js from
|
||||||
# https://sso.accounts.dowjones.com/login
|
# https://sso.accounts.dowjones.com/login
|
||||||
itp = quote(self.WSJ_ITP, safe='')
|
itp = quote(self.WSJ_ITP, safe='')
|
||||||
start_url = 'https://accounts.wsj.com/login?target=' + itp
|
start_url = 'https://accounts.wsj.com/login?target=' + itp
|
||||||
self.log('Starting login process...')
|
self.log('Starting login process at', start_url)
|
||||||
br = self.get_browser_for_wsj(*a, **kw)
|
br = self.get_browser_for_wsj(*a, **kw)
|
||||||
|
# br.set_debug_http(True)
|
||||||
res = br.open(start_url)
|
res = br.open(start_url)
|
||||||
sso_url = res.geturl()
|
sso_url = res.geturl()
|
||||||
query = urlparse.parse_qs(urlparse.urlparse(sso_url).query)
|
query = urlparse.parse_qs(urlparse.urlparse(sso_url).query)
|
||||||
query = {k:v[0] for k, v in query.items()}
|
query = {k:v[0] for k, v in query.items()}
|
||||||
|
# pprint(query)
|
||||||
request_query = {
|
request_query = {
|
||||||
'username': self.username,
|
'username': self.username,
|
||||||
'password': self.password,
|
'password': self.password,
|
||||||
'client_id': query['client'],
|
'client_id': query['client'],
|
||||||
'sso': 'true',
|
|
||||||
'tenant': 'sso',
|
'tenant': 'sso',
|
||||||
'_intstate': 'deprecated',
|
'_intstate': 'deprecated',
|
||||||
'connection': 'DJldap',
|
'connection': 'DJldap',
|
||||||
|
'headers': {
|
||||||
|
'X-REMOTE-USER': self.username,
|
||||||
|
'x-_dj-_client__id': query['client'],
|
||||||
|
},
|
||||||
}
|
}
|
||||||
for k in 'scope connection nonce state ui_locales ns protocol redirect_uri'.split():
|
for cookie in br.cookiejar:
|
||||||
|
if cookie.name == '_csrf':
|
||||||
|
request_query[cookie.name] = cookie.value
|
||||||
|
for k in 'scope connection nonce state ui_locales ns mars protocol redirect_uri'.split():
|
||||||
if k in query:
|
if k in query:
|
||||||
request_query[k] = query[k]
|
request_query[k] = query[k]
|
||||||
|
# pprint(request_query)
|
||||||
login_url = 'https://sso.accounts.dowjones.com/usernamepassword/login'
|
login_url = 'https://sso.accounts.dowjones.com/usernamepassword/login'
|
||||||
# you can get the version below from lib-min.js
|
# you can get the version below from lib-min.js
|
||||||
# search for: str: "x.x.x"
|
# search for: "\d+\.\d+\.\d+"
|
||||||
# This might need to be updated in the future
|
# This might need to be updated in the future
|
||||||
auth0_client = json.dumps({"name": "auth0.js", "version": "7.0.3"})
|
auth0_client = json.dumps({"name": "auth0.js-ulp", "version": "9.11.3"})
|
||||||
if not isinstance(auth0_client, bytes):
|
if not isinstance(auth0_client, bytes):
|
||||||
auth0_client = auth0_client.encode('utf-8')
|
auth0_client = auth0_client.encode('utf-8')
|
||||||
auth0_client = standard_b64encode(auth0_client)
|
auth0_client = standard_b64encode(auth0_client)
|
||||||
@ -154,10 +155,12 @@ class WSJ(BasicNewsRecipe):
|
|||||||
rq = Request(login_url, headers={
|
rq = Request(login_url, headers={
|
||||||
'Accept': 'text/html',
|
'Accept': 'text/html',
|
||||||
'Accept-Language': 'en-US,en;q=0.8',
|
'Accept-Language': 'en-US,en;q=0.8',
|
||||||
|
'Origin': 'https://sso.accounts.dowjones.com',
|
||||||
'Auth0-Client': auth0_client.rstrip('='),
|
'Auth0-Client': auth0_client.rstrip('='),
|
||||||
'X-HTTP-Method-Override': 'POST',
|
'X-HTTP-Method-Override': 'POST',
|
||||||
'X-Requested-With': 'XMLHttpRequest',
|
'X-Requested-With': 'XMLHttpRequest',
|
||||||
'X-Remote-User': self.username
|
'X-Remote-User': self.username,
|
||||||
|
'x-dj-client_id': request_query['client_id'],
|
||||||
}, data=request_query)
|
}, data=request_query)
|
||||||
self.log('Sending login request...')
|
self.log('Sending login request...')
|
||||||
try:
|
try:
|
||||||
|
@ -3,16 +3,23 @@
|
|||||||
# License: GPLv3 Copyright: 2016, Kovid Goyal <kovid at kovidgoyal.net>
|
# License: GPLv3 Copyright: 2016, Kovid Goyal <kovid at kovidgoyal.net>
|
||||||
|
|
||||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||||
|
import json
|
||||||
import json, time, random
|
import random
|
||||||
|
import time
|
||||||
from base64 import standard_b64encode
|
from base64 import standard_b64encode
|
||||||
from datetime import date, timedelta
|
from datetime import date, timedelta
|
||||||
|
|
||||||
from mechanize import Request
|
from mechanize import Request
|
||||||
|
|
||||||
from calibre.web.feeds.news import BasicNewsRecipe
|
from calibre.web.feeds.news import BasicNewsRecipe, classes, prefixed_classes
|
||||||
from css_selectors import Select
|
from css_selectors import Select
|
||||||
|
|
||||||
|
# WSJ has started delivering the paywalled content encrypted even for logged in subscribers.
|
||||||
|
# The content is then decrypted via javascript and displayed.
|
||||||
|
# I could in theory reverse engineer their javascript and decrypt the content in the recipe,
|
||||||
|
# but this is too much effort, at least for me. If anybody wants to have a stab at it, feel free,
|
||||||
|
# the decryption code is in https://www.wsj.com/_next/static/chunks/fec483df-86515f08f3742e3f.js
|
||||||
|
# You can get the encrypted data from any wsj paywalled article page by searching for encryptedDataHash in the HTML.
|
||||||
|
#
|
||||||
try:
|
try:
|
||||||
import urllib.parse as urlparse
|
import urllib.parse as urlparse
|
||||||
except ImportError:
|
except ImportError:
|
||||||
@ -26,25 +33,6 @@ except ImportError:
|
|||||||
needs_subscription = False
|
needs_subscription = False
|
||||||
|
|
||||||
|
|
||||||
def classes(classes):
|
|
||||||
q = frozenset(classes.split(' '))
|
|
||||||
return dict(attrs={
|
|
||||||
'class': lambda x: x and frozenset(x.split()).intersection(q)})
|
|
||||||
|
|
||||||
|
|
||||||
def prefixed_classes(classes):
|
|
||||||
q = frozenset(classes.split(' '))
|
|
||||||
|
|
||||||
def matcher(x):
|
|
||||||
if x:
|
|
||||||
for candidate in frozenset(x.split()):
|
|
||||||
for x in q:
|
|
||||||
if candidate.startswith(x):
|
|
||||||
return True
|
|
||||||
return False
|
|
||||||
return {'attrs': {'class': matcher}}
|
|
||||||
|
|
||||||
|
|
||||||
class WSJ(BasicNewsRecipe):
|
class WSJ(BasicNewsRecipe):
|
||||||
|
|
||||||
if needs_subscription:
|
if needs_subscription:
|
||||||
@ -72,7 +60,9 @@ class WSJ(BasicNewsRecipe):
|
|||||||
dict(name='div', id='article_story_body ncTitleArea snipper-ad-login'.split()),
|
dict(name='div', id='article_story_body ncTitleArea snipper-ad-login'.split()),
|
||||||
classes('nc-exp-artbody errorNotFound'),
|
classes('nc-exp-artbody errorNotFound'),
|
||||||
dict(attrs={'data-module-zone': 'article_snippet'}),
|
dict(attrs={'data-module-zone': 'article_snippet'}),
|
||||||
prefixed_classes('Headline__StyledHeadline- MediaLayout__Layout- ArticleByline__Container- ArticleTimestamp__Timestamp- ArticleBody__Container-'),
|
prefixed_classes(
|
||||||
|
'Headline__StyledHeadline- MediaLayout__Layout- ArticleByline__Container-'
|
||||||
|
' ArticleTimestamp__Timestamp- ArticleBody__Container- PaywalledContent__PaywalledContentContainer- ArticleRoadblock__Container-'),
|
||||||
]
|
]
|
||||||
|
|
||||||
remove_tags = [
|
remove_tags = [
|
||||||
@ -119,33 +109,44 @@ class WSJ(BasicNewsRecipe):
|
|||||||
|
|
||||||
if needs_subscription:
|
if needs_subscription:
|
||||||
def get_browser(self, *a, **kw):
|
def get_browser(self, *a, **kw):
|
||||||
|
from pprint import pprint
|
||||||
|
pprint
|
||||||
# To understand the login logic read app-min.js from
|
# To understand the login logic read app-min.js from
|
||||||
# https://sso.accounts.dowjones.com/login
|
# https://sso.accounts.dowjones.com/login
|
||||||
itp = quote(self.WSJ_ITP, safe='')
|
itp = quote(self.WSJ_ITP, safe='')
|
||||||
start_url = 'https://accounts.wsj.com/login?target=' + itp
|
start_url = 'https://accounts.wsj.com/login?target=' + itp
|
||||||
self.log('Starting login process...')
|
self.log('Starting login process at', start_url)
|
||||||
br = self.get_browser_for_wsj(*a, **kw)
|
br = self.get_browser_for_wsj(*a, **kw)
|
||||||
|
# br.set_debug_http(True)
|
||||||
res = br.open(start_url)
|
res = br.open(start_url)
|
||||||
sso_url = res.geturl()
|
sso_url = res.geturl()
|
||||||
query = urlparse.parse_qs(urlparse.urlparse(sso_url).query)
|
query = urlparse.parse_qs(urlparse.urlparse(sso_url).query)
|
||||||
query = {k:v[0] for k, v in query.items()}
|
query = {k:v[0] for k, v in query.items()}
|
||||||
|
# pprint(query)
|
||||||
request_query = {
|
request_query = {
|
||||||
'username': self.username,
|
'username': self.username,
|
||||||
'password': self.password,
|
'password': self.password,
|
||||||
'client_id': query['client'],
|
'client_id': query['client'],
|
||||||
'sso': 'true',
|
|
||||||
'tenant': 'sso',
|
'tenant': 'sso',
|
||||||
'_intstate': 'deprecated',
|
'_intstate': 'deprecated',
|
||||||
'connection': 'DJldap',
|
'connection': 'DJldap',
|
||||||
|
'headers': {
|
||||||
|
'X-REMOTE-USER': self.username,
|
||||||
|
'x-_dj-_client__id': query['client'],
|
||||||
|
},
|
||||||
}
|
}
|
||||||
for k in 'scope connection nonce state ui_locales ns protocol redirect_uri'.split():
|
for cookie in br.cookiejar:
|
||||||
|
if cookie.name == '_csrf':
|
||||||
|
request_query[cookie.name] = cookie.value
|
||||||
|
for k in 'scope connection nonce state ui_locales ns mars protocol redirect_uri'.split():
|
||||||
if k in query:
|
if k in query:
|
||||||
request_query[k] = query[k]
|
request_query[k] = query[k]
|
||||||
|
# pprint(request_query)
|
||||||
login_url = 'https://sso.accounts.dowjones.com/usernamepassword/login'
|
login_url = 'https://sso.accounts.dowjones.com/usernamepassword/login'
|
||||||
# you can get the version below from lib-min.js
|
# you can get the version below from lib-min.js
|
||||||
# search for: str: "x.x.x"
|
# search for: "\d+\.\d+\.\d+"
|
||||||
# This might need to be updated in the future
|
# This might need to be updated in the future
|
||||||
auth0_client = json.dumps({"name": "auth0.js", "version": "7.0.3"})
|
auth0_client = json.dumps({"name": "auth0.js-ulp", "version": "9.11.3"})
|
||||||
if not isinstance(auth0_client, bytes):
|
if not isinstance(auth0_client, bytes):
|
||||||
auth0_client = auth0_client.encode('utf-8')
|
auth0_client = auth0_client.encode('utf-8')
|
||||||
auth0_client = standard_b64encode(auth0_client)
|
auth0_client = standard_b64encode(auth0_client)
|
||||||
@ -154,10 +155,12 @@ class WSJ(BasicNewsRecipe):
|
|||||||
rq = Request(login_url, headers={
|
rq = Request(login_url, headers={
|
||||||
'Accept': 'text/html',
|
'Accept': 'text/html',
|
||||||
'Accept-Language': 'en-US,en;q=0.8',
|
'Accept-Language': 'en-US,en;q=0.8',
|
||||||
|
'Origin': 'https://sso.accounts.dowjones.com',
|
||||||
'Auth0-Client': auth0_client.rstrip('='),
|
'Auth0-Client': auth0_client.rstrip('='),
|
||||||
'X-HTTP-Method-Override': 'POST',
|
'X-HTTP-Method-Override': 'POST',
|
||||||
'X-Requested-With': 'XMLHttpRequest',
|
'X-Requested-With': 'XMLHttpRequest',
|
||||||
'X-Remote-User': self.username
|
'X-Remote-User': self.username,
|
||||||
|
'x-dj-client_id': request_query['client_id'],
|
||||||
}, data=request_query)
|
}, data=request_query)
|
||||||
self.log('Sending login request...')
|
self.log('Sending login request...')
|
||||||
try:
|
try:
|
||||||
|
Loading…
x
Reference in New Issue
Block a user