Update WSJ

Fixes #1837213 [Private bug](https://bugs.launchpad.net/calibre/+bug/1837213)
This commit is contained in:
Kovid Goyal 2019-07-20 12:40:26 +05:30
parent 39a4fb8d26
commit a78bf9f21d
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C
2 changed files with 112 additions and 88 deletions

View File

@ -5,10 +5,7 @@
from __future__ import absolute_import, division, print_function, unicode_literals from __future__ import absolute_import, division, print_function, unicode_literals
import json import json
try: from base64 import standard_b64encode
from urllib.parse import quote
except ImportError:
from urllib import quote
from mechanize import Request from mechanize import Request
@ -16,6 +13,16 @@ from calibre import random_user_agent
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
from css_selectors import Select from css_selectors import Select
try:
import urllib.parse as urlparse
except ImportError:
import urlparse
try:
from urllib.parse import quote
except ImportError:
from urllib import quote
needs_subscription = True needs_subscription = True
@ -40,7 +47,7 @@ class WSJ(BasicNewsRecipe):
ignore_duplicate_articles = {'url'} ignore_duplicate_articles = {'url'}
remove_attributes = ['style', 'data-scrim'] remove_attributes = ['style', 'data-scrim']
needs_subscription = needs_subscription needs_subscription = needs_subscription
WSJ_ITP = 'https://online.wsj.com/itp/today' WSJ_ITP = 'https://www.wsj.com/print-edition/today'
keep_only_tags = [ keep_only_tags = [
dict(classes('wsj-article-headline-wrap article_header bigTop__hed bigTop__dek bigTop__captioncredit')), dict(classes('wsj-article-headline-wrap article_header bigTop__hed bigTop__dek bigTop__captioncredit')),
@ -87,51 +94,56 @@ class WSJ(BasicNewsRecipe):
# login {{{ # login {{{
if needs_subscription: if needs_subscription:
def get_browser(self, *a, **kw): def get_browser(self, *a, **kw):
# To understand the signin logic read signin.js from # To understand the login logic read app-min.js from
# https://id.wsj.com/access/pages/wsj/us/signin.html # https://sso.accounts.dowjones.com/login
# This is the same login servie as used by Barrons itp = quote(self.WSJ_ITP, safe='')
start_url = 'https://accounts.wsj.com/login?target=' + itp
kw['user_agent'] = random_user_agent(allow_ie=False) kw['user_agent'] = random_user_agent(allow_ie=False)
br = BasicNewsRecipe.get_browser(self, *a, **kw) br = BasicNewsRecipe.get_browser(self, *a, **kw)
# self.wsj_itp_page = open('/t/raw.html').read() self.log('Starting login process...')
# return br res = br.open(start_url)
url = 'https://id.wsj.com/access/pages/wsj/us/signin.html?mg=com-wsj&mg=id-wsj' sso_url = res.geturl()
# br.set_debug_http(True) query = urlparse.parse_qs(urlparse.urlparse(sso_url).query)
br.open(url).read() query = {k:v[0] for k, v in query.items()}
rurl = 'https://id.wsj.com/auth/submitlogin.json' request_query = {
rq = Request(rurl, headers={
'Accept': 'application/json, text/javascript, */*; q=0.01',
'Accept-Language': 'en-US,en;q=0.8',
'Content-Type': 'application/json',
'Referer': url,
'X-HTTP-Method-Override': 'POST',
'X-Requested-With': 'XMLHttpRequest',
}, data=json.dumps({
'username': self.username, 'username': self.username,
'password': self.password, 'password': self.password,
'realm': 'default', 'client_id': query['client'],
'savelogin': 'true', 'sso': 'true',
'template': 'default', 'tenant': 'sso',
'url': quote(self.WSJ_ITP), '_intstate': 'deprecated',
})) }
r = br.open(rq) for k in 'scope connection nonce state ui_locales ns protocol redirect_uri'.split():
if r.code != 200: request_query[k] = query[k]
raise ValueError('Failed to login, check username and password') login_url = 'https://sso.accounts.dowjones.com/usernamepassword/login'
data = json.loads(r.read()) # you can get the version below from lib-min.js
# print(data) # search for: str: "x.x.x"
if data.get('result') != 'success': # This might need to be updated in the future
raise ValueError( auth0_client = json.dumps({"name": "auth0.js", "version": "7.0.3"})
'Failed to login (XHR failed), check username and password') if not isinstance(auth0_client, bytes):
br.set_cookie('m', data['username'], '.wsj.com') auth0_client = auth0_client.encode('utf-8')
try: auth0_client = standard_b64encode(auth0_client)
r = br.open(data['url']) if isinstance(auth0_client, bytes):
except Exception: auth0_client = auth0_client.decode('ascii')
self.log.error('Failed to open login url: {}'.format(data['url'])) rq = Request(login_url, headers={
raise 'Accept': 'text/html',
self.wsj_itp_page = raw = r.read() 'Accept-Language': 'en-US,en;q=0.8',
'Auth0-Client': auth0_client.rstrip('='),
'X-HTTP-Method-Override': 'POST',
'X-Requested-With': 'XMLHttpRequest',
'X-Remote-User': self.username
}, data=request_query)
self.log('Sending login request...')
res = br.open(rq)
if res.code != 200:
raise ValueError('Failed to login, check your username and password')
br.select_form(nr=0)
self.log('Performing login callback...')
res = br.submit()
self.wsj_itp_page = raw = res.read()
if b'>Sign Out<' not in raw: if b'>Sign Out<' not in raw:
raise ValueError( raise ValueError(
'Failed to login (auth URL failed), check username and password') 'Failed to login (callback URL failed), check username and password')
# open('/t/raw.html', 'w').write(raw)
return br return br
else: else:
def get_browser(self, *a, **kw): def get_browser(self, *a, **kw):

View File

@ -5,10 +5,7 @@
from __future__ import absolute_import, division, print_function, unicode_literals from __future__ import absolute_import, division, print_function, unicode_literals
import json import json
try: from base64 import standard_b64encode
from urllib.parse import quote
except ImportError:
from urllib import quote
from mechanize import Request from mechanize import Request
@ -16,6 +13,16 @@ from calibre import random_user_agent
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
from css_selectors import Select from css_selectors import Select
try:
import urllib.parse as urlparse
except ImportError:
import urlparse
try:
from urllib.parse import quote
except ImportError:
from urllib import quote
needs_subscription = False needs_subscription = False
@ -40,7 +47,7 @@ class WSJ(BasicNewsRecipe):
ignore_duplicate_articles = {'url'} ignore_duplicate_articles = {'url'}
remove_attributes = ['style', 'data-scrim'] remove_attributes = ['style', 'data-scrim']
needs_subscription = needs_subscription needs_subscription = needs_subscription
WSJ_ITP = 'https://online.wsj.com/itp/today' WSJ_ITP = 'https://www.wsj.com/print-edition/today'
keep_only_tags = [ keep_only_tags = [
dict(classes('wsj-article-headline-wrap article_header bigTop__hed bigTop__dek bigTop__captioncredit')), dict(classes('wsj-article-headline-wrap article_header bigTop__hed bigTop__dek bigTop__captioncredit')),
@ -87,51 +94,56 @@ class WSJ(BasicNewsRecipe):
# login {{{ # login {{{
if needs_subscription: if needs_subscription:
def get_browser(self, *a, **kw): def get_browser(self, *a, **kw):
# To understand the signin logic read signin.js from # To understand the login logic read app-min.js from
# https://id.wsj.com/access/pages/wsj/us/signin.html # https://sso.accounts.dowjones.com/login
# This is the same login servie as used by Barrons itp = quote(self.WSJ_ITP, safe='')
start_url = 'https://accounts.wsj.com/login?target=' + itp
kw['user_agent'] = random_user_agent(allow_ie=False) kw['user_agent'] = random_user_agent(allow_ie=False)
br = BasicNewsRecipe.get_browser(self, *a, **kw) br = BasicNewsRecipe.get_browser(self, *a, **kw)
# self.wsj_itp_page = open('/t/raw.html').read() self.log('Starting login process...')
# return br res = br.open(start_url)
url = 'https://id.wsj.com/access/pages/wsj/us/signin.html?mg=com-wsj&mg=id-wsj' sso_url = res.geturl()
# br.set_debug_http(True) query = urlparse.parse_qs(urlparse.urlparse(sso_url).query)
br.open(url).read() query = {k:v[0] for k, v in query.items()}
rurl = 'https://id.wsj.com/auth/submitlogin.json' request_query = {
rq = Request(rurl, headers={
'Accept': 'application/json, text/javascript, */*; q=0.01',
'Accept-Language': 'en-US,en;q=0.8',
'Content-Type': 'application/json',
'Referer': url,
'X-HTTP-Method-Override': 'POST',
'X-Requested-With': 'XMLHttpRequest',
}, data=json.dumps({
'username': self.username, 'username': self.username,
'password': self.password, 'password': self.password,
'realm': 'default', 'client_id': query['client'],
'savelogin': 'true', 'sso': 'true',
'template': 'default', 'tenant': 'sso',
'url': quote(self.WSJ_ITP), '_intstate': 'deprecated',
})) }
r = br.open(rq) for k in 'scope connection nonce state ui_locales ns protocol redirect_uri'.split():
if r.code != 200: request_query[k] = query[k]
raise ValueError('Failed to login, check username and password') login_url = 'https://sso.accounts.dowjones.com/usernamepassword/login'
data = json.loads(r.read()) # you can get the version below from lib-min.js
# print(data) # search for: str: "x.x.x"
if data.get('result') != 'success': # This might need to be updated in the future
raise ValueError( auth0_client = json.dumps({"name": "auth0.js", "version": "7.0.3"})
'Failed to login (XHR failed), check username and password') if not isinstance(auth0_client, bytes):
br.set_cookie('m', data['username'], '.wsj.com') auth0_client = auth0_client.encode('utf-8')
try: auth0_client = standard_b64encode(auth0_client)
r = br.open(data['url']) if isinstance(auth0_client, bytes):
except Exception: auth0_client = auth0_client.decode('ascii')
self.log.error('Failed to open login url: {}'.format(data['url'])) rq = Request(login_url, headers={
raise 'Accept': 'text/html',
self.wsj_itp_page = raw = r.read() 'Accept-Language': 'en-US,en;q=0.8',
'Auth0-Client': auth0_client.rstrip('='),
'X-HTTP-Method-Override': 'POST',
'X-Requested-With': 'XMLHttpRequest',
'X-Remote-User': self.username
}, data=request_query)
self.log('Sending login request...')
res = br.open(rq)
if res.code != 200:
raise ValueError('Failed to login, check your username and password')
br.select_form(nr=0)
self.log('Performing login callback...')
res = br.submit()
self.wsj_itp_page = raw = res.read()
if b'>Sign Out<' not in raw: if b'>Sign Out<' not in raw:
raise ValueError( raise ValueError(
'Failed to login (auth URL failed), check username and password') 'Failed to login (callback URL failed), check username and password')
# open('/t/raw.html', 'w').write(raw)
return br return br
else: else:
def get_browser(self, *a, **kw): def get_browser(self, *a, **kw):