WSJ recipe: make it more robust against bot protection

Mostly only required for the free version, with the login
bot protection doesnt kick in.
This commit is contained in:
Kovid Goyal 2021-08-01 10:48:36 +05:30
parent 2ec75e07cf
commit c3848a989d
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C
2 changed files with 56 additions and 22 deletions

View File

@ -4,12 +4,11 @@
from __future__ import absolute_import, division, print_function, unicode_literals from __future__ import absolute_import, division, print_function, unicode_literals
import json import json, time, random
from base64 import standard_b64encode from base64 import standard_b64encode
from mechanize import Request from mechanize import Request
from calibre import random_user_agent
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
from css_selectors import Select from css_selectors import Select
@ -34,7 +33,10 @@ def classes(classes):
class WSJ(BasicNewsRecipe): class WSJ(BasicNewsRecipe):
title = 'The Wall Street Journal' if needs_subscription:
title = 'The Wall Street Journal'
else:
title = 'The Wall Street Journal (free)'
__author__ = 'Kovid Goyal' __author__ = 'Kovid Goyal'
description = 'News and current affairs' description = 'News and current affairs'
language = 'en' language = 'en'
@ -92,15 +94,22 @@ class WSJ(BasicNewsRecipe):
self.log("\nCover unavailable") self.log("\nCover unavailable")
# login {{{ # login {{{
def get_browser_for_wsj(self, *a, **kw):
br = BasicNewsRecipe.get_browser(self, *a, **kw)
br.set_cookie('wsjregion', 'na,us', '.wsj.com')
br.set_cookie('gdprApplies', 'false', '.wsj.com')
br.set_cookie('ccpaApplies', 'false', '.wsj.com')
return br
if needs_subscription: if needs_subscription:
def get_browser(self, *a, **kw): def get_browser(self, *a, **kw):
# To understand the login logic read app-min.js from # To understand the login logic read app-min.js from
# https://sso.accounts.dowjones.com/login # https://sso.accounts.dowjones.com/login
itp = quote(self.WSJ_ITP, safe='') itp = quote(self.WSJ_ITP, safe='')
start_url = 'https://accounts.wsj.com/login?target=' + itp start_url = 'https://accounts.wsj.com/login?target=' + itp
kw['user_agent'] = random_user_agent(allow_ie=False)
br = BasicNewsRecipe.get_browser(self, *a, **kw)
self.log('Starting login process...') self.log('Starting login process...')
br = self.get_browser_for_wsj(*a, **kw)
res = br.open(start_url) res = br.open(start_url)
sso_url = res.geturl() sso_url = res.geturl()
query = urlparse.parse_qs(urlparse.urlparse(sso_url).query) query = urlparse.parse_qs(urlparse.urlparse(sso_url).query)
@ -147,6 +156,7 @@ class WSJ(BasicNewsRecipe):
br.select_form(nr=0) br.select_form(nr=0)
self.log('Performing login callback...') self.log('Performing login callback...')
res = br.submit() res = br.submit()
self.log('Print edition resolved url:', res.geturl())
self.wsj_itp_page = raw = res.read() self.wsj_itp_page = raw = res.read()
if b'/logout' not in raw: if b'/logout' not in raw:
raise ValueError( raise ValueError(
@ -154,9 +164,10 @@ class WSJ(BasicNewsRecipe):
return br return br
else: else:
def get_browser(self, *a, **kw): def get_browser(self, *a, **kw):
kw['user_agent'] = random_user_agent(allow_ie=False) br = self.get_browser_for_wsj(*a, **kw)
br = BasicNewsRecipe.get_browser(self, *a, **kw) res = br.open(self.WSJ_ITP)
self.wsj_itp_page = br.open(self.WSJ_ITP).read() self.log('Print edition resolved url:', res.geturl())
self.wsj_itp_page = res.read()
return br return br
# }}} # }}}
@ -217,15 +228,21 @@ class WSJ(BasicNewsRecipe):
def wsj_add_feed(self, feeds, title, url): def wsj_add_feed(self, feeds, title, url):
try: try:
articles = self.wsj_find_articles(url) for i in range(5):
if not articles:
# retry once, sometimes these pages come up empty
articles = self.wsj_find_articles(url) articles = self.wsj_find_articles(url)
if articles:
break
else:
pause = random.choice((1, 1.5, 2, 2.5))
self.log.warn('No articles found in', url, 'retrying after', pause, 'seconds')
time.sleep(pause)
except Exception: except Exception:
self.log.exception('Failed to parse section:', title) self.log.exception('Failed to parse section:', title)
articles = [] articles = []
if articles: if articles:
feeds.append((title, articles)) feeds.append((title, articles))
else:
self.log.warn('No articles found in', url)
def parse_index(self): def parse_index(self):
# return self.test_wsj_index() # return self.test_wsj_index()

View File

@ -4,12 +4,11 @@
from __future__ import absolute_import, division, print_function, unicode_literals from __future__ import absolute_import, division, print_function, unicode_literals
import json import json, time, random
from base64 import standard_b64encode from base64 import standard_b64encode
from mechanize import Request from mechanize import Request
from calibre import random_user_agent
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
from css_selectors import Select from css_selectors import Select
@ -34,7 +33,10 @@ def classes(classes):
class WSJ(BasicNewsRecipe): class WSJ(BasicNewsRecipe):
title = 'The Wall Street Journal (free)' if needs_subscription:
title = 'The Wall Street Journal'
else:
title = 'The Wall Street Journal (free)'
__author__ = 'Kovid Goyal' __author__ = 'Kovid Goyal'
description = 'News and current affairs' description = 'News and current affairs'
language = 'en' language = 'en'
@ -92,15 +94,22 @@ class WSJ(BasicNewsRecipe):
self.log("\nCover unavailable") self.log("\nCover unavailable")
# login {{{ # login {{{
def get_browser_for_wsj(self, *a, **kw):
br = BasicNewsRecipe.get_browser(self, *a, **kw)
br.set_cookie('wsjregion', 'na,us', '.wsj.com')
br.set_cookie('gdprApplies', 'false', '.wsj.com')
br.set_cookie('ccpaApplies', 'false', '.wsj.com')
return br
if needs_subscription: if needs_subscription:
def get_browser(self, *a, **kw): def get_browser(self, *a, **kw):
# To understand the login logic read app-min.js from # To understand the login logic read app-min.js from
# https://sso.accounts.dowjones.com/login # https://sso.accounts.dowjones.com/login
itp = quote(self.WSJ_ITP, safe='') itp = quote(self.WSJ_ITP, safe='')
start_url = 'https://accounts.wsj.com/login?target=' + itp start_url = 'https://accounts.wsj.com/login?target=' + itp
kw['user_agent'] = random_user_agent(allow_ie=False)
br = BasicNewsRecipe.get_browser(self, *a, **kw)
self.log('Starting login process...') self.log('Starting login process...')
br = self.get_browser_for_wsj(*a, **kw)
res = br.open(start_url) res = br.open(start_url)
sso_url = res.geturl() sso_url = res.geturl()
query = urlparse.parse_qs(urlparse.urlparse(sso_url).query) query = urlparse.parse_qs(urlparse.urlparse(sso_url).query)
@ -147,6 +156,7 @@ class WSJ(BasicNewsRecipe):
br.select_form(nr=0) br.select_form(nr=0)
self.log('Performing login callback...') self.log('Performing login callback...')
res = br.submit() res = br.submit()
self.log('Print edition resolved url:', res.geturl())
self.wsj_itp_page = raw = res.read() self.wsj_itp_page = raw = res.read()
if b'/logout' not in raw: if b'/logout' not in raw:
raise ValueError( raise ValueError(
@ -154,9 +164,10 @@ class WSJ(BasicNewsRecipe):
return br return br
else: else:
def get_browser(self, *a, **kw): def get_browser(self, *a, **kw):
kw['user_agent'] = random_user_agent(allow_ie=False) br = self.get_browser_for_wsj(*a, **kw)
br = BasicNewsRecipe.get_browser(self, *a, **kw) res = br.open(self.WSJ_ITP)
self.wsj_itp_page = br.open(self.WSJ_ITP).read() self.log('Print edition resolved url:', res.geturl())
self.wsj_itp_page = res.read()
return br return br
# }}} # }}}
@ -217,15 +228,21 @@ class WSJ(BasicNewsRecipe):
def wsj_add_feed(self, feeds, title, url): def wsj_add_feed(self, feeds, title, url):
try: try:
articles = self.wsj_find_articles(url) for i in range(5):
if not articles:
# retry once, sometimes these pages come up empty
articles = self.wsj_find_articles(url) articles = self.wsj_find_articles(url)
if articles:
break
else:
pause = random.choice((1, 1.5, 2, 2.5))
self.log.warn('No articles found in', url, 'retrying after', pause, 'seconds')
time.sleep(pause)
except Exception: except Exception:
self.log.exception('Failed to parse section:', title) self.log.exception('Failed to parse section:', title)
articles = [] articles = []
if articles: if articles:
feeds.append((title, articles)) feeds.append((title, articles))
else:
self.log.warn('No articles found in', url)
def parse_index(self): def parse_index(self):
# return self.test_wsj_index() # return self.test_wsj_index()