calibre/recipes/wsj.recipe
unkn0w7n 2670f912c1 ...
2023-05-25 22:48:14 +05:30

320 lines
12 KiB
Python

#!/usr/bin/env python
# vim:fileencoding=utf-8
# License: GPLv3 Copyright: 2016, Kovid Goyal <kovid at kovidgoyal.net>
from __future__ import absolute_import, division, print_function, unicode_literals
import json
import random
import time
from base64 import standard_b64encode
from datetime import date, timedelta
from mechanize import Request
from calibre.web.feeds.news import BasicNewsRecipe, classes
from css_selectors import Select
try:
import urllib.parse as urlparse
except ImportError:
import urlparse
try:
from urllib.parse import quote
except ImportError:
from urllib import quote
needs_subscription = True
def substring_classes(classes):
q = frozenset(classes.split(' '))
def matcher(x):
if x:
for candidate in frozenset(x.split()):
for x in q:
if x in candidate:
return True
return False
return {'attrs': {'class': matcher}}
class WSJ(BasicNewsRecipe):
if needs_subscription:
title = 'The Wall Street Journal'
else:
title = 'The Wall Street Journal (free)'
__author__ = 'Kovid Goyal'
description = 'News and current affairs'
language = 'en'
compress_news_images = True
compress_news_images_auto_size = 7
timefmt = ' [%a, %b %d, %Y]'
no_stylesheets = True
ignore_duplicate_articles = {'url'}
remove_attributes = ['style','height','width']
needs_subscription = needs_subscription
WSJ_ITP = 'https://www.wsj.com/print-edition/today'
extra_css = '''
.imageCaption{font-size:small; text-align:center;}
.sub-head{font-style:italic; color:#404040;}
.bylineWrap{font-size:small; text-align:left;}
'''
keep_only_tags = [
classes('wsj-article-headline-wrap articleLead bylineWrap bigTop-hero'),
dict(name='section', attrs={'subscriptions-section':'content'})
]
remove_tags = [
classes('wsj-ad newsletter-inset media-object-video media-object-podcast podcast--iframe dynamic-inset-overflow-button'),
dict(name='amp-iframe') # interactive graphics
]
def preprocess_html(self, soup):
for by in soup.findAll(**classes('bylineWrap')):
for p in by.findAll('p'):
p.name = 'span'
for img in soup.findAll('amp-img'):
img.name = 'img'
if img['src'] == 'https://s.wsj.net/img/meta/wsj-social-share.png':
img.extract()
h2 = soup.find('h2', attrs={'class':'sub-head'})
if h2:
h2.name = 'p'
return soup
def get_cover_url(self):
from datetime import date
cover = 'https://img.kiosko.net/' + date.today().strftime('%Y/%m/%d') + '/us/wsj.750.jpg'
br = BasicNewsRecipe.get_browser(self, verify_ssl_certificates=False)
try:
br.open(cover)
except:
index = 'https://en.kiosko.net/us/np/wsj.html'
soup = self.index_to_soup(index)
for image in soup.find('img', attrs={'src': lambda x: x and x.endswith('750.jpg')}):
if image['src'].startswith('/'):
return 'https:' + image['src']
return image['src']
self.log("\nCover unavailable")
cover = None
return cover
# login {{{
def get_browser_for_wsj(self, *a, **kw):
br = BasicNewsRecipe.get_browser(self, *a, **kw)
br.set_cookie('wsjregion', 'na,us', '.wsj.com')
br.set_cookie('gdprApplies', 'false', '.wsj.com')
br.set_cookie('ccpaApplies', 'false', '.wsj.com')
return br
if needs_subscription:
def get_browser(self, *a, **kw):
from pprint import pprint
pprint
# To understand the login logic read app-min.js from
# https://sso.accounts.dowjones.com/login
itp = quote(self.WSJ_ITP, safe='')
start_url = 'https://accounts.wsj.com/login?target=' + itp
self.log('Starting login process at', start_url)
br = self.get_browser_for_wsj(*a, **kw)
# br.set_debug_http(True)
res = br.open(start_url)
sso_url = res.geturl()
query = urlparse.parse_qs(urlparse.urlparse(sso_url).query)
query = {k:v[0] for k, v in query.items()}
# pprint(query)
request_query = {
'username': self.username,
'password': self.password,
'client_id': query['client'],
'tenant': 'sso',
'_intstate': 'deprecated',
'connection': 'DJldap',
'headers': {
'X-REMOTE-USER': self.username,
'x-_dj-_client__id': query['client'],
},
}
for cookie in br.cookiejar:
if cookie.name in ('_csrf', 'csrf'):
request_query['_csrf'] = cookie.value
for k in 'scope connection nonce state ui_locales ns mars protocol redirect_uri'.split():
if k in query:
request_query[k] = query[k]
# pprint(request_query)
login_url = 'https://sso.accounts.dowjones.com/usernamepassword/login'
# you can get the version below from lib-min.js
# search for: "\d+\.\d+\.\d+"
# This might need to be updated in the future
auth0_client = json.dumps({"name": "auth0.js-ulp", "version": "9.11.3"})
if not isinstance(auth0_client, bytes):
auth0_client = auth0_client.encode('utf-8')
auth0_client = standard_b64encode(auth0_client)
if isinstance(auth0_client, bytes):
auth0_client = auth0_client.decode('ascii')
rq = Request(login_url, headers={
'Accept': 'text/html',
'Accept-Language': 'en-US,en;q=0.8',
'Origin': 'https://sso.accounts.dowjones.com',
'Auth0-Client': auth0_client.rstrip('='),
'X-HTTP-Method-Override': 'POST',
'X-Requested-With': 'XMLHttpRequest',
'X-Remote-User': self.username,
'x-dj-client_id': request_query['client_id'],
}, data=request_query)
self.log('Sending login request...')
try:
res = br.open(rq)
except Exception as err:
if hasattr(err, 'read'):
raise Exception('Login request failed with error: {} and body: {}'.format(err, err.read().decode('utf-8', 'replace')))
raise
if res.code != 200:
raise ValueError('Failed to login, check your username and password')
br.select_form(nr=0)
self.log('Performing login callback...')
res = br.submit()
self.log('Print edition resolved url:', res.geturl())
self.wsj_itp_page = raw = res.read()
if b'/logout' not in raw:
raise ValueError(
'Failed to login (callback URL failed), check username and password')
return br
else:
def get_browser(self, *a, **kw):
br = self.get_browser_for_wsj(*a, **kw)
res = br.open(self.WSJ_ITP)
url = res.geturl()
if '/20210913/' in url:
today = date.today()
q = today.isoformat().replace('-', '')
try:
res = br.open(url.replace('/20210913/', '/' + q + '/'))
except Exception:
today -= timedelta(days=1)
q = today.isoformat().replace('-', '')
res = br.open(url.replace('/20210913/', '/' + q + '/'))
self.log('Print edition resolved url:', res.geturl())
self.wsj_itp_page = res.read()
return br
# }}}
def abs_wsj_url(self, href, modify_query=True):
if not href.startswith('http'):
href = 'https://www.wsj.com' + href.replace('/articles/', '/amp/articles/')
if modify_query:
href = href.replace('/articles/', '/amp/articles/')
return href
def wsj_find_articles(self, url, ahed=False):
root = self.index_to_soup(url, as_tree=True)
CSSSelect = Select(root)
articles = []
for container in root.xpath('descendant::div[contains(@class, "WSJTheme--list-item--")]'):
heading = next(CSSSelect('h2, h3', container))
a = next(CSSSelect('a', heading))
title = self.tag_to_string(a)
url = self.abs_wsj_url(a.get('href'))
desc = ''
for p in container.xpath('descendant::p[contains(@class, "WSJTheme--description--")]'):
q = self.tag_to_string(p)
if 'Subscriber Content' in q:
continue
desc += q
break
articles.append({'title': title, 'url': url,
'description': desc, 'date': ''})
self.log('\tFound article:', title)
self.log('\t\t', desc + " " + url)
if self.test and len(articles) >= self.test[1]:
break
return articles
def wsj_find_wn_articles(self, feeds, root, CSSSelect):
articles = []
for a in CSSSelect('.style--strap--ND8Cuaip'):
if 'WHAT\'S NEWS' in self.tag_to_string(a).upper():
whats_news = a.getparent()
break
else:
self.log.error('Failed to find Whats News section')
return
for li in CSSSelect('li', whats_news):
a = next(CSSSelect('a', li))
if '/articles/' not in a.get('href', ''):
continue
title = self.tag_to_string(a).strip()
url = self.abs_wsj_url(a.get('href'))
desc = self.tag_to_string(li)
articles.append({'title': title, 'url': url,
'description': desc, 'date': ''})
self.log('\tFound WN article:', title)
self.log('\t\t', desc + " " + url)
return articles
def wsj_add_feed(self, feeds, title, url):
try:
for i in range(5):
articles = self.wsj_find_articles(url)
if articles:
break
else:
pause = random.choice((1, 1.5, 2, 2.5))
self.log.warn('No articles found in', url, 'retrying after', pause, 'seconds')
time.sleep(pause)
except Exception:
self.log.exception('Failed to parse section:', title)
articles = []
if articles:
feeds.append((title, articles))
else:
self.log.warn('No articles found in', url)
def parse_index(self):
# return self.test_wsj_index()
root = self.index_to_soup(self.wsj_itp_page, as_tree=True)
CSSSelect = Select(root)
# from calibre.utils.ipython import ipython
# ipython({'root': root, 'CSSSelect': CSSSelect, 'raw': self.wsj_itp_page})
for inp in CSSSelect('.DayPickerInput > input'):
if inp.get('placeholder'):
self.timefmt = inp.get('placeholder')
break
feeds = []
for container in root.xpath('descendant::*[contains(@class, "WSJTheme--top-menu-item--")]'):
for a in container.xpath('descendant::a[contains(@class, "WSJTheme--section-link--")]'):
frontpage = a.get('href').endswith('frontpage')
title = self.tag_to_string(a).capitalize().strip().replace('U.s.', 'U.S.')
if not title:
continue
url = self.abs_wsj_url(a.get('href'), modify_query=False)
self.log('Found section:', title, 'at', url)
self.wsj_add_feed(feeds, title, url)
if frontpage:
articles = self.wsj_find_wn_articles(feeds, root, CSSSelect)
if articles:
feeds.append(("What's News", articles))
if self.test and len(feeds) >= self.test[0]:
break
return feeds
def test_wsj_index(self):
return [
('Testing', [
{'title': 'Subscriber Article',
'url': self.abs_wsj_url('https://www.wsj.com/articles/egg-prices-jump-as-bird-flu-hits-poultry-flocks-11648900800')},
]),
]