#!/usr/bin/env python2 # vim:fileencoding=utf-8 # License: GPLv3 Copyright: 2016, Kovid Goyal from __future__ import absolute_import, division, print_function, unicode_literals import json from base64 import standard_b64encode from mechanize import Request from calibre import random_user_agent from calibre.web.feeds.news import BasicNewsRecipe from css_selectors import Select try: import urllib.parse as urlparse except ImportError: import urlparse try: from urllib.parse import quote except ImportError: from urllib import quote needs_subscription = True def classes(classes): q = frozenset(classes.split(' ')) return dict(attrs={ 'class': lambda x: x and frozenset(x.split()).intersection(q)}) class WSJ(BasicNewsRecipe): title = 'The Wall Street Journal' __author__ = 'Kovid Goyal' description = 'News and current affairs' language = 'en' compress_news_images = True compress_news_images_auto_size = 7 timefmt = ' [%a, %b %d, %Y]' no_stylesheets = True ignore_duplicate_articles = {'url'} remove_attributes = ['style', 'data-scrim'] needs_subscription = needs_subscription WSJ_ITP = 'https://www.wsj.com/print-edition/today' keep_only_tags = [ dict(classes('wsj-article-headline-wrap article_header bigTop__hed bigTop__dek bigTop__captioncredit')), dict(name='span', itemprop='author', rel='author'), dict(name='article', id='article-contents articleBody'.split()), dict(name='div', id='article_story_body ncTitleArea snipper-ad-login'.split()), dict(classes('nc-exp-artbody errorNotFound')), dict(attrs={'data-module-zone': 'article_snippet'}), ] remove_tags = [ dict(id='right-rail'), dict(id='narrator-nav'), dict(name='div', id='ad_and_popular'), classes('strap-container right-rail comments-count-container insetButton insettipBox author-info' ' media-object-video article_tools nc-exp-artmeta category type-InsetArticlesRelatedByType media-object-rich-text'), dict(name='span', attrs={ 'data-country-code': True, 'data-ticker-code': True}), dict(name='meta link'.split()), ] def preprocess_soup(self, soup): # Slideshow and expandable images need to be processed here to # set the src attribute correctly found = 0 for img in soup.findAll('img', attrs={'data-in-base-data-lazy': True}): img['src'] = img['data-in-base-data-lazy'] found += 1 for img in soup.findAll('img', attrs={'data-enlarge': True}): img['src'] = img['data-enlarge'] found += 1 if found: self.log.debug('Found %d dynamic images in:' % found) return soup def get_cover_url(self): index = 'http://en.kiosko.net/us/np/wsj.html' soup = self.index_to_soup(index) for image in soup.findAll('img', src=True): if image['src'].endswith('750.jpg'): return image['src'] self.log("\nCover unavailable") # login {{{ if needs_subscription: def get_browser(self, *a, **kw): # To understand the login logic read app-min.js from # https://sso.accounts.dowjones.com/login itp = quote(self.WSJ_ITP, safe='') start_url = 'https://accounts.wsj.com/login?target=' + itp kw['user_agent'] = random_user_agent(allow_ie=False) br = BasicNewsRecipe.get_browser(self, *a, **kw) self.log('Starting login process...') res = br.open(start_url) sso_url = res.geturl() query = urlparse.parse_qs(urlparse.urlparse(sso_url).query) query = {k:v[0] for k, v in query.items()} request_query = { 'username': self.username, 'password': self.password, 'client_id': query['client'], 'sso': 'true', 'tenant': 'sso', '_intstate': 'deprecated', 'connection': 'DJldap', } for k in 'scope connection nonce state ui_locales ns protocol redirect_uri'.split(): if k in query: request_query[k] = query[k] login_url = 'https://sso.accounts.dowjones.com/usernamepassword/login' # you can get the version below from lib-min.js # search for: str: "x.x.x" # This might need to be updated in the future auth0_client = json.dumps({"name": "auth0.js", "version": "7.0.3"}) if not isinstance(auth0_client, bytes): auth0_client = auth0_client.encode('utf-8') auth0_client = standard_b64encode(auth0_client) if isinstance(auth0_client, bytes): auth0_client = auth0_client.decode('ascii') rq = Request(login_url, headers={ 'Accept': 'text/html', 'Accept-Language': 'en-US,en;q=0.8', 'Auth0-Client': auth0_client.rstrip('='), 'X-HTTP-Method-Override': 'POST', 'X-Requested-With': 'XMLHttpRequest', 'X-Remote-User': self.username }, data=request_query) self.log('Sending login request...') try: res = br.open(rq) except Exception as err: if hasattr(err, 'read'): raise Exception('Login request failed with error: {} and body: {}'.format(err, err.read().decode('utf-8', 'replace'))) raise if res.code != 200: raise ValueError('Failed to login, check your username and password') br.select_form(nr=0) self.log('Performing login callback...') res = br.submit() self.wsj_itp_page = raw = res.read() if b'/logout' not in raw: raise ValueError( 'Failed to login (callback URL failed), check username and password') return br else: def get_browser(self, *a, **kw): kw['user_agent'] = random_user_agent(allow_ie=False) br = BasicNewsRecipe.get_browser(self, *a, **kw) self.wsj_itp_page = br.open(self.WSJ_ITP).read() return br # }}} def abs_wsj_url(self, href): if not href.startswith('http'): href = 'https://www.wsj.com' + href return href def wsj_find_articles(self, url, ahed=False): root = self.index_to_soup(url, as_tree=True) CSSSelect = Select(root) articles = [] for container in root.xpath('descendant::div[contains(@class, "WSJTheme--list-item-")]'): heading = next(CSSSelect('h2, h3', container)) a = next(CSSSelect('a', heading)) title = self.tag_to_string(a) url = self.abs_wsj_url(a.get('href')) desc = '' for p in container.xpath('descendant::p[contains(@class, "WSJTheme--description-")]'): q = self.tag_to_string(p) if 'Subscriber Content' in q: continue desc += q break articles.append({'title': title, 'url': url, 'description': desc, 'date': ''}) self.log('\tFound article:', title) self.log('\t\t', desc) return articles def wsj_find_wn_articles(self, feeds, root, CSSSelect): articles = [] for a in CSSSelect('.style--strap--3DsLojSy'): if 'WHAT\'S NEWS' in self.tag_to_string(a).upper(): whats_news = a.getparent() break else: self.log.error('Failed to find Whats News section') return for li in CSSSelect('li', whats_news): a = next(CSSSelect('a', li)) if '/articles/' not in a.get('href', ''): continue title = self.tag_to_string(a).strip() url = self.abs_wsj_url(a.get('href')) desc = self.tag_to_string(li) articles.append({'title': title, 'url': url, 'description': desc, 'date': ''}) self.log('\tFound WN article:', title) self.log('\t\t', desc) return articles def wsj_add_feed(self, feeds, title, url): try: articles = self.wsj_find_articles(url) if not articles: # retry once, sometimes these pages come up empty articles = self.wsj_find_articles(url) except Exception: self.log.exception('Failed to parse section:', title) articles = [] if articles: feeds.append((title, articles)) def parse_index(self): # return self.test_wsj_index() root = self.index_to_soup(self.wsj_itp_page, as_tree=True) CSSSelect = Select(root) # from calibre.utils.ipython import ipython # ipython({'root': root, 'CSSSelect': CSSSelect, 'raw': self.wsj_itp_page}) for inp in CSSSelect('.DayPickerInput > input'): if inp.get('placeholder'): self.timefmt = inp.get('placeholder') break feeds = [] for a in CSSSelect('.WSJTheme--nav-container--sPVwT3Fi .WSJTheme--section-link--XGDsdx5q'): frontpage = a.get('href').endswith('frontpage') title = self.tag_to_string(a).capitalize().strip().replace('U.s.', 'U.S.') if not title: continue url = self.abs_wsj_url(a.get('href')) self.log('Found section:', title, 'at', url) self.wsj_add_feed(feeds, title, url) if frontpage: self.wsj_find_wn_articles(feeds, root, CSSSelect) return feeds def test_wsj_index(self): return [ ('Testing', [ {'title': 'Article One', 'url': 'https://www.wsj.com/articles/gms-plan-to-drop-chevy-cruze-hits-ohio-town-hard-1543314600'}, # noqa ]), ]