#!/usr/bin/env python2 # vim:fileencoding=utf-8 # License: GPLv3 Copyright: 2016, Kovid Goyal from __future__ import absolute_import, division, print_function, unicode_literals import json try: from urllib.parse import quote except ImportError: from urllib import quote from mechanize import Request from calibre import random_user_agent from calibre.web.feeds.news import BasicNewsRecipe from css_selectors import Select needs_subscription = True def classes(classes): q = frozenset(classes.split(' ')) return dict(attrs={ 'class': lambda x: x and frozenset(x.split()).intersection(q)}) class WSJ(BasicNewsRecipe): title = 'The Wall Street Journal' __author__ = 'Kovid Goyal' description = 'News and current affairs' language = 'en' compress_news_images = True compress_news_images_auto_size = 7 timefmt = ' [%a, %b %d, %Y]' no_stylesheets = True ignore_duplicate_articles = {'url'} remove_attributes = ['style', 'data-scrim'] needs_subscription = needs_subscription WSJ_ITP = 'https://online.wsj.com/itp/today' keep_only_tags = [ dict(classes('wsj-article-headline-wrap article_header bigTop__hed bigTop__dek bigTop__captioncredit')), dict(name='span', itemprop='author', rel='author'), dict(name='article', id='article-contents articleBody'.split()), dict(name='div', id='article_story_body ncTitleArea snipper-ad-login'.split()), dict(classes('nc-exp-artbody errorNotFound')), dict(attrs={'data-module-zone': 'article_snippet'}), ] remove_tags = [ dict(id='right-rail'), dict(id='narrator-nav'), dict(name='div', id='ad_and_popular'), classes('strap-container right-rail comments-count-container insetButton insettipBox author-info' ' media-object-video article_tools nc-exp-artmeta category type-InsetArticlesRelatedByType media-object-rich-text'), dict(name='span', attrs={ 'data-country-code': True, 'data-ticker-code': True}), dict(name='meta link'.split()), ] def preprocess_soup(self, soup): # Slideshow and expandable images need to be processed here to # set the src attribute correctly found = 0 for img in soup.findAll('img', attrs={'data-in-base-data-lazy': True}): img['src'] = img['data-in-base-data-lazy'] found += 1 for img in soup.findAll('img', attrs={'data-enlarge': True}): img['src'] = img['data-enlarge'] found += 1 if found: self.log.debug('Found %d dynamic images in:' % found) return soup def get_cover_url(self): index = 'http://en.kiosko.net/us/np/wsj.html' soup = self.index_to_soup(index) for image in soup.findAll('img', src=True): if image['src'].endswith('750.jpg'): return image['src'] self.log("\nCover unavailable") # login {{{ if needs_subscription: def get_browser(self, *a, **kw): # To understand the signin logic read signin.js from # https://id.wsj.com/access/pages/wsj/us/signin.html # This is the same login servie as used by Barrons kw['user_agent'] = random_user_agent(allow_ie=False) br = BasicNewsRecipe.get_browser(self, *a, **kw) # self.wsj_itp_page = open('/t/raw.html').read() # return br url = 'https://id.wsj.com/access/pages/wsj/us/signin.html?mg=com-wsj&mg=id-wsj' # br.set_debug_http(True) br.open(url).read() rurl = 'https://id.wsj.com/auth/submitlogin.json' rq = Request(rurl, headers={ 'Accept': 'application/json, text/javascript, */*; q=0.01', 'Accept-Language': 'en-US,en;q=0.8', 'Content-Type': 'application/json', 'Referer': url, 'X-HTTP-Method-Override': 'POST', 'X-Requested-With': 'XMLHttpRequest', }, data=json.dumps({ 'username': self.username, 'password': self.password, 'realm': 'default', 'savelogin': 'true', 'template': 'default', 'url': quote(self.WSJ_ITP), })) r = br.open(rq) if r.code != 200: raise ValueError('Failed to login, check username and password') data = json.loads(r.read()) # print(data) if data.get('result') != 'success': raise ValueError( 'Failed to login (XHR failed), check username and password') br.set_cookie('m', data['username'], '.wsj.com') try: r = br.open(data['url']) except Exception: self.log.error('Failed to open login url: {}'.format(data['url'])) raise self.wsj_itp_page = raw = r.read() if b'>Sign Out<' not in raw: raise ValueError( 'Failed to login (auth URL failed), check username and password') # open('/t/raw.html', 'w').write(raw) return br else: def get_browser(self, *a, **kw): kw['user_agent'] = random_user_agent(allow_ie=False) br = BasicNewsRecipe.get_browser(self, *a, **kw) self.wsj_itp_page = br.open(self.WSJ_ITP).read() return br # }}} def abs_wsj_url(self, href): if not href.startswith('http'): href = 'https://www.wsj.com' + href return href def wsj_find_articles(self, url, ahed=False): root = self.index_to_soup(url, as_tree=True) CSSSelect = Select(root) articles = [] for container in root.xpath('descendant::div[contains(@class, "WSJTheme__list-item_")]'): heading = next(CSSSelect('h2, h3', container)) a = next(CSSSelect('a', heading)) title = self.tag_to_string(a) url = self.abs_wsj_url(a.get('href')) desc = '' for p in container.xpath('descendant::p[contains(@class, "WSJTheme__description_")]'): q = self.tag_to_string(p) if 'Subscriber Content' in q: continue desc += q break articles.append({'title': title, 'url': url, 'description': desc, 'date': ''}) self.log('\tFound article:', title) self.log('\t\t', desc) return articles def wsj_find_wn_articles(self, feeds, root, CSSSelect): articles = [] for a in CSSSelect('.style__strap_2m6gCW_c_6WZKkU--eRUWv'): if 'WHAT\'S NEWS' in self.tag_to_string(a).upper(): whats_news = a.getparent() break else: self.log.error('Failed to find Whats News section') return for li in CSSSelect('li', whats_news): a = next(CSSSelect('a', li)) if '/articles/' not in a.get('href', ''): continue title = self.tag_to_string(a).strip() url = self.abs_wsj_url(a.get('href')) desc = self.tag_to_string(li) articles.append({'title': title, 'url': url, 'description': desc, 'date': ''}) self.log('\tFound WN article:', title) self.log('\t\t', desc) return articles def wsj_add_feed(self, feeds, title, url): self.log('Found section:', title, '[' + url + ']') try: articles = self.wsj_find_articles(url) if not articles: # retry once, sometimes these pages come up empty articles = self.wsj_find_articles(url) except Exception: self.log.exception('Failed to parse section:', title) articles = [] if articles: feeds.append((title, articles)) def parse_index(self): # return self.test_wsj_index() root = self.index_to_soup(self.wsj_itp_page, as_tree=True) CSSSelect = Select(root) for inp in CSSSelect('.DayPickerInput > input'): if inp.get('placeholder'): self.timefmt = inp.get('placeholder') break feeds = [] for a in CSSSelect('.WSJTheme__nav-container_sPVwT3FiPlWjFGtr5KH3d .WSJTheme__section-link_XGDsdx5qPlnC8BZPxQ63R'): frontpage = a.get('href').endswith('frontpage') title = self.tag_to_string(a).capitalize().strip().replace('U.s.', 'U.S.') if not title: continue url = self.abs_wsj_url(a.get('href')) self.wsj_add_feed(feeds, title, url) if frontpage: self.wsj_find_wn_articles(feeds, root, CSSSelect) return feeds def test_wsj_index(self): return [ ('Testing', [ {'title': 'Article One', 'url': 'https://www.wsj.com/articles/gms-plan-to-drop-chevy-cruze-hits-ohio-town-hard-1543314600'}, # noqa ]), ]