Update Wall Street Journal

No longer uses Qt WebKit
This commit is contained in:
Kovid Goyal 2016-04-25 12:19:19 +05:30
parent 5a19ad8eaa
commit 4c66829e6a
2 changed files with 165 additions and 96 deletions

View File

@ -1,5 +1,17 @@
from calibre.web.feeds.jsnews import JavascriptRecipe #!/usr/bin/env python2
from calibre.web.jsbrowser.browser import NotAFile # vim:fileencoding=utf-8
# License: GPLv3 Copyright: 2016, Kovid Goyal <kovid at kovidgoyal.net>
from __future__ import (unicode_literals, division, absolute_import,
print_function)
import json
from mechanize import Request
from urllib import quote
import html5lib
from lxml import html
from calibre.web.feeds.news import BasicNewsRecipe
def CSSSelect(expr): def CSSSelect(expr):
expr = { expr = {
@ -17,8 +29,15 @@ def CSSSelect(expr):
from lxml.etree import XPath from lxml.etree import XPath
return XPath(expr) return XPath(expr)
def classes(classes):
q = frozenset(classes.split(' '))
return dict(attrs={
'class': lambda x: x and frozenset(x.split()).intersection(q)})
class WSJ(JavascriptRecipe): USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64; rv:45.0) Gecko/20100101 Firefox/45.0'
class WSJ(BasicNewsRecipe):
title = 'The Wall Street Journal' title = 'The Wall Street Journal'
__author__ = 'Kovid Goyal' __author__ = 'Kovid Goyal'
@ -27,53 +46,91 @@ class WSJ(JavascriptRecipe):
compress_news_images = True compress_news_images = True
compress_news_images_auto_size = 7 compress_news_images_auto_size = 7
max_articles_per_feed = 1000
timefmt = ' [%a, %b %d, %Y]' timefmt = ' [%a, %b %d, %Y]'
no_stylesheets = True no_stylesheets = True
ignore_duplicate_articles = {'url'} ignore_duplicate_articles = {'url'}
remove_attributes = ['style', 'data-scrim'] remove_attributes = ['style', 'data-scrim']
needs_subscription = True needs_subscription = True
WSJ_ITP = 'http://online.wsj.com/itp/today'
keep_only_tags = ( keep_only_tags = [
'h1', # 'h2.subhead', 'h2.subHed.deck', dict(classes('wsj-article-headline-wrap article_header')),
'span[itemprop=author][rel=author]', dict(name='span', itemprop='author', rel='author'),
'article#article-contents', 'article#articleBody', dict(name='article', id='article-contents articleBody'.split()),
'div#article_story_body', dict(name='div', id='article_story_body ncTitleArea snipper-ad-login'.split()),
# Parallax formatting dict(classes('nc-exp-artbody errorNotFound')),
'div#ncTitleArea', 'section.nc-exp-artbody', dict(attrs={'data-module-zone': 'article_snippet'}),
# Error conditions, login required and page not found ]
'div#snippet-ad-login', 'div.errorNotFound',
)
remove_tags = ( remove_tags = [
'.insetButton', '.insettipBox', '.author-info', '.media-object-video', classes('insetButton insettipBox author-info media-object-video article_tools nc-exp-artmeta category'),
'.article_tools', 'span[data-country-code][data-ticker-code]', dict(name='span', attrs={
'div.nc-exp-artmeta', 'data-country-code': True, 'data-ticker-code': True}),
) dict(name='meta link'.split()),
]
def do_login(self, br, username, password): def preprocess_raw_html(self, raw_html, url):
br.visit( root = html5lib.parse(raw_html, treebuilder='lxml', namespaceHTMLElements=False)
'https://id.wsj.com/access/pages/wsj/us/login_standalone.html?mg=com-wsj', timeout=120) # noqa raw_html = html.tostring(root)
f = br.select_form(nr=0) # open('/t/art.html', 'w').write(raw_html)
f['username'] = username return raw_html
f['password'] = password
br.submit(timeout=120)
def preprocess_stage2(self, article, browser, url, recursion_level): def preprocess_soup(self, soup):
# Slideshow and expandable images need to be processed here to # Slideshow and expandable images need to be processed here to
# set the src attribute correctly # set the src attribute correctly
found = 0 found = 0
for img in browser.css_select('img[data-in-base-data-lazy]', all=True): for img in soup.findAll('img', attrs={'data-in-base-data-lazy': True}):
img.setAttribute('src', img.attribute('data-in-base-data-lazy')) img['src'] = img['data-in-base-data-lazy']
found += 1 found += 1
for img in browser.css_select('img[data-enlarge]', all=True): for img in soup.findAll('img', attrs={'data-enlarge': True}):
img.setAttribute('src', img.attribute('data-enlarge')) img['src'] = img['data-enlarge']
found += 1 found += 1
if found: if found:
self.log.debug('Found %d dynamic images in:' % found, url) self.log.debug('Found %d dynamic images in:' % found)
return soup
def get_publication_data(self, browser): def get_browser(self):
return self.get_wsj_index(browser) # To understand the signin logic read signin.js from
# https://id.wsj.com/access/pages/wsj/us/signin.html
br = BasicNewsRecipe.get_browser(self, user_agent=USER_AGENT)
# self.wsj_itp_page = open('/t/raw.html').read()
# return br
url = 'https://id.wsj.com/access/pages/wsj/us/signin.html?mg=com-wsj&mg=id-wsj'
# br.set_debug_http(True)
br.open(url).read()
rurl = 'https://id.wsj.com/auth/submitlogin.json'
rq = Request(rurl, headers={
'Accept': 'application/json, text/javascript, */*; q=0.01',
'Accept-Language': 'en-US,en;q=0.8',
'Content-Type': 'application/json',
'Referer': url,
'X-HTTP-Method-Override': 'POST',
'X-Requested-With': 'XMLHttpRequest',
}, data=json.dumps({
'username': self.username,
'password': self.password,
'realm': 'default',
'savelogin': 'true',
'template': 'default',
'url': quote(self.WSJ_ITP),
}))
r = br.open(rq)
if r.code != 200:
raise ValueError('Failed to login, check username and password')
data = json.loads(r.read())
# print(data)
if data.get('result') != 'success':
raise ValueError(
'Failed to login (XHR failed), check username and password')
br.set_cookie('m', data['username'], '.wsj.com')
r = br.open(data['url'])
self.wsj_itp_page = raw = r.read()
if b'>Sign Out<' not in raw:
raise ValueError(
'Failed to login (auth URL failed), check username and password')
# open('/t/raw.html', 'w').write(raw)
return br
def abs_wsj_url(self, href): def abs_wsj_url(self, href):
if not href.startswith('http'): if not href.startswith('http'):
@ -81,7 +138,7 @@ class WSJ(JavascriptRecipe):
return href return href
def wsj_find_articles(self, url, ahed=False): def wsj_find_articles(self, url, ahed=False):
root = self.index_to_soup(url) root = self.index_to_soup(url, as_tree=True)
for x in CSSSelect('div.whatsNews-simple')(root): for x in CSSSelect('div.whatsNews-simple')(root):
x.getparent().remove(x) x.getparent().remove(x)
@ -128,7 +185,7 @@ class WSJ(JavascriptRecipe):
return articles return articles
def wsj_find_wn_articles(self, url): def wsj_find_wn_articles(self, url):
root = self.index_to_soup(url) root = self.index_to_soup(url, as_tree=True)
articles = [] articles = []
whats_news = CSSSelect('div.whatsNews-simple.whatsNews-itp')(root) whats_news = CSSSelect('div.whatsNews-simple.whatsNews-itp')(root)
@ -165,23 +222,18 @@ class WSJ(JavascriptRecipe):
if articles: if articles:
feeds.append((title, articles)) feeds.append((title, articles))
def get_wsj_index(self, browser): def parse_index(self):
# return self.test_wsj_index() # return self.test_wsj_index()
ans = {} root = self.index_to_soup(self.wsj_itp_page, as_tree=True)
root = self.index_to_soup('http://online.wsj.com/itp/today')
for span in CSSSelect('span.date-date')(root): for span in CSSSelect('span.date-date')(root):
if span.text: if span.text:
self.timefmt = span.text self.timefmt = span.text
break break
for a in CSSSelect('div.itpSectionHeaderPdf a[href]')(root): for a in CSSSelect('div.itpSectionHeaderPdf a[href]')(root):
href = a.get('href') self.cover_url = a.get('href')
try:
ans['cover'] = browser.download_file(href)
except NotAFile:
break
break break
feeds = ans['index'] = [] feeds = []
for a in CSSSelect('div.itpHeader ul.tab a[href]')(root): for a in CSSSelect('div.itpHeader ul.tab a[href]')(root):
if '/itp/' not in a.get('href', ''): if '/itp/' not in a.get('href', ''):
continue continue
@ -197,10 +249,10 @@ class WSJ(JavascriptRecipe):
title = self.tag_to_string(a) title = self.tag_to_string(a)
url = self.abs_wsj_url(a.get('href')) url = self.abs_wsj_url(a.get('href'))
self.wsj_add_feed(feeds, title, url) self.wsj_add_feed(feeds, title, url)
return ans return feeds
def test_wsj_index(self): def test_wsj_index(self):
return {'index': [ return [
('Testing', [ ('Testing', [
{'title': 'Article One', {'title': 'Article One',
'url': 'http://online.wsj.com/articles/the-end-of-the-impulse-shopper-1416872108'}, # noqa 'url': 'http://online.wsj.com/articles/the-end-of-the-impulse-shopper-1416872108'}, # noqa
@ -209,4 +261,4 @@ class WSJ(JavascriptRecipe):
{'title': 'Article Three', {'title': 'Article Three',
'url': 'http://online.wsj.com/article/SB10634695869867284248804580297251334393676.html'}, # noqa 'url': 'http://online.wsj.com/article/SB10634695869867284248804580297251334393676.html'}, # noqa
]), ]),
]} ]

View File

@ -1,5 +1,14 @@
from calibre.web.feeds.jsnews import JavascriptRecipe #!/usr/bin/env python2
from calibre.web.jsbrowser.browser import NotAFile # vim:fileencoding=utf-8
# License: GPLv3 Copyright: 2016, Kovid Goyal <kovid at kovidgoyal.net>
from __future__ import (unicode_literals, division, absolute_import,
print_function)
import html5lib
from lxml import html
from calibre.web.feeds.news import BasicNewsRecipe
def CSSSelect(expr): def CSSSelect(expr):
expr = { expr = {
@ -17,58 +26,70 @@ def CSSSelect(expr):
from lxml.etree import XPath from lxml.etree import XPath
return XPath(expr) return XPath(expr)
def classes(classes):
q = frozenset(classes.split(' '))
return dict(attrs={
'class': lambda x: x and frozenset(x.split()).intersection(q)})
class WSJ(JavascriptRecipe): USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64; rv:45.0) Gecko/20100101 Firefox/45.0'
title = 'Wall Street Journal (free)'
class WSJ(BasicNewsRecipe):
title = 'The Wall Street Journal'
__author__ = 'Kovid Goyal' __author__ = 'Kovid Goyal'
description = '''News and current affairs. This recipe only fetches complete description = 'News and current affairs'
versions of the articles that are available free on the wsj.com website.
To get the rest of the articles, subscribe to the WSJ and use the other WSJ
recipe.'''
language = 'en' language = 'en'
compress_news_images = True compress_news_images = True
compress_news_images_auto_size = 5 compress_news_images_auto_size = 7
max_articles_per_feed = 1000
timefmt = ' [%a, %b %d, %Y]' timefmt = ' [%a, %b %d, %Y]'
no_stylesheets = True no_stylesheets = True
ignore_duplicate_articles = {'url'} ignore_duplicate_articles = {'url'}
remove_attributes = ['style', 'data-scrim'] remove_attributes = ['style', 'data-scrim']
WSJ_ITP = 'http://online.wsj.com/itp/today'
keep_only_tags = ( keep_only_tags = [
'h1', # 'h2.subhead', 'h2.subHed.deck', dict(classes('wsj-article-headline-wrap article_header')),
'span[itemprop=author][rel=author]', dict(name='span', itemprop='author', rel='author'),
'article#article-contents', 'article#articleBody', dict(name='article', id='article-contents articleBody'.split()),
'div#article_story_body', 'header.article_header', dict(name='div', id='article_story_body ncTitleArea snipper-ad-login'.split()),
# Parallax formatting dict(classes('nc-exp-artbody errorNotFound')),
'div#ncTitleArea', 'section.nc-exp-artbody', dict(attrs={'data-module-zone': 'article_snippet'}),
# Error conditions, login required and page not found ]
'div#snippet-ad-login', 'div.wsj-snippet-body', 'div.wsj-snippet-login', 'div.errorNotFound',
)
remove_tags = ( remove_tags = [
'.insetButton', '.insettipBox', '.author-info', '.media-object-video', classes('insetButton insettipBox author-info media-object-video article_tools nc-exp-artmeta category'),
'.article_tools', 'span[data-country-code][data-ticker-code]', dict(name='span', attrs={
'div.nc-exp-artmeta', 'data-country-code': True, 'data-ticker-code': True}),
) dict(name='meta link'.split()),
]
def preprocess_stage2(self, article, browser, url, recursion_level): def preprocess_raw_html(self, raw_html, url):
root = html5lib.parse(raw_html, treebuilder='lxml', namespaceHTMLElements=False)
raw_html = html.tostring(root)
# open('/t/art.html', 'w').write(raw_html)
return raw_html
def preprocess_soup(self, soup):
# Slideshow and expandable images need to be processed here to # Slideshow and expandable images need to be processed here to
# set the src attribute correctly # set the src attribute correctly
found = 0 found = 0
for img in browser.css_select('img[data-in-base-data-lazy]', all=True): for img in soup.findAll('img', attrs={'data-in-base-data-lazy': True}):
img.setAttribute('src', img.attribute('data-in-base-data-lazy')) img['src'] = img['data-in-base-data-lazy']
found += 1 found += 1
for img in browser.css_select('img[data-enlarge]', all=True): for img in soup.findAll('img', attrs={'data-enlarge': True}):
img.setAttribute('src', img.attribute('data-enlarge')) img['src'] = img['data-enlarge']
found += 1 found += 1
if found: if found:
self.log.debug('Found %d dynamic images in:' % found, url) self.log.debug('Found %d dynamic images in:' % found)
return soup
def get_publication_data(self, browser): def get_browser(self):
return self.get_wsj_index(browser) br = BasicNewsRecipe.get_browser(self, user_agent=USER_AGENT)
self.wsj_itp_page = br.open(self.WSJ_ITP).read()
return br
def abs_wsj_url(self, href): def abs_wsj_url(self, href):
if not href.startswith('http'): if not href.startswith('http'):
@ -76,7 +97,7 @@ class WSJ(JavascriptRecipe):
return href return href
def wsj_find_articles(self, url, ahed=False): def wsj_find_articles(self, url, ahed=False):
root = self.index_to_soup(url) root = self.index_to_soup(url, as_tree=True)
for x in CSSSelect('div.whatsNews-simple')(root): for x in CSSSelect('div.whatsNews-simple')(root):
x.getparent().remove(x) x.getparent().remove(x)
@ -106,6 +127,7 @@ class WSJ(JavascriptRecipe):
self.log('\tFound article:', title) self.log('\tFound article:', title)
self.log('\t\t', desc) self.log('\t\t', desc)
if ahed: if ahed:
for h2 in root.xpath('//li[@class="ahed_listitem"]/h2'): for h2 in root.xpath('//li[@class="ahed_listitem"]/h2'):
a = h2.xpath('descendant::a')[0] a = h2.xpath('descendant::a')[0]
@ -122,7 +144,7 @@ class WSJ(JavascriptRecipe):
return articles return articles
def wsj_find_wn_articles(self, url): def wsj_find_wn_articles(self, url):
root = self.index_to_soup(url) root = self.index_to_soup(url, as_tree=True)
articles = [] articles = []
whats_news = CSSSelect('div.whatsNews-simple.whatsNews-itp')(root) whats_news = CSSSelect('div.whatsNews-simple.whatsNews-itp')(root)
@ -159,23 +181,18 @@ class WSJ(JavascriptRecipe):
if articles: if articles:
feeds.append((title, articles)) feeds.append((title, articles))
def get_wsj_index(self, browser): def parse_index(self):
# return self.test_wsj_index() # return self.test_wsj_index()
ans = {} root = self.index_to_soup(self.wsj_itp_page, as_tree=True)
root = self.index_to_soup('http://online.wsj.com/itp/today')
for span in CSSSelect('span.date-date')(root): for span in CSSSelect('span.date-date')(root):
if span.text: if span.text:
self.timefmt = span.text self.timefmt = span.text
break break
for a in CSSSelect('div.itpSectionHeaderPdf a[href]')(root): for a in CSSSelect('div.itpSectionHeaderPdf a[href]')(root):
href = a.get('href') self.cover_url = a.get('href')
try:
ans['cover'] = browser.download_file(href)
except NotAFile:
break
break break
feeds = ans['index'] = [] feeds = []
for a in CSSSelect('div.itpHeader ul.tab a[href]')(root): for a in CSSSelect('div.itpHeader ul.tab a[href]')(root):
if '/itp/' not in a.get('href', ''): if '/itp/' not in a.get('href', ''):
continue continue
@ -191,10 +208,10 @@ class WSJ(JavascriptRecipe):
title = self.tag_to_string(a) title = self.tag_to_string(a)
url = self.abs_wsj_url(a.get('href')) url = self.abs_wsj_url(a.get('href'))
self.wsj_add_feed(feeds, title, url) self.wsj_add_feed(feeds, title, url)
return ans return feeds
def test_wsj_index(self): def test_wsj_index(self):
return {'index': [ return [
('Testing', [ ('Testing', [
{'title': 'Article One', {'title': 'Article One',
'url': 'http://online.wsj.com/articles/the-end-of-the-impulse-shopper-1416872108'}, # noqa 'url': 'http://online.wsj.com/articles/the-end-of-the-impulse-shopper-1416872108'}, # noqa
@ -203,4 +220,4 @@ class WSJ(JavascriptRecipe):
{'title': 'Article Three', {'title': 'Article Three',
'url': 'http://online.wsj.com/article/SB10634695869867284248804580297251334393676.html'}, # noqa 'url': 'http://online.wsj.com/article/SB10634695869867284248804580297251334393676.html'}, # noqa
]), ]),
]} ]