From 81f8814f751d045581b08521ff717499b6a7b963 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Tue, 17 May 2016 09:41:17 +0530 Subject: [PATCH] Update Barrons Gets rid of the use of javascript_login since that no longer works in modern calibre --- recipes/barrons.recipe | 77 ++++++++++++++++++++++++++++++------------ recipes/wsj.recipe | 1 + 2 files changed, 56 insertions(+), 22 deletions(-) diff --git a/recipes/barrons.recipe b/recipes/barrons.recipe index 06c8e500e4..22bf676b77 100644 --- a/recipes/barrons.recipe +++ b/recipes/barrons.recipe @@ -1,5 +1,18 @@ +#!/usr/bin/env python2 +# vim:fileencoding=utf-8 +# License: GPLv3 Copyright: 2016, Kovid Goyal + +from __future__ import (unicode_literals, division, absolute_import, + print_function) + +import json +from mechanize import Request +from urllib import quote + from calibre.web.feeds.news import BasicNewsRecipe +USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64; rv:45.0) Gecko/20100101 Firefox/45.0' + class Barrons(BasicNewsRecipe): title = 'Barron\'s' @@ -14,11 +27,9 @@ class Barrons(BasicNewsRecipe): no_stylesheets = True match_regexps = ['http://online.barrons.com/.*?html\?mod=.*?|file:.*'] conversion_options = {'linearize_tables': True} - ##delay = 1 # Don't grab articles more than 7 days old oldest_article = 7 - use_javascript_to_login = True requires_version = (0, 9, 16) keep_only_tags = [dict(attrs={'class':lambda x: x and (x.startswith('sector one column') or x.startswith('sector two column'))})] @@ -29,12 +40,46 @@ class Barrons(BasicNewsRecipe): dict(name='span', attrs={'data-country-code':True, 'data-ticker-code':True}), ] - def javascript_login(self, br, username, password): - br.visit('http://commerce.barrons.com/auth/login') - f = br.select_form(nr=0) - f['username'] = username - f['password'] = password - br.submit(timeout=120) + def get_browser(self): + # To understand the signin logic read signin.js from + # https://id.barrons.com/access/pages/barrons/us/login_standalone.html?mg=com-barrons + # This is the same login servie as used by WSJ + br = BasicNewsRecipe.get_browser(self, user_agent=USER_AGENT) + url = 'https://id.barrons.com/access/pages/barrons/us/login_standalone.html?mg=com-barrons' + # br.set_debug_http(True) + br.open(url).read() + rurl = 'https://id.barrons.com/auth/submitlogin.json' + rq = Request(rurl, headers={ + 'Accept': 'application/json, text/javascript, */*; q=0.01', + 'Accept-Language': 'en-US,en;q=0.8', + 'Content-Type': 'application/json', + 'Referer': url, + 'X-HTTP-Method-Override': 'POST', + 'X-Requested-With': 'XMLHttpRequest', + }, data=json.dumps({ + 'username': self.username, + 'password': self.password, + 'realm': 'default', + 'savelogin': 'true', + 'template': 'default', + 'url': quote('http://online.barrons.com'), + })) + r = br.open(rq) + if r.code != 200: + raise ValueError('Failed to login, check username and password') + data = json.loads(r.read()) + # from pprint import pprint + # pprint(data) + if data.get('result') != 'success': + raise ValueError( + 'Failed to login (XHR failed), check username and password') + br.set_cookie('m', data['username'], '.barrons.com') + raw = br.open(data['url']).read() + # open('/t/raw.html', 'wb').write(raw) + if b'>Logout<' not in raw: + raise ValueError( + 'Failed to login (auth URL failed), check username and password') + return br # Use the print version of a page when available. def print_version(self, url): @@ -53,26 +98,14 @@ class Barrons(BasicNewsRecipe): # Comment out the feeds you don't want retrieved. # Because these feeds are sorted alphabetically when converted to LRF, you may want to number them to put them in the order you desire - def get_feeds(self): - return [ + feeds = [ ('This Week\'s Magazine', 'http://online.barrons.com/xml/rss/3_7510.xml'), ('Online Exclusives', 'http://online.barrons.com/xml/rss/3_7515.xml'), ('Companies', 'http://online.barrons.com/xml/rss/3_7516.xml'), ('Markets', 'http://online.barrons.com/xml/rss/3_7517.xml'), ('Technology', 'http://online.barrons.com/xml/rss/3_7518.xml'), ('Funds/Q&A', 'http://online.barrons.com/xml/rss/3_7519.xml'), - ] + ] def get_article_url(self, article): return article.get('link', None) - - def get_cover_url(self): - cover_url = None - index = 'http://online.barrons.com/home-page' - soup = self.index_to_soup(index) - link_item = soup.find('ul',attrs={'class':'newsItem barronsMag'}) - if link_item: - cover_url = link_item.img['src'] - return cover_url - - diff --git a/recipes/wsj.recipe b/recipes/wsj.recipe index 29a4d4f4d6..755e5d85bc 100644 --- a/recipes/wsj.recipe +++ b/recipes/wsj.recipe @@ -93,6 +93,7 @@ class WSJ(BasicNewsRecipe): def get_browser(self): # To understand the signin logic read signin.js from # https://id.wsj.com/access/pages/wsj/us/signin.html + # This is the same login servie as used by Barrons br = BasicNewsRecipe.get_browser(self, user_agent=USER_AGENT) # self.wsj_itp_page = open('/t/raw.html').read() # return br