calibre/recipes/barrons.recipe

#!/usr/bin/env python2
# vim:fileencoding=utf-8
# License: GPLv3 Copyright: 2016, Kovid Goyal <kovid at kovidgoyal.net>

from __future__ import (unicode_literals, division, absolute_import,
                        print_function)

import json
from mechanize import Request
from urllib import quote

from calibre.web.feeds.news import BasicNewsRecipe

USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64; rv:45.0) Gecko/20100101 Firefox/45.0'


class Barrons(BasicNewsRecipe):

    title = 'Barron\'s'
    max_articles_per_feed = 50
    needs_subscription = True
    language = 'en'

    __author__ = 'Kovid Goyal'
    description = 'Weekly publication for investors from the publisher of the Wall Street Journal'
    timefmt = ' [%a, %b %d, %Y]'
    use_embedded_content = False
    no_stylesheets = True
    match_regexps = ['http://online.barrons.com/.*?html\?mod=.*?|file:.*']
    conversion_options = {'linearize_tables': True}

    # Don't grab articles more than 7 days old
    oldest_article = 7
    requires_version = (0, 9, 16)

    keep_only_tags = [dict(attrs={'class': lambda x: x and (
        x.startswith('sector one column') or x.startswith('sector two column'))})]
    remove_tags = [
        dict(name='div', attrs={'class': [
             'sTools sTools-t', 'tabContainer artTabbedNav', 'rssToolBox hidden', 'articleToolbox']}),
        dict(attrs={'class': ['insetButton', 'insettipBox', 'insetClose']}),
        dict(attrs={'data-module-name': [
             'resp.module.trendingNow.BarronsDesktop', 'resp.module.share_tools.ShareTools']}),
        dict(name='span', attrs={
             'data-country-code': True, 'data-ticker-code': True}),
    ]

    def get_browser(self):
        # To understand the signin logic read signin.js from
        # https://id.barrons.com/access/pages/barrons/us/login_standalone.html?mg=com-barrons
        # This is the same login servie as used by WSJ
        br = BasicNewsRecipe.get_browser(self, user_agent=USER_AGENT)
        url = 'https://id.barrons.com/access/pages/barrons/us/login_standalone.html?mg=com-barrons'
        # br.set_debug_http(True)
        br.open(url).read()
        rurl = 'https://id.barrons.com/auth/submitlogin.json'
        rq = Request(rurl, headers={
            'Accept': 'application/json, text/javascript, */*; q=0.01',
            'Accept-Language': 'en-US,en;q=0.8',
            'Content-Type': 'application/json',
            'Referer': url,
            'X-HTTP-Method-Override': 'POST',
            'X-Requested-With': 'XMLHttpRequest',
        }, data=json.dumps({
            'username': self.username,
            'password': self.password,
            'realm': 'default',
            'savelogin': 'true',
            'template': 'default',
            'url': quote('http://online.barrons.com'),
        }))
        r = br.open(rq)
        if r.code != 200:
            raise ValueError('Failed to login, check username and password')
        data = json.loads(r.read())
        # from pprint import pprint
        # pprint(data)
        if data.get('result') != 'success':
            raise ValueError(
                'Failed to login (XHR failed), check username and password')
        br.set_cookie('m', data['username'], '.barrons.com')
        br.open(data['url']).read()
        # open('/t/raw.html', 'wb').write(raw)
        # if b'>Logout<' not in raw:
        #     raise ValueError(
        #         'Failed to login (auth URL failed), check username and password')
        return br

    # Use the print version of a page when available.
    def print_version(self, url):
        main, sep, rest = url.rpartition('?')
        return main + '#text.print'

    def preprocess_html(self, soup):
        # Remove thumbnail for zoomable images
        for div in soup.findAll('div', attrs={'class': lambda x: x and 'insetZoomTargetBox' in x.split()}):
            img = div.find('img')
            if img is not None:
                img.extract()

        return soup

# Comment out the feeds you don't want retrieved.
# Because these feeds are sorted alphabetically when converted to LRF, you
# may want to number them to put them in the order you desire

    feeds = [
        ('This Week\'s Magazine', 'http://online.barrons.com/xml/rss/3_7510.xml'),
        ('Online Exclusives', 'http://online.barrons.com/xml/rss/3_7515.xml'),
        ('Companies', 'http://online.barrons.com/xml/rss/3_7516.xml'),
        ('Markets', 'http://online.barrons.com/xml/rss/3_7517.xml'),
        ('Technology', 'http://online.barrons.com/xml/rss/3_7518.xml'),
        ('Funds/Q&A', 'http://online.barrons.com/xml/rss/3_7519.xml'),
    ]

    def get_article_url(self, article):
        return article.get('link', None)