#!/usr/bin/env python2 # vim:fileencoding=utf-8 # License: GPLv3 Copyright: 2016, Kovid Goyal from calibre.web.feeds.news import BasicNewsRecipe from css_selectors import Select from mechanize import Request from urllib import urlencode import json class HBR(BasicNewsRecipe): title = 'Harvard Business Review' description = 'To subscribe go to http://hbr.harvardbusiness.org' needs_subscription = True __author__ = 'Kovid Goyal' timefmt = ' [%B %Y]' language = 'en' no_stylesheets = True keep_only_tags = [ dict(attrs={'class': ['article-hed', 'byline']}), dict(attrs={'class': lambda x: x and 'article' in x.split()}), ] remove_tags = [ dict(name='personalization-placement'), ] def get_browser(self): br = BasicNewsRecipe.get_browser(self) # br.set_debug_http(True) br.open('https://hbr.org/sign-in') rq = Request('https://hbr.org/authenticate', headers={ 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', 'Referer': 'https://hbr.org/sign-in', 'Origin': 'https://hbr.org', 'X-Requested-With': 'XMLHttpRequest', 'Accept': 'application/json, text/javascript, */*; q=0.01', }, data=urlencode({'username': self.username, 'password': self.password})) r = br.open(rq) raw = r.read() data = json.loads(raw) if data['code'] != 200 or data["message"] != "Authentication Successful.": raise ValueError('Failed to log in check username/password') return br def hbr_parse_toc(self, url): root = self.index_to_soup(url, as_tree=True) select = Select(root) section = 'Unknown' articles = [] feeds = [] toc = next( select('stream-content[data-stream-name="table-of-contents"] ul')) for x in toc.xpath('descendant::*[local-name()="h4" or local-name()="stream-item"]'): if 'h4' in x.tag: if articles: feeds.append((section, articles)) section = self.tag_to_string(x) articles = [] self.log('Found section:', section) else: title, url = x.get('data-title'), x.get('data-url') desc = ''.join(c.tail or '' for c in x).strip() authors = x.get('data-authors') if authors: desc = 'by ' + authors + ': ' + desc self.log('\tFound article:', title, url, desc) articles.append( {'title': title, 'url': 'https://hbr.org' + url, 'description': desc}) if articles: feeds.append((section, articles)) return feeds def parse_index(self): soup = self.index_to_soup('http://hbr.org/magazine') fig = soup.find('figure', attrs={ 'class': lambda x: x and 'current-issue' in x.split()}) url = 'https://hbr.org' + fig.find('a', href=True)['href'] img = fig.find('img') self.cover_url = 'https://hbr.org' + img['src'] self.timefmt = img['alt'] return self.hbr_parse_toc(url)