#!/usr/bin/env python # vim:fileencoding=utf-8 import json import re from collections import OrderedDict from urllib.parse import urlparse from html5_parser import parse from mechanize import Request from calibre import browser, random_user_agent from calibre.web.feeds.news import BasicNewsRecipe, classes def absurl(url): if url.startswith('/'): url = 'https://www.hbr.org' + url return url class HBR(BasicNewsRecipe): title = 'Harvard Business Review' __author__ = 'unkn0wn' description = ( 'Harvard Business Review is the leading destination for smart management thinking. ' 'Through its flagship magazine, books, and digital content and tools published on HBR.org, ' 'Harvard Business Review aims to provide professionals around the world with rigorous insights ' 'and best practices to help lead themselves and their organizations more effectively and to ' 'make a positive impact.' ) language = 'en' masthead_url = 'https://hbr.org/resources/css/images/hbr_logo.svg' publication_type = 'magazine' encoding = 'utf-8' remove_javascript = True no_stylesheets = True compress_news_images = True ignore_duplicate_articles = {'url'} base_url = 'https://hbr.org' remove_attributes = ['height', 'width', 'style'] resolve_internal_links = True extra_css = ''' .article-summary, .article-ideainbrief, .description-text, .link--black, .topic, .auth {font-size:small; color:#202020;} .credits--hero-image, .credits--inline-image, .caption--inline-image, .calibre-nuked-tag-figcaption {font-size:small; text-align:center;} .sub { font-style: italic; } .article-byline-list {font-size:small; font-weight:bold;} .question {font-weight:bold;} .right-rail--container {font-size:small; color:#404040;} .article-callout, .slug-content {color:#404040;} .article-sidebar {color:#202020;} ''' def preprocess_raw_html(self, raw, url): root = parse(raw) script = root.xpath('//script[@id="__NEXT_DATA__"]') data = json.loads(script[0].text) data = data['props']['pageProps']['article'] endpoint_url = ( 'https://platform.hbr.org/hbr/bff/content/article' + urlparse(url).path ) topic = '' if data.get('primaryTopic'): topic = f'
{data["primaryTopic"]}
' title = f'

{data["title"]}

' dek = f'

{data.get("dek", "")}

' hero = '' if data.get('hero'): hero = f'' auth = '' if data.get('authors'): auth = f'

{"By " + ", ".join(x["name"] for x in data.get("authors", {}))}

' key_ = { 'contentKey': data['contentKey'], } headers = { 'User-Agent': random_user_agent(), 'Pragma': 'no-cache', 'Cache-Control': 'no-cache', 'Content-Type': 'application/json', } br = browser() req = Request( endpoint_url, headers=headers, data=json.dumps(key_), method='POST', timeout=self.timeout, ) res = br.open(req) body = json.loads(res.read())['content'] return ( '
' + topic + title + dek + hero + auth + body + '
' ) recipe_specific_options = { 'issue': { 'short': 'Enter the Issue Number you want to download ', 'long': 'For example, 2403', } } def parse_index(self): d = self.recipe_specific_options.get('issue') if not (d and isinstance(d, str)): soup = self.index_to_soup(f'{self.base_url}/magazine') a = soup.find('a', href=lambda x: x and x.startswith('/archive-toc/')) cov_url = a.find('img', attrs={'src': True})['src'] self.cover_url = absurl(cov_url) issue_url = absurl(a['href']) else: issue_url = 'https://hbr.org/archive-toc/BR' + d mobj = re.search(r'archive-toc/(?P(BR)?\d+)\b', issue_url) if mobj: self.cover_url = f'https://hbr.org/resources/images/covers/{mobj.group("issue")}_500.png' soup = self.index_to_soup(issue_url) issue_title = soup.find('h1') if issue_title: self.timefmt = f' [{self.tag_to_string(issue_title)}]' feeds = OrderedDict() for h3 in soup.findAll('h3', attrs={'class': 'hed'}): articles = [] a = h3.find('a') title = self.tag_to_string(a) url = absurl(a['href']) auth = '' div = h3.find_next_sibling('div', attrs={'class': 'stream-item-info'}) if div: aut = self.tag_to_string(div).replace('Magazine Article ', '') auth = re.sub(r'(?<=\w)([A-Z])', r', \1', aut) des = '' dek = h3.find_next_sibling('div', attrs={'class': 'dek'}) if dek: des = self.tag_to_string(dek) desc = des + ' |' + auth.title() section_title = 'Articles' sec = ( h3.findParent('li') .find_previous_sibling('div', **classes('stream-section-label')) .find('h4') ) if sec: section_title = self.tag_to_string(sec).title() self.log(section_title, '\n\t', title, '\n\t', desc, '\n\t\t', url) articles.append({'title': title, 'url': url, 'description': desc}) if articles: if section_title not in feeds: feeds[section_title] = [] feeds[section_title] += articles ans = [(key, val) for key, val in feeds.items()] return ans def preprocess_html(self, soup): for slug in soup.findAll(**classes('slug-content')): del slug['href'] for dek in soup.findAll(**classes('article-byline')): for by in dek.findAll('span', attrs={'class': 'by-prefix'}): by.extract() for li in dek.findAll('li'): li.name = 'span' for div in soup.findAll( 'div', attrs={'class': ['article-summary', 'article-callout']} ): div.name = 'blockquote' for sidebar in soup.findAll(('article-sidebar', 'article-ideainbrief')): sidebar.name = 'blockquote' for img in soup.findAll('img', attrs={'srcset': True}): split = img['srcset'].split(',') for x in split: if '700w' in x: img['src'] = absurl(x.split()[0]) del img['srcset'] return soup # HBR changes the content it delivers based on cookies, so the # following ensures that we send no cookies def get_browser(self, *args, **kwargs): return self def clone_browser(self, *args, **kwargs): return self.get_browser() def open_novisit(self, *args, **kwargs): br = browser() return br.open_novisit(*args, **kwargs) open = open_novisit