diff --git a/recipes/hbr.recipe b/recipes/hbr.recipe index 8b326a18af..18aa5bc612 100644 --- a/recipes/hbr.recipe +++ b/recipes/hbr.recipe @@ -3,17 +3,24 @@ import json import re from collections import OrderedDict -from urllib.parse import urlencode, urljoin +from urllib.parse import urlparse from mechanize import Request +from html5_parser import parse from calibre import browser, random_user_agent from calibre.web.feeds.news import BasicNewsRecipe, classes +def absurl(url): + if url.startswith('/'): + url = 'https://www.hbr.org' + url + return url + + class HBR(BasicNewsRecipe): title = 'Harvard Business Review' - __author__ = 'unkn0wn, updated by ping' + __author__ = 'unkn0wn' description = ( 'Harvard Business Review is the leading destination for smart management thinking. ' 'Through its flagship magazine, books, and digital content and tools published on HBR.org, ' @@ -27,165 +34,167 @@ class HBR(BasicNewsRecipe): encoding = 'utf-8' remove_javascript = True no_stylesheets = True - auto_cleanup = False compress_news_images = True ignore_duplicate_articles = {'url'} base_url = 'https://hbr.org' - remove_attributes = ['height', 'width', 'style'] + resolve_internal_links = True + extra_css = ''' - h1.article-hed { font-size: x-large; margin-bottom: 0.4rem; } - .article-dek { font-size: large; font-style: italic; margin-bottom: 1rem; } - .article-byline { margin-top: 0.7rem; font-size: medium; font-style: normal; font-weight: bold; } - .pub-date { font-size: small; margin-bottom: 1rem; } - img { - display: block; margin-bottom: 0.3rem; max-width: 100%; height: auto; - box-sizing: border-box; - } - .container--caption-credits-hero, .container--caption-credits-inline, span.credit { font-size: small; } - .question { font-weight: bold; } - .description-text { - margin: 1rem 0; - border-top: 1px solid gray; - padding-top: 0.5rem; - font-style: italic; - } - ''' + .article-summary, .article-ideainbrief, + .description-text, .link--black, .topic, .auth {font-size:small; color:#202020;} + .credits--hero-image, .credits--inline-image, .caption--inline-image, + .calibre-nuked-tag-figcaption {font-size:small; text-align:center;} + .sub { font-style: italic; } + .article-byline-list {font-size:small; font-weight:bold;} + .question {font-weight:bold;} + .right-rail--container {font-size:small; color:#404040;} + .article-callout, .slug-content {color:#404040;} + .article-sidebar {color:#202020;} + ''' - keep_only_tags = [ - classes( - 'headline-container article-dek-group pub-date hero-image-content ' - 'article-body standard-content' - ), - ] - - remove_tags = [ - classes( - 'left-rail--container translate-message follow-topic ' - 'newsletter-container by-prefix related-topics--common' - ), - dict(name=['article-sidebar']), - ] - - def preprocess_raw_html(self, raw_html, article_url): - soup = self.soup(raw_html) - - # break author byline out of list - byline_list = soup.find('ul', class_='article-byline-list') - if byline_list: - byline = byline_list.parent - byline.append( - ', '.join( - [ - self.tag_to_string(author) - for author in byline_list.find_all(class_='article-author') - ] - ) - ) - byline_list.decompose() - - # Extract full article content - content_ele = soup.find( - 'content', - attrs={ - 'data-index': True, - 'data-page-year': True, - 'data-page-month': True, - 'data-page-seo-title': True, - 'data-page-slug': True, - }, + def preprocess_raw_html(self, raw, url): + root = parse(raw) + script = root.xpath('//script[@id="__NEXT_DATA__"]') + data = json.loads(script[0].text) + data = data['props']['pageProps']['article'] + endpoint_url = ( + 'https://platform.hbr.org/hbr/bff/content/article' + urlparse(url).path ) - endpoint_url = 'https://hbr.org/api/article/piano/content?' + urlencode( - { - 'year': content_ele['data-page-year'], - 'month': content_ele['data-page-month'], - 'seotitle': content_ele['data-page-seo-title'], - } - ) - data = { - 'contentKey': content_ele['data-index'], - 'pageSlug': content_ele['data-page-slug'], + + topic = '' + if data.get('primaryTopic'): + topic = f'
{data["primaryTopic"]}
' + title = f'

{data["title"]}

' + dek = f'

{data.get("dek", "")}

' + hero = '' + if data.get('hero'): + hero = f'' + auth = '' + if data.get('authors'): + auth = f'

{"By " + ", ".join(x["name"] for x in data.get("authors", {}))}

' + + key_ = { + 'contentKey': data['contentKey'], } headers = { 'User-Agent': random_user_agent(), 'Pragma': 'no-cache', 'Cache-Control': 'no-cache', 'Content-Type': 'application/json', - 'Referer': article_url, } br = browser() req = Request( endpoint_url, headers=headers, - data=json.dumps(data), + data=json.dumps(key_), method='POST', timeout=self.timeout, ) res = br.open(req) - article = json.loads(res.read()) - new_soup = self.soup(article['content']) - # clear out existing partial content - for c in list(content_ele.children): - c.extract() # use extract() instead of decompose() because of strings - content_ele.append(new_soup.body) - return str(soup) + body = json.loads(res.read())['content'] + + return ( + '
' + + topic + + title + + dek + + hero + + auth + + body + + '
' + ) recipe_specific_options = { 'issue': { 'short': 'Enter the Issue Number you want to download ', - 'long': 'For example, 2403' + 'long': 'For example, 2403', } } def parse_index(self): d = self.recipe_specific_options.get('issue') if not (d and isinstance(d, str)): - soup = self.index_to_soup(f'{self.base_url}/magazine') - a = soup.find('a', href=lambda x: x and x.startswith('/archive-toc/')) - cov_url = a.find('img', attrs={'src': True})['src'] - self.cover_url = urljoin(self.base_url, cov_url) - issue_url = urljoin(self.base_url, a['href']) + issue_url = f'{self.base_url}/magazine' else: - issue_url = 'https://hbr.org/archive-toc/BR' + d - mobj = re.search(r'archive-toc/(?P(BR)?\d+)\b', issue_url) - if mobj: - self.cover_url = f'https://hbr.org/resources/images/covers/{mobj.group("issue")}_500.png' - - self.log('Downloading issue:', issue_url) + issue_url = self.base_url + '/archive-toc/BR' + d soup = self.index_to_soup(issue_url) - issue_title = soup.find('h1') - if issue_title: - self.timefmt = f' [{self.tag_to_string(issue_title)}]' + div = soup.find(**classes('backdrop-lightest')) + a = div.find('a', href=lambda x: x and x.startswith('/archive-toc/')) + index = absurl(a['href']) + self.timefmt = ' [' + self.tag_to_string(div.find('h2')) + ']' + self.log('Downloading issue: ', index, self.timefmt) + cov_url = a.find('img', src=True) + if cov_url: + self.cover_url = absurl(cov_url['src']) + soup = self.index_to_soup(index) feeds = OrderedDict() - for h3 in soup.find_all('h3', attrs={'class': 'hed'}): - article_link_ele = h3.find('a') - if not article_link_ele: - continue - article_ele = h3.find_next_sibling( - 'div', attrs={'class': 'stream-item-info'} - ) - if not article_ele: - continue - - title = self.tag_to_string(article_link_ele) - url = urljoin(self.base_url, article_link_ele['href']) - - authors_ele = article_ele.select('ul.byline li') - authors = ', '.join([self.tag_to_string(a) for a in authors_ele]) - - article_desc = '' - dek_ele = h3.find_next_sibling('div', attrs={'class': 'dek'}) - if dek_ele: - article_desc = self.tag_to_string(dek_ele) + ' | ' + authors - section_ele = ( + for h3 in soup.findAll('h3', attrs={'class': 'hed'}): + articles = [] + a = h3.find('a') + title = self.tag_to_string(a) + url = absurl(a['href']) + auth = '' + div = h3.find_next_sibling('div', attrs={'class': 'stream-item-info'}) + if div: + aut = self.tag_to_string(div).replace('Magazine Article ', '') + auth = re.sub(r'(?<=\w)([A-Z])', r', \1', aut) + des = '' + dek = h3.find_next_sibling('div', attrs={'class': 'dek'}) + if dek: + des = self.tag_to_string(dek) + desc = des + ' |' + auth.title() + section_title = 'Articles' + sec = ( h3.findParent('li') .find_previous_sibling('div', **classes('stream-section-label')) .find('h4') ) - section_title = self.tag_to_string(section_ele).title() - feeds.setdefault(section_title, []).append( - {'title': title, 'url': url, 'description': article_desc} - ) - return feeds.items() + if sec: + section_title = self.tag_to_string(sec).title() + self.log(section_title, '\n\t', title, '\n\t', desc, '\n\t\t', url) + articles.append({'title': title, 'url': url, 'description': desc}) + if articles: + if section_title not in feeds: + feeds[section_title] = [] + feeds[section_title] += articles + ans = [(key, val) for key, val in feeds.items()] + return ans + + def preprocess_html(self, soup): + for slug in soup.findAll(**classes('slug-content')): + del slug['href'] + for dek in soup.findAll(**classes('article-byline')): + for by in dek.findAll('span', attrs={'class': 'by-prefix'}): + by.extract() + for li in dek.findAll('li'): + li.name = 'span' + for div in soup.findAll( + 'div', attrs={'class': ['article-summary', 'article-callout']} + ): + div.name = 'blockquote' + for sidebar in soup.findAll(('article-sidebar', 'article-ideainbrief')): + sidebar.name = 'blockquote' + for img in soup.findAll(attrs={'srcset': True}): + split = img['srcset'].split(',') + for x in split: + if '700w' in x: + img['src'] = absurl(x.split()[0]) + del img['srcset'] + return soup + + # HBR changes the content it delivers based on cookies, so the + # following ensures that we send no cookies + def get_browser(self, *args, **kwargs): + return self + + def clone_browser(self, *args, **kwargs): + return self.get_browser() + + def open_novisit(self, *args, **kwargs): + br = browser() + return br.open_novisit(*args, **kwargs) + + open = open_novisit