This commit is contained in:
Kovid Goyal 2025-08-24 19:57:46 +05:30
commit 7adf757f72
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C

View File

@ -3,17 +3,24 @@
import json import json
import re import re
from collections import OrderedDict from collections import OrderedDict
from urllib.parse import urlencode, urljoin
from urllib.parse import urlparse
from mechanize import Request from mechanize import Request
from html5_parser import parse
from calibre import browser, random_user_agent from calibre import browser, random_user_agent
from calibre.web.feeds.news import BasicNewsRecipe, classes from calibre.web.feeds.news import BasicNewsRecipe, classes
def absurl(url):
if url.startswith('/'):
url = 'https://www.hbr.org' + url
return url
class HBR(BasicNewsRecipe): class HBR(BasicNewsRecipe):
title = 'Harvard Business Review' title = 'Harvard Business Review'
__author__ = 'unkn0wn, updated by ping' __author__ = 'unkn0wn'
description = ( description = (
'Harvard Business Review is the leading destination for smart management thinking. ' 'Harvard Business Review is the leading destination for smart management thinking. '
'Through its flagship magazine, books, and digital content and tools published on HBR.org, ' 'Through its flagship magazine, books, and digital content and tools published on HBR.org, '
@ -27,165 +34,167 @@ class HBR(BasicNewsRecipe):
encoding = 'utf-8' encoding = 'utf-8'
remove_javascript = True remove_javascript = True
no_stylesheets = True no_stylesheets = True
auto_cleanup = False
compress_news_images = True compress_news_images = True
ignore_duplicate_articles = {'url'} ignore_duplicate_articles = {'url'}
base_url = 'https://hbr.org' base_url = 'https://hbr.org'
remove_attributes = ['height', 'width', 'style'] remove_attributes = ['height', 'width', 'style']
resolve_internal_links = True
extra_css = ''' extra_css = '''
h1.article-hed { font-size: x-large; margin-bottom: 0.4rem; } .article-summary, .article-ideainbrief,
.article-dek { font-size: large; font-style: italic; margin-bottom: 1rem; } .description-text, .link--black, .topic, .auth {font-size:small; color:#202020;}
.article-byline { margin-top: 0.7rem; font-size: medium; font-style: normal; font-weight: bold; } .credits--hero-image, .credits--inline-image, .caption--inline-image,
.pub-date { font-size: small; margin-bottom: 1rem; } .calibre-nuked-tag-figcaption {font-size:small; text-align:center;}
img { .sub { font-style: italic; }
display: block; margin-bottom: 0.3rem; max-width: 100%; height: auto; .article-byline-list {font-size:small; font-weight:bold;}
box-sizing: border-box; .question {font-weight:bold;}
} .right-rail--container {font-size:small; color:#404040;}
.container--caption-credits-hero, .container--caption-credits-inline, span.credit { font-size: small; } .article-callout, .slug-content {color:#404040;}
.question { font-weight: bold; } .article-sidebar {color:#202020;}
.description-text { '''
margin: 1rem 0;
border-top: 1px solid gray;
padding-top: 0.5rem;
font-style: italic;
}
'''
keep_only_tags = [ def preprocess_raw_html(self, raw, url):
classes( root = parse(raw)
'headline-container article-dek-group pub-date hero-image-content ' script = root.xpath('//script[@id="__NEXT_DATA__"]')
'article-body standard-content' data = json.loads(script[0].text)
), data = data['props']['pageProps']['article']
] endpoint_url = (
'https://platform.hbr.org/hbr/bff/content/article' + urlparse(url).path
remove_tags = [
classes(
'left-rail--container translate-message follow-topic '
'newsletter-container by-prefix related-topics--common'
),
dict(name=['article-sidebar']),
]
def preprocess_raw_html(self, raw_html, article_url):
soup = self.soup(raw_html)
# break author byline out of list
byline_list = soup.find('ul', class_='article-byline-list')
if byline_list:
byline = byline_list.parent
byline.append(
', '.join(
[
self.tag_to_string(author)
for author in byline_list.find_all(class_='article-author')
]
)
)
byline_list.decompose()
# Extract full article content
content_ele = soup.find(
'content',
attrs={
'data-index': True,
'data-page-year': True,
'data-page-month': True,
'data-page-seo-title': True,
'data-page-slug': True,
},
) )
endpoint_url = 'https://hbr.org/api/article/piano/content?' + urlencode(
{ topic = ''
'year': content_ele['data-page-year'], if data.get('primaryTopic'):
'month': content_ele['data-page-month'], topic = f'<div class="topic">{data["primaryTopic"]}</div>'
'seotitle': content_ele['data-page-seo-title'], title = f'<h1>{data["title"]}</h1>'
} dek = f'<p class="sub">{data.get("dek", "")}</p>'
) hero = ''
data = { if data.get('hero'):
'contentKey': content_ele['data-index'], hero = f'<img src={absurl(data["hero"]["image"]["defaultSrc"])}>'
'pageSlug': content_ele['data-page-slug'], auth = ''
if data.get('authors'):
auth = f'<p class="auth">{"By " + ", ".join(x["name"] for x in data.get("authors", {}))}</p>'
key_ = {
'contentKey': data['contentKey'],
} }
headers = { headers = {
'User-Agent': random_user_agent(), 'User-Agent': random_user_agent(),
'Pragma': 'no-cache', 'Pragma': 'no-cache',
'Cache-Control': 'no-cache', 'Cache-Control': 'no-cache',
'Content-Type': 'application/json', 'Content-Type': 'application/json',
'Referer': article_url,
} }
br = browser() br = browser()
req = Request( req = Request(
endpoint_url, endpoint_url,
headers=headers, headers=headers,
data=json.dumps(data), data=json.dumps(key_),
method='POST', method='POST',
timeout=self.timeout, timeout=self.timeout,
) )
res = br.open(req) res = br.open(req)
article = json.loads(res.read()) body = json.loads(res.read())['content']
new_soup = self.soup(article['content'])
# clear out existing partial content return (
for c in list(content_ele.children): '<html><body><article>'
c.extract() # use extract() instead of decompose() because of strings + topic
content_ele.append(new_soup.body) + title
return str(soup) + dek
+ hero
+ auth
+ body
+ '</article></body></html>'
)
recipe_specific_options = { recipe_specific_options = {
'issue': { 'issue': {
'short': 'Enter the Issue Number you want to download ', 'short': 'Enter the Issue Number you want to download ',
'long': 'For example, 2403' 'long': 'For example, 2403',
} }
} }
def parse_index(self): def parse_index(self):
d = self.recipe_specific_options.get('issue') d = self.recipe_specific_options.get('issue')
if not (d and isinstance(d, str)): if not (d and isinstance(d, str)):
soup = self.index_to_soup(f'{self.base_url}/magazine') issue_url = f'{self.base_url}/magazine'
a = soup.find('a', href=lambda x: x and x.startswith('/archive-toc/'))
cov_url = a.find('img', attrs={'src': True})['src']
self.cover_url = urljoin(self.base_url, cov_url)
issue_url = urljoin(self.base_url, a['href'])
else: else:
issue_url = 'https://hbr.org/archive-toc/BR' + d issue_url = self.base_url + '/archive-toc/BR' + d
mobj = re.search(r'archive-toc/(?P<issue>(BR)?\d+)\b', issue_url)
if mobj:
self.cover_url = f'https://hbr.org/resources/images/covers/{mobj.group("issue")}_500.png'
self.log('Downloading issue:', issue_url)
soup = self.index_to_soup(issue_url) soup = self.index_to_soup(issue_url)
issue_title = soup.find('h1') div = soup.find(**classes('backdrop-lightest'))
if issue_title: a = div.find('a', href=lambda x: x and x.startswith('/archive-toc/'))
self.timefmt = f' [{self.tag_to_string(issue_title)}]' index = absurl(a['href'])
self.timefmt = ' [' + self.tag_to_string(div.find('h2')) + ']'
self.log('Downloading issue: ', index, self.timefmt)
cov_url = a.find('img', src=True)
if cov_url:
self.cover_url = absurl(cov_url['src'])
soup = self.index_to_soup(index)
feeds = OrderedDict() feeds = OrderedDict()
for h3 in soup.find_all('h3', attrs={'class': 'hed'}):
article_link_ele = h3.find('a')
if not article_link_ele:
continue
article_ele = h3.find_next_sibling( for h3 in soup.findAll('h3', attrs={'class': 'hed'}):
'div', attrs={'class': 'stream-item-info'} articles = []
) a = h3.find('a')
if not article_ele: title = self.tag_to_string(a)
continue url = absurl(a['href'])
auth = ''
title = self.tag_to_string(article_link_ele) div = h3.find_next_sibling('div', attrs={'class': 'stream-item-info'})
url = urljoin(self.base_url, article_link_ele['href']) if div:
aut = self.tag_to_string(div).replace('Magazine Article ', '')
authors_ele = article_ele.select('ul.byline li') auth = re.sub(r'(?<=\w)([A-Z])', r', \1', aut)
authors = ', '.join([self.tag_to_string(a) for a in authors_ele]) des = ''
dek = h3.find_next_sibling('div', attrs={'class': 'dek'})
article_desc = '' if dek:
dek_ele = h3.find_next_sibling('div', attrs={'class': 'dek'}) des = self.tag_to_string(dek)
if dek_ele: desc = des + ' |' + auth.title()
article_desc = self.tag_to_string(dek_ele) + ' | ' + authors section_title = 'Articles'
section_ele = ( sec = (
h3.findParent('li') h3.findParent('li')
.find_previous_sibling('div', **classes('stream-section-label')) .find_previous_sibling('div', **classes('stream-section-label'))
.find('h4') .find('h4')
) )
section_title = self.tag_to_string(section_ele).title() if sec:
feeds.setdefault(section_title, []).append( section_title = self.tag_to_string(sec).title()
{'title': title, 'url': url, 'description': article_desc} self.log(section_title, '\n\t', title, '\n\t', desc, '\n\t\t', url)
) articles.append({'title': title, 'url': url, 'description': desc})
return feeds.items() if articles:
if section_title not in feeds:
feeds[section_title] = []
feeds[section_title] += articles
ans = [(key, val) for key, val in feeds.items()]
return ans
def preprocess_html(self, soup):
for slug in soup.findAll(**classes('slug-content')):
del slug['href']
for dek in soup.findAll(**classes('article-byline')):
for by in dek.findAll('span', attrs={'class': 'by-prefix'}):
by.extract()
for li in dek.findAll('li'):
li.name = 'span'
for div in soup.findAll(
'div', attrs={'class': ['article-summary', 'article-callout']}
):
div.name = 'blockquote'
for sidebar in soup.findAll(('article-sidebar', 'article-ideainbrief')):
sidebar.name = 'blockquote'
for img in soup.findAll(attrs={'srcset': True}):
split = img['srcset'].split(',')
for x in split:
if '700w' in x:
img['src'] = absurl(x.split()[0])
del img['srcset']
return soup
# HBR changes the content it delivers based on cookies, so the
# following ensures that we send no cookies
def get_browser(self, *args, **kwargs):
return self
def clone_browser(self, *args, **kwargs):
return self.get_browser()
def open_novisit(self, *args, **kwargs):
br = browser()
return br.open_novisit(*args, **kwargs)
open = open_novisit