Update Harvard Business Review

This commit is contained in:
Kovid Goyal 2022-12-19 12:16:22 +05:30
parent 87da4098f5
commit 6812d671eb
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C

View File

@ -1,8 +1,14 @@
from calibre.web.feeds.news import BasicNewsRecipe, classes
from datetime import datetime
from calibre import browser
from collections import OrderedDict
import re import re
from collections import OrderedDict
from calibre import browser
from calibre.web.feeds.news import BasicNewsRecipe, classes
def absurl(url):
if url.startswith('/'):
url = 'https://www.hbr.org/' + url
return url
class HBR(BasicNewsRecipe): class HBR(BasicNewsRecipe):
@ -21,75 +27,66 @@ class HBR(BasicNewsRecipe):
remove_attributes = ['height', 'width', 'style'] remove_attributes = ['height', 'width', 'style']
encoding = 'utf-8' encoding = 'utf-8'
ignore_duplicate_articles = {'url'} ignore_duplicate_articles = {'url'}
resolve_internal_links = True
extra_css = ''' extra_css = '''
article-sidebar{font-family:Georgia,"Times New Roman",Times,serif; border:ridge; text-align:left;} .article-summary, .article-ideainbrief, .description-text, .link--black {font-size:small; color:#202020;}
[close-caption]{ border:ridge; font-size:small; text-align:center;} .credits--hero-image, .credits--inline-image, .caption--inline-image {font-size:small; text-align:center;}
article-ideainbrief{font-family:Georgia,"Times New Roman",Times,serif; text-align:left; font-style:italic; } .article-byline-list {font-size:small; font-weight:bold;}
.article-byline-list{font-size:small;} .question {font-weight:bold;}
.credits--hero-image{font-size:small;} .right-rail--container {font-size:small; color:#404040;}
.credits--inline-image{font-size:small;} .article-callout, .slug-content {color:#404040;}
.caption--inline-image{font-size:small;} .article-sidebar {color:#202020;}
.description-text{font-size:small; color:gray;}
.right-rail--container{font-size:small; color:#4c4c4c;}
.link--black{font-size:small;}
.article-callout{color:#4c4c4c; text-align:center;}
.slug-content{color:gray;}
''' '''
keep_only_tags = [ keep_only_tags = [
classes( classes(
'headline-container hero-image-content article-summary article-body standard-content' 'slug-container headline-container hero-image-content article-summary article-body '
' article-dek-group article-dek slug-container' 'standard-content article-dek-group article-dek'
), )
dict(name='article-sidebar'),
] ]
remove_tags = [ remove_tags = [
classes( classes(
'left-rail--container translate-message follow-topic newsletter-container' 'left-rail--container translate-message follow-topic newsletter-container'
), )
] ]
def parse_index(self): def parse_index(self):
soup = self.index_to_soup('https://hbr.org/magazine') soup = self.index_to_soup('https://hbr.org/magazine')
a = soup.find('a', href=lambda x: x and x.startswith('/archive-toc/')) div = soup.find(**classes('backdrop-lightest'))
url = a['href'] a = div.find('a', href=lambda x: x and x.startswith('/archive-toc/'))
self.log('Downloading issue:', url) index = absurl(a['href'])
cov_url = a.find('img', attrs={'src': True})['src'] self.timefmt = ' [' + self.tag_to_string(div.find('h2')) + ']'
self.cover_url = 'https://hbr.org' + cov_url self.log('Downloading issue: ', index, self.timefmt)
soup = self.index_to_soup('https://hbr.org' + url) cov_url = a.find('img', src=True)
if cov_url:
self.cover_url = absurl(cov_url['src'])
soup = self.index_to_soup(index)
feeds = OrderedDict() feeds = OrderedDict()
for h3 in soup.findAll('h3', attrs={'class': 'hed'}): for h3 in soup.findAll('h3', attrs={'class': 'hed'}):
articles = [] articles = []
d = datetime.today() a = h3.find('a')
for a in h3.findAll(
'a', href=lambda x: x.startswith('/' + d.strftime('%Y') + '/')
):
title = self.tag_to_string(a) title = self.tag_to_string(a)
url = a['href'] url = absurl(a['href'])
url = 'https://hbr.org' + url auth = ''
div = h3.find_next_sibling('div', attrs={'class': 'stream-item-info'}) div = h3.find_next_sibling('div', attrs={'class': 'stream-item-info'})
if div: if div:
aut = self.tag_to_string(div).replace('Magazine Article ', '') aut = self.tag_to_string(div).replace('Magazine Article ', '')
auth = re.sub(r"(?<=\w)([A-Z])", r", \1", aut) auth = re.sub(r"(?<=\w)([A-Z])", r", \1", aut)
des = ''
dek = h3.find_next_sibling('div', attrs={'class': 'dek'}) dek = h3.find_next_sibling('div', attrs={'class': 'dek'})
if dek: if dek:
des = self.tag_to_string(dek) des = self.tag_to_string(dek)
desc = des + ' |' + auth.title() desc = des + ' |' + auth.title()
section_title = 'Articles'
sec = h3.findParent('li').find_previous_sibling('div', **classes('stream-section-label')).find('h4') sec = h3.findParent('li').find_previous_sibling('div', **classes('stream-section-label')).find('h4')
if sec:
section_title = self.tag_to_string(sec).title() section_title = self.tag_to_string(sec).title()
self.log(section_title) self.log(section_title, '\n\t', title, '\n\t', desc, '\n\t\t', url)
self.log('\t', title) articles.append({'title': title, 'url': url, 'description': desc})
self.log('\t', desc)
self.log('\t\t', url)
articles.append({
'title': title,
'url': url,
'description': desc})
if articles: if articles:
if section_title not in feeds: if section_title not in feeds:
feeds[section_title] = [] feeds[section_title] = []
@ -105,8 +102,10 @@ class HBR(BasicNewsRecipe):
by.extract() by.extract()
for li in dek.findAll('li'): for li in dek.findAll('li'):
li.name = 'span' li.name = 'span'
for h2 in soup.findAll(('h2','h3')): for div in soup.findAll('div', attrs={'class':['article-summary', 'article-callout']}):
h2.name = 'h5' div.name = 'blockquote'
for sidebar in soup.findAll(('article-sidebar', 'article-ideainbrief')):
sidebar.name = 'blockquote'
return soup return soup
# HBR changes the content it delivers based on cookies, so the # HBR changes the content it delivers based on cookies, so the