Update Harvard Business Review

This commit is contained in:
Kovid Goyal 2022-06-16 13:33:05 +05:30
parent 4abda26e92
commit aa3312d514
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C

View File

@ -1,6 +1,8 @@
from calibre.web.feeds.news import BasicNewsRecipe, classes from calibre.web.feeds.news import BasicNewsRecipe, classes
from datetime import datetime from datetime import datetime
from calibre import browser from calibre import browser
from collections import OrderedDict
import re
class HBR(BasicNewsRecipe): class HBR(BasicNewsRecipe):
@ -46,32 +48,44 @@ class HBR(BasicNewsRecipe):
cov_url = a.find('img', attrs={'src': True})['src'] cov_url = a.find('img', attrs={'src': True})['src']
self.cover_url = 'https://hbr.org' + cov_url self.cover_url = 'https://hbr.org' + cov_url
soup = self.index_to_soup('https://hbr.org' + url) soup = self.index_to_soup('https://hbr.org' + url)
ans = []
feeds = OrderedDict()
for h3 in soup.findAll('h3', attrs={'class': 'hed'}): for h3 in soup.findAll('h3', attrs={'class': 'hed'}):
articles = []
d = datetime.today() d = datetime.today()
for a in h3.findAll( for a in h3.findAll(
'a', href=lambda x: x.startswith('/' + d.strftime('%Y') + '/') 'a', href=lambda x: x.startswith('/' + d.strftime('%Y') + '/')
): ):
title = self.tag_to_string(a) title = self.tag_to_string(a)
url = a['href'] url = a['href']
url = 'https://hbr.org' + url url = 'https://hbr.org' + url
div = h3.find_next_sibling('div', attrs={'class': 'stream-item-info'}) div = h3.find_next_sibling('div', attrs={'class': 'stream-item-info'})
if div: if div:
auth = self.tag_to_string(div) aut = self.tag_to_string(div).replace('Magazine Article ', '')
auth = re.sub(r"(?<=\w)([A-Z])", r", \1", aut)
dek = h3.find_next_sibling('div', attrs={'class': 'dek'}) dek = h3.find_next_sibling('div', attrs={'class': 'dek'})
if dek: if dek:
des = self.tag_to_string(dek) des = self.tag_to_string(dek)
desc = des + ' |' + auth desc = des + ' |' + auth.title()
sec = h3.findParent('li').find_previous_sibling('div', **classes('stream-section-label')).find('h4')
section_title = self.tag_to_string(sec).title()
self.log(section_title)
self.log('\t', title) self.log('\t', title)
self.log('\t', desc) self.log('\t', desc)
self.log('\t\t', url) self.log('\t\t', url)
ans.append({ articles.append({
'title': title, 'title': title,
'url': url, 'url': url,
'description': desc}) 'description': desc})
return [('Articles', ans)] if articles:
if section_title not in feeds:
feeds[section_title] = []
feeds[section_title] += articles
ans = [(key, val) for key, val in feeds.items()]
return ans
# HBR changes the content it delivers based on cookies, so the # HBR changes the content it delivers based on cookies, so the
# following ensures that we send no cookies # following ensures that we send no cookies