From 6812d671eb53fb69ec067385fffc31b8949ebbf6 Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Mon, 19 Dec 2022 12:16:22 +0530
Subject: [PATCH] Update Harvard Business Review

---
 recipes/hbr.recipe | 97 +++++++++++++++++++++++-----------------------
 1 file changed, 48 insertions(+), 49 deletions(-)

diff --git a/recipes/hbr.recipe b/recipes/hbr.recipe
index f0b0c0218e..799b6d64b9 100644
--- a/recipes/hbr.recipe
+++ b/recipes/hbr.recipe
@@ -1,8 +1,14 @@
-from calibre.web.feeds.news import BasicNewsRecipe, classes
-from datetime import datetime
-from calibre import browser
-from collections import OrderedDict
 import re
+from collections import OrderedDict
+
+from calibre import browser
+from calibre.web.feeds.news import BasicNewsRecipe, classes
+
+
+def absurl(url):
+    if url.startswith('/'):
+        url = 'https://www.hbr.org/' + url
+    return url
 
 
 class HBR(BasicNewsRecipe):
@@ -21,75 +27,66 @@ class HBR(BasicNewsRecipe):
     remove_attributes = ['height', 'width', 'style']
     encoding = 'utf-8'
     ignore_duplicate_articles = {'url'}
+    resolve_internal_links = True
+
     extra_css = '''
-        article-sidebar{font-family:Georgia,"Times New Roman",Times,serif; border:ridge; text-align:left;}
-        [close-caption]{ border:ridge; font-size:small; text-align:center;}
-        article-ideainbrief{font-family:Georgia,"Times New Roman",Times,serif; text-align:left; font-style:italic; }
-        .article-byline-list{font-size:small;}
-        .credits--hero-image{font-size:small;}
-        .credits--inline-image{font-size:small;}
-        .caption--inline-image{font-size:small;}
-        .description-text{font-size:small; color:gray;}
-        .right-rail--container{font-size:small; color:#4c4c4c;}
-        .link--black{font-size:small;}
-        .article-callout{color:#4c4c4c; text-align:center;}
-        .slug-content{color:gray;}
-        '''
+        .article-summary, .article-ideainbrief, .description-text, .link--black {font-size:small; color:#202020;}
+        .credits--hero-image, .credits--inline-image, .caption--inline-image {font-size:small; text-align:center;}
+        .article-byline-list {font-size:small; font-weight:bold;}
+        .question {font-weight:bold;}
+        .right-rail--container {font-size:small; color:#404040;}
+        .article-callout, .slug-content {color:#404040;}
+        .article-sidebar {color:#202020;}
+    '''
 
     keep_only_tags = [
         classes(
-            'headline-container hero-image-content article-summary article-body standard-content'
-            ' article-dek-group article-dek slug-container'
-        ),
-        dict(name='article-sidebar'),
+            'slug-container headline-container hero-image-content article-summary article-body '
+            'standard-content article-dek-group article-dek'
+        )
     ]
 
     remove_tags = [
         classes(
-            'left-rail--container translate-message follow-topic newsletter-container '
-        ),
+            'left-rail--container translate-message follow-topic newsletter-container'
+        )
     ]
 
     def parse_index(self):
         soup = self.index_to_soup('https://hbr.org/magazine')
-        a = soup.find('a', href=lambda x: x and x.startswith('/archive-toc/'))
-        url = a['href']
-        self.log('Downloading issue:', url)
-        cov_url = a.find('img', attrs={'src': True})['src']
-        self.cover_url = 'https://hbr.org' + cov_url
-        soup = self.index_to_soup('https://hbr.org' + url)
+        div = soup.find(**classes('backdrop-lightest'))
+        a = div.find('a', href=lambda x: x and x.startswith('/archive-toc/'))
+        index = absurl(a['href'])
+        self.timefmt = ' [' + self.tag_to_string(div.find('h2')) + ']'
+        self.log('Downloading issue: ', index, self.timefmt)
+        cov_url = a.find('img', src=True)
+        if cov_url:
+            self.cover_url = absurl(cov_url['src'])
+        soup = self.index_to_soup(index)
 
         feeds = OrderedDict()
 
         for h3 in soup.findAll('h3', attrs={'class': 'hed'}):
             articles = []
-            d = datetime.today()
-            for a in h3.findAll(
-                'a', href=lambda x: x.startswith('/' + d.strftime('%Y') + '/')
-            ):
-
-                title = self.tag_to_string(a)
-                url = a['href']
-                url = 'https://hbr.org' + url
+            a = h3.find('a')
+            title = self.tag_to_string(a)
+            url = absurl(a['href'])
+            auth = ''
             div = h3.find_next_sibling('div', attrs={'class': 'stream-item-info'})
             if div:
                 aut = self.tag_to_string(div).replace('Magazine Article ', '')
                 auth = re.sub(r"(?<=\w)([A-Z])", r", \1", aut)
+            des = ''
             dek = h3.find_next_sibling('div', attrs={'class': 'dek'})
             if dek:
                 des = self.tag_to_string(dek)
             desc = des + ' |' + auth.title()
+            section_title = 'Articles'
             sec = h3.findParent('li').find_previous_sibling('div', **classes('stream-section-label')).find('h4')
-            section_title = self.tag_to_string(sec).title()
-            self.log(section_title)
-            self.log('\t', title)
-            self.log('\t', desc)
-            self.log('\t\t', url)
-
-            articles.append({
-                'title': title,
-                'url': url,
-                'description': desc})
+            if sec:
+                section_title = self.tag_to_string(sec).title()
+            self.log(section_title, '\n\t', title, '\n\t', desc, '\n\t\t', url)
+            articles.append({'title': title, 'url': url, 'description': desc})
             if articles:
                 if section_title not in feeds:
                     feeds[section_title] = []
@@ -105,8 +102,10 @@ class HBR(BasicNewsRecipe):
                 by.extract()
             for li in dek.findAll('li'):
                 li.name = 'span'
-        for h2 in soup.findAll(('h2','h3')):
-            h2.name = 'h5'
+        for div in soup.findAll('div', attrs={'class':['article-summary', 'article-callout']}):
+            div.name = 'blockquote'
+        for sidebar in soup.findAll(('article-sidebar', 'article-ideainbrief')):
+            sidebar.name = 'blockquote'
         return soup
 
     # HBR changes the content it delivers based on cookies, so the