Update Harvard Business Review

2025-07-09 03:04:10 -04:00 · 2022-12-19 12:16:22 +05:30 · 2022-12-19 12:16:22 +05:30 · 6812d671eb
commit 6812d671eb
parent 87da4098f5
1 changed files with 48 additions and 49 deletions
--- a/recipes/hbr.recipe
+++ b/recipes/hbr.recipe
@ -1,8 +1,14 @@
 from calibre.web.feeds.news import BasicNewsRecipe, classes
 from datetime import datetime
 from calibre import browser
 from collections import OrderedDict
 import re
 from collections import OrderedDict
 from calibre import browser
 from calibre.web.feeds.news import BasicNewsRecipe, classes
 def absurl(url):
    if url.startswith('/'):
        url = 'https://www.hbr.org/' + url
    return url
 class HBR(BasicNewsRecipe):
@ -21,75 +27,66 @@ class HBR(BasicNewsRecipe):
    remove_attributes = ['height', 'width', 'style']
    encoding = 'utf-8'
    ignore_duplicate_articles = {'url'}
    resolve_internal_links = True
    extra_css = '''
-        article-sidebar{font-family:Georgia,"Times New Roman",Times,serif; border:ridge; text-align:left;}
+        .article-summary, .article-ideainbrief, .description-text, .link--black {font-size:small; color:#202020;}
-        [close-caption]{ border:ridge; font-size:small; text-align:center;}
+        .credits--hero-image, .credits--inline-image, .caption--inline-image {font-size:small; text-align:center;}
-        article-ideainbrief{font-family:Georgia,"Times New Roman",Times,serif; text-align:left; font-style:italic; }
+        .article-byline-list {font-size:small; font-weight:bold;}
-        .article-byline-list{font-size:small;}
+        .question {font-weight:bold;}
-        .credits--hero-image{font-size:small;}
+        .right-rail--container {font-size:small; color:#404040;}
-        .credits--inline-image{font-size:small;}
+        .article-callout, .slug-content {color:#404040;}
-        .caption--inline-image{font-size:small;}
+        .article-sidebar {color:#202020;}
        .description-text{font-size:small; color:gray;}
        .right-rail--container{font-size:small; color:#4c4c4c;}
        .link--black{font-size:small;}
        .article-callout{color:#4c4c4c; text-align:center;}
        .slug-content{color:gray;}
    '''
    keep_only_tags = [
        classes(
-            'headline-container hero-image-content article-summary article-body standard-content'
+            'slug-container headline-container hero-image-content article-summary article-body '
-            ' article-dek-group article-dek slug-container'
+            'standard-content article-dek-group article-dek'
-        ),
+        )
        dict(name='article-sidebar'),
    ]
    remove_tags = [
        classes(
            'left-rail--container translate-message follow-topic newsletter-container'
-        ),
+        )
    ]
    def parse_index(self):
        soup = self.index_to_soup('https://hbr.org/magazine')
-        a = soup.find('a', href=lambda x: x and x.startswith('/archive-toc/'))
+        div = soup.find(**classes('backdrop-lightest'))
-        url = a['href']
+        a = div.find('a', href=lambda x: x and x.startswith('/archive-toc/'))
-        self.log('Downloading issue:', url)
+        index = absurl(a['href'])
-        cov_url = a.find('img', attrs={'src': True})['src']
+        self.timefmt = ' [' + self.tag_to_string(div.find('h2')) + ']'
-        self.cover_url = 'https://hbr.org' + cov_url
+        self.log('Downloading issue: ', index, self.timefmt)
-        soup = self.index_to_soup('https://hbr.org' + url)
+        cov_url = a.find('img', src=True)
        if cov_url:
            self.cover_url = absurl(cov_url['src'])
        soup = self.index_to_soup(index)
        feeds = OrderedDict()
        for h3 in soup.findAll('h3', attrs={'class': 'hed'}):
            articles = []
-            d = datetime.today()
+            a = h3.find('a')
            for a in h3.findAll(
                'a', href=lambda x: x.startswith('/' + d.strftime('%Y') + '/')
            ):
            title = self.tag_to_string(a)
-                url = a['href']
+            url = absurl(a['href'])
-                url = 'https://hbr.org' + url
+            auth = ''
            div = h3.find_next_sibling('div', attrs={'class': 'stream-item-info'})
            if div:
                aut = self.tag_to_string(div).replace('Magazine Article ', '')
                auth = re.sub(r"(?<=\w)([A-Z])", r", \1", aut)
            des = ''
            dek = h3.find_next_sibling('div', attrs={'class': 'dek'})
            if dek:
                des = self.tag_to_string(dek)
            desc = des + ' |' + auth.title()
            section_title = 'Articles'
            sec = h3.findParent('li').find_previous_sibling('div', **classes('stream-section-label')).find('h4')
            if sec:
                section_title = self.tag_to_string(sec).title()
-            self.log(section_title)
+            self.log(section_title, '\n\t', title, '\n\t', desc, '\n\t\t', url)
-            self.log('\t', title)
+            articles.append({'title': title, 'url': url, 'description': desc})
            self.log('\t', desc)
            self.log('\t\t', url)
            articles.append({
                'title': title,
                'url': url,
                'description': desc})
            if articles:
                if section_title not in feeds:
                    feeds[section_title] = []
@ -105,8 +102,10 @@ class HBR(BasicNewsRecipe):
                by.extract()
            for li in dek.findAll('li'):
                li.name = 'span'
-        for h2 in soup.findAll(('h2','h3')):
+        for div in soup.findAll('div', attrs={'class':['article-summary', 'article-callout']}):
-            h2.name = 'h5'
+            div.name = 'blockquote'
        for sidebar in soup.findAll(('article-sidebar', 'article-ideainbrief')):
            sidebar.name = 'blockquote'
        return soup
    # HBR changes the content it delivers based on cookies, so the