calibre/recipes/hbr.recipe

from calibre.web.feeds.news import BasicNewsRecipe, classes
from datetime import datetime
from calibre import browser
from collections import OrderedDict
import re


class HBR(BasicNewsRecipe):
    title = 'Harvard Business Review'
    __author__ = 'unkn0wn'
    description = (
        'Harvard Business Review is the leading destination for smart management thinking.'
        ' Through its flagship magazine, books, and digital content and tools published on HBR.org,'
        ' Harvard Business Review aims to provide professionals around the world with rigorous insights'
        ' and best practices to help lead themselves and their organizations more effectively and to make a positive impact.')
    language = 'en'
    use_embedded_content = False
    no_stylesheets = True
    remove_javascript = True
    masthead_url = 'http://hbr.org/resources/css/images/hbr_logo.svg'
    remove_attributes = ['height', 'width', 'style']
    encoding = 'utf-8'
    ignore_duplicate_articles = {'url'}
    extra_css = '''
        article-sidebar{font-family:Georgia,"Times New Roman",Times,serif; border:ridge; text-align:left;}
        [close-caption]{ border:ridge; font-size:small; text-align:center;}
        article-ideainbrief{font-family:Georgia,"Times New Roman",Times,serif; text-align:left; font-style:italic; }
        .article-byline-list{font-size:small;}
        .credits--hero-image{font-size:small;}
        .credits--inline-image{font-size:small;}
        .caption--inline-image{font-size:small;}
        .description-text{font-size:small; color:gray;}
        .right-rail--container{font-size:small; color:#4c4c4c;}
        .link--black{font-size:small;}
        .article-callout{color:#4c4c4c; text-align:center;}
        .slug-content{color:gray;}
        '''

    keep_only_tags = [
        classes(
            'headline-container hero-image-content article-summary article-body standard-content'
            ' article-dek-group article-dek slug-container'
        ),
        dict(name='article-sidebar'),
    ]

    remove_tags = [
        classes(
            'left-rail--container translate-message follow-topic newsletter-container '
        ),
    ]

    def parse_index(self):
        soup = self.index_to_soup('https://hbr.org/magazine')
        a = soup.find('a', href=lambda x: x and x.startswith('/archive-toc/'))
        url = a['href']
        self.log('Downloading issue:', url)
        cov_url = a.find('img', attrs={'src': True})['src']
        self.cover_url = 'https://hbr.org' + cov_url
        soup = self.index_to_soup('https://hbr.org' + url)

        feeds = OrderedDict()

        for h3 in soup.findAll('h3', attrs={'class': 'hed'}):
            articles = []
            d = datetime.today()
            for a in h3.findAll(
                'a', href=lambda x: x.startswith('/' + d.strftime('%Y') + '/')
            ):

                title = self.tag_to_string(a)
                url = a['href']
                url = 'https://hbr.org' + url
            div = h3.find_next_sibling('div', attrs={'class': 'stream-item-info'})
            if div:
                aut = self.tag_to_string(div).replace('Magazine Article ', '')
                auth = re.sub(r"(?<=\w)([A-Z])", r", \1", aut)
            dek = h3.find_next_sibling('div', attrs={'class': 'dek'})
            if dek:
                des = self.tag_to_string(dek)
            desc = des + ' |' + auth.title()
            sec = h3.findParent('li').find_previous_sibling('div', **classes('stream-section-label')).find('h4')
            section_title = self.tag_to_string(sec).title()
            self.log(section_title)
            self.log('\t', title)
            self.log('\t', desc)
            self.log('\t\t', url)

            articles.append({
                'title': title,
                'url': url,
                'description': desc})
            if articles:
                if section_title not in feeds:
                    feeds[section_title] = []
                feeds[section_title] += articles
        ans = [(key, val) for key, val in feeds.items()]
        return ans

    def preprocess_html(self, soup):
        for slug in soup.findAll(**classes('slug-content')):
            del slug['href']
        for dek in soup.findAll(**classes('article-byline')):
            for by in dek.findAll('span', attrs={'class':'by-prefix'}):
                by.extract()
            for li in dek.findAll('li'):
                li.name = 'span'
        for h2 in soup.findAll(('h2','h3')):
            h2.name = 'h5'
        return soup

    # HBR changes the content it delivers based on cookies, so the
    # following ensures that we send no cookies
    def get_browser(self, *args, **kwargs):
        return self

    def clone_browser(self, *args, **kwargs):
        return self.get_browser()

    def open_novisit(self, *args, **kwargs):
        br = browser()
        return br.open_novisit(*args, **kwargs)

    open = open_novisit