calibre/recipes/fortune_magazine.recipe

from calibre.web.feeds.recipes import BasicNewsRecipe


def classes(classes):
    q = frozenset(classes.split(' '))
    return dict(attrs={
        'class': lambda x: x and frozenset(x.split()).intersection(q)})


class Fortune(BasicNewsRecipe):

    title = 'Fortune Magazine'
    __author__ = 'Rick Shang'

    description = 'FORTUNE is a global business magazine that has been revered in its content and credibility since 1930. FORTUNE covers the entire field of business, including specific companies and business trends, prominent business leaders, and new ideas shaping the global marketplace.'  # noqa
    language = 'en'
    category = 'news'
    encoding = 'UTF-8'
    keep_only_tags = [
            dict(name='h1', attrs={'class': lambda x: x and 'headline' in x}),
            classes('lead-media longform-bylines longform-timestamps author'),
            dict(id=['article-body', 'longform-body']),
    ]

    no_javascript = True
    no_stylesheets = True
    needs_subscription = 'optional'

    def get_browser(self):
        br = BasicNewsRecipe.get_browser(self)
        if self.username and self.password:
            br.open('http://fortune.com/sitemap/')
            br.select_form(id='sign-in-form')
            br['username'] = self.username
            br['password'] = self.password
            br.submit()
        return br

    def preprocess_html(self, soup, *a):
        for div in soup.findAll(attrs={'class': lambda x: x and 'lazy-image' in x.split()}):
            a = div.find('a', href=True)
            if a is not None:
                a.name = 'img'
                a['src'] = a['href']
        return soup

    def parse_index(self):
        articles = []

        # Go to the latestissue
        soup = self.index_to_soup('http://fortune.com/section/magazine/')
        articles = []

        for i, article in enumerate(soup.findAll('article', attrs={'class': lambda x: x and 'type-article' in x.split()})):
            div = article.find('div', attrs={'class': lambda x: x and 'article-info' in x.split()})
            a = div.find('a', href=True)
            url = a['href']
            if url.startswith('/'):
                url = 'http://fortune.com' + url
            title = self.tag_to_string(a)
            ai = div.find('div', attrs={'class': lambda x: x and 'article-info-extended' in x.split()})
            desc = ''
            if ai:
                desc = self.tag_to_string(desc)
            self.log('Article:', title, 'at', url)
            articles.append({'title': title, 'url': url, 'description': desc})
        return [('Articles', articles)]