calibre/recipes/scientific_american.recipe

#!/usr/bin/env python
__license__ = 'GPL v3'

from calibre.web.feeds.news import BasicNewsRecipe
from css_selectors import Select


def absurl(url):
    if url.startswith('/'):
        url = 'https://www.scientificamerican.com' + url
    return url


keep_classes = {'article-header', 'article-content',
                'article-media', 'article-author', 'article-text'}
remove_classes = {'aside-banner', 'moreToExplore', 'article-footer'}


class ScientificAmerican(BasicNewsRecipe):
    title = u'Scientific American'
    description = u'Popular Science. Monthly magazine. Should be downloaded around the middle of each month.'
    category = 'science'
    __author__ = 'Kovid Goyal'
    no_stylesheets = True
    language = 'en'
    publisher = 'Nature Publishing Group'
    remove_empty_feeds = True
    remove_javascript = True
    timefmt = ' [%B %Y]'

    needs_subscription = 'optional'

    keep_only_tags = [
        dict(attrs={'class': lambda x: x and bool(
            set(x.split()).intersection(keep_classes))}),
    ]
    remove_tags = [
        dict(attrs={'class': lambda x: x and bool(
            set(x.split()).intersection(remove_classes))}),
        dict(id=['seeAlsoLinks']),
    ]

    def get_browser(self, *args):
        br = BasicNewsRecipe.get_browser(self)
        if self.username and self.password:
            br.open('https://www.scientificamerican.com/my-account/login/')
            br.select_form(predicate=lambda f: f.attrs.get('id') == 'login')
            br['emailAddress'] = self.username
            br['password'] = self.password
            br.submit()
        return br

    def parse_index(self):
        # Get the cover, date and issue URL
        root = self.index_to_soup(
            'https://www.scientificamerican.com/sciammag/', as_tree=True)
        select = Select(root)
        url = [x.get('href', '') for x in select('main .store-listing__img a')][0]
        url = absurl(url)
        self.cover_url = [x.get('src', '') for x in select('main .store-listing__img img')][0]

        # Now parse the actual issue to get the list of articles
        select = Select(self.index_to_soup(url, as_tree=True))
        feeds = []
        for i, section in enumerate(select('#sa_body .toc-articles')):
            if i == 0:
                feeds.append(
                    ('Features', list(self.parse_sciam_features(select, section))))
            else:
                feeds.extend(self.parse_sciam_departments(select, section))

        return feeds

    def parse_sciam_features(self, select, section):
        for article in select('article[data-article-title]', section):
            title = article.get('data-article-title')
            url = 'https://www.scientificamerican.com/{}/'.format(article.get('id').replace('-', '/', 1))
            desc = ''
            for p in select('p.t_body', article):
                desc += self.tag_to_string(p)
                break
            for p in select('.t_meta', article):
                desc += ' ' + self.tag_to_string(p)
                break
            self.log('Found feature article: %s at %s' % (title, url))
            self.log('\t' + desc)
            yield {'title': title, 'url': url, 'description': desc}

    def parse_sciam_departments(self, select, section):
        section_title, articles = 'Unknown', []
        for li in select('li[data-article-title]', section):
            for span in select('span.department-title', li):
                if articles:
                    yield section_title, articles
                section_title, articles = self.tag_to_string(span), []
                self.log('\nFound section: %s' % section_title)
                break
            url = 'https://www.scientificamerican.com/{}/'.format(li.get('id').replace('-', '/', 1))
            for h2 in select('h2.t_listing-title', li):
                title = self.tag_to_string(h2)
                break
            else:
                continue
            articles.append(
                    {'title': title, 'url': url, 'description': ''})
            self.log('\tFound article: %s at %s' % (title, url))
        if articles:
            yield section_title, articles