calibre/recipes/popscience.recipe

#!/usr/bin/env python
# vim:fileencoding=utf-8
# License: GPLv3 Copyright: 2020, Kovid Goyal <kovid at kovidgoyal.net>

from __future__ import absolute_import, division, print_function, unicode_literals

from calibre.web.feeds.news import BasicNewsRecipe


def classes(classes):
    q = frozenset(classes.split(' '))
    return dict(attrs={
        'class': lambda x: x and frozenset(x.split()).intersection(q)})


class AdvancedUserRecipe1282101454(BasicNewsRecipe):
    title = 'Popular Science'
    language = 'en'
    __author__ = 'Kovid Goyal'
    description = 'Popular Science'
    publisher = 'Popular Science'
    max_articles_per_feed = 100
    ignore_duplicate_articles = {'url'}
    no_stylesheets = True
    keep_only_tags = [
        classes('content-body article-header featured-img'),
    ]

    def parse_section_index(self, slug):
        soup = self.index_to_soup('https://www.popsci.com/{}/'.format(slug))
        main = soup.find(**classes('main-module'))
        for div in main.findAll(**classes('main-item')):
            a = div.find('a', href=True, **classes('linkable'))
            url = a['href']
            title = self.tag_to_string(a.find(**classes('title')))
            desc = ''
            dek = a.find(**classes('dek'))
            if dek is not None:
                desc = self.tag_to_string(dek)
            self.log(' ', title, url)
            yield {'title': title, 'url': url, 'description': desc}

    def parse_index(self):
        sections = []
        for slug, title in {
            'science': 'Science',
            'technology': 'Technology',
            'diy': 'DIY',
            'reviews': 'Reviews',
        }.items():
            self.log('Section:', title)
            articles = list(self.parse_section_index(slug))
            if articles:
                sections.append((title, articles))
        return sections

    def preprocess_html(self, soup):
        for img in soup.findAll('img', attrs={'data-medsrc': True}):
            img['src'] = img['data-medsrc']
        return soup