calibre/recipes/revista_muy.recipe

#!/usr/bin/env python
from calibre.web.feeds.news import BasicNewsRecipe, classes


class RevistaMuyInteresante(BasicNewsRecipe):
    title = 'Revista Muy Interesante'
    __author__ = 'unkn0wn'
    description = 'Revista Muy Interesante, es un sitio con información sobre ciencia, tecnología, historia, sociedad, medio ambiente, etc.'
    language = 'es'
    encoding = 'utf-8'
    no_stylesheets = True
    remove_javascript = True
    remove_attributes = ['style', 'height', 'width']
    ignore_duplicate_articles = {'url'}
    masthead_url = 'https://www.muyinteresante.com/static/img/logo_web.svg'
    resolve_internal_links = True

    def get_cover_url(self):
        soup = self.index_to_soup(
            'https://www.magzter.com/ES/Zinet-Media-Global/Muy-Interesante-Espa%C3%B1a/Science/1806044'
        )
        return soup.find('img', id=lambda s: s and 'mgd__lhd__cover' in s.split())['src']

    extra_css = '''
        .c-detail__bar, .c-detail__author, .c-detail__media__txt { font-size:small; }
        .default-epigraph { font-style:italic; }
    '''

    keep_only_tags = [dict(name='article', attrs={'class':'c-detail'})]

    remove_tags = [
        dict(name=['aside', 'svg', 'script']),
        classes('c-detail__share')
    ]

    def preprocess_html(self, soup):
        au = soup.find(**classes('c-detail__author'))
        if au:
            for p in au.findAll('p'):
                p.name = 'div'
        for h in soup.findAll(['h2', 'h3']):
            h.name = 'h4'
        return soup

    def parse_index(self):
        soup = self.index_to_soup('https://www.muyinteresante.com/')
        ans = []
        for articles in soup.findAll('article'):
            a = articles.find('a', attrs={'class':'page-link', 'href':True})
            if not a:
                continue
            title = self.tag_to_string(a)
            url = a['href']
            desc = ''
            info = articles.find(**classes('c-article__info_content'))
            if info:
                desc = self.tag_to_string(info)
            self.log('\t', title, '\n\t', desc, '\n\t\t', url)
            ans.append({'title': title, 'url': url, 'description': desc})
        return [('Articles', ans)]