#!/usr/bin/env python from calibre.web.feeds.news import BasicNewsRecipe, classes class RevistaMuyInteresante(BasicNewsRecipe): title = 'Revista Muy Interesante' __author__ = 'unkn0wn' description = 'Revista Muy Interesante, es un sitio con información sobre ciencia, tecnología, historia, sociedad, medio ambiente, etc.' language = 'es' encoding = 'utf-8' no_stylesheets = True remove_javascript = True remove_attributes = ['style', 'height', 'width'] ignore_duplicate_articles = {'url'} masthead_url = 'https://www.muyinteresante.com/static/img/logo_web.svg' resolve_internal_links = True def get_cover_url(self): soup = self.index_to_soup( 'https://www.magzter.com/ES/Zinet-Media-Global/Muy-Interesante-Espa%C3%B1a/Science/1806044' ) return soup.find('img', id=lambda s: s and 'mgd__lhd__cover' in s.split())['src'] extra_css = ''' .c-detail__bar, .c-detail__author, .c-detail__media__txt { font-size:small; } .default-epigraph { font-style:italic; } ''' keep_only_tags = [dict(name='article', attrs={'class':'c-detail'})] remove_tags = [ dict(name=['aside', 'svg', 'script']), classes('c-detail__share') ] def preprocess_html(self, soup): au = soup.find(**classes('c-detail__author')) if au: for p in au.findAll('p'): p.name = 'div' for h in soup.findAll(['h2', 'h3']): h.name = 'h4' return soup def parse_index(self): soup = self.index_to_soup('https://www.muyinteresante.com/') ans = [] for articles in soup.findAll('article'): a = articles.find('a', attrs={'class':'page-link', 'href':True}) if not a: continue title = self.tag_to_string(a) url = a['href'] desc = '' info = articles.find(**classes('c-article__info_content')) if info: desc = self.tag_to_string(info) self.log('\t', title, '\n\t', desc, '\n\t\t', url) ans.append({'title': title, 'url': url, 'description': desc}) return [('Articles', ans)]