diff --git a/resources/recipes/revista_muy.recipe b/resources/recipes/revista_muy.recipe index ae3d47466c..e452a6f053 100644 --- a/resources/recipes/revista_muy.recipe +++ b/resources/recipes/revista_muy.recipe @@ -1,3 +1,4 @@ +from calibre.web.feeds.news import re from calibre.web.feeds.recipes import BasicNewsRecipe from BeautifulSoup import Tag @@ -10,26 +11,31 @@ class RevistaMuyInteresante(BasicNewsRecipe): language = 'es' no_stylesheets = True - remove_attributes = ['style', 'font'] + remove_javascript = True + + extra_css = ' .txt_articulo{ font-family: sans-serif; font-size: medium; text-align: justify } .contentheading{font-family: serif; font-size: large; font-weight: bold; color: #000000; text-align: center}' - #then we add our own style(s) like this: - extra_css = ''' - .contentheading{font-weight: bold} - p {font-size: 4px;font-family: Times New Roman;} - ''' def preprocess_html(self, soup): + for item in soup.findAll(style=True): + del item['style'] + for img_tag in soup.findAll('img'): - parent_tag = img_tag.parent - if parent_tag.name == 'td': - if not parent_tag.get('class') == 'txt_articulo': break - imagen = img_tag - new_tag = Tag(soup,'p') - img_tag.replaceWith(new_tag) - div = soup.find(attrs={'class':'article_category'}) - div.insert(0,imagen) + imagen = img_tag + new_tag = Tag(soup,'p') + img_tag.replaceWith(new_tag) + div = soup.find(attrs={'class':'article_category'}) + div.insert(0,imagen) + break return soup + + preprocess_regexps = [ + (re.compile(r'.*?', re.DOTALL|re.IGNORECASE), lambda match: '' + match.group().replace('','').strip().replace('','').strip() + ''), + + ] + + keep_only_tags = [dict(name='div', attrs={'class':['article']}),dict(name='td', attrs={'class':['txt_articulo']})] remove_tags = [ @@ -37,6 +43,7 @@ class RevistaMuyInteresante(BasicNewsRecipe): ,dict(name='div', attrs={'id':['comment']}) ,dict(name='td', attrs={'class':['buttonheading']}) ,dict(name='div', attrs={'class':['tags_articles']}) + ,dict(name='table', attrs={'class':['pagenav']}) ] remove_tags_after = dict(name='div', attrs={'class':'tags_articles'}) @@ -71,8 +78,33 @@ class RevistaMuyInteresante(BasicNewsRecipe): for title, url in [ ('Historia', 'http://www.muyinteresante.es/historia-articulos'), + ('Ciencia', + 'http://www.muyinteresante.es/ciencia-articulos'), + ('Naturaleza', + 'http://www.muyinteresante.es/naturaleza-articulos'), + ('Tecnología', + 'http://www.muyinteresante.es/tecnologia-articulos'), + ('Salud', + 'http://www.muyinteresante.es/salud-articulos'), + ('Más Muy', + 'http://www.muyinteresante.es/muy'), + ('Innova - Automoción', + 'http://www.muyinteresante.es/articulos-innovacion-autos'), + ('Innova - Salud', + 'http://www.muyinteresante.es/articulos-innovacion-salud'), + ('Innova - Medio Ambiente', + 'http://www.muyinteresante.es/articulos-innovacion-medio-ambiente'), + ('Innova - Alimentación', + 'http://www.muyinteresante.es/articulos-innovacion-alimentacion'), + ('Innova - Sociedad', + 'http://www.muyinteresante.es/articulos-innovacion-sociedad'), + ('Innova - Tecnología', + 'http://www.muyinteresante.es/articulos-innovacion-tecnologia'), + ('Innova - Ocio', + 'http://www.muyinteresante.es/articulos-innovacion-ocio'), ]: articles = self.nz_parse_section(url) if articles: feeds.append((title, articles)) return feeds +