Updated Revista Muy Interesante

2025-11-17 03:53:02 -05:00 · 2010-10-03 18:56:15 -06:00 · 2010-10-03 18:56:15 -06:00 · d1ba8c78d4
commit d1ba8c78d4
parent 3f58e8a35a
1 changed files with 46 additions and 14 deletions
--- a/resources/recipes/revista_muy.recipe
+++ b/resources/recipes/revista_muy.recipe
@ -1,3 +1,4 @@
 from calibre.web.feeds.news import re
 from calibre.web.feeds.recipes import BasicNewsRecipe
 from BeautifulSoup import Tag
@ -10,26 +11,31 @@ class RevistaMuyInteresante(BasicNewsRecipe):
    language = 'es'
    no_stylesheets = True
-    remove_attributes = ['style', 'font']
+    remove_javascript = True
    extra_css              = ' .txt_articulo{ font-family: sans-serif; font-size: medium; text-align: justify } .contentheading{font-family: serif; font-size: large; font-weight: bold; color: #000000; text-align: center}'
    #then we add our own style(s) like this:
    extra_css = '''
                       .contentheading{font-weight: bold}
                       p {font-size: 4px;font-family: Times New Roman;}
                    '''
    def preprocess_html(self, soup):
            for item in soup.findAll(style=True):
               del item['style']
            for img_tag in soup.findAll('img'):
                parent_tag = img_tag.parent
                if parent_tag.name == 'td':
                    if not parent_tag.get('class') == 'txt_articulo': break
                imagen = img_tag
                new_tag = Tag(soup,'p')
                img_tag.replaceWith(new_tag)
                div = soup.find(attrs={'class':'article_category'})
                div.insert(0,imagen)
                break
            return soup
    preprocess_regexps = [
        (re.compile(r'<td class="contentheading" width="100%">.*?</td>', re.DOTALL|re.IGNORECASE), lambda match: '<td class="contentheading">' + match.group().replace('<td class="contentheading" width="100%">','').strip().replace('</td>','').strip() + '</td>'),
    ]
    keep_only_tags = [dict(name='div', attrs={'class':['article']}),dict(name='td', attrs={'class':['txt_articulo']})]
    remove_tags        = [
@ -37,6 +43,7 @@ class RevistaMuyInteresante(BasicNewsRecipe):
                            ,dict(name='div', attrs={'id':['comment']})
                            ,dict(name='td', attrs={'class':['buttonheading']})
                            ,dict(name='div', attrs={'class':['tags_articles']})
                            ,dict(name='table', attrs={'class':['pagenav']})
                         ]
    remove_tags_after = dict(name='div', attrs={'class':'tags_articles'})
@ -71,8 +78,33 @@ class RevistaMuyInteresante(BasicNewsRecipe):
            for title, url in [
                ('Historia',
                 'http://www.muyinteresante.es/historia-articulos'),
                ('Ciencia',
                 'http://www.muyinteresante.es/ciencia-articulos'),
                ('Naturaleza',
                 'http://www.muyinteresante.es/naturaleza-articulos'),
                ('Tecnología',
                 'http://www.muyinteresante.es/tecnologia-articulos'),
                ('Salud',
                 'http://www.muyinteresante.es/salud-articulos'),
                ('Más Muy',
                 'http://www.muyinteresante.es/muy'),
                ('Innova - Automoción',
                 'http://www.muyinteresante.es/articulos-innovacion-autos'),
                ('Innova - Salud',
                 'http://www.muyinteresante.es/articulos-innovacion-salud'),
                ('Innova - Medio Ambiente',
                 'http://www.muyinteresante.es/articulos-innovacion-medio-ambiente'),
                ('Innova - Alimentación',
                 'http://www.muyinteresante.es/articulos-innovacion-alimentacion'),
                ('Innova - Sociedad',
                 'http://www.muyinteresante.es/articulos-innovacion-sociedad'),
                ('Innova - Tecnología',
                 'http://www.muyinteresante.es/articulos-innovacion-tecnologia'),
                ('Innova - Ocio',
                 'http://www.muyinteresante.es/articulos-innovacion-ocio'),
             ]:
               articles = self.nz_parse_section(url)
               if articles:
                   feeds.append((title, articles))
            return feeds