Updated Revista Muy Interesante

2025-11-16 19:43:03 -05:00 · 2010-10-03 18:56:15 -06:00 · 2010-10-03 18:56:15 -06:00 · d1ba8c78d4
commit d1ba8c78d4
parent 3f58e8a35a
1 changed files with 46 additions and 14 deletions
--- a/resources/recipes/revista_muy.recipe
+++ b/resources/recipes/revista_muy.recipe
@ -1,3 +1,4 @@
+from calibre.web.feeds.news import re
 from calibre.web.feeds.recipes import BasicNewsRecipe
 from BeautifulSoup import Tag

@ -10,26 +11,31 @@ class RevistaMuyInteresante(BasicNewsRecipe):
    language = 'es'

    no_stylesheets = True
-    remove_attributes = ['style', 'font']
+    remove_javascript = True
+
+    extra_css              = ' .txt_articulo{ font-family: sans-serif; font-size: medium; text-align: justify } .contentheading{font-family: serif; font-size: large; font-weight: bold; color: #000000; text-align: center}'

-    #then we add our own style(s) like this:
-    extra_css = '''
-                       .contentheading{font-weight: bold}
-                       p {font-size: 4px;font-family: Times New Roman;}
-                    '''

    def preprocess_html(self, soup):
+            for item in soup.findAll(style=True):
+               del item['style']
+
            for img_tag in soup.findAll('img'):
-                parent_tag = img_tag.parent
-                if parent_tag.name == 'td':
-                    if not parent_tag.get('class') == 'txt_articulo': break
-                    imagen = img_tag
-                    new_tag = Tag(soup,'p')
-                    img_tag.replaceWith(new_tag)
-                    div = soup.find(attrs={'class':'article_category'})
-                    div.insert(0,imagen)
+                imagen = img_tag
+                new_tag = Tag(soup,'p')
+                img_tag.replaceWith(new_tag)
+                div = soup.find(attrs={'class':'article_category'})
+                div.insert(0,imagen)
+                break
            return soup

+
+    preprocess_regexps = [
+        (re.compile(r'<td class="contentheading" width="100%">.*?</td>', re.DOTALL|re.IGNORECASE), lambda match: '<td class="contentheading">' + match.group().replace('<td class="contentheading" width="100%">','').strip().replace('</td>','').strip() + '</td>'),
+
+    ]
+
+
    keep_only_tags = [dict(name='div', attrs={'class':['article']}),dict(name='td', attrs={'class':['txt_articulo']})]

    remove_tags        = [
@ -37,6 +43,7 @@ class RevistaMuyInteresante(BasicNewsRecipe):
                            ,dict(name='div', attrs={'id':['comment']})
                            ,dict(name='td', attrs={'class':['buttonheading']})
                            ,dict(name='div', attrs={'class':['tags_articles']})
+                            ,dict(name='table', attrs={'class':['pagenav']})
                         ]

    remove_tags_after = dict(name='div', attrs={'class':'tags_articles'})
@ -71,8 +78,33 @@ class RevistaMuyInteresante(BasicNewsRecipe):
            for title, url in [
                ('Historia',
                 'http://www.muyinteresante.es/historia-articulos'),
+                ('Ciencia',
+                 'http://www.muyinteresante.es/ciencia-articulos'),
+                ('Naturaleza',
+                 'http://www.muyinteresante.es/naturaleza-articulos'),
+                ('Tecnología',
+                 'http://www.muyinteresante.es/tecnologia-articulos'),
+                ('Salud',
+                 'http://www.muyinteresante.es/salud-articulos'),
+                ('Más Muy',
+                 'http://www.muyinteresante.es/muy'),
+                ('Innova - Automoción',
+                 'http://www.muyinteresante.es/articulos-innovacion-autos'),
+                ('Innova - Salud',
+                 'http://www.muyinteresante.es/articulos-innovacion-salud'),
+                ('Innova - Medio Ambiente',
+                 'http://www.muyinteresante.es/articulos-innovacion-medio-ambiente'),
+                ('Innova - Alimentación',
+                 'http://www.muyinteresante.es/articulos-innovacion-alimentacion'),
+                ('Innova - Sociedad',
+                 'http://www.muyinteresante.es/articulos-innovacion-sociedad'),
+                ('Innova - Tecnología',
+                 'http://www.muyinteresante.es/articulos-innovacion-tecnologia'),
+                ('Innova - Ocio',
+                 'http://www.muyinteresante.es/articulos-innovacion-ocio'),
             ]:
               articles = self.nz_parse_section(url)
               if articles:
                   feeds.append((title, articles))
            return feeds
+