calibre/recipes/el_pais_babelia.recipe

from calibre.web.feeds.news import BasicNewsRecipe


class ElPaisBabelia(BasicNewsRecipe):

    title = 'El Pais Babelia'
    __author__ = 'oneillpt'
    description = 'El Pais Babelia'
    INDEX = 'http://www.elpais.com/suple/babelia/'
    language = 'es'

    remove_tags_before = dict(name='div', attrs={'class': 'estructura_2col'})
    keep_tags = [dict(name='div', attrs={'class': 'estructura_2col'})]
    remove_tags = [dict(name='div', attrs={'class': 'votos estirar'}),
                   dict(name='div', attrs={'id': 'utilidades'}),
                   dict(name='div', attrs={'class': 'info_relacionada'}),
                   dict(name='div', attrs={'class': 'mod_apoyo'}),
                   dict(name='div', attrs={'class': 'contorno_f'}),
                   dict(name='div', attrs={'class': 'pestanias'}),
                   dict(name='div', attrs={'class': 'otros_webs'}),
                   dict(name='div', attrs={'id': 'pie'})
                   ]
    remove_javascript = True

    def parse_index(self):
        articles = []
        soup = self.index_to_soup(self.INDEX)
        feeds = []
        for section in soup.findAll('div', attrs={'class': 'contenedor_nuevo'}):
            section_title = self.tag_to_string(section.find('h1'))
            articles = []
            for post in section.findAll('a', href=True):
                url = post['href']
                if url.startswith('/'):
                    url = 'http://www.elpais.es' + url
                    title = self.tag_to_string(post)
                    if str(post).find('class=') > 0:
                        klass = post['class']
                        if klass != "":
                            self.log()
                            self.log('--> post:  ', post)
                            self.log('--> url:   ', url)
                            self.log('--> title: ', title)
                            self.log('--> class: ', klass)
                            articles.append({'title': title, 'url': url})
            if articles:
                feeds.append((section_title, articles))
        return feeds