calibre/recipes/el_mercurio_chile.recipe

#!/usr/bin/env  python2
# -*- coding: latin-1 mode: python -*-

__license__   = 'GPL v3'
__copyright__ = '2009-2015, Darko Miletic <darko.miletic at gmail.com>'
__docformat__ = 'restructuredtext es'
'''
www.emol.com
'''

from calibre.web.feeds.news import BasicNewsRecipe

class ElMercurio(BasicNewsRecipe):
    title                 = 'Emol.com - El sitio de noticias online de Chile'
    __author__            = 'Darko Miletic'
    description           = 'El sitio de noticias online de Chile'
    publisher             = 'El Mercurio S.A.P.'
    category              = 'news, politics, Chile'
    oldest_article        = 2
    max_articles_per_feed = 100
    no_stylesheets        = True
    encoding              = 'utf8'
    masthead_url          = 'http://static.emol.cl/emol50/img/logo_emol.gif'
    remove_javascript     = True
    use_embedded_content  = False
    language              = 'es_CL'


    conversion_options = {
                          'comment'   : description
                        , 'tags'      : category
                        , 'publisher' : publisher
                        , 'language'  : language
                        }

    keep_only_tags = [
                       dict(name='div', attrs={'class':['cont_iz_titulobajada','info-notaemol-por','info-notaemol-porfecha']})
                      ,dict(name='div', attrs={'id':'texto_noticia'})
                     ]
    remove_tags = [dict(name='div', attrs={'id':'cont_iz_cuerpo_relacionados'})]

    feeds = [
               (u'Nacional'    , u'http://www.emol.com/noticias/nacional/todas.aspx'     )
              ,(u'Mundo'       , u'http://www.emol.com/noticias/internacional/todas.aspx')
              ,(u'Deportes'    , u'http://www.emol.com/noticias/deportes/todas.aspx'     )
              ,(u'Espectaculos', u'http://www.emol.com/noticias/cultura/todas.aspx'      )
              ,(u'Tecnologia'  , u'http://www.emol.com/noticias/economia/todas.aspx'     )
            ]

    def parse_index(self):
        totalfeeds = []
        lfeeds = self.get_feeds()
        for feedobj in lfeeds:
            feedtitle, feedurl = feedobj
            self.report_progress(0, _('Fetching feed')+' %s...'%(feedtitle if feedtitle else feedurl))
            articles = []
            soup = self.index_to_soup(feedurl)
            arts = soup.find('div', attrs={'id':'caja_listado_noticia_todas'})
            if arts:
                for item in arts.findAll('div', attrs={'class':'listado'}):
                    atag    = item.find('a')
                    ptag    = item.find('span')
                    url           = atag['href']
                    title         = self.tag_to_string(atag)
                    description   = self.tag_to_string(ptag)
                    #date,sep,rest = self.tag_to_string(ptag).partition('|')
                    articles.append({
                                          'title'      :title
                                         ,'date'       :''
                                         ,'url'        :url
                                         ,'description':description
                                        })
                totalfeeds.append((feedtitle, articles))
        return totalfeeds