calibre/recipes/perfil.recipe

__license__   = 'GPL v3'
__copyright__ = '2010, Darko Miletic <darko.miletic at gmail.com>'
'''
perfil.com
'''

from calibre.web.feeds.news import BasicNewsRecipe

class Perfil(BasicNewsRecipe):
    title                 = 'Perfil'
    __author__            = 'Darko Miletic'
    description           = 'Noticias de Argentina y el resto del mundo'
    publisher             = 'perfil.com'
    category              = 'news, politics, Argentina'
    oldest_article        = 2
    max_articles_per_feed = 200
    no_stylesheets        = True
    encoding              = 'cp1252'
    use_embedded_content  = False
    language              = 'es_AR'
    remove_empty_feeds    = True
    masthead_url          = 'http://www.perfil.com/export/sites/diarioperfil/arte/10/logo_perfilcom_mm.gif'
    extra_css             = """
                                body{font-family: Arial,Helvetica,sans-serif }
                                .seccion{border-bottom: 1px dotted #666666; text-transform: uppercase; font-size: x-large}
                                .foto1 h1{font-size: x-small}
                                h1{font-family: Georgia,"Times New Roman",serif}
                                img{margin-bottom: 0.4em}
                            """

    conversion_options = {
                          'comment'   : description
                        , 'tags'      : category
                        , 'publisher' : publisher
                        , 'language'  : language
                        }

    remove_tags   = [
                      dict(name=['iframe','embed','object','base','meta','link'])
                     ,dict(name='a', attrs={'href':'#comentarios'})
                     ,dict(name='div', attrs={'class':'foto3'})
                     ,dict(name='img', attrs={'alt':'ampliar'})
                    ]
    keep_only_tags=[dict(attrs={'class':['bd468a','cuerpoSuperior']})]
    remove_attributes=['onload','lang','width','height','border']

    feeds = [
              (u'Ultimo momento' , u'http://www.perfil.com/rss/ultimomomento.xml')
             ,(u'Politica'       , u'http://www.perfil.com/rss/politica.xml'     )
             ,(u'Policia'        , u'http://www.perfil.com/rss/policia.xml'      )
             ,(u'Internacionales', u'http://www.perfil.com/rss/internacional.xml')
             ,(u'Economia'       , u'http://www.perfil.com/rss/economia.xml'     )
             ,(u'Deportes'       , u'http://www.perfil.com/rss/deportes.xml'     )
             ,(u'Opinion'        , u'http://www.perfil.com/rss/columnistas.xml'  )
             ,(u'Sociedad'       , u'http://www.perfil.com/rss/sociedad.xml'     )
             ,(u'Cultura'        , u'http://www.perfil.com/rss/cultura.xml'      )
             ,(u'Espectaculos'   , u'http://www.perfil.com/rss/espectaculos.xml' )
             ,(u'Ciencia'        , u'http://www.perfil.com/rss/ciencia.xml'      )
             ,(u'Salud'          , u'http://www.perfil.com/rss/salud.xml'        )
             ,(u'Tecnologia'     , u'http://www.perfil.com/rss/tecnologia.xml'   )
            ]

    def preprocess_html(self, soup):
        for item in soup.findAll(style=True):
            del item['style']
        return soup