calibre/recipes/laprensa.recipe

#!/usr/bin/env python
# -*- mode: python -*-
# -*- coding: utf-8 -*-

__license__ = 'GPL v3'
__copyright__ = '2008-2016, Darko Miletic <darko.miletic at gmail.com>'
'''
laprensa.com.ar
'''

from calibre.web.feeds.news import BasicNewsRecipe


class LaPrensa(BasicNewsRecipe):
    title = 'La Prensa'
    __author__ = 'Darko Miletic and Sujata Raman'
    description = 'Informacion Libre las 24 horas'
    publisher = 'La Prensa'
    category = 'news, politics, Argentina'
    oldest_article = 7
    max_articles_per_feed = 100
    no_stylesheets = True
    use_embedded_content = False
    encoding = 'utf8'
    remove_javascript = True
    language = 'es_AR'
    lang = 'es'

    html2lrf_options = [
        '--comment', description, '--category', category, '--publisher', publisher
    ]

    html2epub_options = 'publisher="' + publisher + \
        '"\ncomments="' + description + '"\ntags="' + category + '"'
    filter_regexps = [r'.*archive.aspx.*']

    remove_tags = [
        dict(name='td', attrs={'class': ["link-registro", "link-buscador"]}),
        dict(name='td', attrs={
             'id': ["TDTabItem1", "TDTabItem2", "TDTabItem3", "TDTabItem4"]}),
        dict(name='table', attrs={'class': ["marco-botonera"]}),
        dict(name='tr', attrs={'class': ["messages", "IUTabItemSelected"]}),
        dict(name='input', attrs={'id': "txt_allfields"}),
        dict(name='div', attrs={
             'id': ["TabItem1", "TabItem2", "TabItem3", "TabItem4", "RCPanel"]}),
        dict(name='span', attrs={'id': ["GWCNavigatorControl", "_ctl15"]}),
        dict(name='span', attrs={'class': ["ranking-titulo", "IUTab"]}),
        dict(name='a', attrs={'class': ["link-registro", ]}),
        dict(name='img', src="/versions/1/imgs/icono-comentario.gif"),
        dict(name='img', src="/versions/1/imgs/logo.gif"),
        dict(name='img', src="/versions/1/imgs/boton-ingresar-roll.gif"),
        dict(name='img', src="/versions/1/imgs/icono-recomendar.gif"),
        dict(name='button'),
        dict(name='img', src="/versions/1/imgs/boton-votar-roll.gif"),
        dict(name='img', src="/versions/1/imgs/boton-ingresar.gif"),
        dict(name='img', src="/versions/1/imgs/icono-imprimir.gif"),
        dict(name='img', src="/versions/1/imgs/icono-ampliar-letra.gif"),
        dict(name='img', src="/versions/1/imgs/icono-reducir-letra.gif"),
        dict(name='img', src="/versions/1/imgs/pix-trans.gif"),
        dict(name='img', src="/versions/1/imgs/icono-buscador.gif"),
        dict(name='img', src="/versions/1/imgs/separador-linea-azul.gif"),
        dict(name='img', src=" /versions/1/imgs/separador-linea.gif"),
        dict(name='a', text="Powered by Civinext Groupware - V. 2.0.3567.23706"),
        dict(name='img', height="0")
    ]

    extra_css = '''
                    .seccion{font-size:xx-small;}
                    body{font-family:Arial,Helvetica,sans-serif;font-size:x-small;}
                    .titulo-noticia-principal{font-size:large; color:#00427B; font-weight:bold;}
                    .texto-subtitulos{font-weight:bold;font-size:x-small;}
                    .fecha{font-size:xx-small;}
                    .volanta{font-size:xx-small;}
                '''

    feeds = [

    (u'Politica', u'http://www.laprensa.com.ar/ResourcesManager.aspx?Resource=Rss.aspx&Rss=4'),
    (u'Economia', u'http://www.laprensa.com.ar/ResourcesManager.aspx?Resource=Rss.aspx&Rss=5'),
    (u'Opinion', u'http://www.laprensa.com.ar/ResourcesManager.aspx?Resource=Rss.aspx?Rss=6'),
    (u'El Mundo', u'http://www.laprensa.com.ar/ResourcesManager.aspx?Resource=Rss.aspx?Rss=7'),
    (u'Actualidad', u'http://www.laprensa.com.ar/ResourcesManager.aspx?Resource=Rss.aspx?Rss=8'),
    (u'Deportes', u'http://www.laprensa.com.ar/ResourcesManager.aspx?Resource=Rss.aspx?Rss=9'),
    (u'Espectaculos', u'http://www.laprensa.com.ar/ResourcesManager.aspx?Resource=Rss.aspx?Rss=10')
    ]

    def preprocess_html(self, soup):

        for t in soup.findAll(['table', 'td', 'tr', 'span', 'tbody']):
            t.name = 'div'
        for t in soup.findAll(['hr']):
            t.extract()

        mtag = '<meta http-equiv="Content-Language" content="es-AR"/>'
        soup.head.insert(0, mtag)
        for item in soup.findAll(style=True):
            del item['style']
        for item in soup.findAll(align="center"):
            del item['align']
        for item in soup.findAll(bgcolor="ffffff"):
            del item['bgcolor']
        return soup