#!/usr/bin/env python # -*- mode: python -*- # -*- coding: utf-8 -*- __license__ = 'GPL v3' __copyright__ = '2008-2016, Darko Miletic ' ''' laprensa.com.ar ''' from calibre.web.feeds.news import BasicNewsRecipe class LaPrensa(BasicNewsRecipe): title = 'La Prensa' __author__ = 'Darko Miletic and Sujata Raman' description = 'Informacion Libre las 24 horas' publisher = 'La Prensa' category = 'news, politics, Argentina' oldest_article = 7 max_articles_per_feed = 100 no_stylesheets = True use_embedded_content = False encoding = 'utf8' remove_javascript = True language = 'es_AR' lang = 'es' html2lrf_options = [ '--comment', description, '--category', category, '--publisher', publisher ] html2epub_options = 'publisher="' + publisher + \ '"\ncomments="' + description + '"\ntags="' + category + '"' filter_regexps = [r'.*archive.aspx.*'] remove_tags = [ dict(name='td', attrs={'class': ["link-registro", "link-buscador"]}), dict(name='td', attrs={ 'id': ["TDTabItem1", "TDTabItem2", "TDTabItem3", "TDTabItem4"]}), dict(name='table', attrs={'class': ["marco-botonera"]}), dict(name='tr', attrs={'class': ["messages", "IUTabItemSelected"]}), dict(name='input', attrs={'id': "txt_allfields"}), dict(name='div', attrs={ 'id': ["TabItem1", "TabItem2", "TabItem3", "TabItem4", "RCPanel"]}), dict(name='span', attrs={'id': ["GWCNavigatorControl", "_ctl15"]}), dict(name='span', attrs={'class': ["ranking-titulo", "IUTab"]}), dict(name='a', attrs={'class': ["link-registro", ]}), dict(name='img', src="/versions/1/imgs/icono-comentario.gif"), dict(name='img', src="/versions/1/imgs/logo.gif"), dict(name='img', src="/versions/1/imgs/boton-ingresar-roll.gif"), dict(name='img', src="/versions/1/imgs/icono-recomendar.gif"), dict(name='button'), dict(name='img', src="/versions/1/imgs/boton-votar-roll.gif"), dict(name='img', src="/versions/1/imgs/boton-ingresar.gif"), dict(name='img', src="/versions/1/imgs/icono-imprimir.gif"), dict(name='img', src="/versions/1/imgs/icono-ampliar-letra.gif"), dict(name='img', src="/versions/1/imgs/icono-reducir-letra.gif"), dict(name='img', src="/versions/1/imgs/pix-trans.gif"), dict(name='img', src="/versions/1/imgs/icono-buscador.gif"), dict(name='img', src="/versions/1/imgs/separador-linea-azul.gif"), dict(name='img', src=" /versions/1/imgs/separador-linea.gif"), dict(name='a', text="Powered by Civinext Groupware - V. 2.0.3567.23706"), dict(name='img', height="0") ] extra_css = ''' .seccion{font-size:xx-small;} body{font-family:Arial,Helvetica,sans-serif;font-size:x-small;} .titulo-noticia-principal{font-size:large; color:#00427B; font-weight:bold;} .texto-subtitulos{font-weight:bold;font-size:x-small;} .fecha{font-size:xx-small;} .volanta{font-size:xx-small;} ''' feeds = [ (u'Politica', u'http://www.laprensa.com.ar/ResourcesManager.aspx?Resource=Rss.aspx&Rss=4'), (u'Economia', u'http://www.laprensa.com.ar/ResourcesManager.aspx?Resource=Rss.aspx&Rss=5'), (u'Opinion', u'http://www.laprensa.com.ar/ResourcesManager.aspx?Resource=Rss.aspx?Rss=6'), (u'El Mundo', u'http://www.laprensa.com.ar/ResourcesManager.aspx?Resource=Rss.aspx?Rss=7'), (u'Actualidad', u'http://www.laprensa.com.ar/ResourcesManager.aspx?Resource=Rss.aspx?Rss=8'), (u'Deportes', u'http://www.laprensa.com.ar/ResourcesManager.aspx?Resource=Rss.aspx?Rss=9'), (u'Espectaculos', u'http://www.laprensa.com.ar/ResourcesManager.aspx?Resource=Rss.aspx?Rss=10') ] def preprocess_html(self, soup): for t in soup.findAll(['table', 'td', 'tr', 'span', 'tbody']): t.name = 'div' for t in soup.findAll(['hr']): t.extract() mtag = '' soup.head.insert(0, mtag) for item in soup.findAll(style=True): del item['style'] for item in soup.findAll(align="center"): del item['align'] for item in soup.findAll(bgcolor="ffffff"): del item['bgcolor'] return soup