#!/usr/bin/env python __license__ = 'GPL v3' __copyright__ = '2008-2009, Darko Miletic ' ''' laprensa.com.ar ''' from calibre.web.feeds.news import BasicNewsRecipe class LaPrensa(BasicNewsRecipe): title = 'La Prensa' __author__ = 'Darko Miletic and Sujata Raman' description = 'Informacion Libre las 24 horas' publisher = 'La Prensa' category = 'news, politics, Argentina' oldest_article = 7 max_articles_per_feed = 100 no_stylesheets = True use_embedded_content = False encoding = 'cp1252' # cover_url = 'http://www.laprensa.com.ar/imgs/logo.gif' remove_javascript = True language = 'es_AR' lang = 'es' html2lrf_options = [ '--comment', description , '--category', category , '--publisher', publisher ] html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"' filter_regexps = [r'.*archive.aspx.*'] remove_tags = [ dict(name='td', attrs={'class':["link-registro","link-buscador"]}), dict(name='td', attrs={'id':["TDTabItem1","TDTabItem2","TDTabItem3","TDTabItem4"]}), dict(name='table', attrs={'class':["marco-botonera"]}), dict(name='tr', attrs={'class':["messages","IUTabItemSelected"]}), dict(name='input', attrs={'id':"txt_allfields"}), dict(name='div', attrs={'id':["TabItem1","TabItem2","TabItem3","TabItem4","RCPanel"]}), dict(name='span', attrs={'id':["GWCNavigatorControl","_ctl15"]}), dict(name='span', attrs={'class':["ranking-titulo","IUTab"]}), dict(name='a', attrs={'class':["link-registro",]}), dict(name='img', src = "/versions/1/imgs/icono-comentario.gif"), dict(name='img', src = "/versions/1/imgs/logo.gif"), dict(name='img', src = "/versions/1/imgs/boton-ingresar-roll.gif"), dict(name='img', src = "/versions/1/imgs/icono-recomendar.gif"), dict(name='button'), dict(name='img', src = "/versions/1/imgs/boton-votar-roll.gif"), dict(name='img', src = "/versions/1/imgs/boton-ingresar.gif"), dict(name='img', src = "/versions/1/imgs/icono-imprimir.gif"), dict(name='img', src = "/versions/1/imgs/icono-ampliar-letra.gif"), dict(name='img', src = "/versions/1/imgs/icono-reducir-letra.gif"), dict(name='img', src = "/versions/1/imgs/pix-trans.gif"), dict(name='img', src = "/versions/1/imgs/icono-buscador.gif"), dict(name='img', src = "/versions/1/imgs/separador-linea-azul.gif"), dict(name='img', src = " /versions/1/imgs/separador-linea.gif"), dict(name='a',text ="Powered by Civinext Groupware - V. 2.0.3567.23706"), dict(name='img', height ="0") ] extra_css = ''' .seccion{font-size:xx-small;} body{font-family:Arial,Helvetica,sans-serif;font-size:x-small;} .titulo-noticia-principal{font-size:large; color:#00427B; font-weight:bold;} .texto-subtitulos{font-weight:bold;font-size:x-small;} .fecha{font-size:xx-small;} .volanta{font-size:xx-small;} ''' feeds = [ (u'Politica' , u'http://www.laprensa.com.ar/ResourcesManager.aspx?Resource=Rss.aspx&Rss=4' ) ,(u'Economia' , u'http://www.laprensa.com.ar/ResourcesManager.aspx?Resource=Rss.aspx&Rss=5' ) ,(u'Opinion' , u'http://www.laprensa.com.ar/ResourcesManager.aspx?Resource=Rss.aspx?Rss=6' ) ,(u'El Mundo' , u'http://www.laprensa.com.ar/ResourcesManager.aspx?Resource=Rss.aspx?Rss=7' ) ,(u'Actualidad' , u'http://www.laprensa.com.ar/ResourcesManager.aspx?Resource=Rss.aspx?Rss=8' ) ,(u'Deportes' , u'http://www.laprensa.com.ar/ResourcesManager.aspx?Resource=Rss.aspx?Rss=9' ) ,(u'Espectaculos', u'http://www.laprensa.com.ar/ResourcesManager.aspx?Resource=Rss.aspx?Rss=10') ] def preprocess_html(self, soup): for t in soup.findAll(['table','td','tr','span','tbody']): t.name = 'div' for t in soup.findAll(['hr']): t.extract() mtag = '' soup.head.insert(0,mtag) for item in soup.findAll(style=True): del item['style'] for item in soup.findAll(align = "center"): del item['align'] for item in soup.findAll(bgcolor="ffffff"): del item['bgcolor'] return soup