diff --git a/resources/recipes/laprensa.recipe b/resources/recipes/laprensa.recipe index 3079b4906a..7f9ab16f79 100644 --- a/resources/recipes/laprensa.recipe +++ b/resources/recipes/laprensa.recipe @@ -11,7 +11,7 @@ from calibre.web.feeds.news import BasicNewsRecipe class LaPrensa(BasicNewsRecipe): title = 'La Prensa' - __author__ = 'Darko Miletic' + __author__ = 'Darko Miletic and Sujata Raman' description = 'Informacion Libre las 24 horas' publisher = 'La Prensa' category = 'news, politics, Argentina' @@ -20,9 +20,11 @@ class LaPrensa(BasicNewsRecipe): no_stylesheets = True use_embedded_content = False encoding = 'cp1252' - cover_url = 'http://www.laprensa.com.ar/imgs/logo.gif' + # cover_url = 'http://www.laprensa.com.ar/imgs/logo.gif' remove_javascript = True - + language = 'es' + lang = 'es' + html2lrf_options = [ '--comment', description , '--category', category @@ -30,31 +32,72 @@ class LaPrensa(BasicNewsRecipe): ] html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"' - + filter_regexps = [r'.*archive.aspx.*'] + + remove_tags = [ + dict(name='td', attrs={'class':["link-registro","link-buscador"]}), + dict(name='td', attrs={'id':["TDTabItem1","TDTabItem2","TDTabItem3","TDTabItem4"]}), + dict(name='table', attrs={'class':["marco-botonera"]}), + dict(name='tr', attrs={'class':["messages","IUTabItemSelected"]}), + dict(name='input', attrs={'id':"txt_allfields"}), + dict(name='div', attrs={'id':["TabItem1","TabItem2","TabItem3","TabItem4","RCPanel"]}), + dict(name='span', attrs={'id':["GWCNavigatorControl","_ctl15"]}), + dict(name='span', attrs={'class':["ranking-titulo","IUTab"]}), + dict(name='a', attrs={'class':["link-registro",]}), + dict(name='img', src = "/versions/1/imgs/icono-comentario.gif"), + dict(name='img', src = "/versions/1/imgs/logo.gif"), + dict(name='img', src = "/versions/1/imgs/boton-ingresar-roll.gif"), + dict(name='img', src = "/versions/1/imgs/icono-recomendar.gif"), + dict(name='button'), + dict(name='img', src = "/versions/1/imgs/boton-votar-roll.gif"), + dict(name='img', src = "/versions/1/imgs/boton-ingresar.gif"), + dict(name='img', src = "/versions/1/imgs/icono-imprimir.gif"), + dict(name='img', src = "/versions/1/imgs/icono-ampliar-letra.gif"), + dict(name='img', src = "/versions/1/imgs/icono-reducir-letra.gif"), + dict(name='img', src = "/versions/1/imgs/pix-trans.gif"), + dict(name='img', src = "/versions/1/imgs/icono-buscador.gif"), + dict(name='img', src = "/versions/1/imgs/separador-linea-azul.gif"), + dict(name='img', src = " /versions/1/imgs/separador-linea.gif"), + dict(name='a',text ="Powered by Civinext Groupware - V. 2.0.3567.23706"), + dict(name='img', height ="0") + ] + + extra_css = ''' + .seccion{font-size:xx-small;} + body{font-family:Arial,Helvetica,sans-serif;font-size:x-small;} + .titulo-noticia-principal{font-size:large; color:#00427B; font-weight:bold;} + .texto-subtitulos{font-weight:bold;font-size:x-small;} + .fecha{font-size:xx-small;} + .volanta{font-size:xx-small;} + ''' + feeds = [ - (u'Politica' , u'http://www.laprensa.com.ar/Rss.aspx?Rss=4' ) - ,(u'Economia' , u'http://www.laprensa.com.ar/Rss.aspx?Rss=5' ) - ,(u'Opinion' , u'http://www.laprensa.com.ar/Rss.aspx?Rss=6' ) - ,(u'El Mundo' , u'http://www.laprensa.com.ar/Rss.aspx?Rss=7' ) - ,(u'Actualidad' , u'http://www.laprensa.com.ar/Rss.aspx?Rss=8' ) - ,(u'Deportes' , u'http://www.laprensa.com.ar/Rss.aspx?Rss=9' ) - ,(u'Espectaculos', u'http://www.laprensa.com.ar/Rss.aspx?Rss=10') + (u'Politica' , u'http://www.laprensa.com.ar/ResourcesManager.aspx?Resource=Rss.aspx&Rss=4' ) + ,(u'Economia' , u'http://www.laprensa.com.ar/ResourcesManager.aspx?Resource=Rss.aspx&Rss=5' ) + ,(u'Opinion' , u'http://www.laprensa.com.ar/ResourcesManager.aspx?Resource=Rss.aspx?Rss=6' ) + ,(u'El Mundo' , u'http://www.laprensa.com.ar/ResourcesManager.aspx?Resource=Rss.aspx?Rss=7' ) + ,(u'Actualidad' , u'http://www.laprensa.com.ar/ResourcesManager.aspx?Resource=Rss.aspx?Rss=8' ) + ,(u'Deportes' , u'http://www.laprensa.com.ar/ResourcesManager.aspx?Resource=Rss.aspx?Rss=9' ) + ,(u'Espectaculos', u'http://www.laprensa.com.ar/ResourcesManager.aspx?Resource=Rss.aspx?Rss=10') ] - def print_version(self, url): - return url.replace('.note.aspx','.NotePrint.note.aspx') - - def get_article_url(self, article): - raw = article.get('link', None).encode('utf8') - final = urllib.quote(raw,':/') - return final - + def preprocess_html(self, soup): - del soup.body['onload'] + + for t in soup.findAll(['table','td','tr','span','tbody']): + t.name = 'div' + for t in soup.findAll(['hr']): + t.extract() + mtag = '' soup.head.insert(0,mtag) for item in soup.findAll(style=True): del item['style'] + for item in soup.findAll(align = "center"): + del item['align'] + for item in soup.findAll(bgcolor="ffffff"): + del item['bgcolor'] return soup - - language = 'es' + + +