diff --git a/resources/recipes/el_universal.recipe b/resources/recipes/el_universal.recipe index 39e23f7f2c..78c6166c86 100644 --- a/resources/recipes/el_universal.recipe +++ b/resources/recipes/el_universal.recipe @@ -10,57 +10,116 @@ from calibre.web.feeds.news import BasicNewsRecipe class ElUniversal(BasicNewsRecipe): title = 'El Universal' - __author__ = 'Darko Miletic' + __author__ = 'Darko Miletic and Sujata Raman' description = 'News from Mexico' oldest_article = 1 max_articles_per_feed = 100 publisher = 'El Universal' - category = 'news, politics, Mexico' + category = 'news, politics, Mexico' no_stylesheets = True use_embedded_content = False encoding = 'cp1252' remove_javascript = True language = 'es' - - html2lrf_options = [ - '--comment', description - , '--category', category - , '--publisher', publisher - , '--ignore-tables' - ] - - html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"\nlinearize_tables=True' + extra_css = ''' + body{font-family:Arial,Helvetica,sans-serif; font-size:x-small;} + .geoGris30{font-family:Georgia,"Times New Roman",Times,serif; font-size:large; color:#003366; font-weight:bold;} + .arnegro16{font-family:Georgia,"Times New Roman",Times,serif; font-weight:bold; font-size:small;} + .tbazull2{font-family:"trebuchet ms",Arial,Helvetica,sans-serif; color:#336699; font-size:xx-small;} + .tbgrisf11{font-family:"trebuchet ms",Arial,Helvetica,sans-serif; color: #666666; font-size:xx-small;} + .verrojo13{font-family:"trebuchet ms",Arial,Helvetica,sans-serif; color: #CC0033; font-size:xx-small;} + .trnegro13{font-family:"trebuchet ms",Arial,Helvetica,sans-serif; font-size:xx-small;} + .txt-fotogaleria{font-family:"trebuchet ms",Arial,Helvetica,sans-serif; font-size:xx-small;} + ''' + keep_only_tags = [ dict(name='table', attrs={'width':"633"}),dict(name='table', attrs={'width':"629"}),] - remove_tags = [dict(name='link')] - - feeds = [ + remove_tags = [ + dict(name='table', attrs={'bgcolor':"#f5f5f5"}), + dict(name='td', attrs={'bgcolor':"#f7f8f9"}), + dict(name='td', attrs={'bgcolor':"#f5f5f5"}), + dict(name='table', attrs={'width':"302"}), + dict(name='table', attrs={'width':"214"}), + dict(name='table', attrs={'width':"112"}), + dict(name='table', attrs={'width':"980"}), + dict(name='td', attrs={'height':"1"}), + dict(name='td', attrs={'height':"4"}), + dict(name='td', attrs={'height':"20"}), + dict(name='td', attrs={'height':"10"}), + dict(name='td', attrs={'class':["trrojo11","trbris11","trrojo12","arrojo12s","tbazul13"]}), + dict(name='div', attrs={'id':["mapg","ver_off_todosloscom","todosloscom"]}), + dict(name='span', attrs={'class':["trazul18b","trrojo11","trnaranja11","trbris11","georojo18b","geogris18"]}), + dict(name='span', attrs={'class':["detalles-opinion"]}), + dict(name='a', attrs={'class':["arnaranja12b","trbris11","arazul12rel","trrojo10"]}), + dict(name='img', src = "/img/icono_imprimir.gif"), + dict(name='img', src = "/img/icono_enviar_mail.gif"), + dict(name='img', src = "/img/icono_fuente_g.gif"), + dict(name='img', src = "/img/icono_fuente_m.gif"), + dict(name='img', src = "/img/icono_fuente_c.gif"), + dict(name='img', src = "/img/icono_compartir.gif"), + dict(name='img', src = "/img/icono_enviar_coment.gif"), + dict(name='img', src = "http://www.eluniversal.com.mx/n_img/bot-notasrel.gif"), + dict(name='img', src = "http://www.eluniversal.com.mx/n_img/fr.gif"), + dict(name='img', src = "/img/espiral2.gif"), + dict(name='img', src = "http://www.eluniversal.com.mx/n_img/b"), + dict(name='img', src = "/img/icono_enviar_coment.gifot-notasrel.gif"), + dict(name='img', src = "/n_img/icono_tipo3.gif"), + dict(name='img', src = "/n_img/icono_tipo2.gif"), + dict(name='img', src = "/n_img/icono_print.gif"), + dict(name='img', src = "/n_img/icono_mail2.gif"), + dict(name='img', src = "/n_img/im-comentarios-2a.gif"), + dict(name='img', src = "/n_img/im-comentarios-1a.gif"), + dict(name='img', src = "/img/icono_coment.gif"), + dict(name='img', src = "http://www.eluniversal.com.mx/n_img/bot-sitiosrel.gif"), + dict(name='img', src = "/n_img/icono_tipomenos.gif"), + dict(name='img', src = "/img/futbol/19.jpg"), + dict(name='img', alt = "Facebook"), + dict(name='img', alt = "Twitter"), + dict(name='img', alt = "Google"), + dict(name='img', alt = "LinkedIn"), + dict(name='img', alt = "Viadeo"), + dict(name='img', alt = "Digg"), + dict(name='img', alt = "Delicious"), + dict(name='img', alt = "Meneame"), + dict(name='img', alt = "Yahoo"), + dict(name='img', alt = "Technorati"), + dict(name='a',text =["Compartir","Facebook","Twitter","Google","LinkedIn","Viadeo","Digg","Delicious","Meneame","Yahoo","Technorati"]), + dict(name='select'), + dict(name='a', attrs={'class':"tbgriscompartir"}), + ] + + feeds = [ (u'Minuto por Minuto', u'http://www.eluniversal.com.mx/rss/universalmxm.xml' ) ,(u'Mundo' , u'http://www.eluniversal.com.mx/rss/mundo.xml' ) ,(u'Mexico' , u'http://www.eluniversal.com.mx/rss/mexico.xml' ) ,(u'Estados' , u'http://www.eluniversal.com.mx/rss/estados.xml' ) ,(u'Finanzas' , u'http://www.eluniversal.com.mx/rss/finanzas.xml' ) - ,(u'Deportes' , u'http://www.eluniversal.com.mx/rss/deportes.xml' ) + (u'Deportes' , u'http://www.eluniversal.com.mx/rss/deportes.xml' ) ,(u'Espectaculos' , u'http://www.eluniversal.com.mx/rss/espectaculos.xml' ) ,(u'Cultura' , u'http://www.eluniversal.com.mx/rss/cultura.xml' ) ,(u'Ciencia' , u'http://www.eluniversal.com.mx/rss/ciencia.xml' ) ,(u'Computacion' , u'http://www.eluniversal.com.mx/rss/computo.xml' ) ,(u'Sociedad' , u'http://www.eluniversal.com.mx/rss/sociedad.xml' ) ] - - def print_version(self, url): - return url.replace('/notas/','/notas/vi_') + + # def print_version(self, url): + # return url.replace('/notas/','/notas/vi_') def preprocess_html(self, soup): - mtag = '' - soup.head.insert(0,mtag) - for item in soup.findAll(style=True): - del item['style'] - for item in soup.findAll(font=True): - del item['font'] - for item in soup.findAll(face=True): - del item['face'] - for item in soup.findAll(helvetica=True): - del item['helvetica'] - return soup - + mtag = '' + soup.head.insert(0,mtag) + for tag in soup.findAll(name='td',attrs={'class': 'arazul50'}): + tag.insert(0,"

") + tag.insert(2,"

") + + return soup + + def postprocess_html(self, soup,first): + + for tag in soup.findAll(name=['table', 'span','i']): + tag.name = 'div' + for item in soup.findAll(align = "right"): + del item['align'] + + return soup +