From 14156737ce0ca7cf44da1b8827db9d7dc9a4c823 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Wed, 11 Nov 2009 10:40:40 -0700 Subject: [PATCH] Updated recipes for Critica Digital and Infobae --- resources/recipes/criticadigital.recipe | 29 +++++---- resources/recipes/infobae.recipe | 81 ++++++++++++++++++------- 2 files changed, 76 insertions(+), 34 deletions(-) diff --git a/resources/recipes/criticadigital.recipe b/resources/recipes/criticadigital.recipe index e1e5030a00..d1ef97aef9 100644 --- a/resources/recipes/criticadigital.recipe +++ b/resources/recipes/criticadigital.recipe @@ -10,7 +10,7 @@ from calibre.web.feeds.news import BasicNewsRecipe class CriticaDigital(BasicNewsRecipe): title = 'Critica de la Argentina' - __author__ = 'Darko Miletic' + __author__ = 'Darko Miletic and Sujata Raman' description = 'Noticias de Argentina' oldest_article = 2 max_articles_per_feed = 100 @@ -20,17 +20,22 @@ class CriticaDigital(BasicNewsRecipe): use_embedded_content = False encoding = 'cp1252' - html2lrf_options = [ - '--comment' , description - , '--category' , 'news, Argentina' - , '--publisher' , title - ] - + extra_css = ''' + h1{font-family:"Trebuchet MS";} + h3{color:#9A0000; font-family:Tahoma; font-size:x-small;} + h2{color:#504E53; font-family:Arial,Helvetica,sans-serif ;font-size:small;} + #epigrafe{font-family:Arial,Helvetica,sans-serif ;color:#666666 ; font-size:x-small;} + p {font-family:Arial,Helvetica,sans-serif;} + #fecha{color:#858585; font-family:Tahoma; font-size:x-small;} + #autor{color:#858585; font-family:Tahoma; font-size:x-small;} + #hora{color:#F00000;font-family:Tahoma; font-size:x-small;} + ''' keep_only_tags = [ - dict(name='div', attrs={'class':'bloqueTitulosNoticia'}) - ,dict(name='div', attrs={'id':'c453-1' }) + dict(name='div', attrs={'class':['bloqueTitulosNoticia','cfotonota']}) + ,dict(name='div', attrs={'id':'boxautor'}) + ,dict(name='p', attrs={'id':'textoNota'}) ] - + remove_tags = [ dict(name='div', attrs={'class':'box300' }) ,dict(name='div', style=True ) @@ -38,7 +43,7 @@ class CriticaDigital(BasicNewsRecipe): ,dict(name='div', attrs={'class':'comentario' }) ,dict(name='div', attrs={'class':'paginador' }) ] - + feeds = [ (u'Politica', u'http://www.criticadigital.com/herramientas/rss.php?ch=politica' ) ,(u'Economia', u'http://www.criticadigital.com/herramientas/rss.php?ch=economia' ) @@ -60,3 +65,5 @@ class CriticaDigital(BasicNewsRecipe): if link_item: cover_url = index + link_item.img['src'] return cover_url + + diff --git a/resources/recipes/infobae.recipe b/resources/recipes/infobae.recipe index 78d00677b6..79937ce4f7 100644 --- a/resources/recipes/infobae.recipe +++ b/resources/recipes/infobae.recipe @@ -5,55 +5,90 @@ __copyright__ = '2008-2009, Darko Miletic ' ''' infobae.com ''' - +import re from calibre.web.feeds.news import BasicNewsRecipe class Infobae(BasicNewsRecipe): title = 'Infobae.com' - __author__ = 'Darko Miletic' + __author__ = 'Darko Miletic and Sujata Raman' description = 'Informacion Libre las 24 horas' publisher = 'Infobae.com' - category = 'news, politics, Argentina' + category = 'news, politics, Argentina' oldest_article = 1 max_articles_per_feed = 100 no_stylesheets = True use_embedded_content = False language = 'es' + lang = 'es-AR' encoding = 'cp1252' cover_url = 'http://www.infobae.com/imgs/header/header.gif' remove_javascript = True - - html2lrf_options = [ - '--comment' , description - , '--category' , category - , '--publisher', publisher - , '--ignore-tables' - , '--ignore-colors' - ] - + preprocess_regexps = [(re.compile( + r''), lambda m:'')] + + html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"\nlinearize_tables=True' + extra_css = ''' + .col-center{font-family:Arial,Helvetica,sans-serif;} + h1{font-family:Arial,Helvetica,sans-serif; color:#0D4261;} + .fuenteIntNota{font-family:Arial,Helvetica,sans-serif; color:#1D1D1D; font-size:x-small;} + ''' + + keep_only_tags = [dict(name='div', attrs={'class':['content']})] + + remove_tags = [ - dict(name=['embed','link','object']) - ,dict(name='a', attrs={'onclick':'javascript:window.print()'}) - ] - - feeds = [ + dict(name='div', attrs={'class':['options','col-right','controles', 'bannerLibre','tiulo-masleidas','masleidas-h']}), + dict(name='a', attrs={'name' : 'comentario',}), + dict(name='iframe'), + dict(name='img', alt = "Ver galerias de imagenes"), + + ] + + + feeds = [ (u'Noticias' , u'http://www.infobae.com/adjuntos/html/RSS/hoy.xml' ) ,(u'Salud' , u'http://www.infobae.com/adjuntos/html/RSS/salud.xml' ) ,(u'Tecnologia', u'http://www.infobae.com/adjuntos/html/RSS/tecnologia.xml') ,(u'Deportes' , u'http://www.infobae.com/adjuntos/html/RSS/deportes.xml' ) ] - def print_version(self, url): - main, sep, article_part = url.partition('contenidos/') - article_id, rsep, rrest = article_part.partition('-') - return u'http://www.infobae.com/notas/nota_imprimir.php?Idx=' + article_id +# def print_version(self, url): +# main, sep, article_part = url.partition('contenidos/') +# article_id, rsep, rrest = article_part.partition('-') +# return u'http://www.infobae.com/notas/nota_imprimir.php?Idx=' + article_id + + def get_article_url(self, article): + import urllib, urlparse + parts = list(urlparse.urlparse(article.get('link'))) + parts[2] = urllib.quote(parts[2]) + ans = urlparse.urlunparse(parts) + return ans + def preprocess_html(self, soup): - mtag = '\n\n' - soup.head.insert(0,mtag) + + for tag in soup.head.findAll('strong'): + tag.extract() + for tag in soup.findAll('meta'): + del tag['content'] + tag.extract() + + mtag = '\n\n' + soup.head.insert(0,mtag) for item in soup.findAll(style=True): del item['style'] + return soup + + def postprocess_html(self, soup, first): + + for tag in soup.findAll(name='strong'): + tag.name = 'b' + + return soup + + +