#!/usr/bin/env python __license__ = 'GPL v3' __copyright__ = '2008-2009, Darko Miletic ' ''' infobae.com ''' import re import urllib, urlparse from calibre.web.feeds.news import BasicNewsRecipe class Infobae(BasicNewsRecipe): title = 'Infobae.com' __author__ = 'Darko Miletic and Sujata Raman' description = 'Informacion Libre las 24 horas' publisher = 'Infobae.com' category = 'news, politics, Argentina' oldest_article = 1 max_articles_per_feed = 100 no_stylesheets = True use_embedded_content = False language = 'es' lang = 'es-AR' encoding = 'cp1252' cover_url = 'http://www.infobae.com/imgs/header/header.gif' remove_javascript = True preprocess_regexps = [(re.compile( r''), lambda m:'')] html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"\nlinearize_tables=True' extra_css = ''' .col-center{font-family:Arial,Helvetica,sans-serif;} h1{font-family:Arial,Helvetica,sans-serif; color:#0D4261;} .fuenteIntNota{font-family:Arial,Helvetica,sans-serif; color:#1D1D1D; font-size:x-small;} ''' keep_only_tags = [dict(name='div', attrs={'class':['content']})] remove_tags = [ dict(name='div', attrs={'class':['options','col-right','controles', 'bannerLibre','tiulo-masleidas','masleidas-h']}), dict(name='a', attrs={'name' : 'comentario',}), dict(name='iframe'), dict(name='img', alt = "Ver galerias de imagenes"), ] feeds = [ (u'Noticias' , u'http://www.infobae.com/adjuntos/html/RSS/hoy.xml' ) ,(u'Salud' , u'http://www.infobae.com/adjuntos/html/RSS/salud.xml' ) ,(u'Tecnologia', u'http://www.infobae.com/adjuntos/html/RSS/tecnologia.xml') ,(u'Deportes' , u'http://www.infobae.com/adjuntos/html/RSS/deportes.xml' ) ] # def print_version(self, url): # main, sep, article_part = url.partition('contenidos/') # article_id, rsep, rrest = article_part.partition('-') # return u'http://www.infobae.com/notas/nota_imprimir.php?Idx=' + article_id def get_article_url(self, article): ans = article.get('link').encode('utf-8') parts = list(urlparse.urlparse(ans)) parts[2] = urllib.quote(parts[2]) ans = urlparse.urlunparse(parts) return ans.decode('utf-8') def preprocess_html(self, soup): for tag in soup.head.findAll('strong'): tag.extract() for tag in soup.findAll('meta'): del tag['content'] tag.extract() mtag = '\n\n' soup.head.insert(0,mtag) for item in soup.findAll(style=True): del item['style'] return soup def postprocess_html(self, soup, first): for tag in soup.findAll(name='strong'): tag.name = 'b' return soup