From f9e79e10ddd070fe2c271a1f70dd09ffabcd5500 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Wed, 4 Aug 2010 22:19:59 -0600 Subject: [PATCH] Folha de Sao Paulo by Saverio Palmieri Neto --- resources/recipes/folhadesaopaulo.recipe | 74 ++++++++++++++++++++++++ 1 file changed, 74 insertions(+) create mode 100644 resources/recipes/folhadesaopaulo.recipe diff --git a/resources/recipes/folhadesaopaulo.recipe b/resources/recipes/folhadesaopaulo.recipe new file mode 100644 index 0000000000..262a265020 --- /dev/null +++ b/resources/recipes/folhadesaopaulo.recipe @@ -0,0 +1,74 @@ +#!/usr/bin/env python + +__license__ = 'GPL v3' +__copyright__ = '2010, Saverio Palmieri Neto ' +''' +folha.uol.com.br +''' + +from calibre.web.feeds.news import BasicNewsRecipe + +class FolhaOnline(BasicNewsRecipe): + title = 'Folha de Sao Paulo' + __author__ = 'Saverio Palmieri Neto' + description = 'Brazilian news from Folha de Sao Paulo Online' + publisher = 'Folha de Sao Paulo' + category = 'Brasil, news' + oldest_article = 2 + max_articles_per_feed = 1000 + summary_length = 2048 + no_stylesheets = True + use_embedded_content = False + timefmt = ' [%d %b %Y (%a)]' + encoding = 'cp1252' + cover_url = 'http://lh5.ggpht.com/_hEb7sFmuBvk/TFoiKLRS5dI/AAAAAAAAADM/kcVKggZwKnw/capa_folha.jpg' + cover_margins = (5,5,'white') + remove_javascript = True + + keep_only_tags = [dict(name='div', attrs={'id':'articleNew'})] + + remove_tags = [ + dict(name='script') + ,dict(name='div', + attrs={'id':[ + 'articleButton' + ,'bookmarklets' + ,'ad-180x150-1' + ,'contextualAdsArticle' + ,'articleEnd' + ,'articleComments' + ]}) + ,dict(name='div', + attrs={'class':[ + 'openBox adslibraryArticle' + ]}) + ,dict(name='a') + ,dict(name='iframe') + ,dict(name='link') + ] + + + feeds = [ + (u'Em cima da hora', u'http://feeds.folha.uol.com.br/emcimadahora/rss091.xml') + ,(u'Ambiente', u'http://feeds.folha.uol.com.br/ambiente/rss091.xml') + ,(u'Bichos', u'http://feeds.folha.uol.com.br/bichos/rss091.xml') + ,(u'Poder', u'http://feeds.folha.uol.com.br/poder/rss091.xml') + ,(u'Ciencia', u'http://feeds.folha.uol.com.br/ciencia/rss091.xml') + ,(u'Cotidiano', u'http://feeds.folha.uol.com.br/cotidiado/rss091.xml') + ,(u'Saber', u'http://feeds.folha.uol.com.br/saber/rss091.xml') + ,(u'Equilíbrio e Saúde', u'http://feeds.folha.uol.com.br/equilibrioesaude/rss091.xml') + ,(u'Esporte', u'http://feeds.folha.uol.com.br/esporte/rss091.xml') + ,(u'Ilustrada', u'http://feeds.folha.uol.com.br/ilustrada/rss091.xml') + ,(u'Ilustríssima', u'http://feeds.folha.uol.com.br/ilustrissima/rss091.xml') + ,(u'Mercado', u'http://feeds.folha.uol.com.br/mercado/rss091.xml') + ,(u'Mundo', u'http://feeds.folha.uol.com.br/mundo/rss091.xml') + ,(u'Tec', u'http://feeds.folha.uol.com.br/tec/rss091.xml') + ,(u'Turismo', u'http://feeds.folha.uol.com.br/turismo/rss091.xml') + ] + + def preprocess_html(self, soup): + for item in soup.findAll(style=True): + del item['style'] + return soup + + language = 'pt'