diff --git a/recipes/estadao.recipe b/recipes/estadao.recipe index 4e520c1135..86ab572398 100644 --- a/recipes/estadao.recipe +++ b/recipes/estadao.recipe @@ -1,63 +1,134 @@ #!/usr/bin/env python - -__license__ = 'GPL v3' -__copyright__ = '2010, elsuave' -''' -estadao.com.br -''' - from calibre.web.feeds.news import BasicNewsRecipe +from datetime import datetime, timedelta +from calibre.ebooks.BeautifulSoup import Tag,BeautifulSoup +from calibre.utils.magick import Image, PixelWand +from urllib2 import Request, urlopen, URLError class Estadao(BasicNewsRecipe): - title = 'O Estado de S. Paulo' - __author__ = 'elsuave (modified from Darko Miletic)' - description = 'News from Brasil in Portuguese' - publisher = 'O Estado de S. Paulo' - category = 'news, politics, Brasil' - oldest_article = 2 - max_articles_per_feed = 25 + THUMBALIZR_API = "0123456789abcdef01234567890" # ---->Get your at http://www.thumbalizr.com/ + LANGUAGE = 'pt_br' + language = 'pt' + LANGHTM = 'pt-br' + ENCODING = 'utf' + ENCHTM = 'utf-8' + directionhtm = 'ltr' + requires_version = (0,8,47) + news = True + publication_type = 'newsportal' + + title = u'Estadao' + __author__ = 'Euler Alves' + description = u'Brazilian news from Estad\xe3o' + publisher = u'Estad\xe3o' + category = 'news, rss' + + oldest_article = 4 + max_articles_per_feed = 100 + summary_length = 1000 + + remove_javascript = True no_stylesheets = True use_embedded_content = False - encoding = 'utf8' - cover_url = 'http://www.estadao.com.br/img/logo_estadao.png' - remove_javascript = True + remove_empty_feeds = True + timefmt = ' [%d %b %Y (%a)]' - html2lrf_options = [ - '--comment', description - , '--category', category - , '--publisher', publisher - ] + html2lrf_options = [ + '--comment', description + ,'--category', category + ,'--publisher', publisher + ] - html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"' + html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"' - keep_only_tags = [ - dict(name='div', attrs={'class':['bb-md-noticia','c5']}) - ] + hoje = datetime.now()-timedelta(days=2) + pubdate = hoje.strftime('%a, %d %b') + if hoje.hour<10: + hoje = hoje-timedelta(days=1) + CAPA = 'http://www.estadao.com.br/estadaodehoje/'+hoje.strftime('%Y%m%d')+'/img/capadodia.jpg' + SCREENSHOT = 'http://estadao.com.br/' + cover_margins = (0,0,'white') + masthead_url = 'http://www.estadao.com.br/estadao/novo/img/logo.png' + keep_only_tags = [dict(name='div', attrs={'class':['bb-md-noticia','corpo']})] remove_tags = [ - dict(name=['script','object','form','ul']) - ,dict(name='div', attrs={'class':['fnt2 Color_04 bold','right fnt2 innerTop15 dvTmFont','™_01 right outerLeft15','tituloBox','tags']}) - ,dict(name='div', attrs={'id':['bb-md-noticia-subcom']}) - ] + dict(name='div', + attrs={'id':[ + 'bb-md-noticia-tabs' + ]}) + ,dict(name='div', + attrs={'class':[ + 'tags' + ,'discussion' + ,'bb-gg adsense_container' + ]}) + + ,dict(name='a') + ,dict(name='iframe') + ,dict(name='link') + ,dict(name='script') + ] feeds = [ - (u'Manchetes Estadao', u'http://www.estadao.com.br/rss/manchetes.xml') - ,(u'Ultimas noticias', u'http://www.estadao.com.br/rss/ultimas.xml') - ,(u'Nacional', u'http://www.estadao.com.br/rss/nacional.xml') - ,(u'Internacional', u'http://www.estadao.com.br/rss/internacional.xml') - ,(u'Cidades', u'http://www.estadao.com.br/rss/cidades.xml') - ,(u'Esportes', u'http://www.estadao.com.br/rss/esportes.xml') - ,(u'Arte & Lazer', u'http://www.estadao.com.br/rss/arteelazer.xml') - ,(u'Economia', u'http://www.estadao.com.br/rss/economia.xml') - ,(u'Vida &', u'http://www.estadao.com.br/rss/vidae.xml') - ] + (u'\xDAltimas Not\xEDcias', u'http://www.estadao.com.br/rss/ultimas.xml') + ,(u'Manchetes', u'http://www.estadao.com.br/rss/manchetes.xml') + ,(u'Brasil', u'http://www.estadao.com.br/rss/brasil.xml') + ,(u'Internacional', u'http://www.estadao.com.br/rss/internacional.xml') + ,(u'Cinema', u'http://blogs.estadao.com.br/cinema/feed/') + ,(u'Planeta', u'http://www.estadao.com.br/rss/planeta.xml') + ,(u'Ci\xEAncia', u'http://www.estadao.com.br/rss/ciencia.xml') + ,(u'Sa\xFAde', u'http://www.estadao.com.br/rss/saude.xml') + ,(u'Pol\xEDtica', u'http://www.estadao.com.br/rss/politica.xml') + ] + conversion_options = { + 'title' : title + ,'comments' : description + ,'publisher' : publisher + ,'tags' : category + ,'language' : LANGUAGE + ,'linearize_tables': True + } + def preprocess_html(self, soup): + for item in soup.findAll(style=True): + del item['style'] + if not soup.find(attrs={'http-equiv':'Content-Language'}): + meta0 = Tag(soup,'meta',[("http-equiv","Content-Language"),("content",self.LANGHTM)]) + soup.head.insert(0,meta0) + if not soup.find(attrs={'http-equiv':'Content-Type'}): + meta1 = Tag(soup,'meta',[("http-equiv","Content-Type"),("content","text/html; charset="+self.ENCHTM)]) + soup.head.insert(0,meta1) + return soup - language = 'pt' - - def get_article_url(self, article): - url = BasicNewsRecipe.get_article_url(self, article) - if '/Multimidia/' not in url: - return url + def postprocess_html(self, soup, first): + #process all the images. assumes that the new html has the correct path + for tag in soup.findAll(lambda tag: tag.name.lower()=='img' and tag.has_key('src')): + iurl = tag['src'] + img = Image() + img.open(iurl) + width, height = img.size + print 'img is: ', iurl, 'width is: ', width, 'height is: ', height + pw = PixelWand() + if( width > height and width > 590) : + print 'Rotate image' + img.rotate(pw, -90) + img.save(iurl) + return soup + def get_cover_url(self): + cover_url = self.CAPA + pedido = Request(self.CAPA) + pedido.add_header('User-agent','Mozilla/5.0 (Windows; U; Windows NT 5.1; '+self.LANGHTM+'; userid='+self.THUMBALIZR_API+') Calibre/0.8.47 (like Gecko)') + pedido.add_header('Accept-Charset',self.ENCHTM) + pedido.add_header('Referer',self.SCREENSHOT) + try: + resposta = urlopen(pedido) + soup = BeautifulSoup(resposta) + cover_item = soup.find('body') + if cover_item: + cover_url='http://api.thumbalizr.com/?api_key='+self.THUMBALIZR_API+'&url='+self.SCREENSHOT+'&width=600&quality=90' + return cover_url + except URLError: + cover_url='http://api.thumbalizr.com/?api_key='+self.THUMBALIZR_API+'&url='+self.SCREENSHOT+'&width=600&quality=90' + return cover_url diff --git a/recipes/folhadesaopaulo.recipe b/recipes/folhadesaopaulo.recipe index 262a265020..40898672e6 100644 --- a/recipes/folhadesaopaulo.recipe +++ b/recipes/folhadesaopaulo.recipe @@ -1,74 +1,149 @@ -#!/usr/bin/env python - -__license__ = 'GPL v3' -__copyright__ = '2010, Saverio Palmieri Neto ' -''' -folha.uol.com.br -''' - from calibre.web.feeds.news import BasicNewsRecipe +from datetime import datetime, timedelta +from calibre.ebooks.BeautifulSoup import Tag,BeautifulSoup +from calibre.utils.magick import Image, PixelWand +from urllib2 import Request, urlopen, URLError class FolhaOnline(BasicNewsRecipe): - title = 'Folha de Sao Paulo' - __author__ = 'Saverio Palmieri Neto' - description = 'Brazilian news from Folha de Sao Paulo Online' - publisher = 'Folha de Sao Paulo' - category = 'Brasil, news' - oldest_article = 2 - max_articles_per_feed = 1000 - summary_length = 2048 + THUMBALIZR_API = "0123456789abcdef01234567890" # ---->Get your at http://www.thumbalizr.com/ + LANGUAGE = 'pt_br' + language = 'pt' + LANGHTM = 'pt-br' + ENCODING = 'cp1252' + ENCHTM = 'iso-8859-1' + directionhtm = 'ltr' + requires_version = (0,8,47) + news = True + publication_type = 'newsportal' + + title = u'Folha de S\xE3o Paulo' + __author__ = 'Euler Alves' + description = u'Brazilian news from Folha de S\xE3o Paulo' + publisher = u'Folha de S\xE3o Paulo' + category = 'news, rss' + + oldest_article = 4 + max_articles_per_feed = 100 + summary_length = 1000 + + remove_javascript = True no_stylesheets = True use_embedded_content = False + remove_empty_feeds = True timefmt = ' [%d %b %Y (%a)]' - encoding = 'cp1252' - cover_url = 'http://lh5.ggpht.com/_hEb7sFmuBvk/TFoiKLRS5dI/AAAAAAAAADM/kcVKggZwKnw/capa_folha.jpg' - cover_margins = (5,5,'white') - remove_javascript = True - keep_only_tags = [dict(name='div', attrs={'id':'articleNew'})] + html2lrf_options = [ + '--comment', description + ,'--category', category + ,'--publisher', publisher + ] - remove_tags = [ - dict(name='script') - ,dict(name='div', - attrs={'id':[ - 'articleButton' - ,'bookmarklets' - ,'ad-180x150-1' - ,'contextualAdsArticle' - ,'articleEnd' - ,'articleComments' - ]}) - ,dict(name='div', - attrs={'class':[ - 'openBox adslibraryArticle' - ]}) - ,dict(name='a') - ,dict(name='iframe') - ,dict(name='link') - ] + html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"' + hoje = datetime.now() + pubdate = hoje.strftime('%a, %d %b') + if hoje.hour<6: + hoje = hoje-timedelta(days=1) + CAPA = 'http://www1.folha.uol.com.br/fsp/images/cp'+hoje.strftime('%d%m%Y')+'.jpg' + SCREENSHOT = 'http://www1.folha.uol.com.br/' + cover_margins = (0,0,'white') + masthead_url = 'http://f.i.uol.com.br/fsp/furniture/images/lgo-fsp-430x50-ffffff.gif' + + keep_only_tags = [dict(name='div', attrs={'id':'articleNew'})] + remove_tags = [ + dict(name='div', + attrs={'id':[ + 'articleButton' + ,'bookmarklets' + ,'ad-180x150-1' + ,'contextualAdsArticle' + ,'articleEnd' + ,'articleComments' + ]}) + ,dict(name='div', + attrs={'class':[ + 'openBox adslibraryArticle' + ]}) + + ,dict(name='a') + ,dict(name='iframe') + ,dict(name='link') + ,dict(name='script') + ] feeds = [ - (u'Em cima da hora', u'http://feeds.folha.uol.com.br/emcimadahora/rss091.xml') - ,(u'Ambiente', u'http://feeds.folha.uol.com.br/ambiente/rss091.xml') - ,(u'Bichos', u'http://feeds.folha.uol.com.br/bichos/rss091.xml') - ,(u'Poder', u'http://feeds.folha.uol.com.br/poder/rss091.xml') - ,(u'Ciencia', u'http://feeds.folha.uol.com.br/ciencia/rss091.xml') - ,(u'Cotidiano', u'http://feeds.folha.uol.com.br/cotidiado/rss091.xml') - ,(u'Saber', u'http://feeds.folha.uol.com.br/saber/rss091.xml') - ,(u'Equilíbrio e Saúde', u'http://feeds.folha.uol.com.br/equilibrioesaude/rss091.xml') - ,(u'Esporte', u'http://feeds.folha.uol.com.br/esporte/rss091.xml') - ,(u'Ilustrada', u'http://feeds.folha.uol.com.br/ilustrada/rss091.xml') - ,(u'Ilustríssima', u'http://feeds.folha.uol.com.br/ilustrissima/rss091.xml') - ,(u'Mercado', u'http://feeds.folha.uol.com.br/mercado/rss091.xml') - ,(u'Mundo', u'http://feeds.folha.uol.com.br/mundo/rss091.xml') - ,(u'Tec', u'http://feeds.folha.uol.com.br/tec/rss091.xml') - ,(u'Turismo', u'http://feeds.folha.uol.com.br/turismo/rss091.xml') - ] + (u'Em cima da hora', u'http://feeds.folha.uol.com.br/emcimadahora/rss091.xml') + ,(u'Ambiente', u'http://feeds.folha.uol.com.br/ambiente/rss091.xml') + ,(u'Bichos', u'http://feeds.folha.uol.com.br/bichos/rss091.xml') + ,(u'Ci\xEAncia', u'http://feeds.folha.uol.com.br/ciencia/rss091.xml') + ,(u'Poder', u'http://feeds.folha.uol.com.br/poder/rss091.xml') + ,(u'Equil\xEDbrio e Sa\xFAde', u'http://feeds.folha.uol.com.br/equilibrioesaude/rss091.xml') + ,(u'Turismo', u'http://feeds.folha.uol.com.br/folha/turismo/rss091.xml') + ,(u'Mundo', u'http://feeds.folha.uol.com.br/mundo/rss091.xml') + ,(u'Pelo Mundo', u'http://feeds.folha.uol.com.br/pelomundo.folha.rssblog.uol.com.br/') + ,(u'Circuito integrado', u'http://feeds.folha.uol.com.br/circuitointegrado.folha.rssblog.uol.com.br/') + ,(u'Blog do Fred', u'http://feeds.folha.uol.com.br/blogdofred.folha.rssblog.uol.com.br/') + ,(u'Maria In\xEAs Dolci', u'http://feeds.folha.uol.com.br/mariainesdolci.folha.blog.uol.com.br/') + ,(u'Eduardo Ohata', u'http://feeds.folha.uol.com.br/folha/pensata/eduardoohata/rss091.xml') + ,(u'Kennedy Alencar', u'http://feeds.folha.uol.com.br/folha/pensata/kennedyalencar/rss091.xml') + ,(u'Eliane Catanh\xEAde', u'http://feeds.folha.uol.com.br/folha/pensata/elianecantanhede/rss091.xml') + ,(u'Fernado Canzian', u'http://feeds.folha.uol.com.br/folha/pensata/fernandocanzian/rss091.xml') + ,(u'Gilberto Dimenstein', u'http://feeds.folha.uol.com.br/folha/pensata/gilbertodimenstein/rss091.xml') + ,(u'H\xE9lio Schwartsman', u'http://feeds.folha.uol.com.br/folha/pensata/helioschwartsman/rss091.xml') + ,(u'Jo\xE3o Pereira Coutinho', u'http://http://feeds.folha.uol.com.br/folha/pensata/joaopereiracoutinho/rss091.xml') + ,(u'Luiz Caversan', u'http://http://feeds.folha.uol.com.br/folha/pensata/luizcaversan/rss091.xml') + ,(u'S\xE9rgio Malbergier', u'http://http://feeds.folha.uol.com.br/folha/pensata/sergiomalbergier/rss091.xml') + ,(u'Valdo Cruz', u'http://http://feeds.folha.uol.com.br/folha/pensata/valdocruz/rss091.xml') + ] + + conversion_options = { + 'title' : title + ,'comments' : description + ,'publisher' : publisher + ,'tags' : category + ,'language' : LANGUAGE + ,'linearize_tables': True + } def preprocess_html(self, soup): for item in soup.findAll(style=True): del item['style'] + if not soup.find(attrs={'http-equiv':'Content-Language'}): + meta0 = Tag(soup,'meta',[("http-equiv","Content-Language"),("content",self.LANGHTM)]) + soup.head.insert(0,meta0) + if not soup.find(attrs={'http-equiv':'Content-Type'}): + meta1 = Tag(soup,'meta',[("http-equiv","Content-Type"),("content","text/html; charset="+self.ENCHTM)]) + soup.head.insert(0,meta1) return soup - language = 'pt' + def postprocess_html(self, soup, first): + #process all the images. assumes that the new html has the correct path + for tag in soup.findAll(lambda tag: tag.name.lower()=='img' and tag.has_key('src')): + iurl = tag['src'] + img = Image() + img.open(iurl) + width, height = img.size + print 'img is: ', iurl, 'width is: ', width, 'height is: ', height + pw = PixelWand() + if( width > height and width > 590) : + print 'Rotate image' + img.rotate(pw, -90) + img.save(iurl) + return soup + + def get_cover_url(self): + cover_url = self.CAPA + pedido = Request(self.CAPA) + pedido.add_header('User-agent','Mozilla/5.0 (Windows; U; Windows NT 5.1; '+self.LANGHTM+'; userid='+self.THUMBALIZR_API+') Calibre/0.8.47 (like Gecko)') + pedido.add_header('Accept-Charset',self.ENCHTM) + pedido.add_header('Referer',self.SCREENSHOT) + try: + resposta = urlopen(pedido) + soup = BeautifulSoup(resposta) + cover_item = soup.find('body') + if cover_item: + cover_url='http://api.thumbalizr.com/?api_key='+self.THUMBALIZR_API+'&url='+self.SCREENSHOT+'&width=600&quality=90' + return cover_url + except URLError: + cover_url='http://api.thumbalizr.com/?api_key='+self.THUMBALIZR_API+'&url='+self.SCREENSHOT+'&width=600&quality=90' + return cover_url