diff --git a/recipes/estadao.recipe b/recipes/estadao.recipe index 86ab572398..5b6303ba21 100644 --- a/recipes/estadao.recipe +++ b/recipes/estadao.recipe @@ -1,134 +1,129 @@ -#!/usr/bin/env python -from calibre.web.feeds.news import BasicNewsRecipe -from datetime import datetime, timedelta -from calibre.ebooks.BeautifulSoup import Tag,BeautifulSoup -from calibre.utils.magick import Image, PixelWand -from urllib2 import Request, urlopen, URLError - -class Estadao(BasicNewsRecipe): - THUMBALIZR_API = "0123456789abcdef01234567890" # ---->Get your at http://www.thumbalizr.com/ - LANGUAGE = 'pt_br' - language = 'pt' - LANGHTM = 'pt-br' - ENCODING = 'utf' - ENCHTM = 'utf-8' - directionhtm = 'ltr' - requires_version = (0,8,47) - news = True - publication_type = 'newsportal' - - title = u'Estadao' - __author__ = 'Euler Alves' - description = u'Brazilian news from Estad\xe3o' - publisher = u'Estad\xe3o' - category = 'news, rss' - - oldest_article = 4 - max_articles_per_feed = 100 - summary_length = 1000 - - remove_javascript = True - no_stylesheets = True - use_embedded_content = False - remove_empty_feeds = True - timefmt = ' [%d %b %Y (%a)]' - - html2lrf_options = [ - '--comment', description - ,'--category', category - ,'--publisher', publisher - ] - - html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"' - - hoje = datetime.now()-timedelta(days=2) - pubdate = hoje.strftime('%a, %d %b') - if hoje.hour<10: - hoje = hoje-timedelta(days=1) - CAPA = 'http://www.estadao.com.br/estadaodehoje/'+hoje.strftime('%Y%m%d')+'/img/capadodia.jpg' - SCREENSHOT = 'http://estadao.com.br/' - cover_margins = (0,0,'white') - masthead_url = 'http://www.estadao.com.br/estadao/novo/img/logo.png' - - keep_only_tags = [dict(name='div', attrs={'class':['bb-md-noticia','corpo']})] - remove_tags = [ - dict(name='div', - attrs={'id':[ - 'bb-md-noticia-tabs' - ]}) - ,dict(name='div', - attrs={'class':[ - 'tags' - ,'discussion' - ,'bb-gg adsense_container' - ]}) - - ,dict(name='a') - ,dict(name='iframe') - ,dict(name='link') - ,dict(name='script') - ] - - feeds = [ - (u'\xDAltimas Not\xEDcias', u'http://www.estadao.com.br/rss/ultimas.xml') - ,(u'Manchetes', u'http://www.estadao.com.br/rss/manchetes.xml') - ,(u'Brasil', u'http://www.estadao.com.br/rss/brasil.xml') - ,(u'Internacional', u'http://www.estadao.com.br/rss/internacional.xml') - ,(u'Cinema', u'http://blogs.estadao.com.br/cinema/feed/') - ,(u'Planeta', u'http://www.estadao.com.br/rss/planeta.xml') - ,(u'Ci\xEAncia', u'http://www.estadao.com.br/rss/ciencia.xml') - ,(u'Sa\xFAde', u'http://www.estadao.com.br/rss/saude.xml') - ,(u'Pol\xEDtica', u'http://www.estadao.com.br/rss/politica.xml') - ] - - conversion_options = { - 'title' : title - ,'comments' : description - ,'publisher' : publisher - ,'tags' : category - ,'language' : LANGUAGE - ,'linearize_tables': True - } - - def preprocess_html(self, soup): - for item in soup.findAll(style=True): - del item['style'] - if not soup.find(attrs={'http-equiv':'Content-Language'}): - meta0 = Tag(soup,'meta',[("http-equiv","Content-Language"),("content",self.LANGHTM)]) - soup.head.insert(0,meta0) - if not soup.find(attrs={'http-equiv':'Content-Type'}): - meta1 = Tag(soup,'meta',[("http-equiv","Content-Type"),("content","text/html; charset="+self.ENCHTM)]) - soup.head.insert(0,meta1) - return soup - - def postprocess_html(self, soup, first): - #process all the images. assumes that the new html has the correct path - for tag in soup.findAll(lambda tag: tag.name.lower()=='img' and tag.has_key('src')): - iurl = tag['src'] - img = Image() - img.open(iurl) - width, height = img.size - print 'img is: ', iurl, 'width is: ', width, 'height is: ', height - pw = PixelWand() - if( width > height and width > 590) : - print 'Rotate image' - img.rotate(pw, -90) - img.save(iurl) - return soup - - def get_cover_url(self): - cover_url = self.CAPA - pedido = Request(self.CAPA) - pedido.add_header('User-agent','Mozilla/5.0 (Windows; U; Windows NT 5.1; '+self.LANGHTM+'; userid='+self.THUMBALIZR_API+') Calibre/0.8.47 (like Gecko)') - pedido.add_header('Accept-Charset',self.ENCHTM) - pedido.add_header('Referer',self.SCREENSHOT) - try: - resposta = urlopen(pedido) - soup = BeautifulSoup(resposta) - cover_item = soup.find('body') - if cover_item: - cover_url='http://api.thumbalizr.com/?api_key='+self.THUMBALIZR_API+'&url='+self.SCREENSHOT+'&width=600&quality=90' - return cover_url - except URLError: - cover_url='http://api.thumbalizr.com/?api_key='+self.THUMBALIZR_API+'&url='+self.SCREENSHOT+'&width=600&quality=90' - return cover_url +from calibre.web.feeds.news import BasicNewsRecipe +from datetime import datetime, timedelta +from calibre.ebooks.BeautifulSoup import Tag,BeautifulSoup +from calibre.utils.magick import Image, PixelWand +from urllib2 import Request, urlopen, URLError + +class Estadao(BasicNewsRecipe): + THUMBALIZR_API = '' # ---->Get your at http://www.thumbalizr.com/ and put here + LANGUAGE = 'pt_br' + language = 'pt' + LANGHTM = 'pt-br' + ENCODING = 'utf' + ENCHTM = 'utf-8' + directionhtm = 'ltr' + requires_version = (0,7,47) + news = True + + title = u'Estad\xe3o' + __author__ = 'Euler Alves' + description = u'Brazilian news from Estad\xe3o' + publisher = u'Estad\xe3o' + category = 'news, rss' + + oldest_article = 4 + max_articles_per_feed = 100 + summary_length = 1000 + + remove_javascript = True + no_stylesheets = True + use_embedded_content = False + remove_empty_feeds = True + timefmt = ' [%d %b %Y (%a)]' + + hoje = datetime.now()-timedelta(days=2) + pubdate = hoje.strftime('%a, %d %b') + if hoje.hour<10: + hoje = hoje-timedelta(days=1) + CAPA = 'http://www.estadao.com.br/estadaodehoje/'+hoje.strftime('%Y%m%d')+'/img/capadodia.jpg' + SCREENSHOT = 'http://estadao.com.br/' + cover_margins = (0,0,'white') + masthead_url = 'http://www.estadao.com.br/estadao/novo/img/logo.png' + + keep_only_tags = [dict(name='div', attrs={'class':['bb-md-noticia','corpo']})] + remove_tags = [ + dict(name='div', + attrs={'id':[ + 'bb-md-noticia-tabs' + ]}) + ,dict(name='div', + attrs={'class':[ + 'tags' + ,'discussion' + ,'bb-gg adsense_container' + ]}) + + ,dict(name='a') + ,dict(name='iframe') + ,dict(name='link') + ,dict(name='script') + ] + + + feeds = [ + (u'\xDAltimas Not\xEDcias', u'http://www.estadao.com.br/rss/ultimas.xml') + ,(u'Manchetes', u'http://www.estadao.com.br/rss/manchetes.xml') + ,(u'Brasil', u'http://www.estadao.com.br/rss/brasil.xml') + ,(u'Internacional', u'http://www.estadao.com.br/rss/internacional.xml') + ,(u'Cinema', u'http://blogs.estadao.com.br/cinema/feed/') + ,(u'Planeta', u'http://www.estadao.com.br/rss/planeta.xml') + ,(u'Ci\xEAncia', u'http://www.estadao.com.br/rss/ciencia.xml') + ,(u'Sa\xFAde', u'http://www.estadao.com.br/rss/saude.xml') + ,(u'Pol\xEDtica', u'http://www.estadao.com.br/rss/politica.xml') + ] + + conversion_options = { + 'title' : title + ,'comments' : description + ,'publisher' : publisher + ,'tags' : category + ,'language' : LANGUAGE + ,'linearize_tables': True + } + + def preprocess_html(self, soup): + for item in soup.findAll(style=True): + del item['style'] + if not soup.find(attrs={'http-equiv':'Content-Language'}): + meta0 = Tag(soup,'meta',[("http-equiv","Content-Language"),("content",self.LANGHTM)]) + soup.head.insert(0,meta0) + if not soup.find(attrs={'http-equiv':'Content-Type'}): + meta1 = Tag(soup,'meta',[("http-equiv","Content-Type"),("content","text/html; charset="+self.ENCHTM)]) + soup.head.insert(0,meta1) + return soup + + def postprocess_html(self, soup, first): + #process all the images. assumes that the new html has the correct path + for tag in soup.findAll(lambda tag: tag.name.lower()=='img' and tag.has_key('src')): + iurl = tag['src'] + img = Image() + img.open(iurl) + width, height = img.size + print 'img is: ', iurl, 'width is: ', width, 'height is: ', height + if img < 0: + raise RuntimeError('Out of memory') + pw = PixelWand() + if( width > height and width > 590) : + print 'Rotate image' + img.rotate(pw, -90) + img.save(iurl) + return soup + + def get_cover_url(self): + if self.THUMBALIZR_API: + cover_url = self.CAPA + pedido = Request(self.CAPA) + pedido.add_header('User-agent','Mozilla/5.0 (Windows; U; Windows NT 5.1; '+self.LANGHTM+'; userid='+self.THUMBALIZR_API+') Calibre/0.8.47 (like Gecko)') + pedido.add_header('Accept-Charset',self.ENCHTM) + pedido.add_header('Referer',self.SCREENSHOT) + try: + resposta = urlopen(pedido) + soup = BeautifulSoup(resposta) + cover_item = soup.find('body') + if cover_item: + cover_url='http://api.thumbalizr.com/?api_key='+self.THUMBALIZR_API+'&url='+self.SCREENSHOT+'&width=600&quality=90' + return cover_url + except URLError: + cover_url='http://api.thumbalizr.com/?api_key='+self.THUMBALIZR_API+'&url='+self.SCREENSHOT+'&width=600&quality=90' + return cover_url + diff --git a/recipes/folhadesaopaulo.recipe b/recipes/folhadesaopaulo.recipe index 40898672e6..028513ad3a 100644 --- a/recipes/folhadesaopaulo.recipe +++ b/recipes/folhadesaopaulo.recipe @@ -1,149 +1,151 @@ -from calibre.web.feeds.news import BasicNewsRecipe -from datetime import datetime, timedelta -from calibre.ebooks.BeautifulSoup import Tag,BeautifulSoup -from calibre.utils.magick import Image, PixelWand -from urllib2 import Request, urlopen, URLError - -class FolhaOnline(BasicNewsRecipe): - THUMBALIZR_API = "0123456789abcdef01234567890" # ---->Get your at http://www.thumbalizr.com/ - LANGUAGE = 'pt_br' - language = 'pt' - LANGHTM = 'pt-br' - ENCODING = 'cp1252' - ENCHTM = 'iso-8859-1' - directionhtm = 'ltr' - requires_version = (0,8,47) - news = True - publication_type = 'newsportal' - - title = u'Folha de S\xE3o Paulo' - __author__ = 'Euler Alves' - description = u'Brazilian news from Folha de S\xE3o Paulo' - publisher = u'Folha de S\xE3o Paulo' - category = 'news, rss' - - oldest_article = 4 - max_articles_per_feed = 100 - summary_length = 1000 - - remove_javascript = True - no_stylesheets = True - use_embedded_content = False - remove_empty_feeds = True - timefmt = ' [%d %b %Y (%a)]' - - html2lrf_options = [ - '--comment', description - ,'--category', category - ,'--publisher', publisher - ] - - html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"' - - hoje = datetime.now() - pubdate = hoje.strftime('%a, %d %b') - if hoje.hour<6: - hoje = hoje-timedelta(days=1) - CAPA = 'http://www1.folha.uol.com.br/fsp/images/cp'+hoje.strftime('%d%m%Y')+'.jpg' - SCREENSHOT = 'http://www1.folha.uol.com.br/' - cover_margins = (0,0,'white') - masthead_url = 'http://f.i.uol.com.br/fsp/furniture/images/lgo-fsp-430x50-ffffff.gif' - - keep_only_tags = [dict(name='div', attrs={'id':'articleNew'})] - remove_tags = [ - dict(name='div', - attrs={'id':[ - 'articleButton' - ,'bookmarklets' - ,'ad-180x150-1' - ,'contextualAdsArticle' - ,'articleEnd' - ,'articleComments' - ]}) - ,dict(name='div', - attrs={'class':[ - 'openBox adslibraryArticle' - ]}) - - ,dict(name='a') - ,dict(name='iframe') - ,dict(name='link') - ,dict(name='script') - ] - - feeds = [ - (u'Em cima da hora', u'http://feeds.folha.uol.com.br/emcimadahora/rss091.xml') - ,(u'Ambiente', u'http://feeds.folha.uol.com.br/ambiente/rss091.xml') - ,(u'Bichos', u'http://feeds.folha.uol.com.br/bichos/rss091.xml') - ,(u'Ci\xEAncia', u'http://feeds.folha.uol.com.br/ciencia/rss091.xml') - ,(u'Poder', u'http://feeds.folha.uol.com.br/poder/rss091.xml') - ,(u'Equil\xEDbrio e Sa\xFAde', u'http://feeds.folha.uol.com.br/equilibrioesaude/rss091.xml') - ,(u'Turismo', u'http://feeds.folha.uol.com.br/folha/turismo/rss091.xml') - ,(u'Mundo', u'http://feeds.folha.uol.com.br/mundo/rss091.xml') - ,(u'Pelo Mundo', u'http://feeds.folha.uol.com.br/pelomundo.folha.rssblog.uol.com.br/') - ,(u'Circuito integrado', u'http://feeds.folha.uol.com.br/circuitointegrado.folha.rssblog.uol.com.br/') - ,(u'Blog do Fred', u'http://feeds.folha.uol.com.br/blogdofred.folha.rssblog.uol.com.br/') - ,(u'Maria In\xEAs Dolci', u'http://feeds.folha.uol.com.br/mariainesdolci.folha.blog.uol.com.br/') - ,(u'Eduardo Ohata', u'http://feeds.folha.uol.com.br/folha/pensata/eduardoohata/rss091.xml') - ,(u'Kennedy Alencar', u'http://feeds.folha.uol.com.br/folha/pensata/kennedyalencar/rss091.xml') - ,(u'Eliane Catanh\xEAde', u'http://feeds.folha.uol.com.br/folha/pensata/elianecantanhede/rss091.xml') - ,(u'Fernado Canzian', u'http://feeds.folha.uol.com.br/folha/pensata/fernandocanzian/rss091.xml') - ,(u'Gilberto Dimenstein', u'http://feeds.folha.uol.com.br/folha/pensata/gilbertodimenstein/rss091.xml') - ,(u'H\xE9lio Schwartsman', u'http://feeds.folha.uol.com.br/folha/pensata/helioschwartsman/rss091.xml') - ,(u'Jo\xE3o Pereira Coutinho', u'http://http://feeds.folha.uol.com.br/folha/pensata/joaopereiracoutinho/rss091.xml') - ,(u'Luiz Caversan', u'http://http://feeds.folha.uol.com.br/folha/pensata/luizcaversan/rss091.xml') - ,(u'S\xE9rgio Malbergier', u'http://http://feeds.folha.uol.com.br/folha/pensata/sergiomalbergier/rss091.xml') - ,(u'Valdo Cruz', u'http://http://feeds.folha.uol.com.br/folha/pensata/valdocruz/rss091.xml') - ] - - conversion_options = { - 'title' : title - ,'comments' : description - ,'publisher' : publisher - ,'tags' : category - ,'language' : LANGUAGE - ,'linearize_tables': True - } - - def preprocess_html(self, soup): - for item in soup.findAll(style=True): - del item['style'] - if not soup.find(attrs={'http-equiv':'Content-Language'}): - meta0 = Tag(soup,'meta',[("http-equiv","Content-Language"),("content",self.LANGHTM)]) - soup.head.insert(0,meta0) - if not soup.find(attrs={'http-equiv':'Content-Type'}): - meta1 = Tag(soup,'meta',[("http-equiv","Content-Type"),("content","text/html; charset="+self.ENCHTM)]) - soup.head.insert(0,meta1) - return soup - - def postprocess_html(self, soup, first): - #process all the images. assumes that the new html has the correct path - for tag in soup.findAll(lambda tag: tag.name.lower()=='img' and tag.has_key('src')): - iurl = tag['src'] - img = Image() - img.open(iurl) - width, height = img.size - print 'img is: ', iurl, 'width is: ', width, 'height is: ', height - pw = PixelWand() - if( width > height and width > 590) : - print 'Rotate image' - img.rotate(pw, -90) - img.save(iurl) - return soup - - def get_cover_url(self): - cover_url = self.CAPA - pedido = Request(self.CAPA) - pedido.add_header('User-agent','Mozilla/5.0 (Windows; U; Windows NT 5.1; '+self.LANGHTM+'; userid='+self.THUMBALIZR_API+') Calibre/0.8.47 (like Gecko)') - pedido.add_header('Accept-Charset',self.ENCHTM) - pedido.add_header('Referer',self.SCREENSHOT) - try: - resposta = urlopen(pedido) - soup = BeautifulSoup(resposta) - cover_item = soup.find('body') - if cover_item: - cover_url='http://api.thumbalizr.com/?api_key='+self.THUMBALIZR_API+'&url='+self.SCREENSHOT+'&width=600&quality=90' - return cover_url - except URLError: - cover_url='http://api.thumbalizr.com/?api_key='+self.THUMBALIZR_API+'&url='+self.SCREENSHOT+'&width=600&quality=90' - return cover_url +from calibre.web.feeds.news import BasicNewsRecipe +from datetime import datetime, timedelta +from calibre.ebooks.BeautifulSoup import Tag,BeautifulSoup +from calibre.utils.magick import Image, PixelWand +from urllib2 import Request, urlopen, URLError + +class FolhaOnline(BasicNewsRecipe): + THUMBALIZR_API = '' # ---->Get your at http://www.thumbalizr.com/ and put here + LANGUAGE = 'pt_br' + language = 'pt' + LANGHTM = 'pt-br' + ENCODING = 'cp1252' + ENCHTM = 'iso-8859-1' + directionhtm = 'ltr' + requires_version = (0,7,47) + news = True + + title = u'Folha de S\xE3o Paulo' + __author__ = 'Euler Alves' + description = u'Brazilian news from Folha de S\xE3o Paulo' + publisher = u'Folha de S\xE3o Paulo' + category = 'news, rss' + + oldest_article = 4 + max_articles_per_feed = 100 + summary_length = 1000 + + remove_javascript = True + no_stylesheets = True + use_embedded_content = False + remove_empty_feeds = True + timefmt = ' [%d %b %Y (%a)]' + + html2lrf_options = [ + '--comment', description + ,'--category', category + ,'--publisher', publisher + ] + + html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"' + + hoje = datetime.now() + pubdate = hoje.strftime('%a, %d %b') + if hoje.hour<6: + hoje = hoje-timedelta(days=1) + CAPA = 'http://www1.folha.uol.com.br/fsp/images/cp'+hoje.strftime('%d%m%Y')+'.jpg' + SCREENSHOT = 'http://www1.folha.uol.com.br/' + cover_margins = (0,0,'white') + masthead_url = 'http://f.i.uol.com.br/fsp/furniture/images/lgo-fsp-430x50-ffffff.gif' + + keep_only_tags = [dict(name='div', attrs={'id':'articleNew'})] + remove_tags = [ + dict(name='div', + attrs={'id':[ + 'articleButton' + ,'bookmarklets' + ,'ad-180x150-1' + ,'contextualAdsArticle' + ,'articleEnd' + ,'articleComments' + ]}) + ,dict(name='div', + attrs={'class':[ + 'openBox adslibraryArticle' + ]}) + + ,dict(name='a') + ,dict(name='iframe') + ,dict(name='link') + ,dict(name='script') + ] + + feeds = [ + (u'Em cima da hora', u'http://feeds.folha.uol.com.br/emcimadahora/rss091.xml') + ,(u'Ambiente', u'http://feeds.folha.uol.com.br/ambiente/rss091.xml') + ,(u'Bichos', u'http://feeds.folha.uol.com.br/bichos/rss091.xml') + ,(u'Ci\xEAncia', u'http://feeds.folha.uol.com.br/ciencia/rss091.xml') + ,(u'Poder', u'http://feeds.folha.uol.com.br/poder/rss091.xml') + ,(u'Equil\xEDbrio e Sa\xFAde', u'http://feeds.folha.uol.com.br/equilibrioesaude/rss091.xml') + ,(u'Turismo', u'http://feeds.folha.uol.com.br/folha/turismo/rss091.xml') + ,(u'Mundo', u'http://feeds.folha.uol.com.br/mundo/rss091.xml') + ,(u'Pelo Mundo', u'http://feeds.folha.uol.com.br/pelomundo.folha.rssblog.uol.com.br/') + ,(u'Circuito integrado', u'http://feeds.folha.uol.com.br/circuitointegrado.folha.rssblog.uol.com.br/') + ,(u'Blog do Fred', u'http://feeds.folha.uol.com.br/blogdofred.folha.rssblog.uol.com.br/') + ,(u'Maria In\xEAs Dolci', u'http://feeds.folha.uol.com.br/mariainesdolci.folha.blog.uol.com.br/') + ,(u'Eduardo Ohata', u'http://feeds.folha.uol.com.br/folha/pensata/eduardoohata/rss091.xml') + ,(u'Kennedy Alencar', u'http://feeds.folha.uol.com.br/folha/pensata/kennedyalencar/rss091.xml') + ,(u'Eliane Catanh\xEAde', u'http://feeds.folha.uol.com.br/folha/pensata/elianecantanhede/rss091.xml') + ,(u'Fernado Canzian', u'http://feeds.folha.uol.com.br/folha/pensata/fernandocanzian/rss091.xml') + ,(u'Gilberto Dimenstein', u'http://feeds.folha.uol.com.br/folha/pensata/gilbertodimenstein/rss091.xml') + ,(u'H\xE9lio Schwartsman', u'http://feeds.folha.uol.com.br/folha/pensata/helioschwartsman/rss091.xml') + ,(u'Jo\xE3o Pereira Coutinho', u'http://http://feeds.folha.uol.com.br/folha/pensata/joaopereiracoutinho/rss091.xml') + ,(u'Luiz Caversan', u'http://http://feeds.folha.uol.com.br/folha/pensata/luizcaversan/rss091.xml') + ,(u'S\xE9rgio Malbergier', u'http://http://feeds.folha.uol.com.br/folha/pensata/sergiomalbergier/rss091.xml') + ,(u'Valdo Cruz', u'http://http://feeds.folha.uol.com.br/folha/pensata/valdocruz/rss091.xml') + ] + + + conversion_options = { + 'title' : title + ,'comments' : description + ,'publisher' : publisher + ,'tags' : category + ,'language' : LANGUAGE + ,'linearize_tables': True + } + + def preprocess_html(self, soup): + for item in soup.findAll(style=True): + del item['style'] + if not soup.find(attrs={'http-equiv':'Content-Language'}): + meta0 = Tag(soup,'meta',[("http-equiv","Content-Language"),("content",self.LANGHTM)]) + soup.head.insert(0,meta0) + if not soup.find(attrs={'http-equiv':'Content-Type'}): + meta1 = Tag(soup,'meta',[("http-equiv","Content-Type"),("content","text/html; charset="+self.ENCHTM)]) + soup.head.insert(0,meta1) + return soup + + def postprocess_html(self, soup, first): + #process all the images. assumes that the new html has the correct path + for tag in soup.findAll(lambda tag: tag.name.lower()=='img' and tag.has_key('src')): + iurl = tag['src'] + img = Image() + img.open(iurl) + width, height = img.size + print 'img is: ', iurl, 'width is: ', width, 'height is: ', height + if img < 0: + raise RuntimeError('Out of memory') + pw = PixelWand() + if( width > height and width > 590) : + print 'Rotate image' + img.rotate(pw, -90) + img.save(iurl) + return soup + + def get_cover_url(self): + cover_url = self.CAPA + pedido = Request(self.CAPA) + pedido.add_header('User-agent','Mozilla/5.0 (Windows; U; Windows NT 5.1; '+self.LANGHTM+'; userid='+self.THUMBALIZR_API+') Calibre/0.8.47 (like Gecko)') + pedido.add_header('Accept-Charset',self.ENCHTM) + pedido.add_header('Referer',self.SCREENSHOT) + try: + resposta = urlopen(pedido) + soup = BeautifulSoup(resposta) + cover_item = soup.find('body') + if cover_item: + cover_url='http://api.thumbalizr.com/?api_key='+self.THUMBALIZR_API+'&url='+self.SCREENSHOT+'&width=600&quality=90' + return cover_url + except URLError: + cover_url='http://api.thumbalizr.com/?api_key='+self.THUMBALIZR_API+'&url='+self.SCREENSHOT+'&width=600&quality=90' + return cover_url diff --git a/recipes/icons/estadao.png b/recipes/icons/estadao.png index 8f5637ad58..706f101f33 100644 Binary files a/recipes/icons/estadao.png and b/recipes/icons/estadao.png differ diff --git a/recipes/icons/folhadesaopaulo.png b/recipes/icons/folhadesaopaulo.png new file mode 100644 index 0000000000..c895e57d70 Binary files /dev/null and b/recipes/icons/folhadesaopaulo.png differ diff --git a/recipes/lifehacker.recipe b/recipes/lifehacker.recipe index ff95efc50a..e96b031dab 100644 --- a/recipes/lifehacker.recipe +++ b/recipes/lifehacker.recipe @@ -1,37 +1,100 @@ -__license__ = 'GPL v3' -__copyright__ = '2010, NA' -''' -lifehacker.com -''' - -from calibre.web.feeds.news import BasicNewsRecipe - -class Lifehacker(BasicNewsRecipe): - title = 'Lifehacker' - __author__ = 'Kovid Goyal' - description = "Computers make us more productive. Yeah, right. Lifehacker recommends the software downloads and web sites that actually save time. Don't live to geek; geek to live." - publisher = 'lifehacker.com' - category = 'news, IT, Internet, gadgets, tips and tricks, howto, diy' - oldest_article = 2 - max_articles_per_feed = 100 - no_stylesheets = True - encoding = 'utf-8' - use_embedded_content = True - language = 'en' - masthead_url = 'http://cache.gawkerassets.com/assets/lifehacker.com/img/logo.png' - conversion_options = { - 'comment' : description - , 'tags' : category - , 'publisher' : publisher - , 'language' : language - } - - remove_tags = [ - {'class': 'feedflare'}, - ] - - feeds = [(u'Articles', u'http://feeds.gawker.com/lifehacker/vip?format=xml')] - - def preprocess_html(self, soup): - return self.adeify_images(soup) - +from calibre.web.feeds.news import BasicNewsRecipe +from datetime import datetime +from calibre.ebooks.BeautifulSoup import Tag +from calibre.utils.magick import Image, PixelWand + +class LifeHacker(BasicNewsRecipe): + THUMBALIZR_API = '' # ---->Get your at http://www.thumbalizr.com/ and put here + LANGUAGE = 'en' + LANGHTM = 'en' + language = 'en' + ENCODING = 'utf' + ENCHTM = 'utf-8' + requires_version = (0,7,47) + news = True + + title = u'LifeHacker' + __author__ = 'Euler Alves' + description = u'Tips, tricks, and downloads for getting things done.' + publisher = u'lifehacker.com' + author = u'Adam Pash & Kevin Purdy & Adam Dachis & Whitson Gordon & Gina Trapani' + category = 'news, rss' + + oldest_article = 4 + max_articles_per_feed = 20 + summary_length = 1000 + + remove_javascript = True + no_stylesheets = True + use_embedded_content = True + remove_empty_feeds = True + timefmt = ' [%d %b %Y (%a)]' + + hoje = datetime.now() + pubdate = hoje.strftime('%a, %d %b') + cover_url = 'http://api.thumbalizr.com/?api_key='+THUMBALIZR_API+'&url=http://lifehacker.com&width=600&quality=90' + cover_margins = (0,0,'white') + masthead_url = 'http://cache.gawkerassets.com/assets/lifehacker.com/img/logo.png' + + remove_tags = [ + {'class': 'feedflare'}, + dict(name='div', + attrs={'class':[ + 'ad_container' + ,'ad_300x250' + ,'ad_interstitial' + ,'share-wrap' + ,'ad_300x600' + ,'ad_perma-footer-adsense' + ,'ad_perma-panorama' + ,'ad panorama' + ,'ad_container' + ]}) + ,dict(name='div', + attrs={'id':[ + 'agegate_container' + ,'agegate_container_rejected' + ,'sharemenu-wrap' + ]}) + ] + + feeds = [(u'Articles', u'http://feeds.gawker.com/lifehacker/vip?format=xml')] + + conversion_options = { + 'title' : title + ,'comments' : description + ,'publisher' : publisher + ,'tags' : category + ,'language' : LANGUAGE + ,'linearize_tables': True + } + + def preprocess_html(self, soup): + for item in soup.findAll(style=True): + del item['style'] + if not soup.find(attrs={'http-equiv':'Content-Language'}): + meta0 = Tag(soup,'meta',[("http-equiv","Content-Language"),("content",self.LANGHTM)]) + soup.head.insert(0,meta0) + if not soup.find(attrs={'http-equiv':'Content-Type'}): + meta1 = Tag(soup,'meta',[("http-equiv","Content-Type"),("content","text/html; charset="+self.ENCHTM)]) + soup.head.insert(0,meta1) + return soup + + def postprocess_html(self, soup, first): + #process all the images. assumes that the new html has the correct path + for tag in soup.findAll(lambda tag: tag.name.lower()=='img' and tag.has_key('src')): + iurl = tag['src'] + img = Image() + img.open(iurl) + width, height = img.size + print 'img is: ', iurl, 'width is: ', width, 'height is: ', height + if img < 0: + raise RuntimeError('Out of memory') + pw = PixelWand() + if( width > height and width > 590) : + print 'Rotate image' + img.rotate(pw, -90) + img.save(iurl) + return soup + +