diff --git a/recipes/estadao.recipe b/recipes/estadao.recipe index 86ab572398..5b6303ba21 100644 --- a/recipes/estadao.recipe +++ b/recipes/estadao.recipe @@ -1,134 +1,129 @@ -#!/usr/bin/env python -from calibre.web.feeds.news import BasicNewsRecipe -from datetime import datetime, timedelta -from calibre.ebooks.BeautifulSoup import Tag,BeautifulSoup -from calibre.utils.magick import Image, PixelWand -from urllib2 import Request, urlopen, URLError - -class Estadao(BasicNewsRecipe): - THUMBALIZR_API = "0123456789abcdef01234567890" # ---->Get your at http://www.thumbalizr.com/ - LANGUAGE = 'pt_br' - language = 'pt' - LANGHTM = 'pt-br' - ENCODING = 'utf' - ENCHTM = 'utf-8' - directionhtm = 'ltr' - requires_version = (0,8,47) - news = True - publication_type = 'newsportal' - - title = u'Estadao' - __author__ = 'Euler Alves' - description = u'Brazilian news from Estad\xe3o' - publisher = u'Estad\xe3o' - category = 'news, rss' - - oldest_article = 4 - max_articles_per_feed = 100 - summary_length = 1000 - - remove_javascript = True - no_stylesheets = True - use_embedded_content = False - remove_empty_feeds = True - timefmt = ' [%d %b %Y (%a)]' - - html2lrf_options = [ - '--comment', description - ,'--category', category - ,'--publisher', publisher - ] - - html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"' - - hoje = datetime.now()-timedelta(days=2) - pubdate = hoje.strftime('%a, %d %b') - if hoje.hour<10: - hoje = hoje-timedelta(days=1) - CAPA = 'http://www.estadao.com.br/estadaodehoje/'+hoje.strftime('%Y%m%d')+'/img/capadodia.jpg' - SCREENSHOT = 'http://estadao.com.br/' - cover_margins = (0,0,'white') - masthead_url = 'http://www.estadao.com.br/estadao/novo/img/logo.png' - - keep_only_tags = [dict(name='div', attrs={'class':['bb-md-noticia','corpo']})] - remove_tags = [ - dict(name='div', - attrs={'id':[ - 'bb-md-noticia-tabs' - ]}) - ,dict(name='div', - attrs={'class':[ - 'tags' - ,'discussion' - ,'bb-gg adsense_container' - ]}) - - ,dict(name='a') - ,dict(name='iframe') - ,dict(name='link') - ,dict(name='script') - ] - - feeds = [ - (u'\xDAltimas Not\xEDcias', u'http://www.estadao.com.br/rss/ultimas.xml') - ,(u'Manchetes', u'http://www.estadao.com.br/rss/manchetes.xml') - ,(u'Brasil', u'http://www.estadao.com.br/rss/brasil.xml') - ,(u'Internacional', u'http://www.estadao.com.br/rss/internacional.xml') - ,(u'Cinema', u'http://blogs.estadao.com.br/cinema/feed/') - ,(u'Planeta', u'http://www.estadao.com.br/rss/planeta.xml') - ,(u'Ci\xEAncia', u'http://www.estadao.com.br/rss/ciencia.xml') - ,(u'Sa\xFAde', u'http://www.estadao.com.br/rss/saude.xml') - ,(u'Pol\xEDtica', u'http://www.estadao.com.br/rss/politica.xml') - ] - - conversion_options = { - 'title' : title - ,'comments' : description - ,'publisher' : publisher - ,'tags' : category - ,'language' : LANGUAGE - ,'linearize_tables': True - } - - def preprocess_html(self, soup): - for item in soup.findAll(style=True): - del item['style'] - if not soup.find(attrs={'http-equiv':'Content-Language'}): - meta0 = Tag(soup,'meta',[("http-equiv","Content-Language"),("content",self.LANGHTM)]) - soup.head.insert(0,meta0) - if not soup.find(attrs={'http-equiv':'Content-Type'}): - meta1 = Tag(soup,'meta',[("http-equiv","Content-Type"),("content","text/html; charset="+self.ENCHTM)]) - soup.head.insert(0,meta1) - return soup - - def postprocess_html(self, soup, first): - #process all the images. assumes that the new html has the correct path - for tag in soup.findAll(lambda tag: tag.name.lower()=='img' and tag.has_key('src')): - iurl = tag['src'] - img = Image() - img.open(iurl) - width, height = img.size - print 'img is: ', iurl, 'width is: ', width, 'height is: ', height - pw = PixelWand() - if( width > height and width > 590) : - print 'Rotate image' - img.rotate(pw, -90) - img.save(iurl) - return soup - - def get_cover_url(self): - cover_url = self.CAPA - pedido = Request(self.CAPA) - pedido.add_header('User-agent','Mozilla/5.0 (Windows; U; Windows NT 5.1; '+self.LANGHTM+'; userid='+self.THUMBALIZR_API+') Calibre/0.8.47 (like Gecko)') - pedido.add_header('Accept-Charset',self.ENCHTM) - pedido.add_header('Referer',self.SCREENSHOT) - try: - resposta = urlopen(pedido) - soup = BeautifulSoup(resposta) - cover_item = soup.find('body') - if cover_item: - cover_url='http://api.thumbalizr.com/?api_key='+self.THUMBALIZR_API+'&url='+self.SCREENSHOT+'&width=600&quality=90' - return cover_url - except URLError: - cover_url='http://api.thumbalizr.com/?api_key='+self.THUMBALIZR_API+'&url='+self.SCREENSHOT+'&width=600&quality=90' - return cover_url +from calibre.web.feeds.news import BasicNewsRecipe +from datetime import datetime, timedelta +from calibre.ebooks.BeautifulSoup import Tag,BeautifulSoup +from calibre.utils.magick import Image, PixelWand +from urllib2 import Request, urlopen, URLError + +class Estadao(BasicNewsRecipe): + THUMBALIZR_API = '' # ---->Get your at http://www.thumbalizr.com/ and put here + LANGUAGE = 'pt_br' + language = 'pt' + LANGHTM = 'pt-br' + ENCODING = 'utf' + ENCHTM = 'utf-8' + directionhtm = 'ltr' + requires_version = (0,7,47) + news = True + + title = u'Estad\xe3o' + __author__ = 'Euler Alves' + description = u'Brazilian news from Estad\xe3o' + publisher = u'Estad\xe3o' + category = 'news, rss' + + oldest_article = 4 + max_articles_per_feed = 100 + summary_length = 1000 + + remove_javascript = True + no_stylesheets = True + use_embedded_content = False + remove_empty_feeds = True + timefmt = ' [%d %b %Y (%a)]' + + hoje = datetime.now()-timedelta(days=2) + pubdate = hoje.strftime('%a, %d %b') + if hoje.hour<10: + hoje = hoje-timedelta(days=1) + CAPA = 'http://www.estadao.com.br/estadaodehoje/'+hoje.strftime('%Y%m%d')+'/img/capadodia.jpg' + SCREENSHOT = 'http://estadao.com.br/' + cover_margins = (0,0,'white') + masthead_url = 'http://www.estadao.com.br/estadao/novo/img/logo.png' + + keep_only_tags = [dict(name='div', attrs={'class':['bb-md-noticia','corpo']})] + remove_tags = [ + dict(name='div', + attrs={'id':[ + 'bb-md-noticia-tabs' + ]}) + ,dict(name='div', + attrs={'class':[ + 'tags' + ,'discussion' + ,'bb-gg adsense_container' + ]}) + + ,dict(name='a') + ,dict(name='iframe') + ,dict(name='link') + ,dict(name='script') + ] + + + feeds = [ + (u'\xDAltimas Not\xEDcias', u'http://www.estadao.com.br/rss/ultimas.xml') + ,(u'Manchetes', u'http://www.estadao.com.br/rss/manchetes.xml') + ,(u'Brasil', u'http://www.estadao.com.br/rss/brasil.xml') + ,(u'Internacional', u'http://www.estadao.com.br/rss/internacional.xml') + ,(u'Cinema', u'http://blogs.estadao.com.br/cinema/feed/') + ,(u'Planeta', u'http://www.estadao.com.br/rss/planeta.xml') + ,(u'Ci\xEAncia', u'http://www.estadao.com.br/rss/ciencia.xml') + ,(u'Sa\xFAde', u'http://www.estadao.com.br/rss/saude.xml') + ,(u'Pol\xEDtica', u'http://www.estadao.com.br/rss/politica.xml') + ] + + conversion_options = { + 'title' : title + ,'comments' : description + ,'publisher' : publisher + ,'tags' : category + ,'language' : LANGUAGE + ,'linearize_tables': True + } + + def preprocess_html(self, soup): + for item in soup.findAll(style=True): + del item['style'] + if not soup.find(attrs={'http-equiv':'Content-Language'}): + meta0 = Tag(soup,'meta',[("http-equiv","Content-Language"),("content",self.LANGHTM)]) + soup.head.insert(0,meta0) + if not soup.find(attrs={'http-equiv':'Content-Type'}): + meta1 = Tag(soup,'meta',[("http-equiv","Content-Type"),("content","text/html; charset="+self.ENCHTM)]) + soup.head.insert(0,meta1) + return soup + + def postprocess_html(self, soup, first): + #process all the images. assumes that the new html has the correct path + for tag in soup.findAll(lambda tag: tag.name.lower()=='img' and tag.has_key('src')): + iurl = tag['src'] + img = Image() + img.open(iurl) + width, height = img.size + print 'img is: ', iurl, 'width is: ', width, 'height is: ', height + if img < 0: + raise RuntimeError('Out of memory') + pw = PixelWand() + if( width > height and width > 590) : + print 'Rotate image' + img.rotate(pw, -90) + img.save(iurl) + return soup + + def get_cover_url(self): + if self.THUMBALIZR_API: + cover_url = self.CAPA + pedido = Request(self.CAPA) + pedido.add_header('User-agent','Mozilla/5.0 (Windows; U; Windows NT 5.1; '+self.LANGHTM+'; userid='+self.THUMBALIZR_API+') Calibre/0.8.47 (like Gecko)') + pedido.add_header('Accept-Charset',self.ENCHTM) + pedido.add_header('Referer',self.SCREENSHOT) + try: + resposta = urlopen(pedido) + soup = BeautifulSoup(resposta) + cover_item = soup.find('body') + if cover_item: + cover_url='http://api.thumbalizr.com/?api_key='+self.THUMBALIZR_API+'&url='+self.SCREENSHOT+'&width=600&quality=90' + return cover_url + except URLError: + cover_url='http://api.thumbalizr.com/?api_key='+self.THUMBALIZR_API+'&url='+self.SCREENSHOT+'&width=600&quality=90' + return cover_url + diff --git a/recipes/folhadesaopaulo.recipe b/recipes/folhadesaopaulo.recipe index 40898672e6..028513ad3a 100644 --- a/recipes/folhadesaopaulo.recipe +++ b/recipes/folhadesaopaulo.recipe @@ -1,149 +1,151 @@ -from calibre.web.feeds.news import BasicNewsRecipe -from datetime import datetime, timedelta -from calibre.ebooks.BeautifulSoup import Tag,BeautifulSoup -from calibre.utils.magick import Image, PixelWand -from urllib2 import Request, urlopen, URLError - -class FolhaOnline(BasicNewsRecipe): - THUMBALIZR_API = "0123456789abcdef01234567890" # ---->Get your at http://www.thumbalizr.com/ - LANGUAGE = 'pt_br' - language = 'pt' - LANGHTM = 'pt-br' - ENCODING = 'cp1252' - ENCHTM = 'iso-8859-1' - directionhtm = 'ltr' - requires_version = (0,8,47) - news = True - publication_type = 'newsportal' - - title = u'Folha de S\xE3o Paulo' - __author__ = 'Euler Alves' - description = u'Brazilian news from Folha de S\xE3o Paulo' - publisher = u'Folha de S\xE3o Paulo' - category = 'news, rss' - - oldest_article = 4 - max_articles_per_feed = 100 - summary_length = 1000 - - remove_javascript = True - no_stylesheets = True - use_embedded_content = False - remove_empty_feeds = True - timefmt = ' [%d %b %Y (%a)]' - - html2lrf_options = [ - '--comment', description - ,'--category', category - ,'--publisher', publisher - ] - - html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"' - - hoje = datetime.now() - pubdate = hoje.strftime('%a, %d %b') - if hoje.hour<6: - hoje = hoje-timedelta(days=1) - CAPA = 'http://www1.folha.uol.com.br/fsp/images/cp'+hoje.strftime('%d%m%Y')+'.jpg' - SCREENSHOT = 'http://www1.folha.uol.com.br/' - cover_margins = (0,0,'white') - masthead_url = 'http://f.i.uol.com.br/fsp/furniture/images/lgo-fsp-430x50-ffffff.gif' - - keep_only_tags = [dict(name='div', attrs={'id':'articleNew'})] - remove_tags = [ - dict(name='div', - attrs={'id':[ - 'articleButton' - ,'bookmarklets' - ,'ad-180x150-1' - ,'contextualAdsArticle' - ,'articleEnd' - ,'articleComments' - ]}) - ,dict(name='div', - attrs={'class':[ - 'openBox adslibraryArticle' - ]}) - - ,dict(name='a') - ,dict(name='iframe') - ,dict(name='link') - ,dict(name='script') - ] - - feeds = [ - (u'Em cima da hora', u'http://feeds.folha.uol.com.br/emcimadahora/rss091.xml') - ,(u'Ambiente', u'http://feeds.folha.uol.com.br/ambiente/rss091.xml') - ,(u'Bichos', u'http://feeds.folha.uol.com.br/bichos/rss091.xml') - ,(u'Ci\xEAncia', u'http://feeds.folha.uol.com.br/ciencia/rss091.xml') - ,(u'Poder', u'http://feeds.folha.uol.com.br/poder/rss091.xml') - ,(u'Equil\xEDbrio e Sa\xFAde', u'http://feeds.folha.uol.com.br/equilibrioesaude/rss091.xml') - ,(u'Turismo', u'http://feeds.folha.uol.com.br/folha/turismo/rss091.xml') - ,(u'Mundo', u'http://feeds.folha.uol.com.br/mundo/rss091.xml') - ,(u'Pelo Mundo', u'http://feeds.folha.uol.com.br/pelomundo.folha.rssblog.uol.com.br/') - ,(u'Circuito integrado', u'http://feeds.folha.uol.com.br/circuitointegrado.folha.rssblog.uol.com.br/') - ,(u'Blog do Fred', u'http://feeds.folha.uol.com.br/blogdofred.folha.rssblog.uol.com.br/') - ,(u'Maria In\xEAs Dolci', u'http://feeds.folha.uol.com.br/mariainesdolci.folha.blog.uol.com.br/') - ,(u'Eduardo Ohata', u'http://feeds.folha.uol.com.br/folha/pensata/eduardoohata/rss091.xml') - ,(u'Kennedy Alencar', u'http://feeds.folha.uol.com.br/folha/pensata/kennedyalencar/rss091.xml') - ,(u'Eliane Catanh\xEAde', u'http://feeds.folha.uol.com.br/folha/pensata/elianecantanhede/rss091.xml') - ,(u'Fernado Canzian', u'http://feeds.folha.uol.com.br/folha/pensata/fernandocanzian/rss091.xml') - ,(u'Gilberto Dimenstein', u'http://feeds.folha.uol.com.br/folha/pensata/gilbertodimenstein/rss091.xml') - ,(u'H\xE9lio Schwartsman', u'http://feeds.folha.uol.com.br/folha/pensata/helioschwartsman/rss091.xml') - ,(u'Jo\xE3o Pereira Coutinho', u'http://http://feeds.folha.uol.com.br/folha/pensata/joaopereiracoutinho/rss091.xml') - ,(u'Luiz Caversan', u'http://http://feeds.folha.uol.com.br/folha/pensata/luizcaversan/rss091.xml') - ,(u'S\xE9rgio Malbergier', u'http://http://feeds.folha.uol.com.br/folha/pensata/sergiomalbergier/rss091.xml') - ,(u'Valdo Cruz', u'http://http://feeds.folha.uol.com.br/folha/pensata/valdocruz/rss091.xml') - ] - - conversion_options = { - 'title' : title - ,'comments' : description - ,'publisher' : publisher - ,'tags' : category - ,'language' : LANGUAGE - ,'linearize_tables': True - } - - def preprocess_html(self, soup): - for item in soup.findAll(style=True): - del item['style'] - if not soup.find(attrs={'http-equiv':'Content-Language'}): - meta0 = Tag(soup,'meta',[("http-equiv","Content-Language"),("content",self.LANGHTM)]) - soup.head.insert(0,meta0) - if not soup.find(attrs={'http-equiv':'Content-Type'}): - meta1 = Tag(soup,'meta',[("http-equiv","Content-Type"),("content","text/html; charset="+self.ENCHTM)]) - soup.head.insert(0,meta1) - return soup - - def postprocess_html(self, soup, first): - #process all the images. assumes that the new html has the correct path - for tag in soup.findAll(lambda tag: tag.name.lower()=='img' and tag.has_key('src')): - iurl = tag['src'] - img = Image() - img.open(iurl) - width, height = img.size - print 'img is: ', iurl, 'width is: ', width, 'height is: ', height - pw = PixelWand() - if( width > height and width > 590) : - print 'Rotate image' - img.rotate(pw, -90) - img.save(iurl) - return soup - - def get_cover_url(self): - cover_url = self.CAPA - pedido = Request(self.CAPA) - pedido.add_header('User-agent','Mozilla/5.0 (Windows; U; Windows NT 5.1; '+self.LANGHTM+'; userid='+self.THUMBALIZR_API+') Calibre/0.8.47 (like Gecko)') - pedido.add_header('Accept-Charset',self.ENCHTM) - pedido.add_header('Referer',self.SCREENSHOT) - try: - resposta = urlopen(pedido) - soup = BeautifulSoup(resposta) - cover_item = soup.find('body') - if cover_item: - cover_url='http://api.thumbalizr.com/?api_key='+self.THUMBALIZR_API+'&url='+self.SCREENSHOT+'&width=600&quality=90' - return cover_url - except URLError: - cover_url='http://api.thumbalizr.com/?api_key='+self.THUMBALIZR_API+'&url='+self.SCREENSHOT+'&width=600&quality=90' - return cover_url +from calibre.web.feeds.news import BasicNewsRecipe +from datetime import datetime, timedelta +from calibre.ebooks.BeautifulSoup import Tag,BeautifulSoup +from calibre.utils.magick import Image, PixelWand +from urllib2 import Request, urlopen, URLError + +class FolhaOnline(BasicNewsRecipe): + THUMBALIZR_API = '' # ---->Get your at http://www.thumbalizr.com/ and put here + LANGUAGE = 'pt_br' + language = 'pt' + LANGHTM = 'pt-br' + ENCODING = 'cp1252' + ENCHTM = 'iso-8859-1' + directionhtm = 'ltr' + requires_version = (0,7,47) + news = True + + title = u'Folha de S\xE3o Paulo' + __author__ = 'Euler Alves' + description = u'Brazilian news from Folha de S\xE3o Paulo' + publisher = u'Folha de S\xE3o Paulo' + category = 'news, rss' + + oldest_article = 4 + max_articles_per_feed = 100 + summary_length = 1000 + + remove_javascript = True + no_stylesheets = True + use_embedded_content = False + remove_empty_feeds = True + timefmt = ' [%d %b %Y (%a)]' + + html2lrf_options = [ + '--comment', description + ,'--category', category + ,'--publisher', publisher + ] + + html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"' + + hoje = datetime.now() + pubdate = hoje.strftime('%a, %d %b') + if hoje.hour<6: + hoje = hoje-timedelta(days=1) + CAPA = 'http://www1.folha.uol.com.br/fsp/images/cp'+hoje.strftime('%d%m%Y')+'.jpg' + SCREENSHOT = 'http://www1.folha.uol.com.br/' + cover_margins = (0,0,'white') + masthead_url = 'http://f.i.uol.com.br/fsp/furniture/images/lgo-fsp-430x50-ffffff.gif' + + keep_only_tags = [dict(name='div', attrs={'id':'articleNew'})] + remove_tags = [ + dict(name='div', + attrs={'id':[ + 'articleButton' + ,'bookmarklets' + ,'ad-180x150-1' + ,'contextualAdsArticle' + ,'articleEnd' + ,'articleComments' + ]}) + ,dict(name='div', + attrs={'class':[ + 'openBox adslibraryArticle' + ]}) + + ,dict(name='a') + ,dict(name='iframe') + ,dict(name='link') + ,dict(name='script') + ] + + feeds = [ + (u'Em cima da hora', u'http://feeds.folha.uol.com.br/emcimadahora/rss091.xml') + ,(u'Ambiente', u'http://feeds.folha.uol.com.br/ambiente/rss091.xml') + ,(u'Bichos', u'http://feeds.folha.uol.com.br/bichos/rss091.xml') + ,(u'Ci\xEAncia', u'http://feeds.folha.uol.com.br/ciencia/rss091.xml') + ,(u'Poder', u'http://feeds.folha.uol.com.br/poder/rss091.xml') + ,(u'Equil\xEDbrio e Sa\xFAde', u'http://feeds.folha.uol.com.br/equilibrioesaude/rss091.xml') + ,(u'Turismo', u'http://feeds.folha.uol.com.br/folha/turismo/rss091.xml') + ,(u'Mundo', u'http://feeds.folha.uol.com.br/mundo/rss091.xml') + ,(u'Pelo Mundo', u'http://feeds.folha.uol.com.br/pelomundo.folha.rssblog.uol.com.br/') + ,(u'Circuito integrado', u'http://feeds.folha.uol.com.br/circuitointegrado.folha.rssblog.uol.com.br/') + ,(u'Blog do Fred', u'http://feeds.folha.uol.com.br/blogdofred.folha.rssblog.uol.com.br/') + ,(u'Maria In\xEAs Dolci', u'http://feeds.folha.uol.com.br/mariainesdolci.folha.blog.uol.com.br/') + ,(u'Eduardo Ohata', u'http://feeds.folha.uol.com.br/folha/pensata/eduardoohata/rss091.xml') + ,(u'Kennedy Alencar', u'http://feeds.folha.uol.com.br/folha/pensata/kennedyalencar/rss091.xml') + ,(u'Eliane Catanh\xEAde', u'http://feeds.folha.uol.com.br/folha/pensata/elianecantanhede/rss091.xml') + ,(u'Fernado Canzian', u'http://feeds.folha.uol.com.br/folha/pensata/fernandocanzian/rss091.xml') + ,(u'Gilberto Dimenstein', u'http://feeds.folha.uol.com.br/folha/pensata/gilbertodimenstein/rss091.xml') + ,(u'H\xE9lio Schwartsman', u'http://feeds.folha.uol.com.br/folha/pensata/helioschwartsman/rss091.xml') + ,(u'Jo\xE3o Pereira Coutinho', u'http://http://feeds.folha.uol.com.br/folha/pensata/joaopereiracoutinho/rss091.xml') + ,(u'Luiz Caversan', u'http://http://feeds.folha.uol.com.br/folha/pensata/luizcaversan/rss091.xml') + ,(u'S\xE9rgio Malbergier', u'http://http://feeds.folha.uol.com.br/folha/pensata/sergiomalbergier/rss091.xml') + ,(u'Valdo Cruz', u'http://http://feeds.folha.uol.com.br/folha/pensata/valdocruz/rss091.xml') + ] + + + conversion_options = { + 'title' : title + ,'comments' : description + ,'publisher' : publisher + ,'tags' : category + ,'language' : LANGUAGE + ,'linearize_tables': True + } + + def preprocess_html(self, soup): + for item in soup.findAll(style=True): + del item['style'] + if not soup.find(attrs={'http-equiv':'Content-Language'}): + meta0 = Tag(soup,'meta',[("http-equiv","Content-Language"),("content",self.LANGHTM)]) + soup.head.insert(0,meta0) + if not soup.find(attrs={'http-equiv':'Content-Type'}): + meta1 = Tag(soup,'meta',[("http-equiv","Content-Type"),("content","text/html; charset="+self.ENCHTM)]) + soup.head.insert(0,meta1) + return soup + + def postprocess_html(self, soup, first): + #process all the images. assumes that the new html has the correct path + for tag in soup.findAll(lambda tag: tag.name.lower()=='img' and tag.has_key('src')): + iurl = tag['src'] + img = Image() + img.open(iurl) + width, height = img.size + print 'img is: ', iurl, 'width is: ', width, 'height is: ', height + if img < 0: + raise RuntimeError('Out of memory') + pw = PixelWand() + if( width > height and width > 590) : + print 'Rotate image' + img.rotate(pw, -90) + img.save(iurl) + return soup + + def get_cover_url(self): + cover_url = self.CAPA + pedido = Request(self.CAPA) + pedido.add_header('User-agent','Mozilla/5.0 (Windows; U; Windows NT 5.1; '+self.LANGHTM+'; userid='+self.THUMBALIZR_API+') Calibre/0.8.47 (like Gecko)') + pedido.add_header('Accept-Charset',self.ENCHTM) + pedido.add_header('Referer',self.SCREENSHOT) + try: + resposta = urlopen(pedido) + soup = BeautifulSoup(resposta) + cover_item = soup.find('body') + if cover_item: + cover_url='http://api.thumbalizr.com/?api_key='+self.THUMBALIZR_API+'&url='+self.SCREENSHOT+'&width=600&quality=90' + return cover_url + except URLError: + cover_url='http://api.thumbalizr.com/?api_key='+self.THUMBALIZR_API+'&url='+self.SCREENSHOT+'&width=600&quality=90' + return cover_url diff --git a/recipes/icons/estadao.png b/recipes/icons/estadao.png index 8f5637ad58..706f101f33 100644 Binary files a/recipes/icons/estadao.png and b/recipes/icons/estadao.png differ diff --git a/recipes/icons/folhadesaopaulo.png b/recipes/icons/folhadesaopaulo.png new file mode 100644 index 0000000000..c895e57d70 Binary files /dev/null and b/recipes/icons/folhadesaopaulo.png differ diff --git a/recipes/lifehacker.recipe b/recipes/lifehacker.recipe index ff95efc50a..e96b031dab 100644 --- a/recipes/lifehacker.recipe +++ b/recipes/lifehacker.recipe @@ -1,37 +1,100 @@ -__license__ = 'GPL v3' -__copyright__ = '2010, NA' -''' -lifehacker.com -''' - -from calibre.web.feeds.news import BasicNewsRecipe - -class Lifehacker(BasicNewsRecipe): - title = 'Lifehacker' - __author__ = 'Kovid Goyal' - description = "Computers make us more productive. Yeah, right. Lifehacker recommends the software downloads and web sites that actually save time. Don't live to geek; geek to live." - publisher = 'lifehacker.com' - category = 'news, IT, Internet, gadgets, tips and tricks, howto, diy' - oldest_article = 2 - max_articles_per_feed = 100 - no_stylesheets = True - encoding = 'utf-8' - use_embedded_content = True - language = 'en' - masthead_url = 'http://cache.gawkerassets.com/assets/lifehacker.com/img/logo.png' - conversion_options = { - 'comment' : description - , 'tags' : category - , 'publisher' : publisher - , 'language' : language - } - - remove_tags = [ - {'class': 'feedflare'}, - ] - - feeds = [(u'Articles', u'http://feeds.gawker.com/lifehacker/vip?format=xml')] - - def preprocess_html(self, soup): - return self.adeify_images(soup) - +from calibre.web.feeds.news import BasicNewsRecipe +from datetime import datetime +from calibre.ebooks.BeautifulSoup import Tag +from calibre.utils.magick import Image, PixelWand + +class LifeHacker(BasicNewsRecipe): + THUMBALIZR_API = '' # ---->Get your at http://www.thumbalizr.com/ and put here + LANGUAGE = 'en' + LANGHTM = 'en' + language = 'en' + ENCODING = 'utf' + ENCHTM = 'utf-8' + requires_version = (0,7,47) + news = True + + title = u'LifeHacker' + __author__ = 'Euler Alves' + description = u'Tips, tricks, and downloads for getting things done.' + publisher = u'lifehacker.com' + author = u'Adam Pash & Kevin Purdy & Adam Dachis & Whitson Gordon & Gina Trapani' + category = 'news, rss' + + oldest_article = 4 + max_articles_per_feed = 20 + summary_length = 1000 + + remove_javascript = True + no_stylesheets = True + use_embedded_content = True + remove_empty_feeds = True + timefmt = ' [%d %b %Y (%a)]' + + hoje = datetime.now() + pubdate = hoje.strftime('%a, %d %b') + cover_url = 'http://api.thumbalizr.com/?api_key='+THUMBALIZR_API+'&url=http://lifehacker.com&width=600&quality=90' + cover_margins = (0,0,'white') + masthead_url = 'http://cache.gawkerassets.com/assets/lifehacker.com/img/logo.png' + + remove_tags = [ + {'class': 'feedflare'}, + dict(name='div', + attrs={'class':[ + 'ad_container' + ,'ad_300x250' + ,'ad_interstitial' + ,'share-wrap' + ,'ad_300x600' + ,'ad_perma-footer-adsense' + ,'ad_perma-panorama' + ,'ad panorama' + ,'ad_container' + ]}) + ,dict(name='div', + attrs={'id':[ + 'agegate_container' + ,'agegate_container_rejected' + ,'sharemenu-wrap' + ]}) + ] + + feeds = [(u'Articles', u'http://feeds.gawker.com/lifehacker/vip?format=xml')] + + conversion_options = { + 'title' : title + ,'comments' : description + ,'publisher' : publisher + ,'tags' : category + ,'language' : LANGUAGE + ,'linearize_tables': True + } + + def preprocess_html(self, soup): + for item in soup.findAll(style=True): + del item['style'] + if not soup.find(attrs={'http-equiv':'Content-Language'}): + meta0 = Tag(soup,'meta',[("http-equiv","Content-Language"),("content",self.LANGHTM)]) + soup.head.insert(0,meta0) + if not soup.find(attrs={'http-equiv':'Content-Type'}): + meta1 = Tag(soup,'meta',[("http-equiv","Content-Type"),("content","text/html; charset="+self.ENCHTM)]) + soup.head.insert(0,meta1) + return soup + + def postprocess_html(self, soup, first): + #process all the images. assumes that the new html has the correct path + for tag in soup.findAll(lambda tag: tag.name.lower()=='img' and tag.has_key('src')): + iurl = tag['src'] + img = Image() + img.open(iurl) + width, height = img.size + print 'img is: ', iurl, 'width is: ', width, 'height is: ', height + if img < 0: + raise RuntimeError('Out of memory') + pw = PixelWand() + if( width > height and width > 590) : + print 'Rotate image' + img.rotate(pw, -90) + img.save(iurl) + return soup + + diff --git a/setup/upload.py b/setup/upload.py index 6cd9ad3eca..4fd388ce43 100644 --- a/setup/upload.py +++ b/setup/upload.py @@ -5,7 +5,8 @@ __license__ = 'GPL v3' __copyright__ = '2009, Kovid Goyal ' __docformat__ = 'restructuredtext en' -import os, re, cStringIO, base64, httplib, subprocess, hashlib, shutil, time, glob +import os, re, cStringIO, base64, httplib, subprocess, hashlib, shutil, time, \ + glob, stat from subprocess import check_call from tempfile import NamedTemporaryFile, mkdtemp from zipfile import ZipFile @@ -344,6 +345,8 @@ class UploadUserManual(Command): # {{{ def build_plugin_example(self, path): from calibre import CurrentDir with NamedTemporaryFile(suffix='.zip') as f: + os.fchmod(f.fileno(), + stat.S_IRUSR|stat.S_IRGRP|stat.S_IROTH|stat.S_IWRITE) with CurrentDir(self.d(path)): with ZipFile(f, 'w') as zf: for x in os.listdir('.'): @@ -352,8 +355,8 @@ class UploadUserManual(Command): # {{{ for y in os.listdir(x): zf.write(os.path.join(x, y)) bname = self.b(path) + '_plugin.zip' - subprocess.check_call(['scp', f.name, 'divok:%s/%s'%(DOWNLOADS, - bname)]) + dest = '%s/%s'%(DOWNLOADS, bname) + subprocess.check_call(['scp', f.name, dest]) def run(self, opts): path = self.j(self.SRC, 'calibre', 'manual', 'plugin_examples') diff --git a/src/calibre/ebooks/chardet/__init__.py b/src/calibre/ebooks/chardet/__init__.py index 571ceafe53..c562176ef2 100644 --- a/src/calibre/ebooks/chardet/__init__.py +++ b/src/calibre/ebooks/chardet/__init__.py @@ -110,4 +110,11 @@ def xml_to_unicode(raw, verbose=False, strip_encoding_pats=False, if resolve_entities: raw = substitute_entites(raw) + if encoding and encoding.lower().replace('_', '-').strip() in ( + 'gb2312', 'chinese', 'csiso58gb231280', 'euc-cn', 'euccn', + 'eucgb2312-cn', 'gb2312-1980', 'gb2312-80', 'iso-ir-58'): + # Microsoft Word exports to HTML with encoding incorrectly set to + # gb2312 instead of gbk. gbk is a superset of gb2312, anyway. + encoding = 'gbk' + return raw, encoding diff --git a/src/calibre/ebooks/metadata/book/base.py b/src/calibre/ebooks/metadata/book/base.py index 91dcc29230..328ab7be26 100644 --- a/src/calibre/ebooks/metadata/book/base.py +++ b/src/calibre/ebooks/metadata/book/base.py @@ -198,8 +198,10 @@ class Metadata(object): return copy.deepcopy(ans) def _clean_identifier(self, typ, val): - typ = icu_lower(typ).strip().replace(':', '').replace(',', '') - val = val.strip().replace(',', '|').replace(':', '|') + if typ: + typ = icu_lower(typ).strip().replace(':', '').replace(',', '') + if val: + val = val.strip().replace(',', '|').replace(':', '|') return typ, val def set_identifiers(self, identifiers): diff --git a/src/calibre/ebooks/metadata/sources/amazon.py b/src/calibre/ebooks/metadata/sources/amazon.py index c9c742f88c..cfa2b09ea8 100644 --- a/src/calibre/ebooks/metadata/sources/amazon.py +++ b/src/calibre/ebooks/metadata/sources/amazon.py @@ -503,7 +503,7 @@ if __name__ == '__main__': # tests {{{ ( # This isbn not on amazon {'identifiers':{'isbn': '8324616489'}, 'title':'Learning Python', 'authors':['Lutz']}, - [title_test('Learning Python: Powerful Object-Oriented Programming', + [title_test('Learning Python, 3rd Edition', exact=True), authors_test(['Mark Lutz']) ] diff --git a/src/calibre/gui2/library/models.py b/src/calibre/gui2/library/models.py index e9c2621237..3e1670c7af 100644 --- a/src/calibre/gui2/library/models.py +++ b/src/calibre/gui2/library/models.py @@ -1132,7 +1132,7 @@ class DeviceBooksModel(BooksModel): # {{{ self.sorted_map = list(self.map) else: self.sorted_map = list(range(len(self.db))) - self.sorted_map.sort(cmp=keygen, reverse=descending) + self.sorted_map.sort(key=keygen, reverse=descending) self.sorted_on = (self.column_map[col], order) self.sort_history.insert(0, self.sorted_on) if hasattr(keygen, 'db'): diff --git a/src/calibre/utils/html2text.py b/src/calibre/utils/html2text.py index 0eb84a3d38..3779c68918 100644 --- a/src/calibre/utils/html2text.py +++ b/src/calibre/utils/html2text.py @@ -1,8 +1,14 @@ #!/usr/bin/env python +# -*- coding: utf-8 -*- + """html2text: Turn HTML into equivalent Markdown-structured text.""" -__version__ = "2.39" -__author__ = "Aaron Swartz (me@aaronsw.com)" -__copyright__ = "(C) 2004-2008 Aaron Swartz. GNU GPL 3." +# Last upstream version before changes +#__version__ = "2.39" +__license__ = 'GPL 3' +__copyright__ = ''' +Copyright (c) 2011, John Schember +(C) 2004-2008 Aaron Swartz +''' __contributors__ = ["Martin 'Joey' Schulze", "Ricardo Reyes", "Kevin Jay North"] # TODO: @@ -11,7 +17,6 @@ __contributors__ = ["Martin 'Joey' Schulze", "Ricardo Reyes", "Kevin Jay North"] if not hasattr(__builtins__, 'True'): True, False = 1, 0 import re, sys, urllib, htmlentitydefs, codecs import sgmllib -import urlparse sgmllib.charref = re.compile('&#([xX]?[0-9a-fA-F]+)[^0-9a-fA-F]') try: from textwrap import wrap @@ -145,9 +150,7 @@ class _html2text(sgmllib.SGMLParser): self.outcount = 0 self.start = 1 self.space = 0 - self.a = [] self.astack = [] - self.acount = 0 self.list = [] self.blockquote = 0 self.pre = 0 @@ -181,29 +184,6 @@ class _html2text(sgmllib.SGMLParser): def unknown_endtag(self, tag): self.handle_tag(tag, None, 0) - def previousIndex(self, attrs): - """ returns the index of certain set of attributes (of a link) in the - self.a list - - If the set of attributes is not found, returns None - """ - if not attrs.has_key('href'): return None - - i = -1 - for a in self.a: - i += 1 - match = 0 - - if a.has_key('href') and a['href'] == attrs['href']: - if a.has_key('title') or attrs.has_key('title'): - if (a.has_key('title') and attrs.has_key('title') and - a['title'] == attrs['title']): - match = True - else: - match = True - - if match: return i - def handle_tag(self, tag, attrs, start): attrs = fixattrs(attrs) @@ -268,34 +248,23 @@ class _html2text(sgmllib.SGMLParser): if self.astack: a = self.astack.pop() if a: - i = self.previousIndex(a) - if i is not None: - a = self.a[i] - else: - self.acount += 1 - a['count'] = self.acount - a['outcount'] = self.outcount - self.a.append(a) - self.o("][" + `a['count']` + "]") + title = '' + if a.has_key('title'): + title = ' "%s"' % a['title'] + self.o('](%s%s)' % (a['href'], title)) if tag == "img" and start: attrsD = {} for (x, y) in attrs: attrsD[x] = y attrs = attrsD if attrs.has_key('src'): - attrs['href'] = attrs['src'] alt = attrs.get('alt', '') - i = self.previousIndex(attrs) - if i is not None: - attrs = self.a[i] - else: - self.acount += 1 - attrs['count'] = self.acount - attrs['outcount'] = self.outcount - self.a.append(attrs) self.o("![") self.o(alt) - self.o("]["+`attrs['count']`+"]") + title = '' + if attrs.has_key('title'): + title = ' "%s"' % attrs['title'] + self.o('](%s%s)' % (attrs['src'], title)) if tag == 'dl' and start: self.p() if tag == 'dt' and not start: self.pbr() @@ -373,7 +342,6 @@ class _html2text(sgmllib.SGMLParser): self.out("\n") self.space = 0 - if self.p_p: self.out(('\n'+bq)*self.p_p) self.space = 0 @@ -382,22 +350,6 @@ class _html2text(sgmllib.SGMLParser): if not self.lastWasNL: self.out(' ') self.space = 0 - if self.a and ((self.p_p == 2 and LINKS_EACH_PARAGRAPH) or force == "end"): - if force == "end": self.out("\n") - - newa = [] - for link in self.a: - if self.outcount > link['outcount']: - self.out(" ["+`link['count']`+"]: " + urlparse.urljoin(self.baseurl, link['href'])) - if link.has_key('title'): self.out(" ("+link['title']+")") - self.out("\n") - else: - newa.append(link) - - if self.a != newa: self.out("\n") # Don't need an extra line when nothing was done. - - self.a = newa - if self.abbr_list and force == "end": for abbr, definition in self.abbr_list.items(): self.out(" *[" + abbr + "]: " + definition + "\n")