Updated LifeHacker

This commit is contained in:
Kovid Goyal 2011-03-30 23:02:14 -06:00
parent f970ac5744
commit a34e318107
5 changed files with 380 additions and 320 deletions

View File

@ -1,134 +1,129 @@
#!/usr/bin/env python from calibre.web.feeds.news import BasicNewsRecipe
from calibre.web.feeds.news import BasicNewsRecipe from datetime import datetime, timedelta
from datetime import datetime, timedelta from calibre.ebooks.BeautifulSoup import Tag,BeautifulSoup
from calibre.ebooks.BeautifulSoup import Tag,BeautifulSoup from calibre.utils.magick import Image, PixelWand
from calibre.utils.magick import Image, PixelWand from urllib2 import Request, urlopen, URLError
from urllib2 import Request, urlopen, URLError
class Estadao(BasicNewsRecipe):
class Estadao(BasicNewsRecipe): THUMBALIZR_API = '' # ---->Get your at http://www.thumbalizr.com/ and put here
THUMBALIZR_API = "0123456789abcdef01234567890" # ---->Get your at http://www.thumbalizr.com/ LANGUAGE = 'pt_br'
LANGUAGE = 'pt_br' language = 'pt'
language = 'pt' LANGHTM = 'pt-br'
LANGHTM = 'pt-br' ENCODING = 'utf'
ENCODING = 'utf' ENCHTM = 'utf-8'
ENCHTM = 'utf-8' directionhtm = 'ltr'
directionhtm = 'ltr' requires_version = (0,7,47)
requires_version = (0,8,47) news = True
news = True
publication_type = 'newsportal' title = u'Estad\xe3o'
__author__ = 'Euler Alves'
title = u'Estadao' description = u'Brazilian news from Estad\xe3o'
__author__ = 'Euler Alves' publisher = u'Estad\xe3o'
description = u'Brazilian news from Estad\xe3o' category = 'news, rss'
publisher = u'Estad\xe3o'
category = 'news, rss' oldest_article = 4
max_articles_per_feed = 100
oldest_article = 4 summary_length = 1000
max_articles_per_feed = 100
summary_length = 1000 remove_javascript = True
no_stylesheets = True
remove_javascript = True use_embedded_content = False
no_stylesheets = True remove_empty_feeds = True
use_embedded_content = False timefmt = ' [%d %b %Y (%a)]'
remove_empty_feeds = True
timefmt = ' [%d %b %Y (%a)]' hoje = datetime.now()-timedelta(days=2)
pubdate = hoje.strftime('%a, %d %b')
html2lrf_options = [ if hoje.hour<10:
'--comment', description hoje = hoje-timedelta(days=1)
,'--category', category CAPA = 'http://www.estadao.com.br/estadaodehoje/'+hoje.strftime('%Y%m%d')+'/img/capadodia.jpg'
,'--publisher', publisher SCREENSHOT = 'http://estadao.com.br/'
] cover_margins = (0,0,'white')
masthead_url = 'http://www.estadao.com.br/estadao/novo/img/logo.png'
html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"'
keep_only_tags = [dict(name='div', attrs={'class':['bb-md-noticia','corpo']})]
hoje = datetime.now()-timedelta(days=2) remove_tags = [
pubdate = hoje.strftime('%a, %d %b') dict(name='div',
if hoje.hour<10: attrs={'id':[
hoje = hoje-timedelta(days=1) 'bb-md-noticia-tabs'
CAPA = 'http://www.estadao.com.br/estadaodehoje/'+hoje.strftime('%Y%m%d')+'/img/capadodia.jpg' ]})
SCREENSHOT = 'http://estadao.com.br/' ,dict(name='div',
cover_margins = (0,0,'white') attrs={'class':[
masthead_url = 'http://www.estadao.com.br/estadao/novo/img/logo.png' 'tags'
,'discussion'
keep_only_tags = [dict(name='div', attrs={'class':['bb-md-noticia','corpo']})] ,'bb-gg adsense_container'
remove_tags = [ ]})
dict(name='div',
attrs={'id':[ ,dict(name='a')
'bb-md-noticia-tabs' ,dict(name='iframe')
]}) ,dict(name='link')
,dict(name='div', ,dict(name='script')
attrs={'class':[ ]
'tags'
,'discussion'
,'bb-gg adsense_container' feeds = [
]}) (u'\xDAltimas Not\xEDcias', u'http://www.estadao.com.br/rss/ultimas.xml')
,(u'Manchetes', u'http://www.estadao.com.br/rss/manchetes.xml')
,dict(name='a') ,(u'Brasil', u'http://www.estadao.com.br/rss/brasil.xml')
,dict(name='iframe') ,(u'Internacional', u'http://www.estadao.com.br/rss/internacional.xml')
,dict(name='link') ,(u'Cinema', u'http://blogs.estadao.com.br/cinema/feed/')
,dict(name='script') ,(u'Planeta', u'http://www.estadao.com.br/rss/planeta.xml')
] ,(u'Ci\xEAncia', u'http://www.estadao.com.br/rss/ciencia.xml')
,(u'Sa\xFAde', u'http://www.estadao.com.br/rss/saude.xml')
feeds = [ ,(u'Pol\xEDtica', u'http://www.estadao.com.br/rss/politica.xml')
(u'\xDAltimas Not\xEDcias', u'http://www.estadao.com.br/rss/ultimas.xml') ]
,(u'Manchetes', u'http://www.estadao.com.br/rss/manchetes.xml')
,(u'Brasil', u'http://www.estadao.com.br/rss/brasil.xml') conversion_options = {
,(u'Internacional', u'http://www.estadao.com.br/rss/internacional.xml') 'title' : title
,(u'Cinema', u'http://blogs.estadao.com.br/cinema/feed/') ,'comments' : description
,(u'Planeta', u'http://www.estadao.com.br/rss/planeta.xml') ,'publisher' : publisher
,(u'Ci\xEAncia', u'http://www.estadao.com.br/rss/ciencia.xml') ,'tags' : category
,(u'Sa\xFAde', u'http://www.estadao.com.br/rss/saude.xml') ,'language' : LANGUAGE
,(u'Pol\xEDtica', u'http://www.estadao.com.br/rss/politica.xml') ,'linearize_tables': True
] }
conversion_options = { def preprocess_html(self, soup):
'title' : title for item in soup.findAll(style=True):
,'comments' : description del item['style']
,'publisher' : publisher if not soup.find(attrs={'http-equiv':'Content-Language'}):
,'tags' : category meta0 = Tag(soup,'meta',[("http-equiv","Content-Language"),("content",self.LANGHTM)])
,'language' : LANGUAGE soup.head.insert(0,meta0)
,'linearize_tables': True if not soup.find(attrs={'http-equiv':'Content-Type'}):
} meta1 = Tag(soup,'meta',[("http-equiv","Content-Type"),("content","text/html; charset="+self.ENCHTM)])
soup.head.insert(0,meta1)
def preprocess_html(self, soup): return soup
for item in soup.findAll(style=True):
del item['style'] def postprocess_html(self, soup, first):
if not soup.find(attrs={'http-equiv':'Content-Language'}): #process all the images. assumes that the new html has the correct path
meta0 = Tag(soup,'meta',[("http-equiv","Content-Language"),("content",self.LANGHTM)]) for tag in soup.findAll(lambda tag: tag.name.lower()=='img' and tag.has_key('src')):
soup.head.insert(0,meta0) iurl = tag['src']
if not soup.find(attrs={'http-equiv':'Content-Type'}): img = Image()
meta1 = Tag(soup,'meta',[("http-equiv","Content-Type"),("content","text/html; charset="+self.ENCHTM)]) img.open(iurl)
soup.head.insert(0,meta1) width, height = img.size
return soup print 'img is: ', iurl, 'width is: ', width, 'height is: ', height
if img < 0:
def postprocess_html(self, soup, first): raise RuntimeError('Out of memory')
#process all the images. assumes that the new html has the correct path pw = PixelWand()
for tag in soup.findAll(lambda tag: tag.name.lower()=='img' and tag.has_key('src')): if( width > height and width > 590) :
iurl = tag['src'] print 'Rotate image'
img = Image() img.rotate(pw, -90)
img.open(iurl) img.save(iurl)
width, height = img.size return soup
print 'img is: ', iurl, 'width is: ', width, 'height is: ', height
pw = PixelWand() def get_cover_url(self):
if( width > height and width > 590) : if self.THUMBALIZR_API:
print 'Rotate image' cover_url = self.CAPA
img.rotate(pw, -90) pedido = Request(self.CAPA)
img.save(iurl) pedido.add_header('User-agent','Mozilla/5.0 (Windows; U; Windows NT 5.1; '+self.LANGHTM+'; userid='+self.THUMBALIZR_API+') Calibre/0.8.47 (like Gecko)')
return soup pedido.add_header('Accept-Charset',self.ENCHTM)
pedido.add_header('Referer',self.SCREENSHOT)
def get_cover_url(self): try:
cover_url = self.CAPA resposta = urlopen(pedido)
pedido = Request(self.CAPA) soup = BeautifulSoup(resposta)
pedido.add_header('User-agent','Mozilla/5.0 (Windows; U; Windows NT 5.1; '+self.LANGHTM+'; userid='+self.THUMBALIZR_API+') Calibre/0.8.47 (like Gecko)') cover_item = soup.find('body')
pedido.add_header('Accept-Charset',self.ENCHTM) if cover_item:
pedido.add_header('Referer',self.SCREENSHOT) cover_url='http://api.thumbalizr.com/?api_key='+self.THUMBALIZR_API+'&url='+self.SCREENSHOT+'&width=600&quality=90'
try: return cover_url
resposta = urlopen(pedido) except URLError:
soup = BeautifulSoup(resposta) cover_url='http://api.thumbalizr.com/?api_key='+self.THUMBALIZR_API+'&url='+self.SCREENSHOT+'&width=600&quality=90'
cover_item = soup.find('body') return cover_url
if cover_item:
cover_url='http://api.thumbalizr.com/?api_key='+self.THUMBALIZR_API+'&url='+self.SCREENSHOT+'&width=600&quality=90'
return cover_url
except URLError:
cover_url='http://api.thumbalizr.com/?api_key='+self.THUMBALIZR_API+'&url='+self.SCREENSHOT+'&width=600&quality=90'
return cover_url

View File

@ -1,149 +1,151 @@
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
from datetime import datetime, timedelta from datetime import datetime, timedelta
from calibre.ebooks.BeautifulSoup import Tag,BeautifulSoup from calibre.ebooks.BeautifulSoup import Tag,BeautifulSoup
from calibre.utils.magick import Image, PixelWand from calibre.utils.magick import Image, PixelWand
from urllib2 import Request, urlopen, URLError from urllib2 import Request, urlopen, URLError
class FolhaOnline(BasicNewsRecipe): class FolhaOnline(BasicNewsRecipe):
THUMBALIZR_API = "0123456789abcdef01234567890" # ---->Get your at http://www.thumbalizr.com/ THUMBALIZR_API = '' # ---->Get your at http://www.thumbalizr.com/ and put here
LANGUAGE = 'pt_br' LANGUAGE = 'pt_br'
language = 'pt' language = 'pt'
LANGHTM = 'pt-br' LANGHTM = 'pt-br'
ENCODING = 'cp1252' ENCODING = 'cp1252'
ENCHTM = 'iso-8859-1' ENCHTM = 'iso-8859-1'
directionhtm = 'ltr' directionhtm = 'ltr'
requires_version = (0,8,47) requires_version = (0,7,47)
news = True news = True
publication_type = 'newsportal'
title = u'Folha de S\xE3o Paulo'
title = u'Folha de S\xE3o Paulo' __author__ = 'Euler Alves'
__author__ = 'Euler Alves' description = u'Brazilian news from Folha de S\xE3o Paulo'
description = u'Brazilian news from Folha de S\xE3o Paulo' publisher = u'Folha de S\xE3o Paulo'
publisher = u'Folha de S\xE3o Paulo' category = 'news, rss'
category = 'news, rss'
oldest_article = 4
oldest_article = 4 max_articles_per_feed = 100
max_articles_per_feed = 100 summary_length = 1000
summary_length = 1000
remove_javascript = True
remove_javascript = True no_stylesheets = True
no_stylesheets = True use_embedded_content = False
use_embedded_content = False remove_empty_feeds = True
remove_empty_feeds = True timefmt = ' [%d %b %Y (%a)]'
timefmt = ' [%d %b %Y (%a)]'
html2lrf_options = [
html2lrf_options = [ '--comment', description
'--comment', description ,'--category', category
,'--category', category ,'--publisher', publisher
,'--publisher', publisher ]
]
html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"'
html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"'
hoje = datetime.now()
hoje = datetime.now() pubdate = hoje.strftime('%a, %d %b')
pubdate = hoje.strftime('%a, %d %b') if hoje.hour<6:
if hoje.hour<6: hoje = hoje-timedelta(days=1)
hoje = hoje-timedelta(days=1) CAPA = 'http://www1.folha.uol.com.br/fsp/images/cp'+hoje.strftime('%d%m%Y')+'.jpg'
CAPA = 'http://www1.folha.uol.com.br/fsp/images/cp'+hoje.strftime('%d%m%Y')+'.jpg' SCREENSHOT = 'http://www1.folha.uol.com.br/'
SCREENSHOT = 'http://www1.folha.uol.com.br/' cover_margins = (0,0,'white')
cover_margins = (0,0,'white') masthead_url = 'http://f.i.uol.com.br/fsp/furniture/images/lgo-fsp-430x50-ffffff.gif'
masthead_url = 'http://f.i.uol.com.br/fsp/furniture/images/lgo-fsp-430x50-ffffff.gif'
keep_only_tags = [dict(name='div', attrs={'id':'articleNew'})]
keep_only_tags = [dict(name='div', attrs={'id':'articleNew'})] remove_tags = [
remove_tags = [ dict(name='div',
dict(name='div', attrs={'id':[
attrs={'id':[ 'articleButton'
'articleButton' ,'bookmarklets'
,'bookmarklets' ,'ad-180x150-1'
,'ad-180x150-1' ,'contextualAdsArticle'
,'contextualAdsArticle' ,'articleEnd'
,'articleEnd' ,'articleComments'
,'articleComments' ]})
]}) ,dict(name='div',
,dict(name='div', attrs={'class':[
attrs={'class':[ 'openBox adslibraryArticle'
'openBox adslibraryArticle' ]})
]})
,dict(name='a')
,dict(name='a') ,dict(name='iframe')
,dict(name='iframe') ,dict(name='link')
,dict(name='link') ,dict(name='script')
,dict(name='script') ]
]
feeds = [
feeds = [ (u'Em cima da hora', u'http://feeds.folha.uol.com.br/emcimadahora/rss091.xml')
(u'Em cima da hora', u'http://feeds.folha.uol.com.br/emcimadahora/rss091.xml') ,(u'Ambiente', u'http://feeds.folha.uol.com.br/ambiente/rss091.xml')
,(u'Ambiente', u'http://feeds.folha.uol.com.br/ambiente/rss091.xml') ,(u'Bichos', u'http://feeds.folha.uol.com.br/bichos/rss091.xml')
,(u'Bichos', u'http://feeds.folha.uol.com.br/bichos/rss091.xml') ,(u'Ci\xEAncia', u'http://feeds.folha.uol.com.br/ciencia/rss091.xml')
,(u'Ci\xEAncia', u'http://feeds.folha.uol.com.br/ciencia/rss091.xml') ,(u'Poder', u'http://feeds.folha.uol.com.br/poder/rss091.xml')
,(u'Poder', u'http://feeds.folha.uol.com.br/poder/rss091.xml') ,(u'Equil\xEDbrio e Sa\xFAde', u'http://feeds.folha.uol.com.br/equilibrioesaude/rss091.xml')
,(u'Equil\xEDbrio e Sa\xFAde', u'http://feeds.folha.uol.com.br/equilibrioesaude/rss091.xml') ,(u'Turismo', u'http://feeds.folha.uol.com.br/folha/turismo/rss091.xml')
,(u'Turismo', u'http://feeds.folha.uol.com.br/folha/turismo/rss091.xml') ,(u'Mundo', u'http://feeds.folha.uol.com.br/mundo/rss091.xml')
,(u'Mundo', u'http://feeds.folha.uol.com.br/mundo/rss091.xml') ,(u'Pelo Mundo', u'http://feeds.folha.uol.com.br/pelomundo.folha.rssblog.uol.com.br/')
,(u'Pelo Mundo', u'http://feeds.folha.uol.com.br/pelomundo.folha.rssblog.uol.com.br/') ,(u'Circuito integrado', u'http://feeds.folha.uol.com.br/circuitointegrado.folha.rssblog.uol.com.br/')
,(u'Circuito integrado', u'http://feeds.folha.uol.com.br/circuitointegrado.folha.rssblog.uol.com.br/') ,(u'Blog do Fred', u'http://feeds.folha.uol.com.br/blogdofred.folha.rssblog.uol.com.br/')
,(u'Blog do Fred', u'http://feeds.folha.uol.com.br/blogdofred.folha.rssblog.uol.com.br/') ,(u'Maria In\xEAs Dolci', u'http://feeds.folha.uol.com.br/mariainesdolci.folha.blog.uol.com.br/')
,(u'Maria In\xEAs Dolci', u'http://feeds.folha.uol.com.br/mariainesdolci.folha.blog.uol.com.br/') ,(u'Eduardo Ohata', u'http://feeds.folha.uol.com.br/folha/pensata/eduardoohata/rss091.xml')
,(u'Eduardo Ohata', u'http://feeds.folha.uol.com.br/folha/pensata/eduardoohata/rss091.xml') ,(u'Kennedy Alencar', u'http://feeds.folha.uol.com.br/folha/pensata/kennedyalencar/rss091.xml')
,(u'Kennedy Alencar', u'http://feeds.folha.uol.com.br/folha/pensata/kennedyalencar/rss091.xml') ,(u'Eliane Catanh\xEAde', u'http://feeds.folha.uol.com.br/folha/pensata/elianecantanhede/rss091.xml')
,(u'Eliane Catanh\xEAde', u'http://feeds.folha.uol.com.br/folha/pensata/elianecantanhede/rss091.xml') ,(u'Fernado Canzian', u'http://feeds.folha.uol.com.br/folha/pensata/fernandocanzian/rss091.xml')
,(u'Fernado Canzian', u'http://feeds.folha.uol.com.br/folha/pensata/fernandocanzian/rss091.xml') ,(u'Gilberto Dimenstein', u'http://feeds.folha.uol.com.br/folha/pensata/gilbertodimenstein/rss091.xml')
,(u'Gilberto Dimenstein', u'http://feeds.folha.uol.com.br/folha/pensata/gilbertodimenstein/rss091.xml') ,(u'H\xE9lio Schwartsman', u'http://feeds.folha.uol.com.br/folha/pensata/helioschwartsman/rss091.xml')
,(u'H\xE9lio Schwartsman', u'http://feeds.folha.uol.com.br/folha/pensata/helioschwartsman/rss091.xml') ,(u'Jo\xE3o Pereira Coutinho', u'http://http://feeds.folha.uol.com.br/folha/pensata/joaopereiracoutinho/rss091.xml')
,(u'Jo\xE3o Pereira Coutinho', u'http://http://feeds.folha.uol.com.br/folha/pensata/joaopereiracoutinho/rss091.xml') ,(u'Luiz Caversan', u'http://http://feeds.folha.uol.com.br/folha/pensata/luizcaversan/rss091.xml')
,(u'Luiz Caversan', u'http://http://feeds.folha.uol.com.br/folha/pensata/luizcaversan/rss091.xml') ,(u'S\xE9rgio Malbergier', u'http://http://feeds.folha.uol.com.br/folha/pensata/sergiomalbergier/rss091.xml')
,(u'S\xE9rgio Malbergier', u'http://http://feeds.folha.uol.com.br/folha/pensata/sergiomalbergier/rss091.xml') ,(u'Valdo Cruz', u'http://http://feeds.folha.uol.com.br/folha/pensata/valdocruz/rss091.xml')
,(u'Valdo Cruz', u'http://http://feeds.folha.uol.com.br/folha/pensata/valdocruz/rss091.xml') ]
]
conversion_options = { conversion_options = {
'title' : title 'title' : title
,'comments' : description ,'comments' : description
,'publisher' : publisher ,'publisher' : publisher
,'tags' : category ,'tags' : category
,'language' : LANGUAGE ,'language' : LANGUAGE
,'linearize_tables': True ,'linearize_tables': True
} }
def preprocess_html(self, soup): def preprocess_html(self, soup):
for item in soup.findAll(style=True): for item in soup.findAll(style=True):
del item['style'] del item['style']
if not soup.find(attrs={'http-equiv':'Content-Language'}): if not soup.find(attrs={'http-equiv':'Content-Language'}):
meta0 = Tag(soup,'meta',[("http-equiv","Content-Language"),("content",self.LANGHTM)]) meta0 = Tag(soup,'meta',[("http-equiv","Content-Language"),("content",self.LANGHTM)])
soup.head.insert(0,meta0) soup.head.insert(0,meta0)
if not soup.find(attrs={'http-equiv':'Content-Type'}): if not soup.find(attrs={'http-equiv':'Content-Type'}):
meta1 = Tag(soup,'meta',[("http-equiv","Content-Type"),("content","text/html; charset="+self.ENCHTM)]) meta1 = Tag(soup,'meta',[("http-equiv","Content-Type"),("content","text/html; charset="+self.ENCHTM)])
soup.head.insert(0,meta1) soup.head.insert(0,meta1)
return soup return soup
def postprocess_html(self, soup, first): def postprocess_html(self, soup, first):
#process all the images. assumes that the new html has the correct path #process all the images. assumes that the new html has the correct path
for tag in soup.findAll(lambda tag: tag.name.lower()=='img' and tag.has_key('src')): for tag in soup.findAll(lambda tag: tag.name.lower()=='img' and tag.has_key('src')):
iurl = tag['src'] iurl = tag['src']
img = Image() img = Image()
img.open(iurl) img.open(iurl)
width, height = img.size width, height = img.size
print 'img is: ', iurl, 'width is: ', width, 'height is: ', height print 'img is: ', iurl, 'width is: ', width, 'height is: ', height
pw = PixelWand() if img < 0:
if( width > height and width > 590) : raise RuntimeError('Out of memory')
print 'Rotate image' pw = PixelWand()
img.rotate(pw, -90) if( width > height and width > 590) :
img.save(iurl) print 'Rotate image'
return soup img.rotate(pw, -90)
img.save(iurl)
def get_cover_url(self): return soup
cover_url = self.CAPA
pedido = Request(self.CAPA) def get_cover_url(self):
pedido.add_header('User-agent','Mozilla/5.0 (Windows; U; Windows NT 5.1; '+self.LANGHTM+'; userid='+self.THUMBALIZR_API+') Calibre/0.8.47 (like Gecko)') cover_url = self.CAPA
pedido.add_header('Accept-Charset',self.ENCHTM) pedido = Request(self.CAPA)
pedido.add_header('Referer',self.SCREENSHOT) pedido.add_header('User-agent','Mozilla/5.0 (Windows; U; Windows NT 5.1; '+self.LANGHTM+'; userid='+self.THUMBALIZR_API+') Calibre/0.8.47 (like Gecko)')
try: pedido.add_header('Accept-Charset',self.ENCHTM)
resposta = urlopen(pedido) pedido.add_header('Referer',self.SCREENSHOT)
soup = BeautifulSoup(resposta) try:
cover_item = soup.find('body') resposta = urlopen(pedido)
if cover_item: soup = BeautifulSoup(resposta)
cover_url='http://api.thumbalizr.com/?api_key='+self.THUMBALIZR_API+'&url='+self.SCREENSHOT+'&width=600&quality=90' cover_item = soup.find('body')
return cover_url if cover_item:
except URLError: cover_url='http://api.thumbalizr.com/?api_key='+self.THUMBALIZR_API+'&url='+self.SCREENSHOT+'&width=600&quality=90'
cover_url='http://api.thumbalizr.com/?api_key='+self.THUMBALIZR_API+'&url='+self.SCREENSHOT+'&width=600&quality=90' return cover_url
return cover_url except URLError:
cover_url='http://api.thumbalizr.com/?api_key='+self.THUMBALIZR_API+'&url='+self.SCREENSHOT+'&width=600&quality=90'
return cover_url

Binary file not shown.

Before

Width:  |  Height:  |  Size: 1.7 KiB

After

Width:  |  Height:  |  Size: 714 B

Binary file not shown.

After

Width:  |  Height:  |  Size: 392 B

View File

@ -1,37 +1,100 @@
__license__ = 'GPL v3' from calibre.web.feeds.news import BasicNewsRecipe
__copyright__ = '2010, NA' from datetime import datetime
''' from calibre.ebooks.BeautifulSoup import Tag
lifehacker.com from calibre.utils.magick import Image, PixelWand
'''
class LifeHacker(BasicNewsRecipe):
from calibre.web.feeds.news import BasicNewsRecipe THUMBALIZR_API = '' # ---->Get your at http://www.thumbalizr.com/ and put here
LANGUAGE = 'en'
class Lifehacker(BasicNewsRecipe): LANGHTM = 'en'
title = 'Lifehacker' language = 'en'
__author__ = 'Kovid Goyal' ENCODING = 'utf'
description = "Computers make us more productive. Yeah, right. Lifehacker recommends the software downloads and web sites that actually save time. Don't live to geek; geek to live." ENCHTM = 'utf-8'
publisher = 'lifehacker.com' requires_version = (0,7,47)
category = 'news, IT, Internet, gadgets, tips and tricks, howto, diy' news = True
oldest_article = 2
max_articles_per_feed = 100 title = u'LifeHacker'
no_stylesheets = True __author__ = 'Euler Alves'
encoding = 'utf-8' description = u'Tips, tricks, and downloads for getting things done.'
use_embedded_content = True publisher = u'lifehacker.com'
language = 'en' author = u'Adam Pash & Kevin Purdy & Adam Dachis & Whitson Gordon & Gina Trapani'
masthead_url = 'http://cache.gawkerassets.com/assets/lifehacker.com/img/logo.png' category = 'news, rss'
conversion_options = {
'comment' : description oldest_article = 4
, 'tags' : category max_articles_per_feed = 20
, 'publisher' : publisher summary_length = 1000
, 'language' : language
} remove_javascript = True
no_stylesheets = True
remove_tags = [ use_embedded_content = True
{'class': 'feedflare'}, remove_empty_feeds = True
] timefmt = ' [%d %b %Y (%a)]'
feeds = [(u'Articles', u'http://feeds.gawker.com/lifehacker/vip?format=xml')] hoje = datetime.now()
pubdate = hoje.strftime('%a, %d %b')
def preprocess_html(self, soup): cover_url = 'http://api.thumbalizr.com/?api_key='+THUMBALIZR_API+'&url=http://lifehacker.com&width=600&quality=90'
return self.adeify_images(soup) cover_margins = (0,0,'white')
masthead_url = 'http://cache.gawkerassets.com/assets/lifehacker.com/img/logo.png'
remove_tags = [
{'class': 'feedflare'},
dict(name='div',
attrs={'class':[
'ad_container'
,'ad_300x250'
,'ad_interstitial'
,'share-wrap'
,'ad_300x600'
,'ad_perma-footer-adsense'
,'ad_perma-panorama'
,'ad panorama'
,'ad_container'
]})
,dict(name='div',
attrs={'id':[
'agegate_container'
,'agegate_container_rejected'
,'sharemenu-wrap'
]})
]
feeds = [(u'Articles', u'http://feeds.gawker.com/lifehacker/vip?format=xml')]
conversion_options = {
'title' : title
,'comments' : description
,'publisher' : publisher
,'tags' : category
,'language' : LANGUAGE
,'linearize_tables': True
}
def preprocess_html(self, soup):
for item in soup.findAll(style=True):
del item['style']
if not soup.find(attrs={'http-equiv':'Content-Language'}):
meta0 = Tag(soup,'meta',[("http-equiv","Content-Language"),("content",self.LANGHTM)])
soup.head.insert(0,meta0)
if not soup.find(attrs={'http-equiv':'Content-Type'}):
meta1 = Tag(soup,'meta',[("http-equiv","Content-Type"),("content","text/html; charset="+self.ENCHTM)])
soup.head.insert(0,meta1)
return soup
def postprocess_html(self, soup, first):
#process all the images. assumes that the new html has the correct path
for tag in soup.findAll(lambda tag: tag.name.lower()=='img' and tag.has_key('src')):
iurl = tag['src']
img = Image()
img.open(iurl)
width, height = img.size
print 'img is: ', iurl, 'width is: ', width, 'height is: ', height
if img < 0:
raise RuntimeError('Out of memory')
pw = PixelWand()
if( width > height and width > 590) :
print 'Rotate image'
img.rotate(pw, -90)
img.save(iurl)
return soup