calibre/recipes/estadao.recipe
2013-05-28 11:42:53 +05:30

130 lines
5.2 KiB
Plaintext

from calibre.web.feeds.news import BasicNewsRecipe
from datetime import datetime, timedelta
from calibre.ebooks.BeautifulSoup import Tag,BeautifulSoup
from calibre.utils.magick import Image, PixelWand
from urllib2 import Request, urlopen, URLError
class Estadao(BasicNewsRecipe):
THUMBALIZR_API = '' # ---->Get your at http://www.thumbalizr.com/ and put here
LANGUAGE = 'pt_br'
language = 'pt'
LANGHTM = 'pt-br'
ENCODING = 'utf'
ENCHTM = 'utf-8'
directionhtm = 'ltr'
requires_version = (0,7,47)
news = True
title = u'Estad\xe3o'
__author__ = 'Euler Alves'
description = u'Brazilian news from Estad\xe3o'
publisher = u'Estad\xe3o'
category = 'news, rss'
oldest_article = 4
max_articles_per_feed = 100
summary_length = 1000
remove_javascript = True
no_stylesheets = True
use_embedded_content = False
remove_empty_feeds = True
timefmt = ' [%d %b %Y (%a)]'
hoje = datetime.now()-timedelta(days=2)
pubdate = hoje.strftime('%a, %d %b')
if hoje.hour<10:
hoje = hoje-timedelta(days=1)
CAPA = 'http://www.estadao.com.br/estadaodehoje/'+hoje.strftime('%Y%m%d')+'/img/capadodia.jpg'
SCREENSHOT = 'http://estadao.com.br/'
cover_margins = (0,0,'white')
masthead_url = 'http://www.estadao.com.br/estadao/novo/img/logo.png'
keep_only_tags = [dict(name='div', attrs={'class':['bb-md-noticia','corpo']})]
remove_tags = [
dict(name='div',
attrs={'id':[
'bb-md-noticia-tabs'
]})
,dict(name='div',
attrs={'class':[
'tags'
,'discussion'
,'bb-gg adsense_container'
]})
,dict(name='a')
,dict(name='iframe')
,dict(name='link')
,dict(name='script')
]
feeds = [
(u'\xDAltimas Not\xEDcias', u'http://www.estadao.com.br/rss/ultimas.xml')
,(u'Manchetes', u'http://www.estadao.com.br/rss/manchetes.xml')
,(u'Brasil', u'http://www.estadao.com.br/rss/brasil.xml')
,(u'Internacional', u'http://www.estadao.com.br/rss/internacional.xml')
,(u'Cinema', u'http://blogs.estadao.com.br/cinema/feed/')
,(u'Planeta', u'http://www.estadao.com.br/rss/planeta.xml')
,(u'Ci\xEAncia', u'http://www.estadao.com.br/rss/ciencia.xml')
,(u'Sa\xFAde', u'http://www.estadao.com.br/rss/saude.xml')
,(u'Pol\xEDtica', u'http://www.estadao.com.br/rss/politica.xml')
]
conversion_options = {
'title' : title
,'comments' : description
,'publisher' : publisher
,'tags' : category
,'language' : LANGUAGE
,'linearize_tables': True
}
def preprocess_html(self, soup):
for item in soup.findAll(style=True):
del item['style']
if not soup.find(attrs={'http-equiv':'Content-Language'}):
meta0 = Tag(soup,'meta',[("http-equiv","Content-Language"),("content",self.LANGHTM)])
soup.head.insert(0,meta0)
if not soup.find(attrs={'http-equiv':'Content-Type'}):
meta1 = Tag(soup,'meta',[("http-equiv","Content-Type"),("content","text/html; charset="+self.ENCHTM)])
soup.head.insert(0,meta1)
return soup
def postprocess_html(self, soup, first):
#process all the images. assumes that the new html has the correct path
for tag in soup.findAll(lambda tag: tag.name.lower()=='img' and tag.has_key('src')):
iurl = tag['src']
img = Image()
img.open(iurl)
width, height = img.size
print 'img is: ', iurl, 'width is: ', width, 'height is: ', height
if img < 0:
raise RuntimeError('Out of memory')
pw = PixelWand()
if( width > height and width > 590) :
print 'Rotate image'
img.rotate(pw, -90)
img.save(iurl)
return soup
def get_cover_url(self):
if self.THUMBALIZR_API:
cover_url = self.CAPA
pedido = Request(self.CAPA)
pedido.add_header('User-agent','Mozilla/5.0 (Windows; U; Windows NT 5.1; '+self.LANGHTM+'; userid='+self.THUMBALIZR_API+') Calibre/0.8.47 (like Gecko)')
pedido.add_header('Accept-Charset',self.ENCHTM)
pedido.add_header('Referer',self.SCREENSHOT)
try:
resposta = urlopen(pedido)
soup = BeautifulSoup(resposta)
cover_item = soup.find('body')
if cover_item:
cover_url='http://api.thumbalizr.com/?api_key='+self.THUMBALIZR_API+'&url='+self.SCREENSHOT+'&width=600&quality=90'
return cover_url
except URLError:
cover_url='http://api.thumbalizr.com/?api_key='+self.THUMBALIZR_API+'&url='+self.SCREENSHOT+'&width=600&quality=90'
return cover_url