calibre/recipes/o_globo.recipe
david weil 4432e3b837 Encoding in OGlobo recipe was wrong and unnecessar
Encoding in O'Globo recipe was wrong and unnecessary.

With the original recipe the article titles was ok, but on the articles the text body encoding is wrong.

The fetched articules already include its encoding information (to be utf-8 when/with the one I tried), and changing the recipe to remove the `encoding` specification seems to work ok!
2014-01-05 12:26:22 -02:00

73 lines
3.0 KiB
Python

#!/usr/bin/env python
__license__ = 'GPL v3'
__copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>'
'''
oglobo.globo.com
'''
from calibre.web.feeds.news import BasicNewsRecipe
class OGlobo(BasicNewsRecipe):
title = 'O Globo'
__author__ = 'Darko Miletic and Carlos Laviola'
description = 'News from Brasil'
publisher = 'O Globo'
category = 'news, politics, Brasil'
oldest_article = 7
max_articles_per_feed = 100
no_stylesheets = True
use_embedded_content = False
cover_url = 'http://oglobo.globo.com/_img/o-globo.png'
remove_javascript = True
html2lrf_options = [
'--comment', description
, '--category', category
, '--publisher', publisher
]
html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"'
extra_css = '''
cite{color:#007BB5; font-size:xx-small; font-style:italic;}
body{font-family:Arial,Helvetica,sans-serif;font-size:x-small;}
h3{font-size:large; color:#082963; font-weight:bold;}
#ident{color:#0179B4; font-size:xx-small;}
p{color:#000000;font-weight:normal;}
.commentario p{color:#007BB5; font-style:italic;}
'''
remove_tags = [
dict(name='script')
,dict(name='form')
,dict(name='div', attrs={'id':'header'})
,dict(name='p', attrs={'id':'info-date-press'})
]
feeds = [
(u'Todos os canais', u'http://oglobo.globo.com/rss.xml?completo=true')
,(u'Ciencia', u'http://oglobo.globo.com/rss.xml?secao=ciencia&completo=true')
,(u'Educacao', u'http://oglobo.globo.com/rss.xml?secao=educacao&completo=true')
,(u'Opiniao', u'http://oglobo.globo.com/rss.xml?secao=opiniao&completo=true')
,(u'Cultura', u'http://oglobo.globo.com/rss.xml?secao=cultura&completo=true')
,(u'Esportes', u'http://oglobo.globo.com/rss.xml?secao=esportes&completo=true')
,(u'Mundo', u'http://oglobo.globo.com/rss.xml?secao=mundo&completo=true')
,(u'Pais', u'http://oglobo.globo.com/rss.xml?secao=pais&completo=true')
,(u'Rio', u'http://oglobo.globo.com/rss.xml?secao=rio&completo=true')
,(u'Saude', u'http://oglobo.globo.com/rss.xml?secao=saude&completo=true')
,(u'Economia', u'http://oglobo.globo.com/rss.xml?secao=economia&completo=true')
,(u'Tecnologia', u'http://oglobo.globo.com/rss.xml?secao=tecnologia&completo=true')
]
def print_version(self, url):
return url + '?service=print'
def preprocess_html(self, soup):
for item in soup.findAll(style=True):
del item['style']
return soup
language = 'pt_BR'