mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-09-29 15:31:08 -04:00
Encoding in O'Globo recipe was wrong and unnecessary. With the original recipe the article titles was ok, but on the articles the text body encoding is wrong. The fetched articules already include its encoding information (to be utf-8 when/with the one I tried), and changing the recipe to remove the `encoding` specification seems to work ok!
73 lines
3.0 KiB
Python
73 lines
3.0 KiB
Python
#!/usr/bin/env python
|
|
|
|
__license__ = 'GPL v3'
|
|
__copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>'
|
|
'''
|
|
oglobo.globo.com
|
|
'''
|
|
|
|
from calibre.web.feeds.news import BasicNewsRecipe
|
|
|
|
class OGlobo(BasicNewsRecipe):
|
|
title = 'O Globo'
|
|
__author__ = 'Darko Miletic and Carlos Laviola'
|
|
description = 'News from Brasil'
|
|
publisher = 'O Globo'
|
|
category = 'news, politics, Brasil'
|
|
oldest_article = 7
|
|
max_articles_per_feed = 100
|
|
no_stylesheets = True
|
|
use_embedded_content = False
|
|
cover_url = 'http://oglobo.globo.com/_img/o-globo.png'
|
|
remove_javascript = True
|
|
|
|
html2lrf_options = [
|
|
'--comment', description
|
|
, '--category', category
|
|
, '--publisher', publisher
|
|
]
|
|
|
|
html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"'
|
|
|
|
extra_css = '''
|
|
cite{color:#007BB5; font-size:xx-small; font-style:italic;}
|
|
body{font-family:Arial,Helvetica,sans-serif;font-size:x-small;}
|
|
h3{font-size:large; color:#082963; font-weight:bold;}
|
|
#ident{color:#0179B4; font-size:xx-small;}
|
|
p{color:#000000;font-weight:normal;}
|
|
.commentario p{color:#007BB5; font-style:italic;}
|
|
'''
|
|
|
|
remove_tags = [
|
|
dict(name='script')
|
|
,dict(name='form')
|
|
,dict(name='div', attrs={'id':'header'})
|
|
,dict(name='p', attrs={'id':'info-date-press'})
|
|
]
|
|
|
|
|
|
feeds = [
|
|
(u'Todos os canais', u'http://oglobo.globo.com/rss.xml?completo=true')
|
|
,(u'Ciencia', u'http://oglobo.globo.com/rss.xml?secao=ciencia&completo=true')
|
|
,(u'Educacao', u'http://oglobo.globo.com/rss.xml?secao=educacao&completo=true')
|
|
,(u'Opiniao', u'http://oglobo.globo.com/rss.xml?secao=opiniao&completo=true')
|
|
,(u'Cultura', u'http://oglobo.globo.com/rss.xml?secao=cultura&completo=true')
|
|
,(u'Esportes', u'http://oglobo.globo.com/rss.xml?secao=esportes&completo=true')
|
|
,(u'Mundo', u'http://oglobo.globo.com/rss.xml?secao=mundo&completo=true')
|
|
,(u'Pais', u'http://oglobo.globo.com/rss.xml?secao=pais&completo=true')
|
|
,(u'Rio', u'http://oglobo.globo.com/rss.xml?secao=rio&completo=true')
|
|
,(u'Saude', u'http://oglobo.globo.com/rss.xml?secao=saude&completo=true')
|
|
,(u'Economia', u'http://oglobo.globo.com/rss.xml?secao=economia&completo=true')
|
|
,(u'Tecnologia', u'http://oglobo.globo.com/rss.xml?secao=tecnologia&completo=true')
|
|
]
|
|
|
|
def print_version(self, url):
|
|
return url + '?service=print'
|
|
|
|
def preprocess_html(self, soup):
|
|
for item in soup.findAll(style=True):
|
|
del item['style']
|
|
return soup
|
|
|
|
language = 'pt_BR'
|