New recipes by Darko Miletic for Cuban, Brasilian, Chilean and Montenegran news sources

This commit is contained in:
Kovid Goyal 2009-02-01 18:43:40 -08:00
parent 15107d2f71
commit 47e94f47cb
22 changed files with 546 additions and 1 deletions

Binary file not shown.

After

Width:  |  Height:  |  Size: 2.3 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 1.7 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 685 B

Binary file not shown.

After

Width:  |  Height:  |  Size: 942 B

Binary file not shown.

After

Width:  |  Height:  |  Size: 514 B

Binary file not shown.

After

Width:  |  Height:  |  Size: 534 B

Binary file not shown.

After

Width:  |  Height:  |  Size: 393 B

Binary file not shown.

After

Width:  |  Height:  |  Size: 701 B

Binary file not shown.

After

Width:  |  Height:  |  Size: 636 B

View File

@ -24,7 +24,9 @@ recipe_modules = ['recipe_' + r for r in (
'joelonsoftware', 'telepolis', 'common_dreams', 'nin', 'tomshardware_de', 'joelonsoftware', 'telepolis', 'common_dreams', 'nin', 'tomshardware_de',
'pagina12', 'infobae', 'ambito', 'elargentino', 'sueddeutsche', 'the_age', 'pagina12', 'infobae', 'ambito', 'elargentino', 'sueddeutsche', 'the_age',
'laprensa', 'amspec', 'freakonomics', 'criticadigital', 'elcronista', 'laprensa', 'amspec', 'freakonomics', 'criticadigital', 'elcronista',
'shacknews', 'teleread', 'shacknews', 'teleread', 'granma', 'juventudrebelde', 'juventudrebelde_english',
'la_tercera', 'el_mercurio_chile', 'la_cuarta', 'lanacion_chile', 'la_segunda',
'jb_online', 'estadao', 'o_globo', 'vijesti',
)] )]
import re, imp, inspect, time, os import re, imp, inspect, time, os

View File

@ -0,0 +1,47 @@
#!/usr/bin/env python
__license__ = 'GPL v3'
__copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>'
'''
emol.com
'''
from calibre.web.feeds.news import BasicNewsRecipe
class ElMercurio(BasicNewsRecipe):
title = 'El Mercurio online'
__author__ = 'Darko Miletic'
description = 'El sitio de noticias online de Chile'
oldest_article = 2
max_articles_per_feed = 100
no_stylesheets = True
use_embedded_content = False
encoding = 'cp1252'
cover_url = 'http://www.emol.com/especiales/logo_emol/logo_emol.gif'
html2lrf_options = [
'--comment' , description
, '--category' , 'news, Chile'
, '--publisher' , title
]
keep_only_tags = [
dict(name='div', attrs={'class':'despliegue-txt_750px'})
,dict(name='div', attrs={'id':'div_cuerpo_participa'})
]
remove_tags = [
dict(name='div', attrs={'class':'contenedor_despliegue-col-left300'})
,dict(name='div', attrs={'id':['div_centro_dn_opc','div_cabezera','div_secciones','div_contenidos','div_pie','nav']})
]
feeds = [
(u'Noticias de ultima hora', u'http://www.emol.com/rss20/rss.asp?canal=0')
,(u'Nacional', u'http://www.emol.com/rss20/rss.asp?canal=1')
,(u'Mundo', u'http://www.emol.com/rss20/rss.asp?canal=2')
,(u'Deportes', u'http://www.emol.com/rss20/rss.asp?canal=4')
,(u'Magazine', u'http://www.emol.com/rss20/rss.asp?canal=6')
,(u'Tecnologia', u'http://www.emol.com/rss20/rss.asp?canal=5')
,(u'La Musica', u'http://www.emol.com/rss20/rss.asp?canal=7')
]

View File

@ -0,0 +1,54 @@
#!/usr/bin/env python
__license__ = 'GPL v3'
__copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>'
'''
estadao.com.br
'''
from calibre.web.feeds.news import BasicNewsRecipe
class Estadao(BasicNewsRecipe):
title = 'O Estado de S. Paulo'
__author__ = 'Darko Miletic'
description = 'News from Brasil'
oldest_article = 2
max_articles_per_feed = 100
no_stylesheets = True
use_embedded_content = False
encoding = 'utf8'
cover_url = 'http://www.estadao.com.br/img/logo_estadao.png'
html2lrf_options = [
'--comment' , description
, '--category' , 'news, Brasil'
, '--publisher' , title
]
keep_only_tags = [dict(name='div', attrs={'id':'c1'})]
remove_tags = [
dict(name=['script','object','form','ul'])
,dict(name='div', attrs={'id':['votacao','estadaohoje']})
,dict(name='p', attrs={'id':'ctrl_texto'})
,dict(name='p', attrs={'class':'texto'})
]
feeds = [
(u'Manchetes Estadao', u'http://www.estadao.com.br/rss/manchetes.xml')
,(u'Ultimas noticias', u'http://www.estadao.com.br/rss/ultimas.xml')
,(u'Nacional', u'http://www.estadao.com.br/rss/nacional.xml')
,(u'Internacional', u'http://www.estadao.com.br/rss/internacional.xml')
,(u'Cidades', u'http://www.estadao.com.br/rss/cidades.xml')
,(u'Esportes', u'http://www.estadao.com.br/rss/esportes.xml')
,(u'Arte & Lazer', u'http://www.estadao.com.br/rss/arteelazer.xml')
,(u'Economia', u'http://www.estadao.com.br/rss/economia.xml')
,(u'Vida &', u'http://www.estadao.com.br/rss/vidae.xml')
]
def preprocess_html(self, soup):
ifr = soup.find('iframe')
if ifr:
ifr.extract()
return soup

View File

@ -0,0 +1,42 @@
#!/usr/bin/env python
__license__ = 'GPL v3'
__copyright__ = '2008, Darko Miletic <darko.miletic at gmail.com>'
'''
granma.cubaweb.cu
'''
import urllib
from calibre.web.feeds.news import BasicNewsRecipe
class Granma(BasicNewsRecipe):
title = 'Diario Granma'
__author__ = 'Darko Miletic'
description = 'Organo oficial del Comite Central del Partido Comunista de Cuba'
oldest_article = 2
max_articles_per_feed = 100
no_stylesheets = True
use_embedded_content = False
encoding = 'cp1252'
cover_url = 'http://www.granma.cubaweb.cu/imagenes/granweb229d.jpg'
html2lrf_options = [
'--comment' , description
, '--category' , 'news, Cuba'
, '--publisher' , title
, '--ignore-tables'
]
keep_only_tags = [dict(name='table', attrs={'height':'466'})]
feeds = [(u'Noticias', u'http://www.granma.cubaweb.cu/noticias.xml' )]
def preprocess_html(self, soup):
del soup.body.table['style']
rtag = soup.find('td', attrs={'height':'458'})
if rtag:
del rtag['style']
return soup

View File

@ -0,0 +1,41 @@
#!/usr/bin/env python
__license__ = 'GPL v3'
__copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>'
'''
jbonline.terra.com.br
'''
from calibre.web.feeds.news import BasicNewsRecipe
class JBOnline(BasicNewsRecipe):
title = 'Jornal Brasileiro Online'
__author__ = 'Darko Miletic'
description = 'News from Brasil'
oldest_article = 2
max_articles_per_feed = 100
no_stylesheets = True
use_embedded_content = False
encoding = 'cp1252'
cover_url = 'http://jbonline.terra.com.br/img/logo_01.gif'
html2lrf_options = [
'--comment' , description
, '--category' , 'news, Brasil'
, '--publisher' , title
]
keep_only_tags = [dict(name='div', attrs={'id':'corpoNoticia'})]
remove_tags = [dict(name=['script','object','form'])]
feeds = [(u'Todos as editorias', u'http://jbonline.terra.com.br/extra/rsstrjb.xml')]
def preprocess_html(self, soup):
ifr = soup.find('iframe')
if ifr:
ifr.extract()
item = soup.find('div', attrs={'id':'corpoNoticia'})
if item:
del item['style']
return soup

View File

@ -0,0 +1,42 @@
#!/usr/bin/env python
__license__ = 'GPL v3'
__copyright__ = '2008, Darko Miletic <darko.miletic at gmail.com>'
'''
juventudrebelde.cu
'''
from calibre import strftime
from calibre.web.feeds.news import BasicNewsRecipe
class Juventudrebelde(BasicNewsRecipe):
title = 'Juventud Rebelde'
__author__ = 'Darko Miletic'
description = 'Diario de la Juventud Cubana'
oldest_article = 2
max_articles_per_feed = 100
no_stylesheets = True
use_embedded_content = False
encoding = 'cp1252'
cover_url = strftime('http://www.juventudrebelde.cu/UserFiles/File/impreso/iportada-%Y-%m-%d.jpg')
html2lrf_options = [
'--comment' , description
, '--category' , 'news, Cuba'
, '--publisher' , title
, '--ignore-tables'
]
keep_only_tags = [dict(name='div', attrs={'id':'noticia'})]
feeds = [
(u'Generales', u'http://www.juventudrebelde.cu/rss/generales.php' )
,(u'Cuba', u'http://www.juventudrebelde.cu/rss/generales.php?seccion=cuba' )
,(u'Internacionales', u'http://www.juventudrebelde.cu/rss/generales.php?seccion=internacionales' )
,(u'Opinion', u'http://www.juventudrebelde.cu/rss/generales.php?seccion=opinion' )
,(u'Cultura', u'http://www.juventudrebelde.cu/rss/generales.php?seccion=cultura' )
,(u'Deportes', u'http://www.juventudrebelde.cu/rss/generales.php?seccion=deportes' )
,(u'Lectura', u'http://www.juventudrebelde.cu/rss/generales.php?seccion=lectura' )
]

View File

@ -0,0 +1,33 @@
#!/usr/bin/env python
__license__ = 'GPL v3'
__copyright__ = '2008, Darko Miletic <darko.miletic at gmail.com>'
'''
juventudrebelde.co.cu
'''
from calibre import strftime
from calibre.web.feeds.news import BasicNewsRecipe
class Juventudrebelde_english(BasicNewsRecipe):
title = 'Juventud Rebelde in english'
__author__ = 'Darko Miletic'
description = 'The newspaper of Cuban Youth'
oldest_article = 2
max_articles_per_feed = 100
no_stylesheets = True
use_embedded_content = False
encoding = 'iso-8859-1'
html2lrf_options = [
'--comment' , description
, '--category' , 'news, Cuba'
, '--publisher' , title
, '--ignore-tables'
]
keep_only_tags = [dict(name='div', attrs={'class':'read'})]
feeds = [(u'All news', u'http://www.juventudrebelde.cip.cu/rss/all/' )]

View File

@ -0,0 +1,42 @@
#!/usr/bin/env python
__license__ = 'GPL v3'
__copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>'
'''
lacuarta.cl
'''
from calibre.web.feeds.news import BasicNewsRecipe
class LaCuarta(BasicNewsRecipe):
title = 'La Cuarta'
__author__ = 'Darko Miletic'
description = 'El sitio de noticias online de Chile'
oldest_article = 2
max_articles_per_feed = 100
no_stylesheets = True
use_embedded_content = False
encoding = 'cp1252'
html2lrf_options = [
'--comment' , description
, '--category' , 'news, Chile'
, '--publisher' , title
]
keep_only_tags = [dict(name='div', attrs={'class':'articulo desplegado'}) ]
remove_tags = [
dict(name='script')
,dict(name='ul')
,dict(name='div', attrs={'id':['toolbox','articleImageDisplayer','enviarAmigo']})
,dict(name='div', attrs={'class':['par ad-1','par ad-2']})
,dict(name='input')
,dict(name='p', attrs={'id':['mensajeError','mensajeEnviandoNoticia','mensajeExito']})
,dict(name='strong', text='PUBLICIDAD')
]
feeds = [(u'Noticias', u'http://lacuarta.cl/app/rss?sc=TEFDVUFSVEE=')]

View File

@ -0,0 +1,47 @@
#!/usr/bin/env python
__license__ = 'GPL v3'
__copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>'
'''
lasegunda.com
'''
from calibre.web.feeds.news import BasicNewsRecipe
class LaSegunda(BasicNewsRecipe):
title = 'La Segunda'
__author__ = 'Darko Miletic'
description = 'El sitio de noticias online de Chile'
oldest_article = 2
max_articles_per_feed = 100
no_stylesheets = True
use_embedded_content = False
encoding = 'cp1252'
cover_url = 'http://www.lasegunda.com/imagenes/logotipo_lasegunda_Oli.gif'
html2lrf_options = [
'--comment' , description
, '--category' , 'news, Chile'
, '--publisher' , title
, '--ignore-tables'
]
keep_only_tags = [dict(name='table')]
feeds = [
(u'Noticias de ultima hora', u'http://www.lasegunda.com/rss20/index.asp?canal=0')
,(u'Politica', u'http://www.lasegunda.com/rss20/index.asp?canal=21')
,(u'Cronica', u'http://www.lasegunda.com/rss20/index.asp?canal=20')
,(u'Internacional', u'http://www.lasegunda.com/rss20/index.asp?canal=23')
,(u'Deportes', u'http://www.lasegunda.com/rss20/index.asp?canal=24')
,(u'Epectaculos/Cultura', u'http://www.lasegunda.com/rss20/index.asp?canal=25')
,(u'Educacion', u'http://www.lasegunda.com/rss20/index.asp?canal=26')
,(u'Ciencia y Tecnologia', u'http://www.lasegunda.com/rss20/index.asp?canal=27')
,(u'Solidaridad', u'http://www.lasegunda.com/rss20/index.asp?canal=28')
,(u'Buena Vida', u'http://www.lasegunda.com/rss20/index.asp?canal=32')
]
def print_version(self, url):
rest, sep, article_id = url.partition('index.asp?idnoticia=')
return u'http://www.lasegunda.com/edicionOnline/include/secciones/_detalle_impresion.asp?idnoticia=' + article_id

View File

@ -0,0 +1,52 @@
#!/usr/bin/env python
__license__ = 'GPL v3'
__copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>'
'''
latercera.com
'''
from calibre.web.feeds.news import BasicNewsRecipe
class LaTercera(BasicNewsRecipe):
title = 'La Tercera'
__author__ = 'Darko Miletic'
description = 'El sitio de noticias online de Chile'
oldest_article = 2
max_articles_per_feed = 100
no_stylesheets = True
use_embedded_content = False
encoding = 'cp1252'
html2lrf_options = [
'--comment' , description
, '--category' , 'news, Chile'
, '--publisher' , title
]
keep_only_tags = [dict(name='div', attrs={'class':'span-16 articulo border'}) ]
remove_tags = [
dict(name='script')
,dict(name='ul')
,dict(name='div', attrs={'id':['boxComentarios','shim','enviarAmigo']})
,dict(name='div', attrs={'class':['ad640','span-10 imgSet A','infoRelCol']})
,dict(name='input')
,dict(name='p', attrs={'id':['mensajeError','mensajeEnviandoNoticia','mensajeExito']})
]
feeds = [
(u'Noticias de ultima hora', u'http://www.latercera.com/app/rss?sc=TEFURVJDRVJB&ul=1')
,(u'Pais', u'http://www.latercera.com/app/rss?sc=TEFURVJDRVJB&category=654')
,(u'Mundo', u'http://www.latercera.com/app/rss?sc=TEFURVJDRVJB&category=678')
,(u'Deportes', u'http://www.latercera.com/app/rss?sc=TEFURVJDRVJB&category=656')
,(u'Negocios', u'http://www.latercera.com/app/rss?sc=TEFURVJDRVJB&category=655')
,(u'Entretenimiento', u'http://www.latercera.com/app/rss?sc=TEFURVJDRVJB&category=661')
,(u'Motores', u'http://www.latercera.com/app/rss?sc=TEFURVJDRVJB&category=665')
,(u'Tendencias', u'http://www.latercera.com/app/rss?sc=TEFURVJDRVJB&category=659')
,(u'Estilo', u'http://www.latercera.com/app/rss?sc=TEFURVJDRVJB&category=660')
,(u'Educacion', u'http://www.latercera.com/app/rss?sc=TEFURVJDRVJB&category=657')
]

View File

@ -0,0 +1,44 @@
#!/usr/bin/env python
__license__ = 'GPL v3'
__copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>'
'''
lanacion.cl
'''
import urllib
from calibre.web.feeds.news import BasicNewsRecipe
class LaNacionChile(BasicNewsRecipe):
title = 'La Nacion Chile'
__author__ = 'Darko Miletic'
description = 'El sitio de noticias online de Chile'
oldest_article = 2
max_articles_per_feed = 100
no_stylesheets = True
use_embedded_content = False
encoding = 'cp1252'
cover_url = 'http://www.lanacion.cl/prontus_noticias_v2/imag/site/logo.gif'
html2lrf_options = [
'--comment' , description
, '--category' , 'news, Chile'
, '--publisher' , title
]
keep_only_tags = [dict(name='div', attrs={'class':'bloque'})]
feeds = [(u'Noticias', u'http://www.lanacion.cl/rss.xml')]
def print_version(self, url):
toprint = urllib.quote(url,':/')
return u'http://www.lanacion.cl/cgi-bx/imprimir.cgi?_URL=' + toprint
def preprocess_html(self, soup):
del soup.body['onload']
soup.head.base.extract()
item = soup.find('a', attrs={'href':'javascript:window.close()'})
if item:
item.extract()
return soup

View File

@ -0,0 +1,57 @@
#!/usr/bin/env python
__license__ = 'GPL v3'
__copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>'
'''
oglobo.globo.com
'''
from calibre.web.feeds.news import BasicNewsRecipe
class OGlobo(BasicNewsRecipe):
title = 'O Globo'
__author__ = 'Darko Miletic'
description = 'News from Brasil'
oldest_article = 2
max_articles_per_feed = 100
no_stylesheets = True
use_embedded_content = False
encoding = 'cp1252'
cover_url = 'http://oglobo.globo.com/_img/o-globo.png'
html2lrf_options = [
'--comment' , description
, '--category' , 'news, Brasil'
, '--publisher' , title
]
keep_only_tags = [dict(name='div', attrs={'id':'ltintb'})]
remove_tags = [
dict(name='script')
,dict(name='object')
,dict(name='form')
,dict(name='div', attrs={'id':['linksPatGoogle','rdpm','cor','com','env','rcm_st']})
,dict(name='div', attrs={'class':'box-zap-anu2'})
,dict(name='a')
,dict(name='link')
]
feeds = [
(u'Todos os canais', u'http://oglobo.globo.com/rss/plantao.xml')
,(u'Ciencia', u'http://oglobo.globo.com/rss/plantaociencia.xml')
,(u'Educacao', u'http://oglobo.globo.com/rss/plantaoeducacao.xml')
,(u'Opiniao', u'http://oglobo.globo.com/rss/plantaoopiniao.xml')
,(u'Sao Paulo', u'http://oglobo.globo.com/rss/plantaosaopaulo.xml')
,(u'Viagem', u'http://oglobo.globo.com/rss/plantaoviagem.xml')
,(u'Cultura', u'http://oglobo.globo.com/rss/plantaocultura.xml')
,(u'Esportes', u'http://oglobo.globo.com/rss/plantaoesportes.xml')
,(u'Mundo', u'http://oglobo.globo.com/rss/plantaomundo.xml')
,(u'Pais', u'http://oglobo.globo.com/rss/plantaopais.xml')
,(u'Rio', u'http://oglobo.globo.com/rss/plantaorio.xml')
,(u'Saude', u'http://oglobo.globo.com/rss/plantaosaude.xml')
,(u'Viver Melhor', u'http://oglobo.globo.com/rss/plantaovivermelhor.xml')
,(u'Economia', u'http://oglobo.globo.com/rss/plantaoeconomia.xml')
,(u'Tecnologia', u'http://oglobo.globo.com/rss/plantaotecnologia.xml')
]

View File

@ -0,0 +1,42 @@
#!/usr/bin/env python
__license__ = 'GPL v3'
__copyright__ = '2008, Darko Miletic <darko.miletic at gmail.com>'
'''
vijesti.cg.yu
'''
import string,re
from calibre.web.feeds.news import BasicNewsRecipe
class Vijesti(BasicNewsRecipe):
title = 'Vijesti'
__author__ = 'Darko Miletic'
description = 'News from Montenegro'
oldest_article = 2
max_articles_per_feed = 100
no_stylesheets = True
use_embedded_content = False
encoding = 'cp1250'
cover_url = 'http://www.vijesti.cg.yu/img/logo.gif'
preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')]
html2lrf_options = [
'--comment' , description
, '--category' , 'news, Montenegro'
, '--publisher' , 'Daily Press Vijesti'
]
keep_only_tags = [dict(name='div', attrs={'id':'mainnews'})]
feeds = [(u'Sve vijesti', u'http://www.vijesti.cg.yu/rss.php' )]
def preprocess_html(self, soup):
soup.html['xml:lang'] = 'sr-Latn-ME'
soup.html['lang'] = 'sr-Latn-ME'
mtag = '<meta http-equiv="Content-Language" content="sr-Latn-ME"/>'
soup.head.insert(0,mtag)
return soup