diff --git a/resources/recipes/el_periodico.recipe b/resources/recipes/el_periodico.recipe new file mode 100644 index 0000000000..2c3ed456fb --- /dev/null +++ b/resources/recipes/el_periodico.recipe @@ -0,0 +1,109 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +__license__ = 'GPL v3' +__copyright__ = '04 December 2010, desUBIKado' +__author__ = 'desUBIKado' +__description__ = 'Daily newspaper from Aragon' +__version__ = 'v0.05' +__date__ = '07, December 2010' +''' +elperiodicodearagon.com +''' +import re +from calibre.web.feeds.news import BasicNewsRecipe + + +class elperiodicodearagon(BasicNewsRecipe): + title = u'El Periodico de Aragon' + __author__ = u'desUBIKado' + description = u'Noticias desde Aragon' + publisher = u'elperiodicodearagon.com' + category = u'news, politics, Spain, Aragon' + oldest_article = 2 + delay = 0 + max_articles_per_feed = 100 + no_stylesheets = True + use_embedded_content = False + language = 'es' + encoding = 'utf8' + remove_empty_feeds = True + remove_javascript = True + + + conversion_options = { + 'comments' : description + ,'tags' : category + ,'language' : language + ,'publisher' : publisher + } + + feeds = [(u'Arag\xf3n', u'http://elperiodicodearagon.com/RSS/2.xml'), + (u'Internacional', u'http://elperiodicodearagon.com/RSS/4.xml'), + (u'Espa\xf1a', u'http://elperiodicodearagon.com/RSS/3.xml'), + (u'Econom\xeda', u'http://elperiodicodearagon.com/RSS/5.xml'), + (u'Deportes', u'http://elperiodicodearagon.com/RSS/7.xml'), + (u'Real Zaragoza', u'http://elperiodicodearagon.com/RSS/10.xml'), + (u'Opini\xf3n', u'http://elperiodicodearagon.com/RSS/103.xml'), + (u'Escenarios', u'http://elperiodicodearagon.com/RSS/105.xml'), + (u'Sociedad', u'http://elperiodicodearagon.com/RSS/104.xml'), + (u'Gente', u'http://elperiodicodearagon.com/RSS/330.xml')] + + + extra_css = ''' + h3{font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:xx-large;} + h2{font-family:Arial,Helvetica,sans-serif; font-weight:normal;font-size:small;} + dd{font-family:Arial,Helvetica,sans-serif; font-weight:normal;font-size:small;} + ''' + + remove_attributes = ['height','width'] + + keep_only_tags = [dict(name='div', attrs={'id':'contenidos'})] + + + # Quitar toda la morralla + + remove_tags = [dict(name='ul', attrs={'class':'herramientasDeNoticia'}), + dict(name='span', attrs={'class':'MasInformacion '}), + dict(name='span', attrs={'class':'MasInformacion'}), + dict(name='div', attrs={'class':'Middle'}), + dict(name='div', attrs={'class':'MenuCabeceraRZaragoza'}), + dict(name='div', attrs={'id':'MenuCabeceraRZaragoza'}), + dict(name='div', attrs={'class':'MenuEquipo'}), + dict(name='div', attrs={'class':'TemasRelacionados'}), + dict(name='div', attrs={'class':'GaleriaEnNoticia'}), + dict(name='div', attrs={'class':'Recorte'}), + dict(name='div', attrs={'id':'NoticiasenRecursos'}), + dict(name='div', attrs={'id':'NoticiaEnPapel'}), + dict(name='p', attrs={'class':'RecorteEnNoticias'}), + dict(name='div', attrs={'id':'Comparte'}), + dict(name='div', attrs={'id':'CajaComparte'}), + dict(name='a', attrs={'class':'EscribirComentario'}), + dict(name='a', attrs={'class':'AvisoComentario'}), + dict(name='div', attrs={'class':'CajaAvisoComentario'}), + dict(name='div', attrs={'class':'navegaNoticias'}), + dict(name='div', attrs={'id':'PaginadorDiCom'}), + dict(name='div', attrs={'id':'CajaAccesoCuentaUsuario'}), + dict(name='div', attrs={'id':'CintilloComentario'}), + dict(name='div', attrs={'id':'EscribeComentario'}), + dict(name='div', attrs={'id':'FormularioComentario'}), + dict(name='div', attrs={'id':'FormularioNormas'})] + + # Recuperamos la portada de papel (la imagen format=1 tiene mayor resolucion) + + def get_cover_url(self): + index = 'http://pdf.elperiodicodearagon.com/' + soup = self.index_to_soup(index) + for image in soup.findAll('img',src=True): + if image['src'].startswith('http://pdf.elperiodicodearagon.com/funciones/portada-preview.php?eid='): + return image['src'].rstrip('format=2') + 'format=1' + return None + + # Para quitar espacios entre la noticia y los comentarios (lineas 1 y 2) + # El indice no apuntaba correctamente al empiece de la noticia (linea 3) + + preprocess_regexps = [ + (re.compile(r'

 

', re.DOTALL|re.IGNORECASE), lambda match: ''), + (re.compile(r'

', re.DOTALL|re.IGNORECASE), lambda match: ''), + (re.compile(r'

', re.DOTALL|re.IGNORECASE), lambda match: '

') + ] diff --git a/resources/recipes/red_aragon.recipe b/resources/recipes/red_aragon.recipe new file mode 100644 index 0000000000..4681e6660b --- /dev/null +++ b/resources/recipes/red_aragon.recipe @@ -0,0 +1,47 @@ +#!/usr/bin/env python +__license__ = 'GPL v3' +__copyright__ = '11 December 2010, desUBIKado' +__author__ = 'desUBIKado' +__description__ = 'Entertainment guide from Aragon' +__version__ = 'v0.01' +__date__ = '11, December 2010' +''' +[url]http://www.redaragon.es/[/url] +''' + +from calibre.web.feeds.news import BasicNewsRecipe + +class heraldo(BasicNewsRecipe): + __author__ = 'desUBIKado' + description = u'Guia de ocio desde Aragon' + title = u'RedAragon' + publisher = 'Grupo Z' + category = 'Concerts, Movies, Entertainment news' + cover_url = 'http://www.redaragon.com/2008_img/logotipo.gif' + language = 'es' + timefmt = '[%a, %d %b, %Y]' + oldest_article = 15 + max_articles_per_feed = 100 + encoding = 'iso-8859-1' + use_embedded_content = False + remove_javascript = True + no_stylesheets = True + + feeds = [(u'Conciertos', u'http://redaragon.com/rss/agenda.asp?tid=1'), + (u'Exposiciones', u'http://redaragon.com/rss/agenda.asp?tid=5'), + (u'Teatro', u'http://redaragon.com/rss/agenda.asp?tid=10'), + (u'Conferencias', u'http://redaragon.com/rss/agenda.asp?tid=2'), + (u'Ferias', u'http://redaragon.com/rss/agenda.asp?tid=6'), + (u'Filmotecas/Cineclubs', u'http://redaragon.com/rss/agenda.asp?tid=7'), + (u'Presentaciones', u'http://redaragon.com/rss/agenda.asp?tid=9'), + (u'Fiestas', u'http://redaragon.com/rss/agenda.asp?tid=11'), + (u'Infantil', u'http://redaragon.com/rss/agenda.asp?tid=13'), + (u'Otros', u'http://redaragon.com/rss/agenda.asp?tid=8')] + + keep_only_tags = [dict(name='div', attrs={'id':'FichaEventoAgenda'})] + + remove_tags = [dict(name='div', attrs={'class':['Comparte','CajaAgenda','Caja','Cintillo']})] + + remove_tags_before = dict(name='div' , attrs={'id':'FichaEventoAgenda'}) + + remove_tags_after = dict(name='div' , attrs={'class':'Cintillo'})