'
'''
www.businessworld.in
'''
@@ -22,7 +20,11 @@ class BusinessWorldMagazine(BasicNewsRecipe):
use_embedded_content = False
encoding = 'utf-8'
language = 'en_IN'
-
+ extra_css = """
+ img{display: block; margin-bottom: 0.5em}
+ body{font-family: Arial,Helvetica,sans-serif}
+ h2{color: gray; display: block}
+ """
conversion_options = {
'comment' : description
@@ -42,7 +44,26 @@ class BusinessWorldMagazine(BasicNewsRecipe):
articles = []
linklist = []
soup = self.index_to_soup(self.INDEX)
-
+
+ tough = soup.find('div', attrs={'id':'tough'})
+ if tough:
+ for item in tough.findAll('h1'):
+ description = ''
+ title_prefix = ''
+ feed_link = item.find('a')
+ if feed_link and feed_link.has_key('href'):
+ url = self.ROOT + feed_link['href']
+ if not self.is_in_list(linklist,url):
+ title = title_prefix + self.tag_to_string(feed_link)
+ date = strftime(self.timefmt)
+ articles.append({
+ 'title' :title
+ ,'date' :date
+ ,'url' :url
+ ,'description':description
+ })
+ linklist.append(url)
+
for item in soup.findAll('div', attrs={'class':'nametitle'}):
description = ''
title_prefix = ''
@@ -62,8 +83,8 @@ class BusinessWorldMagazine(BasicNewsRecipe):
return [(soup.head.title.string, articles)]
- keep_only_tags = [dict(name='div', attrs={'id':['register-panel','printwrapper']})]
- remove_tags = [dict(name=['object','link'])]
+ keep_only_tags = [dict(name='div', attrs={'id':'printwrapper'})]
+ remove_tags = [dict(name=['object','link','meta','base','iframe','link','table'])]
def print_version(self, url):
return url.replace('/bw/','/bw/storyContent/')
diff --git a/resources/recipes/el_periodico.recipe b/resources/recipes/el_periodico.recipe
new file mode 100644
index 0000000000..2c3ed456fb
--- /dev/null
+++ b/resources/recipes/el_periodico.recipe
@@ -0,0 +1,109 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+__license__ = 'GPL v3'
+__copyright__ = '04 December 2010, desUBIKado'
+__author__ = 'desUBIKado'
+__description__ = 'Daily newspaper from Aragon'
+__version__ = 'v0.05'
+__date__ = '07, December 2010'
+'''
+elperiodicodearagon.com
+'''
+import re
+from calibre.web.feeds.news import BasicNewsRecipe
+
+
+class elperiodicodearagon(BasicNewsRecipe):
+ title = u'El Periodico de Aragon'
+ __author__ = u'desUBIKado'
+ description = u'Noticias desde Aragon'
+ publisher = u'elperiodicodearagon.com'
+ category = u'news, politics, Spain, Aragon'
+ oldest_article = 2
+ delay = 0
+ max_articles_per_feed = 100
+ no_stylesheets = True
+ use_embedded_content = False
+ language = 'es'
+ encoding = 'utf8'
+ remove_empty_feeds = True
+ remove_javascript = True
+
+
+ conversion_options = {
+ 'comments' : description
+ ,'tags' : category
+ ,'language' : language
+ ,'publisher' : publisher
+ }
+
+ feeds = [(u'Arag\xf3n', u'http://elperiodicodearagon.com/RSS/2.xml'),
+ (u'Internacional', u'http://elperiodicodearagon.com/RSS/4.xml'),
+ (u'Espa\xf1a', u'http://elperiodicodearagon.com/RSS/3.xml'),
+ (u'Econom\xeda', u'http://elperiodicodearagon.com/RSS/5.xml'),
+ (u'Deportes', u'http://elperiodicodearagon.com/RSS/7.xml'),
+ (u'Real Zaragoza', u'http://elperiodicodearagon.com/RSS/10.xml'),
+ (u'Opini\xf3n', u'http://elperiodicodearagon.com/RSS/103.xml'),
+ (u'Escenarios', u'http://elperiodicodearagon.com/RSS/105.xml'),
+ (u'Sociedad', u'http://elperiodicodearagon.com/RSS/104.xml'),
+ (u'Gente', u'http://elperiodicodearagon.com/RSS/330.xml')]
+
+
+ extra_css = '''
+ h3{font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:xx-large;}
+ h2{font-family:Arial,Helvetica,sans-serif; font-weight:normal;font-size:small;}
+ dd{font-family:Arial,Helvetica,sans-serif; font-weight:normal;font-size:small;}
+ '''
+
+ remove_attributes = ['height','width']
+
+ keep_only_tags = [dict(name='div', attrs={'id':'contenidos'})]
+
+
+ # Quitar toda la morralla
+
+ remove_tags = [dict(name='ul', attrs={'class':'herramientasDeNoticia'}),
+ dict(name='span', attrs={'class':'MasInformacion '}),
+ dict(name='span', attrs={'class':'MasInformacion'}),
+ dict(name='div', attrs={'class':'Middle'}),
+ dict(name='div', attrs={'class':'MenuCabeceraRZaragoza'}),
+ dict(name='div', attrs={'id':'MenuCabeceraRZaragoza'}),
+ dict(name='div', attrs={'class':'MenuEquipo'}),
+ dict(name='div', attrs={'class':'TemasRelacionados'}),
+ dict(name='div', attrs={'class':'GaleriaEnNoticia'}),
+ dict(name='div', attrs={'class':'Recorte'}),
+ dict(name='div', attrs={'id':'NoticiasenRecursos'}),
+ dict(name='div', attrs={'id':'NoticiaEnPapel'}),
+ dict(name='p', attrs={'class':'RecorteEnNoticias'}),
+ dict(name='div', attrs={'id':'Comparte'}),
+ dict(name='div', attrs={'id':'CajaComparte'}),
+ dict(name='a', attrs={'class':'EscribirComentario'}),
+ dict(name='a', attrs={'class':'AvisoComentario'}),
+ dict(name='div', attrs={'class':'CajaAvisoComentario'}),
+ dict(name='div', attrs={'class':'navegaNoticias'}),
+ dict(name='div', attrs={'id':'PaginadorDiCom'}),
+ dict(name='div', attrs={'id':'CajaAccesoCuentaUsuario'}),
+ dict(name='div', attrs={'id':'CintilloComentario'}),
+ dict(name='div', attrs={'id':'EscribeComentario'}),
+ dict(name='div', attrs={'id':'FormularioComentario'}),
+ dict(name='div', attrs={'id':'FormularioNormas'})]
+
+ # Recuperamos la portada de papel (la imagen format=1 tiene mayor resolucion)
+
+ def get_cover_url(self):
+ index = 'http://pdf.elperiodicodearagon.com/'
+ soup = self.index_to_soup(index)
+ for image in soup.findAll('img',src=True):
+ if image['src'].startswith('http://pdf.elperiodicodearagon.com/funciones/portada-preview.php?eid='):
+ return image['src'].rstrip('format=2') + 'format=1'
+ return None
+
+ # Para quitar espacios entre la noticia y los comentarios (lineas 1 y 2)
+ # El indice no apuntaba correctamente al empiece de la noticia (linea 3)
+
+ preprocess_regexps = [
+ (re.compile(r'
', re.DOTALL|re.IGNORECASE), lambda match: ''),
+ (re.compile(r'
', re.DOTALL|re.IGNORECASE), lambda match: ''),
+ (re.compile(r'', re.DOTALL|re.IGNORECASE), lambda match: '
')
+ ]
diff --git a/resources/recipes/el_universal.recipe b/resources/recipes/el_universal.recipe
index 1995d0f932..f053812c05 100644
--- a/resources/recipes/el_universal.recipe
+++ b/resources/recipes/el_universal.recipe
@@ -1,7 +1,5 @@
-#!/usr/bin/env python
-
__license__ = 'GPL v3'
-__copyright__ = '2009, Darko Miletic '
+__copyright__ = '2009-2010, Darko Miletic '
'''
eluniversal.com.mx
'''
@@ -18,75 +16,25 @@ class ElUniversal(BasicNewsRecipe):
category = 'news, politics, Mexico'
no_stylesheets = True
use_embedded_content = False
- encoding = 'cp1252'
+ encoding = 'utf8'
remove_javascript = True
- language = 'es'
+ remove_empty_feeds = True
+ publication_type = 'newspaper'
+ language = 'es'
extra_css = '''
- body{font-family:Arial,Helvetica,sans-serif; font-size:x-small;}
- .geoGris30{font-family:Georgia,"Times New Roman",Times,serif; font-size:large; color:#003366; font-weight:bold;}
- .arnegro16{font-family:Georgia,"Times New Roman",Times,serif; font-weight:bold; font-size:small;}
- .tbazull2{font-family:"trebuchet ms",Arial,Helvetica,sans-serif; color:#336699; font-size:xx-small;}
- .tbgrisf11{font-family:"trebuchet ms",Arial,Helvetica,sans-serif; color: #666666; font-size:xx-small;}
- .verrojo13{font-family:"trebuchet ms",Arial,Helvetica,sans-serif; color: #CC0033; font-size:xx-small;}
- .trnegro13{font-family:"trebuchet ms",Arial,Helvetica,sans-serif; font-size:xx-small;}
- .txt-fotogaleria{font-family:"trebuchet ms",Arial,Helvetica,sans-serif; font-size:xx-small;}
+ body{font-family:Arial,Helvetica,sans-serif}
+ .noteTitle{font-family: Georgia,"Times New Roman",Times,serif; color: #336699; font-size: xx-large; font-weight: bold}
+ .noteInfo{display: block; color: gray}
'''
- keep_only_tags = [ dict(name='table', attrs={'width':"633"}),dict(name='table', attrs={'width':"629"}),]
-
+ keep_only_tags = [ dict(name='div', attrs={'id':'noteContent'})]
+ remove_tags_after = dict(attrs={'class':'noteText'})
remove_tags = [
- dict(name='table', attrs={'bgcolor':"#f5f5f5"}),
- dict(name='td', attrs={'bgcolor':"#f7f8f9"}),
- dict(name='td', attrs={'bgcolor':"#f5f5f5"}),
- dict(name='table', attrs={'width':"302"}),
- dict(name='table', attrs={'width':"214"}),
- dict(name='table', attrs={'width':"112"}),
- dict(name='table', attrs={'width':"980"}),
- dict(name='td', attrs={'height':"1"}),
- dict(name='td', attrs={'height':"4"}),
- dict(name='td', attrs={'height':"20"}),
- dict(name='td', attrs={'height':"10"}),
- dict(name='td', attrs={'class':["trrojo11","trbris11","trrojo12","arrojo12s","tbazul13"]}),
- dict(name='div', attrs={'id':["mapg","ver_off_todosloscom","todosloscom"]}),
- dict(name='span', attrs={'class':["trazul18b","trrojo11","trnaranja11","trbris11","georojo18b","geogris18"]}),
- dict(name='span', attrs={'class':["detalles-opinion"]}),
- dict(name='a', attrs={'class':["arnaranja12b","trbris11","arazul12rel","trrojo10"]}),
- dict(name='img', src = "/img/icono_imprimir.gif"),
- dict(name='img', src = "/img/icono_enviar_mail.gif"),
- dict(name='img', src = "/img/icono_fuente_g.gif"),
- dict(name='img', src = "/img/icono_fuente_m.gif"),
- dict(name='img', src = "/img/icono_fuente_c.gif"),
- dict(name='img', src = "/img/icono_compartir.gif"),
- dict(name='img', src = "/img/icono_enviar_coment.gif"),
- dict(name='img', src = "http://www.eluniversal.com.mx/n_img/bot-notasrel.gif"),
- dict(name='img', src = "http://www.eluniversal.com.mx/n_img/fr.gif"),
- dict(name='img', src = "/img/espiral2.gif"),
- dict(name='img', src = "http://www.eluniversal.com.mx/n_img/b"),
- dict(name='img', src = "/img/icono_enviar_coment.gifot-notasrel.gif"),
- dict(name='img', src = "/n_img/icono_tipo3.gif"),
- dict(name='img', src = "/n_img/icono_tipo2.gif"),
- dict(name='img', src = "/n_img/icono_print.gif"),
- dict(name='img', src = "/n_img/icono_mail2.gif"),
- dict(name='img', src = "/n_img/im-comentarios-2a.gif"),
- dict(name='img', src = "/n_img/im-comentarios-1a.gif"),
- dict(name='img', src = "/img/icono_coment.gif"),
- dict(name='img', src = "http://www.eluniversal.com.mx/n_img/bot-sitiosrel.gif"),
- dict(name='img', src = "/n_img/icono_tipomenos.gif"),
- dict(name='img', src = "/img/futbol/19.jpg"),
- dict(name='img', alt = "Facebook"),
- dict(name='img', alt = "Twitter"),
- dict(name='img', alt = "Google"),
- dict(name='img', alt = "LinkedIn"),
- dict(name='img', alt = "Viadeo"),
- dict(name='img', alt = "Digg"),
- dict(name='img', alt = "Delicious"),
- dict(name='img', alt = "Meneame"),
- dict(name='img', alt = "Yahoo"),
- dict(name='img', alt = "Technorati"),
- dict(name='a',text =["Compartir","Facebook","Twitter","Google","LinkedIn","Viadeo","Digg","Delicious","Meneame","Yahoo","Technorati"]),
- dict(name='select'),
- dict(name='a', attrs={'class':"tbgriscompartir"}),
- ]
+ dict(attrs={'class':'noteExtras'}),
+ dict(name=['meta','iframe','base','embed','object']),
+ dict(attrs={'id':'tm_box'})
+ ]
+ remove_attributes=['lang','onclick']
feeds = [
(u'Minuto por Minuto', u'http://www.eluniversal.com.mx/rss/universalmxm.xml' )
@@ -101,25 +49,3 @@ class ElUniversal(BasicNewsRecipe):
,(u'Computacion' , u'http://www.eluniversal.com.mx/rss/computo.xml' )
,(u'Sociedad' , u'http://www.eluniversal.com.mx/rss/sociedad.xml' )
]
-
- # def print_version(self, url):
- # return url.replace('/notas/','/notas/vi_')
-
- def preprocess_html(self, soup):
- mtag = ''
- soup.head.insert(0,mtag)
- for tag in soup.findAll(name='td',attrs={'class': 'arazul50'}):
- tag.insert(0,"")
- tag.insert(2,"
")
-
- return soup
-
- def postprocess_html(self, soup,first):
-
- for tag in soup.findAll(name=['table', 'span','i']):
- tag.name = 'div'
- for item in soup.findAll(align = "right"):
- del item['align']
-
- return soup
-
diff --git a/resources/recipes/elpais_impreso.recipe b/resources/recipes/elpais_impreso.recipe
index bba3bda217..130013286c 100644
--- a/resources/recipes/elpais_impreso.recipe
+++ b/resources/recipes/elpais_impreso.recipe
@@ -1,86 +1,95 @@
-# -*- coding: utf-8 -*-
__license__ = 'GPL v3'
__copyright__ = '2010, Darko Miletic '
'''
-www.elpais.com/diario/
+www.elpais.com
'''
-from calibre import strftime
from calibre.web.feeds.news import BasicNewsRecipe
-class ElPaisImpresa(BasicNewsRecipe):
- title = u'El Pa\xeds - edicion impresa'
+class ElPais_RSS(BasicNewsRecipe):
+ title = 'El Pais'
__author__ = 'Darko Miletic'
- description = u'el periodico global en Espa\xf1ol'
+ description = 'el periodico global en Castellano'
publisher = 'EDICIONES EL PAIS, S.L.'
- category = 'news, politics,Spain,actualidad,noticias,informacion,videos,fotografias,audios,graficos,nacional,internacional,deportes,economia,tecnologia,cultura,gente,television,sociedad,opinion,blogs,foros,chats,encuestas,entrevistas,participacion'
+ category = 'news, politics, finances, world, spain'
+ oldest_article = 2
+ max_articles_per_feed = 200
no_stylesheets = True
- encoding = 'latin1'
+ encoding = 'cp1252'
use_embedded_content = False
- language = 'es'
+ language = 'es_ES'
+ remove_empty_feeds = True
publication_type = 'newspaper'
- masthead_url = 'http://www.elpais.com/im/tit_logo_global.gif'
- index = 'http://www.elpais.com/diario/'
- extra_css = ' p{text-align: justify} body{ text-align: left; font-family: Georgia,"Times New Roman",Times,serif } h2{font-family: Arial,Helvetica,sans-serif} img{margin-bottom: 0.4em} '
+ masthead_url = 'http://www.elpais.com/im/tit_logo.gif'
+ extra_css = """
+ body{font-family: Georgia,"Times New Roman",Times,serif }
+ h3{font-family: Arial,Helvetica,sans-serif}
+ img{margin-bottom: 0.4em; display:block}
+ """
conversion_options = {
- 'comment' : description
- , 'tags' : category
- , 'publisher' : publisher
- , 'language' : language
+ 'comment' : description
+ , 'tags' : category
+ , 'publisher' : publisher
+ , 'language' : language
}
- feeds = [
- (u'Internacional' , index + u'internacional/' )
- ,(u'Espa\xf1a' , index + u'espana/' )
- ,(u'Economia' , index + u'economia/' )
- ,(u'Opinion' , index + u'opinion/' )
- ,(u'Vi\xf1etas' , index + u'vineta/' )
- ,(u'Sociedad' , index + u'sociedad/' )
- ,(u'Cultura' , index + u'cultura/' )
- ,(u'Tendencias' , index + u'tendencias/' )
- ,(u'Gente' , index + u'gente/' )
- ,(u'Obituarios' , index + u'obituarios/' )
- ,(u'Deportes' , index + u'deportes/' )
- ,(u'Pantallas' , index + u'radioytv/' )
- ,(u'Ultima' , index + u'ultima/' )
- ,(u'Educacion' , index + u'educacion/' )
- ,(u'Saludo' , index + u'salud/' )
- ,(u'Ciberpais' , index + u'ciberpais/' )
- ,(u'EP3' , index + u'ep3/' )
- ,(u'Cine' , index + u'cine/' )
- ,(u'Babelia' , index + u'babelia/' )
- ,(u'El viajero' , index + u'viajero/' )
- ,(u'Negocios' , index + u'negocios/' )
- ,(u'Domingo' , index + u'domingo/' )
- ,(u'El Pais semanal' , index + u'eps/' )
- ,(u'Quadern Catalunya' , index + u'quadern-catalunya/' )
- ]
+ keep_only_tags = [dict(attrs={'class':['cabecera_noticia estirar','cabecera_noticia','','contenido_noticia']})]
+ remove_tags = [
+ dict(name=['meta','link','base','iframe','embed','object'])
+ ,dict(attrs={'class':['info_complementa','estructura_2col_der','votos estirar','votos']})
+ ,dict(attrs={'id':'utilidades'})
+ ]
+ remove_tags_after = dict(attrs={'id':'utilidades'})
+ remove_attributes = ['lang','border','width','height']
- keep_only_tags=[dict(attrs={'class':['cabecera_noticia','contenido_noticia']})]
- remove_attributes=['width','height']
- remove_tags=[dict(name='link')]
-
- def parse_index(self):
- totalfeeds = []
- lfeeds = self.get_feeds()
- for feedobj in lfeeds:
- feedtitle, feedurl = feedobj
- self.report_progress(0, _('Fetching feed')+' %s...'%(feedtitle if feedtitle else feedurl))
- articles = []
- soup = self.index_to_soup(feedurl)
- for item in soup.findAll('a',attrs={'class':['g19r003','g19i003','g17r003','g17i003']}):
- url = 'http://www.elpais.com' + item['href'].rpartition('/')[0]
- title = self.tag_to_string(item)
- date = strftime(self.timefmt)
- articles.append({
- 'title' :title
- ,'date' :date
- ,'url' :url
- ,'description':''
- })
- totalfeeds.append((feedtitle, articles))
- return totalfeeds
+ feeds = [
+ (u'Lo ultimo' , u'http://www.elpais.com/rss/feed.html?feedId=17046')
+ ,(u'America Latina' , u'http://www.elpais.com/rss/feed.html?feedId=17041')
+ ,(u'Mexico' , u'http://www.elpais.com/rss/feed.html?feedId=17042')
+ ,(u'Europa' , u'http://www.elpais.com/rss/feed.html?feedId=17043')
+ ,(u'Estados Unidos' , u'http://www.elpais.com/rss/feed.html?feedId=17044')
+ ,(u'Oriente proximo' , u'http://www.elpais.com/rss/feed.html?feedId=17045')
+ ,(u'Espana' , u'http://www.elpais.com/rss/feed.html?feedId=1002' )
+ ,(u'Andalucia' , u'http://www.elpais.com/rss/feed.html?feedId=17057')
+ ,(u'Catalunia' , u'http://www.elpais.com/rss/feed.html?feedId=17059')
+ ,(u'Comunidad Valenciana' , u'http://www.elpais.com/rss/feed.html?feedId=17061')
+ ,(u'Madrid' , u'http://www.elpais.com/rss/feed.html?feedId=1016' )
+ ,(u'Pais Vasco' , u'http://www.elpais.com/rss/feed.html?feedId=17062')
+ ,(u'Galicia' , u'http://www.elpais.com/rss/feed.html?feedId=17063')
+ ,(u'Opinion' , u'http://www.elpais.com/rss/feed.html?feedId=1003' )
+ ,(u'Sociedad' , u'http://www.elpais.com/rss/feed.html?feedId=1004' )
+ ,(u'Deportes' , u'http://www.elpais.com/rss/feed.html?feedId=1007' )
+ ,(u'Cultura' , u'http://www.elpais.com/rss/feed.html?feedId=1008' )
+ ,(u'Cine' , u'http://www.elpais.com/rss/feed.html?feedId=17052')
+ ,(u'Literatura' , u'http://www.elpais.com/rss/feed.html?feedId=17053')
+ ,(u'Musica' , u'http://www.elpais.com/rss/feed.html?feedId=17051')
+ ,(u'Arte' , u'http://www.elpais.com/rss/feed.html?feedId=17060')
+ ,(u'Tecnologia' , u'http://www.elpais.com/rss/feed.html?feedId=1005' )
+ ,(u'Economia' , u'http://www.elpais.com/rss/feed.html?feedId=1006' )
+ ,(u'Ciencia' , u'http://www.elpais.com/rss/feed.html?feedId=17068')
+ ,(u'Salud' , u'http://www.elpais.com/rss/feed.html?feedId=17074')
+ ,(u'Ocio' , u'http://www.elpais.com/rss/feed.html?feedId=17075')
+ ,(u'Justicia y Leyes' , u'http://www.elpais.com/rss/feed.html?feedId=17069')
+ ,(u'Guerras y conflictos' , u'http://www.elpais.com/rss/feed.html?feedId=17070')
+ ,(u'Politica' , u'http://www.elpais.com/rss/feed.html?feedId=17073')
+ ]
def print_version(self, url):
return url + '?print=1'
+
+ def preprocess_html(self, soup):
+ for item in soup.findAll(style=True):
+ del item['style']
+ for item in soup.findAll('a'):
+ if item.string is not None:
+ tstr = item.string
+ item.replaceWith(tstr)
+ else:
+ item.name='span'
+ for atrs in ['href','target','alt','title']:
+ if item.has_key(atrs):
+ del item[atrs]
+ for item in soup.findAll('img',alt=False):
+ item['alt'] = 'image'
+ return soup
diff --git a/resources/recipes/heraldo.recipe b/resources/recipes/heraldo.recipe
index 381e97b9ce..c5669e116b 100644
--- a/resources/recipes/heraldo.recipe
+++ b/resources/recipes/heraldo.recipe
@@ -1,50 +1,65 @@
#!/usr/bin/env python
-__license__ = 'GPL v3'
-__author__ = 'Lorenzo Vigentini'
-__copyright__ = '2009, Lorenzo Vigentini '
+__license__ = 'GPL v3'
+__copyright__ = '04 December 2010, desUBIKado'
+__author__ = 'desUBIKado'
__description__ = 'Daily newspaper from Aragon'
-__version__ = 'v1.01'
-__date__ = '30, January 2010'
-
+__version__ = 'v0.03'
+__date__ = '11, December 2010'
'''
-http://www.heraldo.es/
+[url]http://www.heraldo.es/[/url]
'''
+import time
from calibre.web.feeds.news import BasicNewsRecipe
class heraldo(BasicNewsRecipe):
- author = 'Lorenzo Vigentini'
+ __author__ = 'desUBIKado'
description = 'Daily newspaper from Aragon'
-
- cover_url = 'http://www.heraldo.es/MODULOS/global/publico/interfaces/img/logo.gif'
title = u'Heraldo de Aragon'
publisher = 'OJD Nielsen'
category = 'News, politics, culture, economy, general interest'
-
language = 'es'
timefmt = '[%a, %d %b, %Y]'
-
oldest_article = 1
- max_articles_per_feed = 25
-
+ max_articles_per_feed = 100
use_embedded_content = False
- recursion = 10
-
remove_javascript = True
no_stylesheets = True
-
- keep_only_tags = [
- dict(name='div', attrs={'class':['titularNoticiaNN','textoGrisVerdanaContenidos']})
- ]
+ recursion = 10
feeds = [
- (u'Portadas ', u'http://www.heraldo.es/index.php/mod.portadas/mem.rss')
- ]
+ (u'Portadas', u'http://www.heraldo.es/index.php/mod.portadas/mem.rss')
+ ]
+
+
+
+ keep_only_tags = [dict(name='div', attrs={'id':['dts','com']})]
+
+ remove_tags = [dict(name='a', attrs={'class':['com flo-r','enl-if','enl-df']}),
+ dict(name='div', attrs={'class':['brb-b-s con marg-btt','cnt-rel con']}),
+ dict(name='form', attrs={'class':'form'})]
+
+ remove_tags_before = dict(name='div' , attrs={'id':'dts'})
+ remove_tags_after = dict(name='div' , attrs={'id':'com'})
+
+ def get_cover_url(self):
+ cover = None
+ st = time.localtime()
+ year = str(st.tm_year)
+ month = "%.2d" % st.tm_mon
+ day = "%.2d" % st.tm_mday
+ #[url]http://oldorigin-www.heraldo.es/20101211/primeras/portada_aragon.pdf[/url]
+ cover='http://oldorigin-www.heraldo.es/'+ year + month + day +'/primeras/portada_aragon.pdf'
+ br = BasicNewsRecipe.get_browser()
+ try:
+ br.open(cover)
+ except:
+ self.log("\nPortada no disponible")
+ cover ='http://www.heraldo.es/MODULOS/global/publico/interfaces/img/logo-Heraldo.png'
+ return cover
+
+
+
extra_css = '''
- .articledate {color: gray;font-family: monospace;}
- .articledescription {display: block;font-family: sans;font-size: 0.7em; text-indent: 0;}
- .firma {color: #666;display: block;font-family: verdana, arial, helvetica;font-size: 1em;margin-bottom: 8px;}
- .textoGrisVerdanaContenidos {color: #56595c;display: block;font-family: Verdana;font-size: 1.28571em;padding-bottom: 10px}
- .titularNoticiaNN {display: block;padding-bottom: 10px;padding-left: 0;padding-right: 0;padding-top: 4px}
- .titulo {color: #003066;font-family: Tahoma;font-size: 1.92857em;font-weight: bold;line-height: 1.2em}
- '''
+ h2{font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:xx-large;}
+ '''
diff --git a/resources/recipes/karlsruhe.recipe b/resources/recipes/karlsruhe.recipe
new file mode 100644
index 0000000000..c0bc5369f1
--- /dev/null
+++ b/resources/recipes/karlsruhe.recipe
@@ -0,0 +1,52 @@
+import re
+from calibre.web.feeds.news import BasicNewsRecipe
+
+class KANewsRecipe(BasicNewsRecipe):
+ title = u'KA-News.de'
+ description = u'Nachrichten aus Karlsruhe, Deutschland und der Welt.'
+ __author__ = 'tfeld'
+ lang='de'
+ no_stylesheets = True
+
+ oldest_article = 7
+ max_articles_per_feed = 100
+
+ feeds = [
+ (u'News aus Karlsruhe', 'http://www.ka-news.de/storage/rss/rss/karlsruhe.xml'),
+ (u'Kulturnachrichten aus Karlsruhe', 'http://www.ka-news.de/storage/rss/rss/kultur.xml'),
+ (u'Durlach: News aus Durlach', 'http://www.ka-news.de/storage/rss/rss/durlach.xml'),
+ (u'Stutensee: News aus Stutensee Blankenloch, Büchig, Friedrichstal, Staffort, Spöck', 'http://www.ka-news.de/storage/rss/rss/stutensee.xml'),
+ (u'Bruchsal: News aus Bruchsal', 'http://www.ka-news.de/storage/rss/rss/bruchsal.xml'),
+ (u'Wirtschaftsnews aus Karlsruhe', 'http://www.ka-news.de/storage/rss/rss/wirtschaft.xml'),
+ (u'ka-news.de - Sport', 'http://www.ka-news.de/storage/rss/rss/sport.xml'),
+ (u'KSC-News - News rund um den KSC', 'http://www.ka-news.de/storage/rss/rss/ksc.xml'),
+ (u'ka-news.de - BG Karlsruhe', 'http://www.ka-news.de/storage/rss/rss/basketball.xml')
+ ]
+
+ preprocess_regexps = [
+ (re.compile(r'width:[0-9]*?px', re.DOTALL|re.IGNORECASE), lambda match: ''),
+ ]
+
+ remove_tags_before = dict(id='artdetail_ueberschrift')
+ remove_tags_after = dict(id='artdetail_unterzeile')
+ remove_tags = [dict(name=['div'], attrs={'class': 'lbx_table'}),
+ dict(name=['div'], attrs={'class': 'lk_zumthema'}),
+ dict(name=['div'], attrs={'class': 'lk_thumb'}),
+ dict(name=['div'], attrs={'class': 'lk_trenner'}),
+ dict(name=['div'], attrs={'class': 'lupen_container'}),
+ dict(name=['script']),
+ dict(name=['span'], attrs={'style': 'display:none;'}),
+ dict(name=['span'], attrs={'class': 'comm_info'}),
+ dict(name=['h3'], attrs={'id': 'artdetail_unterzeile'})]
+
+ # removing style attribute _after_ removing specifig tags above
+ remove_attributes = ['width','height','style']
+
+ extra_css = '''
+ h1{ font-size:large; font-weight:bold; }
+ h2{ font-size:medium; font-weight:bold; }
+ '''
+
+ def get_cover_url(self):
+ return 'http://www.ka-news.de/storage/scl/techkanews/logos/434447_m1t1w250q75s1v29681_ka-news-Logo_mit_Schatten_transparent.png'
+
diff --git a/resources/recipes/red_aragon.recipe b/resources/recipes/red_aragon.recipe
new file mode 100644
index 0000000000..4681e6660b
--- /dev/null
+++ b/resources/recipes/red_aragon.recipe
@@ -0,0 +1,47 @@
+#!/usr/bin/env python
+__license__ = 'GPL v3'
+__copyright__ = '11 December 2010, desUBIKado'
+__author__ = 'desUBIKado'
+__description__ = 'Entertainment guide from Aragon'
+__version__ = 'v0.01'
+__date__ = '11, December 2010'
+'''
+[url]http://www.redaragon.es/[/url]
+'''
+
+from calibre.web.feeds.news import BasicNewsRecipe
+
+class heraldo(BasicNewsRecipe):
+ __author__ = 'desUBIKado'
+ description = u'Guia de ocio desde Aragon'
+ title = u'RedAragon'
+ publisher = 'Grupo Z'
+ category = 'Concerts, Movies, Entertainment news'
+ cover_url = 'http://www.redaragon.com/2008_img/logotipo.gif'
+ language = 'es'
+ timefmt = '[%a, %d %b, %Y]'
+ oldest_article = 15
+ max_articles_per_feed = 100
+ encoding = 'iso-8859-1'
+ use_embedded_content = False
+ remove_javascript = True
+ no_stylesheets = True
+
+ feeds = [(u'Conciertos', u'http://redaragon.com/rss/agenda.asp?tid=1'),
+ (u'Exposiciones', u'http://redaragon.com/rss/agenda.asp?tid=5'),
+ (u'Teatro', u'http://redaragon.com/rss/agenda.asp?tid=10'),
+ (u'Conferencias', u'http://redaragon.com/rss/agenda.asp?tid=2'),
+ (u'Ferias', u'http://redaragon.com/rss/agenda.asp?tid=6'),
+ (u'Filmotecas/Cineclubs', u'http://redaragon.com/rss/agenda.asp?tid=7'),
+ (u'Presentaciones', u'http://redaragon.com/rss/agenda.asp?tid=9'),
+ (u'Fiestas', u'http://redaragon.com/rss/agenda.asp?tid=11'),
+ (u'Infantil', u'http://redaragon.com/rss/agenda.asp?tid=13'),
+ (u'Otros', u'http://redaragon.com/rss/agenda.asp?tid=8')]
+
+ keep_only_tags = [dict(name='div', attrs={'id':'FichaEventoAgenda'})]
+
+ remove_tags = [dict(name='div', attrs={'class':['Comparte','CajaAgenda','Caja','Cintillo']})]
+
+ remove_tags_before = dict(name='div' , attrs={'id':'FichaEventoAgenda'})
+
+ remove_tags_after = dict(name='div' , attrs={'class':'Cintillo'})
diff --git a/resources/recipes/salon.recipe b/resources/recipes/salon.recipe
index ed7ec98f10..c421ab094d 100644
--- a/resources/recipes/salon.recipe
+++ b/resources/recipes/salon.recipe
@@ -25,22 +25,20 @@ class Salon_com(BasicNewsRecipe):
feeds = [
('News & Politics', 'http://feeds.salon.com/salon/news'),
- ('War Room', 'http://feeds.salon.com/salon/war_room'),
- ('Arts & Entertainment', 'http://feeds.salon.com/salon/ent'),
- ('I Like to Watch', 'http://feeds.salon.com/salon/iltw'),
- ('Beyond Multiplex', 'http://feeds.salon.com/salon/btm'),
- ('Book Reviews', 'http://feeds.salon.com/salon/books'),
- ('All Life', 'http://feeds.salon.com/salon/mwt'),
- ('All Opinion', 'http://feeds.salon.com/salon/opinion'),
- ('Glenn Greenwald', 'http://feeds.salon.com/salon/greenwald'),
- ('Garrison Keillor', 'http://dir.salon.com/topics/garrison_keillor/index.rss'),
- ('Joan Walsh', 'http://www.salon.com/rss/walsh.rss'),
- ('All Sports', 'http://feeds.salon.com/salon/sports'),
+ ('War Room', 'http://feeds.feedburner.com/salon/war_room'),
+ ('Joan Walsh', 'http://feeds.feedburner.com/Salon_Joan_Walsh'),
+ ('Glenn Greenwald', 'http://feeds.feedburner.com/salon/greenwald'),
('Tech & Business', 'http://feeds.salon.com/salon/tech'),
- ('How World Works', 'http://feeds.salon.com/salon/htww')
+ ('Ask the Pilot', 'http://feeds.feedburner.com/salon/ask_the_pilot'),
+ ('How World Works', 'http://feeds.feedburner.com/salon/htww'),
+ ('Life', 'http://feeds.feedburner.com/salon/mwt'),
+ ('Broadsheet', 'http://feeds.feedburner.com/salon/broadsheet'),
+ ('Movie Reviews', 'http://feeds.feedburner.com/salon/movie_reviews'),
+ ('Film Salon', 'http://feeds.feedburner.com/Salon/Film_Salon'),
+ ('TV', 'http://feeds.feedburner.com/salon/tv'),
+ ('Books', 'http://feeds.feedburner.com/salon/books')
]
def print_version(self, url):
return url.replace('/index.html', '/print.html')
-
diff --git a/resources/recipes/smith.recipe b/resources/recipes/smith.recipe
index e52b2ee709..98f7d98517 100644
--- a/resources/recipes/smith.recipe
+++ b/resources/recipes/smith.recipe
@@ -17,8 +17,8 @@ class SmithsonianMagazine(BasicNewsRecipe):
remove_tags = [
dict(name='iframe'),
dict(name='div', attrs={'class':'article_sidebar_border'}),
- dict(name='div', attrs={'id':['article_sidebar_border', 'most-popular_large']}),
- #dict(name='ul', attrs={'class':'article-tools'}),
+ dict(name='div', attrs={'id':['article_sidebar_border', 'most-popular_large', 'most-popular-body_large']}),
+ ##dict(name='ul', attrs={'class':'article-tools'}),
dict(name='ul', attrs={'class':'cat-breadcrumb col three last'}),
]
@@ -37,16 +37,16 @@ class SmithsonianMagazine(BasicNewsRecipe):
]
def preprocess_html(self, soup):
- story = soup.find(name='div', attrs={'id':'article-left'})
- #td = heading.findParent(name='td')
- #td.extract()
+ story = soup.find(name='div', attrs={'id':'article-body'})
+ ##td = heading.findParent(name='td')
+ ##td.extract()
soup = BeautifulSoup('t')
body = soup.find(name='body')
body.insert(0, story)
return soup
- def postprocess_html(self, soup, first):
- for p in soup.findAll(id='articlePaginationWrapper'): p.extract()
- if not first:
- for div in soup.findAll(id='article-head'): div.extract()
- return soup
+ #def postprocess_html(self, soup, first):
+ #for p in soup.findAll(id='articlePaginationWrapper'): p.extract()
+ #if not first:
+ #for div in soup.findAll(id='article-head'): div.extract()
+ #return soup
diff --git a/resources/recipes/the_week_magazine_free.recipe b/resources/recipes/the_week_magazine_free.recipe
index 1bac4133e7..6e033eaf82 100644
--- a/resources/recipes/the_week_magazine_free.recipe
+++ b/resources/recipes/the_week_magazine_free.recipe
@@ -1,17 +1,19 @@
-
__license__ = 'GPL v3'
-__copyright__ = '2010, Darko Miletic '
+__copyright__ = '2010, JOlo'
'''
www.theweek.com
'''
from calibre.web.feeds.news import BasicNewsRecipe
+import re
-class TheWeekFree(BasicNewsRecipe):
- title = 'The Week Magazine - Free content'
- __author__ = 'Darko Miletic'
+class TheWeek(BasicNewsRecipe):
+ title = 'The Week Magazine'
+ __author__ = 'Jim Olo'
description = "The best of the US and international media. Daily coverage of commentary and analysis of the day's events, as well as arts, entertainment, people and gossip, and political cartoons."
publisher = 'The Week Publications, Inc.'
+ masthead_url = 'http://test.theweek.com/images/logo_theweek.gif'
+ cover_url = masthead_url
category = 'news, politics, USA'
oldest_article = 7
max_articles_per_feed = 100
@@ -19,31 +21,27 @@ class TheWeekFree(BasicNewsRecipe):
encoding = 'utf-8'
use_embedded_content = False
language = 'en'
+ preprocess_regexps = [(re.compile(r'