'
+'''
+globaleconomicanalysis.blogspot.com
+'''
+
+from calibre.web.feeds.news import BasicNewsRecipe
+
+class GlobalEconomicAnalysis(BasicNewsRecipe):
+ title = "Mish's Global Economic Trend Analysis"
+ __author__ = 'Darko Miletic'
+ description = 'Thoughts on the global economy, housing, gold, silver, interest rates, oil, energy, China, commodities, the dollar, Euro, Renminbi, Yen, inflation, deflation, stagflation, precious metals, emerging markets, and policy decisions that affect the global markets.'
+ publisher = 'Mike Shedlock'
+ category = 'news, politics, economy, banking'
+ oldest_article = 7
+ max_articles_per_feed = 200
+ no_stylesheets = True
+ encoding = 'utf8'
+ use_embedded_content = True
+ language = 'en'
+ remove_empty_feeds = True
+ publication_type = 'blog'
+ masthead_url = 'http://www.pagina12.com.ar/commons/imgs/logo-home.gif'
+ extra_css = """
+ body{font-family: Arial,Helvetica,sans-serif }
+ img{margin-bottom: 0.4em; display:block}
+ """
+
+ conversion_options = {
+ 'comment' : description
+ , 'tags' : category
+ , 'publisher' : publisher
+ , 'language' : language
+ }
+
+ remove_tags = [
+ dict(name=['meta','link','iframe','object','embed'])
+ ,dict(attrs={'class':'blogger-post-footer'})
+ ]
+ remove_attributes=['border']
+
+ feeds = [(u'Articles', u'http://feeds2.feedburner.com/MishsGlobalEconomicTrendAnalysis')]
diff --git a/resources/recipes/el_periodico.recipe b/resources/recipes/el_periodico.recipe
new file mode 100644
index 0000000000..2c3ed456fb
--- /dev/null
+++ b/resources/recipes/el_periodico.recipe
@@ -0,0 +1,109 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+__license__ = 'GPL v3'
+__copyright__ = '04 December 2010, desUBIKado'
+__author__ = 'desUBIKado'
+__description__ = 'Daily newspaper from Aragon'
+__version__ = 'v0.05'
+__date__ = '07, December 2010'
+'''
+elperiodicodearagon.com
+'''
+import re
+from calibre.web.feeds.news import BasicNewsRecipe
+
+
+class elperiodicodearagon(BasicNewsRecipe):
+ title = u'El Periodico de Aragon'
+ __author__ = u'desUBIKado'
+ description = u'Noticias desde Aragon'
+ publisher = u'elperiodicodearagon.com'
+ category = u'news, politics, Spain, Aragon'
+ oldest_article = 2
+ delay = 0
+ max_articles_per_feed = 100
+ no_stylesheets = True
+ use_embedded_content = False
+ language = 'es'
+ encoding = 'utf8'
+ remove_empty_feeds = True
+ remove_javascript = True
+
+
+ conversion_options = {
+ 'comments' : description
+ ,'tags' : category
+ ,'language' : language
+ ,'publisher' : publisher
+ }
+
+ feeds = [(u'Arag\xf3n', u'http://elperiodicodearagon.com/RSS/2.xml'),
+ (u'Internacional', u'http://elperiodicodearagon.com/RSS/4.xml'),
+ (u'Espa\xf1a', u'http://elperiodicodearagon.com/RSS/3.xml'),
+ (u'Econom\xeda', u'http://elperiodicodearagon.com/RSS/5.xml'),
+ (u'Deportes', u'http://elperiodicodearagon.com/RSS/7.xml'),
+ (u'Real Zaragoza', u'http://elperiodicodearagon.com/RSS/10.xml'),
+ (u'Opini\xf3n', u'http://elperiodicodearagon.com/RSS/103.xml'),
+ (u'Escenarios', u'http://elperiodicodearagon.com/RSS/105.xml'),
+ (u'Sociedad', u'http://elperiodicodearagon.com/RSS/104.xml'),
+ (u'Gente', u'http://elperiodicodearagon.com/RSS/330.xml')]
+
+
+ extra_css = '''
+ h3{font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:xx-large;}
+ h2{font-family:Arial,Helvetica,sans-serif; font-weight:normal;font-size:small;}
+ dd{font-family:Arial,Helvetica,sans-serif; font-weight:normal;font-size:small;}
+ '''
+
+ remove_attributes = ['height','width']
+
+ keep_only_tags = [dict(name='div', attrs={'id':'contenidos'})]
+
+
+ # Quitar toda la morralla
+
+ remove_tags = [dict(name='ul', attrs={'class':'herramientasDeNoticia'}),
+ dict(name='span', attrs={'class':'MasInformacion '}),
+ dict(name='span', attrs={'class':'MasInformacion'}),
+ dict(name='div', attrs={'class':'Middle'}),
+ dict(name='div', attrs={'class':'MenuCabeceraRZaragoza'}),
+ dict(name='div', attrs={'id':'MenuCabeceraRZaragoza'}),
+ dict(name='div', attrs={'class':'MenuEquipo'}),
+ dict(name='div', attrs={'class':'TemasRelacionados'}),
+ dict(name='div', attrs={'class':'GaleriaEnNoticia'}),
+ dict(name='div', attrs={'class':'Recorte'}),
+ dict(name='div', attrs={'id':'NoticiasenRecursos'}),
+ dict(name='div', attrs={'id':'NoticiaEnPapel'}),
+ dict(name='p', attrs={'class':'RecorteEnNoticias'}),
+ dict(name='div', attrs={'id':'Comparte'}),
+ dict(name='div', attrs={'id':'CajaComparte'}),
+ dict(name='a', attrs={'class':'EscribirComentario'}),
+ dict(name='a', attrs={'class':'AvisoComentario'}),
+ dict(name='div', attrs={'class':'CajaAvisoComentario'}),
+ dict(name='div', attrs={'class':'navegaNoticias'}),
+ dict(name='div', attrs={'id':'PaginadorDiCom'}),
+ dict(name='div', attrs={'id':'CajaAccesoCuentaUsuario'}),
+ dict(name='div', attrs={'id':'CintilloComentario'}),
+ dict(name='div', attrs={'id':'EscribeComentario'}),
+ dict(name='div', attrs={'id':'FormularioComentario'}),
+ dict(name='div', attrs={'id':'FormularioNormas'})]
+
+ # Recuperamos la portada de papel (la imagen format=1 tiene mayor resolucion)
+
+ def get_cover_url(self):
+ index = 'http://pdf.elperiodicodearagon.com/'
+ soup = self.index_to_soup(index)
+ for image in soup.findAll('img',src=True):
+ if image['src'].startswith('http://pdf.elperiodicodearagon.com/funciones/portada-preview.php?eid='):
+ return image['src'].rstrip('format=2') + 'format=1'
+ return None
+
+ # Para quitar espacios entre la noticia y los comentarios (lineas 1 y 2)
+ # El indice no apuntaba correctamente al empiece de la noticia (linea 3)
+
+ preprocess_regexps = [
+ (re.compile(r'
', re.DOTALL|re.IGNORECASE), lambda match: ''),
+ (re.compile(r'
', re.DOTALL|re.IGNORECASE), lambda match: ''),
+ (re.compile(r'', re.DOTALL|re.IGNORECASE), lambda match: '
')
+ ]
diff --git a/resources/recipes/el_universal.recipe b/resources/recipes/el_universal.recipe
index 1995d0f932..f053812c05 100644
--- a/resources/recipes/el_universal.recipe
+++ b/resources/recipes/el_universal.recipe
@@ -1,7 +1,5 @@
-#!/usr/bin/env python
-
__license__ = 'GPL v3'
-__copyright__ = '2009, Darko Miletic '
+__copyright__ = '2009-2010, Darko Miletic '
'''
eluniversal.com.mx
'''
@@ -18,75 +16,25 @@ class ElUniversal(BasicNewsRecipe):
category = 'news, politics, Mexico'
no_stylesheets = True
use_embedded_content = False
- encoding = 'cp1252'
+ encoding = 'utf8'
remove_javascript = True
- language = 'es'
+ remove_empty_feeds = True
+ publication_type = 'newspaper'
+ language = 'es'
extra_css = '''
- body{font-family:Arial,Helvetica,sans-serif; font-size:x-small;}
- .geoGris30{font-family:Georgia,"Times New Roman",Times,serif; font-size:large; color:#003366; font-weight:bold;}
- .arnegro16{font-family:Georgia,"Times New Roman",Times,serif; font-weight:bold; font-size:small;}
- .tbazull2{font-family:"trebuchet ms",Arial,Helvetica,sans-serif; color:#336699; font-size:xx-small;}
- .tbgrisf11{font-family:"trebuchet ms",Arial,Helvetica,sans-serif; color: #666666; font-size:xx-small;}
- .verrojo13{font-family:"trebuchet ms",Arial,Helvetica,sans-serif; color: #CC0033; font-size:xx-small;}
- .trnegro13{font-family:"trebuchet ms",Arial,Helvetica,sans-serif; font-size:xx-small;}
- .txt-fotogaleria{font-family:"trebuchet ms",Arial,Helvetica,sans-serif; font-size:xx-small;}
+ body{font-family:Arial,Helvetica,sans-serif}
+ .noteTitle{font-family: Georgia,"Times New Roman",Times,serif; color: #336699; font-size: xx-large; font-weight: bold}
+ .noteInfo{display: block; color: gray}
'''
- keep_only_tags = [ dict(name='table', attrs={'width':"633"}),dict(name='table', attrs={'width':"629"}),]
-
+ keep_only_tags = [ dict(name='div', attrs={'id':'noteContent'})]
+ remove_tags_after = dict(attrs={'class':'noteText'})
remove_tags = [
- dict(name='table', attrs={'bgcolor':"#f5f5f5"}),
- dict(name='td', attrs={'bgcolor':"#f7f8f9"}),
- dict(name='td', attrs={'bgcolor':"#f5f5f5"}),
- dict(name='table', attrs={'width':"302"}),
- dict(name='table', attrs={'width':"214"}),
- dict(name='table', attrs={'width':"112"}),
- dict(name='table', attrs={'width':"980"}),
- dict(name='td', attrs={'height':"1"}),
- dict(name='td', attrs={'height':"4"}),
- dict(name='td', attrs={'height':"20"}),
- dict(name='td', attrs={'height':"10"}),
- dict(name='td', attrs={'class':["trrojo11","trbris11","trrojo12","arrojo12s","tbazul13"]}),
- dict(name='div', attrs={'id':["mapg","ver_off_todosloscom","todosloscom"]}),
- dict(name='span', attrs={'class':["trazul18b","trrojo11","trnaranja11","trbris11","georojo18b","geogris18"]}),
- dict(name='span', attrs={'class':["detalles-opinion"]}),
- dict(name='a', attrs={'class':["arnaranja12b","trbris11","arazul12rel","trrojo10"]}),
- dict(name='img', src = "/img/icono_imprimir.gif"),
- dict(name='img', src = "/img/icono_enviar_mail.gif"),
- dict(name='img', src = "/img/icono_fuente_g.gif"),
- dict(name='img', src = "/img/icono_fuente_m.gif"),
- dict(name='img', src = "/img/icono_fuente_c.gif"),
- dict(name='img', src = "/img/icono_compartir.gif"),
- dict(name='img', src = "/img/icono_enviar_coment.gif"),
- dict(name='img', src = "http://www.eluniversal.com.mx/n_img/bot-notasrel.gif"),
- dict(name='img', src = "http://www.eluniversal.com.mx/n_img/fr.gif"),
- dict(name='img', src = "/img/espiral2.gif"),
- dict(name='img', src = "http://www.eluniversal.com.mx/n_img/b"),
- dict(name='img', src = "/img/icono_enviar_coment.gifot-notasrel.gif"),
- dict(name='img', src = "/n_img/icono_tipo3.gif"),
- dict(name='img', src = "/n_img/icono_tipo2.gif"),
- dict(name='img', src = "/n_img/icono_print.gif"),
- dict(name='img', src = "/n_img/icono_mail2.gif"),
- dict(name='img', src = "/n_img/im-comentarios-2a.gif"),
- dict(name='img', src = "/n_img/im-comentarios-1a.gif"),
- dict(name='img', src = "/img/icono_coment.gif"),
- dict(name='img', src = "http://www.eluniversal.com.mx/n_img/bot-sitiosrel.gif"),
- dict(name='img', src = "/n_img/icono_tipomenos.gif"),
- dict(name='img', src = "/img/futbol/19.jpg"),
- dict(name='img', alt = "Facebook"),
- dict(name='img', alt = "Twitter"),
- dict(name='img', alt = "Google"),
- dict(name='img', alt = "LinkedIn"),
- dict(name='img', alt = "Viadeo"),
- dict(name='img', alt = "Digg"),
- dict(name='img', alt = "Delicious"),
- dict(name='img', alt = "Meneame"),
- dict(name='img', alt = "Yahoo"),
- dict(name='img', alt = "Technorati"),
- dict(name='a',text =["Compartir","Facebook","Twitter","Google","LinkedIn","Viadeo","Digg","Delicious","Meneame","Yahoo","Technorati"]),
- dict(name='select'),
- dict(name='a', attrs={'class':"tbgriscompartir"}),
- ]
+ dict(attrs={'class':'noteExtras'}),
+ dict(name=['meta','iframe','base','embed','object']),
+ dict(attrs={'id':'tm_box'})
+ ]
+ remove_attributes=['lang','onclick']
feeds = [
(u'Minuto por Minuto', u'http://www.eluniversal.com.mx/rss/universalmxm.xml' )
@@ -101,25 +49,3 @@ class ElUniversal(BasicNewsRecipe):
,(u'Computacion' , u'http://www.eluniversal.com.mx/rss/computo.xml' )
,(u'Sociedad' , u'http://www.eluniversal.com.mx/rss/sociedad.xml' )
]
-
- # def print_version(self, url):
- # return url.replace('/notas/','/notas/vi_')
-
- def preprocess_html(self, soup):
- mtag = ''
- soup.head.insert(0,mtag)
- for tag in soup.findAll(name='td',attrs={'class': 'arazul50'}):
- tag.insert(0,"")
- tag.insert(2,"
")
-
- return soup
-
- def postprocess_html(self, soup,first):
-
- for tag in soup.findAll(name=['table', 'span','i']):
- tag.name = 'div'
- for item in soup.findAll(align = "right"):
- del item['align']
-
- return soup
-
diff --git a/resources/recipes/elpais_impreso.recipe b/resources/recipes/elpais_impreso.recipe
index bba3bda217..130013286c 100644
--- a/resources/recipes/elpais_impreso.recipe
+++ b/resources/recipes/elpais_impreso.recipe
@@ -1,86 +1,95 @@
-# -*- coding: utf-8 -*-
__license__ = 'GPL v3'
__copyright__ = '2010, Darko Miletic '
'''
-www.elpais.com/diario/
+www.elpais.com
'''
-from calibre import strftime
from calibre.web.feeds.news import BasicNewsRecipe
-class ElPaisImpresa(BasicNewsRecipe):
- title = u'El Pa\xeds - edicion impresa'
+class ElPais_RSS(BasicNewsRecipe):
+ title = 'El Pais'
__author__ = 'Darko Miletic'
- description = u'el periodico global en Espa\xf1ol'
+ description = 'el periodico global en Castellano'
publisher = 'EDICIONES EL PAIS, S.L.'
- category = 'news, politics,Spain,actualidad,noticias,informacion,videos,fotografias,audios,graficos,nacional,internacional,deportes,economia,tecnologia,cultura,gente,television,sociedad,opinion,blogs,foros,chats,encuestas,entrevistas,participacion'
+ category = 'news, politics, finances, world, spain'
+ oldest_article = 2
+ max_articles_per_feed = 200
no_stylesheets = True
- encoding = 'latin1'
+ encoding = 'cp1252'
use_embedded_content = False
- language = 'es'
+ language = 'es_ES'
+ remove_empty_feeds = True
publication_type = 'newspaper'
- masthead_url = 'http://www.elpais.com/im/tit_logo_global.gif'
- index = 'http://www.elpais.com/diario/'
- extra_css = ' p{text-align: justify} body{ text-align: left; font-family: Georgia,"Times New Roman",Times,serif } h2{font-family: Arial,Helvetica,sans-serif} img{margin-bottom: 0.4em} '
+ masthead_url = 'http://www.elpais.com/im/tit_logo.gif'
+ extra_css = """
+ body{font-family: Georgia,"Times New Roman",Times,serif }
+ h3{font-family: Arial,Helvetica,sans-serif}
+ img{margin-bottom: 0.4em; display:block}
+ """
conversion_options = {
- 'comment' : description
- , 'tags' : category
- , 'publisher' : publisher
- , 'language' : language
+ 'comment' : description
+ , 'tags' : category
+ , 'publisher' : publisher
+ , 'language' : language
}
- feeds = [
- (u'Internacional' , index + u'internacional/' )
- ,(u'Espa\xf1a' , index + u'espana/' )
- ,(u'Economia' , index + u'economia/' )
- ,(u'Opinion' , index + u'opinion/' )
- ,(u'Vi\xf1etas' , index + u'vineta/' )
- ,(u'Sociedad' , index + u'sociedad/' )
- ,(u'Cultura' , index + u'cultura/' )
- ,(u'Tendencias' , index + u'tendencias/' )
- ,(u'Gente' , index + u'gente/' )
- ,(u'Obituarios' , index + u'obituarios/' )
- ,(u'Deportes' , index + u'deportes/' )
- ,(u'Pantallas' , index + u'radioytv/' )
- ,(u'Ultima' , index + u'ultima/' )
- ,(u'Educacion' , index + u'educacion/' )
- ,(u'Saludo' , index + u'salud/' )
- ,(u'Ciberpais' , index + u'ciberpais/' )
- ,(u'EP3' , index + u'ep3/' )
- ,(u'Cine' , index + u'cine/' )
- ,(u'Babelia' , index + u'babelia/' )
- ,(u'El viajero' , index + u'viajero/' )
- ,(u'Negocios' , index + u'negocios/' )
- ,(u'Domingo' , index + u'domingo/' )
- ,(u'El Pais semanal' , index + u'eps/' )
- ,(u'Quadern Catalunya' , index + u'quadern-catalunya/' )
- ]
+ keep_only_tags = [dict(attrs={'class':['cabecera_noticia estirar','cabecera_noticia','','contenido_noticia']})]
+ remove_tags = [
+ dict(name=['meta','link','base','iframe','embed','object'])
+ ,dict(attrs={'class':['info_complementa','estructura_2col_der','votos estirar','votos']})
+ ,dict(attrs={'id':'utilidades'})
+ ]
+ remove_tags_after = dict(attrs={'id':'utilidades'})
+ remove_attributes = ['lang','border','width','height']
- keep_only_tags=[dict(attrs={'class':['cabecera_noticia','contenido_noticia']})]
- remove_attributes=['width','height']
- remove_tags=[dict(name='link')]
-
- def parse_index(self):
- totalfeeds = []
- lfeeds = self.get_feeds()
- for feedobj in lfeeds:
- feedtitle, feedurl = feedobj
- self.report_progress(0, _('Fetching feed')+' %s...'%(feedtitle if feedtitle else feedurl))
- articles = []
- soup = self.index_to_soup(feedurl)
- for item in soup.findAll('a',attrs={'class':['g19r003','g19i003','g17r003','g17i003']}):
- url = 'http://www.elpais.com' + item['href'].rpartition('/')[0]
- title = self.tag_to_string(item)
- date = strftime(self.timefmt)
- articles.append({
- 'title' :title
- ,'date' :date
- ,'url' :url
- ,'description':''
- })
- totalfeeds.append((feedtitle, articles))
- return totalfeeds
+ feeds = [
+ (u'Lo ultimo' , u'http://www.elpais.com/rss/feed.html?feedId=17046')
+ ,(u'America Latina' , u'http://www.elpais.com/rss/feed.html?feedId=17041')
+ ,(u'Mexico' , u'http://www.elpais.com/rss/feed.html?feedId=17042')
+ ,(u'Europa' , u'http://www.elpais.com/rss/feed.html?feedId=17043')
+ ,(u'Estados Unidos' , u'http://www.elpais.com/rss/feed.html?feedId=17044')
+ ,(u'Oriente proximo' , u'http://www.elpais.com/rss/feed.html?feedId=17045')
+ ,(u'Espana' , u'http://www.elpais.com/rss/feed.html?feedId=1002' )
+ ,(u'Andalucia' , u'http://www.elpais.com/rss/feed.html?feedId=17057')
+ ,(u'Catalunia' , u'http://www.elpais.com/rss/feed.html?feedId=17059')
+ ,(u'Comunidad Valenciana' , u'http://www.elpais.com/rss/feed.html?feedId=17061')
+ ,(u'Madrid' , u'http://www.elpais.com/rss/feed.html?feedId=1016' )
+ ,(u'Pais Vasco' , u'http://www.elpais.com/rss/feed.html?feedId=17062')
+ ,(u'Galicia' , u'http://www.elpais.com/rss/feed.html?feedId=17063')
+ ,(u'Opinion' , u'http://www.elpais.com/rss/feed.html?feedId=1003' )
+ ,(u'Sociedad' , u'http://www.elpais.com/rss/feed.html?feedId=1004' )
+ ,(u'Deportes' , u'http://www.elpais.com/rss/feed.html?feedId=1007' )
+ ,(u'Cultura' , u'http://www.elpais.com/rss/feed.html?feedId=1008' )
+ ,(u'Cine' , u'http://www.elpais.com/rss/feed.html?feedId=17052')
+ ,(u'Literatura' , u'http://www.elpais.com/rss/feed.html?feedId=17053')
+ ,(u'Musica' , u'http://www.elpais.com/rss/feed.html?feedId=17051')
+ ,(u'Arte' , u'http://www.elpais.com/rss/feed.html?feedId=17060')
+ ,(u'Tecnologia' , u'http://www.elpais.com/rss/feed.html?feedId=1005' )
+ ,(u'Economia' , u'http://www.elpais.com/rss/feed.html?feedId=1006' )
+ ,(u'Ciencia' , u'http://www.elpais.com/rss/feed.html?feedId=17068')
+ ,(u'Salud' , u'http://www.elpais.com/rss/feed.html?feedId=17074')
+ ,(u'Ocio' , u'http://www.elpais.com/rss/feed.html?feedId=17075')
+ ,(u'Justicia y Leyes' , u'http://www.elpais.com/rss/feed.html?feedId=17069')
+ ,(u'Guerras y conflictos' , u'http://www.elpais.com/rss/feed.html?feedId=17070')
+ ,(u'Politica' , u'http://www.elpais.com/rss/feed.html?feedId=17073')
+ ]
def print_version(self, url):
return url + '?print=1'
+
+ def preprocess_html(self, soup):
+ for item in soup.findAll(style=True):
+ del item['style']
+ for item in soup.findAll('a'):
+ if item.string is not None:
+ tstr = item.string
+ item.replaceWith(tstr)
+ else:
+ item.name='span'
+ for atrs in ['href','target','alt','title']:
+ if item.has_key(atrs):
+ del item[atrs]
+ for item in soup.findAll('img',alt=False):
+ item['alt'] = 'image'
+ return soup
diff --git a/resources/recipes/gva_be.recipe b/resources/recipes/gva_be.recipe
index 34c4122394..f42bd23417 100644
--- a/resources/recipes/gva_be.recipe
+++ b/resources/recipes/gva_be.recipe
@@ -40,13 +40,12 @@ class GazetvanAntwerpen(BasicNewsRecipe):
remove_tags_after = dict(name='span', attrs={'class':'author'})
feeds = [
- (u'Overzicht & Blikvanger', u'http://www.gva.be/syndicationservices/artfeedservice.svc/rss/overview/overzicht' )
+ (u'Binnenland' , u'http://www.gva.be/syndicationservices/artfeedservice.svc/rss/mostrecent/binnenland' )
+ ,(u'Buitenland' , u'http://www.gva.be/syndicationservices/artfeedservice.svc/rss/mostrecent/buitenland' )
,(u'Stad & Regio' , u'http://www.gva.be/syndicationservices/artfeedservice.svc/rss/mostrecent/stadenregio' )
,(u'Economie' , u'http://www.gva.be/syndicationservices/artfeedservice.svc/rss/mostrecent/economie' )
- ,(u'Binnenland' , u'http://www.gva.be/syndicationservices/artfeedservice.svc/rss/mostrecent/binnenland' )
- ,(u'Buitenland' , u'http://www.gva.be/syndicationservices/artfeedservice.svc/rss/mostrecent/buitenland' )
,(u'Media & Cultur' , u'http://www.gva.be/syndicationservices/artfeedservice.svc/rss/mostrecent/mediaencultuur')
- ,(u'Wetenschap' , u'http://www.gva.be/syndicationservices/artfeedservice.svc/rss/mostrecent/mediaencultuur')
+ ,(u'Wetenschap' , u'http://www.gva.be/syndicationservices/artfeedservice.svc/rss/mostrecent/wetenschap' )
,(u'Sport' , u'http://www.gva.be/syndicationservices/artfeedservice.svc/rss/mostrecent/sport' )
]
diff --git a/resources/recipes/heraldo.recipe b/resources/recipes/heraldo.recipe
index 381e97b9ce..c5669e116b 100644
--- a/resources/recipes/heraldo.recipe
+++ b/resources/recipes/heraldo.recipe
@@ -1,50 +1,65 @@
#!/usr/bin/env python
-__license__ = 'GPL v3'
-__author__ = 'Lorenzo Vigentini'
-__copyright__ = '2009, Lorenzo Vigentini '
+__license__ = 'GPL v3'
+__copyright__ = '04 December 2010, desUBIKado'
+__author__ = 'desUBIKado'
__description__ = 'Daily newspaper from Aragon'
-__version__ = 'v1.01'
-__date__ = '30, January 2010'
-
+__version__ = 'v0.03'
+__date__ = '11, December 2010'
'''
-http://www.heraldo.es/
+[url]http://www.heraldo.es/[/url]
'''
+import time
from calibre.web.feeds.news import BasicNewsRecipe
class heraldo(BasicNewsRecipe):
- author = 'Lorenzo Vigentini'
+ __author__ = 'desUBIKado'
description = 'Daily newspaper from Aragon'
-
- cover_url = 'http://www.heraldo.es/MODULOS/global/publico/interfaces/img/logo.gif'
title = u'Heraldo de Aragon'
publisher = 'OJD Nielsen'
category = 'News, politics, culture, economy, general interest'
-
language = 'es'
timefmt = '[%a, %d %b, %Y]'
-
oldest_article = 1
- max_articles_per_feed = 25
-
+ max_articles_per_feed = 100
use_embedded_content = False
- recursion = 10
-
remove_javascript = True
no_stylesheets = True
-
- keep_only_tags = [
- dict(name='div', attrs={'class':['titularNoticiaNN','textoGrisVerdanaContenidos']})
- ]
+ recursion = 10
feeds = [
- (u'Portadas ', u'http://www.heraldo.es/index.php/mod.portadas/mem.rss')
- ]
+ (u'Portadas', u'http://www.heraldo.es/index.php/mod.portadas/mem.rss')
+ ]
+
+
+
+ keep_only_tags = [dict(name='div', attrs={'id':['dts','com']})]
+
+ remove_tags = [dict(name='a', attrs={'class':['com flo-r','enl-if','enl-df']}),
+ dict(name='div', attrs={'class':['brb-b-s con marg-btt','cnt-rel con']}),
+ dict(name='form', attrs={'class':'form'})]
+
+ remove_tags_before = dict(name='div' , attrs={'id':'dts'})
+ remove_tags_after = dict(name='div' , attrs={'id':'com'})
+
+ def get_cover_url(self):
+ cover = None
+ st = time.localtime()
+ year = str(st.tm_year)
+ month = "%.2d" % st.tm_mon
+ day = "%.2d" % st.tm_mday
+ #[url]http://oldorigin-www.heraldo.es/20101211/primeras/portada_aragon.pdf[/url]
+ cover='http://oldorigin-www.heraldo.es/'+ year + month + day +'/primeras/portada_aragon.pdf'
+ br = BasicNewsRecipe.get_browser()
+ try:
+ br.open(cover)
+ except:
+ self.log("\nPortada no disponible")
+ cover ='http://www.heraldo.es/MODULOS/global/publico/interfaces/img/logo-Heraldo.png'
+ return cover
+
+
+
extra_css = '''
- .articledate {color: gray;font-family: monospace;}
- .articledescription {display: block;font-family: sans;font-size: 0.7em; text-indent: 0;}
- .firma {color: #666;display: block;font-family: verdana, arial, helvetica;font-size: 1em;margin-bottom: 8px;}
- .textoGrisVerdanaContenidos {color: #56595c;display: block;font-family: Verdana;font-size: 1.28571em;padding-bottom: 10px}
- .titularNoticiaNN {display: block;padding-bottom: 10px;padding-left: 0;padding-right: 0;padding-top: 4px}
- .titulo {color: #003066;font-family: Tahoma;font-size: 1.92857em;font-weight: bold;line-height: 1.2em}
- '''
+ h2{font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:xx-large;}
+ '''
diff --git a/resources/recipes/karlsruhe.recipe b/resources/recipes/karlsruhe.recipe
new file mode 100644
index 0000000000..c0bc5369f1
--- /dev/null
+++ b/resources/recipes/karlsruhe.recipe
@@ -0,0 +1,52 @@
+import re
+from calibre.web.feeds.news import BasicNewsRecipe
+
+class KANewsRecipe(BasicNewsRecipe):
+ title = u'KA-News.de'
+ description = u'Nachrichten aus Karlsruhe, Deutschland und der Welt.'
+ __author__ = 'tfeld'
+ lang='de'
+ no_stylesheets = True
+
+ oldest_article = 7
+ max_articles_per_feed = 100
+
+ feeds = [
+ (u'News aus Karlsruhe', 'http://www.ka-news.de/storage/rss/rss/karlsruhe.xml'),
+ (u'Kulturnachrichten aus Karlsruhe', 'http://www.ka-news.de/storage/rss/rss/kultur.xml'),
+ (u'Durlach: News aus Durlach', 'http://www.ka-news.de/storage/rss/rss/durlach.xml'),
+ (u'Stutensee: News aus Stutensee Blankenloch, Büchig, Friedrichstal, Staffort, Spöck', 'http://www.ka-news.de/storage/rss/rss/stutensee.xml'),
+ (u'Bruchsal: News aus Bruchsal', 'http://www.ka-news.de/storage/rss/rss/bruchsal.xml'),
+ (u'Wirtschaftsnews aus Karlsruhe', 'http://www.ka-news.de/storage/rss/rss/wirtschaft.xml'),
+ (u'ka-news.de - Sport', 'http://www.ka-news.de/storage/rss/rss/sport.xml'),
+ (u'KSC-News - News rund um den KSC', 'http://www.ka-news.de/storage/rss/rss/ksc.xml'),
+ (u'ka-news.de - BG Karlsruhe', 'http://www.ka-news.de/storage/rss/rss/basketball.xml')
+ ]
+
+ preprocess_regexps = [
+ (re.compile(r'width:[0-9]*?px', re.DOTALL|re.IGNORECASE), lambda match: ''),
+ ]
+
+ remove_tags_before = dict(id='artdetail_ueberschrift')
+ remove_tags_after = dict(id='artdetail_unterzeile')
+ remove_tags = [dict(name=['div'], attrs={'class': 'lbx_table'}),
+ dict(name=['div'], attrs={'class': 'lk_zumthema'}),
+ dict(name=['div'], attrs={'class': 'lk_thumb'}),
+ dict(name=['div'], attrs={'class': 'lk_trenner'}),
+ dict(name=['div'], attrs={'class': 'lupen_container'}),
+ dict(name=['script']),
+ dict(name=['span'], attrs={'style': 'display:none;'}),
+ dict(name=['span'], attrs={'class': 'comm_info'}),
+ dict(name=['h3'], attrs={'id': 'artdetail_unterzeile'})]
+
+ # removing style attribute _after_ removing specifig tags above
+ remove_attributes = ['width','height','style']
+
+ extra_css = '''
+ h1{ font-size:large; font-weight:bold; }
+ h2{ font-size:medium; font-weight:bold; }
+ '''
+
+ def get_cover_url(self):
+ return 'http://www.ka-news.de/storage/scl/techkanews/logos/434447_m1t1w250q75s1v29681_ka-news-Logo_mit_Schatten_transparent.png'
+
diff --git a/resources/recipes/lanacion.recipe b/resources/recipes/lanacion.recipe
index 19f6c1c897..050cb2e79c 100644
--- a/resources/recipes/lanacion.recipe
+++ b/resources/recipes/lanacion.recipe
@@ -78,4 +78,6 @@ class Lanacion(BasicNewsRecipe):
]
def preprocess_html(self, soup):
+ for item in soup.findAll(style=True):
+ del item['style']
return self.adeify_images(soup)
diff --git a/resources/recipes/le_monde.recipe b/resources/recipes/le_monde.recipe
index 18be6ca711..c14b8eeeff 100644
--- a/resources/recipes/le_monde.recipe
+++ b/resources/recipes/le_monde.recipe
@@ -4,7 +4,7 @@ from calibre.web.feeds.recipes import BasicNewsRecipe
class LeMonde(BasicNewsRecipe):
title = 'Le Monde'
__author__ = 'veezh'
- description = 'Actualités'
+ description = u'Actualit\xe9s'
oldest_article = 1
max_articles_per_feed = 100
no_stylesheets = True
diff --git a/resources/recipes/nrc-nl-epub.recipe b/resources/recipes/nrc-nl-epub.recipe
new file mode 100644
index 0000000000..da9b9195ce
--- /dev/null
+++ b/resources/recipes/nrc-nl-epub.recipe
@@ -0,0 +1,58 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+#Based on Lars Jacob's Taz Digiabo recipe
+
+__license__ = 'GPL v3'
+__copyright__ = '2010, veezh'
+
+'''
+www.nrc.nl
+'''
+import os, urllib2, zipfile
+import time
+from calibre.web.feeds.news import BasicNewsRecipe
+from calibre.ptempfile import PersistentTemporaryFile
+
+
+class NRCHandelsblad(BasicNewsRecipe):
+
+ title = u'NRC Handelsblad'
+ description = u'De EPUB-versie van NRC'
+ language = 'nl'
+ lang = 'nl-NL'
+
+ __author__ = 'veezh'
+
+ conversion_options = {
+ 'no_default_epub_cover' : True
+ }
+
+ def build_index(self):
+ today = time.strftime("%Y%m%d")
+ domain = "http://digitaleeditie.nrc.nl"
+
+ url = domain + "/digitaleeditie/helekrant/epub/nrc_" + today + ".epub"
+# print url
+
+ try:
+ f = urllib2.urlopen(url)
+ except urllib2.HTTPError:
+ self.report_progress(0,_('Kan niet inloggen om editie te downloaden'))
+ raise ValueError('Krant van vandaag nog niet beschikbaar')
+
+ tmp = PersistentTemporaryFile(suffix='.epub')
+ self.report_progress(0,_('downloading epub'))
+ tmp.write(f.read())
+ tmp.close()
+
+ zfile = zipfile.ZipFile(tmp.name, 'r')
+ self.report_progress(0,_('extracting epub'))
+
+ zfile.extractall(self.output_dir)
+
+ tmp.close()
+ index = os.path.join(self.output_dir, 'content.opf')
+
+ self.report_progress(1,_('epub downloaded and extracted'))
+
+ return index
diff --git a/resources/recipes/red_aragon.recipe b/resources/recipes/red_aragon.recipe
new file mode 100644
index 0000000000..4681e6660b
--- /dev/null
+++ b/resources/recipes/red_aragon.recipe
@@ -0,0 +1,47 @@
+#!/usr/bin/env python
+__license__ = 'GPL v3'
+__copyright__ = '11 December 2010, desUBIKado'
+__author__ = 'desUBIKado'
+__description__ = 'Entertainment guide from Aragon'
+__version__ = 'v0.01'
+__date__ = '11, December 2010'
+'''
+[url]http://www.redaragon.es/[/url]
+'''
+
+from calibre.web.feeds.news import BasicNewsRecipe
+
+class heraldo(BasicNewsRecipe):
+ __author__ = 'desUBIKado'
+ description = u'Guia de ocio desde Aragon'
+ title = u'RedAragon'
+ publisher = 'Grupo Z'
+ category = 'Concerts, Movies, Entertainment news'
+ cover_url = 'http://www.redaragon.com/2008_img/logotipo.gif'
+ language = 'es'
+ timefmt = '[%a, %d %b, %Y]'
+ oldest_article = 15
+ max_articles_per_feed = 100
+ encoding = 'iso-8859-1'
+ use_embedded_content = False
+ remove_javascript = True
+ no_stylesheets = True
+
+ feeds = [(u'Conciertos', u'http://redaragon.com/rss/agenda.asp?tid=1'),
+ (u'Exposiciones', u'http://redaragon.com/rss/agenda.asp?tid=5'),
+ (u'Teatro', u'http://redaragon.com/rss/agenda.asp?tid=10'),
+ (u'Conferencias', u'http://redaragon.com/rss/agenda.asp?tid=2'),
+ (u'Ferias', u'http://redaragon.com/rss/agenda.asp?tid=6'),
+ (u'Filmotecas/Cineclubs', u'http://redaragon.com/rss/agenda.asp?tid=7'),
+ (u'Presentaciones', u'http://redaragon.com/rss/agenda.asp?tid=9'),
+ (u'Fiestas', u'http://redaragon.com/rss/agenda.asp?tid=11'),
+ (u'Infantil', u'http://redaragon.com/rss/agenda.asp?tid=13'),
+ (u'Otros', u'http://redaragon.com/rss/agenda.asp?tid=8')]
+
+ keep_only_tags = [dict(name='div', attrs={'id':'FichaEventoAgenda'})]
+
+ remove_tags = [dict(name='div', attrs={'class':['Comparte','CajaAgenda','Caja','Cintillo']})]
+
+ remove_tags_before = dict(name='div' , attrs={'id':'FichaEventoAgenda'})
+
+ remove_tags_after = dict(name='div' , attrs={'class':'Cintillo'})
diff --git a/resources/recipes/salon.recipe b/resources/recipes/salon.recipe
index ed7ec98f10..c421ab094d 100644
--- a/resources/recipes/salon.recipe
+++ b/resources/recipes/salon.recipe
@@ -25,22 +25,20 @@ class Salon_com(BasicNewsRecipe):
feeds = [
('News & Politics', 'http://feeds.salon.com/salon/news'),
- ('War Room', 'http://feeds.salon.com/salon/war_room'),
- ('Arts & Entertainment', 'http://feeds.salon.com/salon/ent'),
- ('I Like to Watch', 'http://feeds.salon.com/salon/iltw'),
- ('Beyond Multiplex', 'http://feeds.salon.com/salon/btm'),
- ('Book Reviews', 'http://feeds.salon.com/salon/books'),
- ('All Life', 'http://feeds.salon.com/salon/mwt'),
- ('All Opinion', 'http://feeds.salon.com/salon/opinion'),
- ('Glenn Greenwald', 'http://feeds.salon.com/salon/greenwald'),
- ('Garrison Keillor', 'http://dir.salon.com/topics/garrison_keillor/index.rss'),
- ('Joan Walsh', 'http://www.salon.com/rss/walsh.rss'),
- ('All Sports', 'http://feeds.salon.com/salon/sports'),
+ ('War Room', 'http://feeds.feedburner.com/salon/war_room'),
+ ('Joan Walsh', 'http://feeds.feedburner.com/Salon_Joan_Walsh'),
+ ('Glenn Greenwald', 'http://feeds.feedburner.com/salon/greenwald'),
('Tech & Business', 'http://feeds.salon.com/salon/tech'),
- ('How World Works', 'http://feeds.salon.com/salon/htww')
+ ('Ask the Pilot', 'http://feeds.feedburner.com/salon/ask_the_pilot'),
+ ('How World Works', 'http://feeds.feedburner.com/salon/htww'),
+ ('Life', 'http://feeds.feedburner.com/salon/mwt'),
+ ('Broadsheet', 'http://feeds.feedburner.com/salon/broadsheet'),
+ ('Movie Reviews', 'http://feeds.feedburner.com/salon/movie_reviews'),
+ ('Film Salon', 'http://feeds.feedburner.com/Salon/Film_Salon'),
+ ('TV', 'http://feeds.feedburner.com/salon/tv'),
+ ('Books', 'http://feeds.feedburner.com/salon/books')
]
def print_version(self, url):
return url.replace('/index.html', '/print.html')
-
diff --git a/resources/recipes/smith.recipe b/resources/recipes/smith.recipe
index e52b2ee709..98f7d98517 100644
--- a/resources/recipes/smith.recipe
+++ b/resources/recipes/smith.recipe
@@ -17,8 +17,8 @@ class SmithsonianMagazine(BasicNewsRecipe):
remove_tags = [
dict(name='iframe'),
dict(name='div', attrs={'class':'article_sidebar_border'}),
- dict(name='div', attrs={'id':['article_sidebar_border', 'most-popular_large']}),
- #dict(name='ul', attrs={'class':'article-tools'}),
+ dict(name='div', attrs={'id':['article_sidebar_border', 'most-popular_large', 'most-popular-body_large']}),
+ ##dict(name='ul', attrs={'class':'article-tools'}),
dict(name='ul', attrs={'class':'cat-breadcrumb col three last'}),
]
@@ -37,16 +37,16 @@ class SmithsonianMagazine(BasicNewsRecipe):
]
def preprocess_html(self, soup):
- story = soup.find(name='div', attrs={'id':'article-left'})
- #td = heading.findParent(name='td')
- #td.extract()
+ story = soup.find(name='div', attrs={'id':'article-body'})
+ ##td = heading.findParent(name='td')
+ ##td.extract()
soup = BeautifulSoup('t')
body = soup.find(name='body')
body.insert(0, story)
return soup
- def postprocess_html(self, soup, first):
- for p in soup.findAll(id='articlePaginationWrapper'): p.extract()
- if not first:
- for div in soup.findAll(id='article-head'): div.extract()
- return soup
+ #def postprocess_html(self, soup, first):
+ #for p in soup.findAll(id='articlePaginationWrapper'): p.extract()
+ #if not first:
+ #for div in soup.findAll(id='article-head'): div.extract()
+ #return soup
diff --git a/resources/recipes/the_week_magazine_free.recipe b/resources/recipes/the_week_magazine_free.recipe
index 1bac4133e7..6e033eaf82 100644
--- a/resources/recipes/the_week_magazine_free.recipe
+++ b/resources/recipes/the_week_magazine_free.recipe
@@ -1,17 +1,19 @@
-
__license__ = 'GPL v3'
-__copyright__ = '2010, Darko Miletic '
+__copyright__ = '2010, JOlo'
'''
www.theweek.com
'''
from calibre.web.feeds.news import BasicNewsRecipe
+import re
-class TheWeekFree(BasicNewsRecipe):
- title = 'The Week Magazine - Free content'
- __author__ = 'Darko Miletic'
+class TheWeek(BasicNewsRecipe):
+ title = 'The Week Magazine'
+ __author__ = 'Jim Olo'
description = "The best of the US and international media. Daily coverage of commentary and analysis of the day's events, as well as arts, entertainment, people and gossip, and political cartoons."
publisher = 'The Week Publications, Inc.'
+ masthead_url = 'http://test.theweek.com/images/logo_theweek.gif'
+ cover_url = masthead_url
category = 'news, politics, USA'
oldest_article = 7
max_articles_per_feed = 100
@@ -19,31 +21,27 @@ class TheWeekFree(BasicNewsRecipe):
encoding = 'utf-8'
use_embedded_content = False
language = 'en'
+ preprocess_regexps = [(re.compile(r'