'
''' http://www.derstandard.at - Austrian Newspaper '''
import re
from calibre.web.feeds.news import BasicNewsRecipe
+from time import strftime
class DerStandardRecipe(BasicNewsRecipe):
title = u'derStandard'
- __author__ = 'Gerhard Aigner and Sujata Raman and Marcel Jira'
+ __author__ = 'Gerhard Aigner and Sujata Raman and Marcel Jira and Peter Reschenhofer'
description = u'Nachrichten aus Österreich'
publisher ='derStandard.at'
category = 'news, politics, nachrichten, Austria'
@@ -88,3 +89,41 @@ class DerStandardRecipe(BasicNewsRecipe):
for t in soup.findAll(['ul', 'li']):
t.name = 'div'
return soup
+
+ def get_cover_url(self):
+ highResolution = True
+
+ date = strftime("%Y/%Y%m%d")
+ # it is also possible for the past
+ #date = '2012/20120503'
+
+ urlP1 = 'http://epaper.derstandarddigital.at/'
+ urlP2 = 'data_ep/STAN/' + date
+ urlP3 = '/V.B1/'
+ urlP4 = 'paper.htm'
+ urlHTML = urlP1 + urlP2 + urlP3 + urlP4
+
+ br = self.clone_browser(self.browser)
+ htmlF = br.open_novisit(urlHTML)
+ htmlC = htmlF.read()
+
+
+ # URL EXAMPLE: data_ep/STAN/2012/20120504/V.B1/pages/A3B6798F-2751-4D8D-A103-C5EF22F7ACBE.htm
+ # consists of part2 + part3 + 'pages/' + code
+ # 'pages/' has length 6, code has lenght 36
+
+ index = htmlC.find(urlP2) + len(urlP2 + urlP3) + 6
+ code = htmlC[index:index + 36]
+
+
+ # URL EXAMPLE HIGH RESOLUTION: http://epaper.derstandarddigital.at/data_ep/STAN/2012/20120504/pagejpg/A3B6798F-2751-4D8D-A103-C5EF22F7ACBE_b.png
+ # URL EXAMPLE LOW RESOLUTION: http://epaper.derstandarddigital.at/data_ep/STAN/2012/20120504/pagejpg/2AB52F71-11C1-4859-9114-CDCD79BEFDCB.png
+
+ urlPic = urlP1 + urlP2 + '/pagejpg/' + code
+
+ if highResolution:
+ urlPic = urlPic + '_b'
+
+ urlPic = urlPic + '.png'
+
+ return urlPic
diff --git a/recipes/drytooling_pl.recipe b/recipes/drytooling_pl.recipe
new file mode 100644
index 0000000000..bb05e1a25f
--- /dev/null
+++ b/recipes/drytooling_pl.recipe
@@ -0,0 +1,15 @@
+# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
+
+from calibre.web.feeds.news import BasicNewsRecipe
+
+class BasicUserRecipe1337668045(BasicNewsRecipe):
+ title = u'Drytooling.com.pl'
+ masthead_url = 'http://drytooling.com.pl/images/drytooling-kindle.png'
+ cover_url = 'http://drytooling.com.pl/images/drytooling-kindle.png'
+ description = u'Drytooling.com.pl jest serwisem wspinaczki zimowej, alpinizmu i himalaizmu. Jeśli uwielbiasz zimę, nie możesz doczekać się aż wyciągniesz szpej z szafki i uderzysz w Tatry, Alpy, czy może Himalaje, to znajdziesz tutaj naprawdę dużo interesujących Cię treści! Zapraszamy!'
+ __author__ = u'Damian Granowski'
+ oldest_article = 100
+ max_articles_per_feed = 20
+ auto_cleanup = True
+
+ feeds = [(u'Newsy', u'http://drytooling.com.pl/index.php?option=com_ninjarsssyndicator&feed_id=4&format=raw'), (u'Artyku\u0142y', u'http://drytooling.com.pl/index.php?option=com_ninjarsssyndicator&feed_id=3&format=raw'), (u'Imprezy i zawody', u'http://drytooling.com.pl/index.php?option=com_ninjarsssyndicator&feed_id=5&format=raw'), (u'Baza G\xf3rska', u'http://drytooling.com.pl/index.php?option=com_ninjarsssyndicator&feed_id=6&format=raw'), (u'Wyprawy', u'http://drytooling.com.pl/index.php?option=com_ninjarsssyndicator&feed_id=7&format=raw'), (u'Newsy / alpinizm', u'http://drytooling.com.pl/index.php?option=com_ninjarsssyndicator&feed_id=12&format=raw'), (u'Newsy / klasyka zimowa', u'http://drytooling.com.pl/index.php?option=com_ninjarsssyndicator&feed_id=11&format=raw'), (u'Newsy / himalaizm', u'http://drytooling.com.pl/index.php?option=com_ninjarsssyndicator&feed_id=10&format=raw'), (u'Outdoor', u'http://drytooling.com.pl/index.php?option=com_ninjarsssyndicator&feed_id=8&format=raw')]
diff --git a/recipes/economico.recipe b/recipes/economico.recipe
new file mode 100644
index 0000000000..86a1e15975
--- /dev/null
+++ b/recipes/economico.recipe
@@ -0,0 +1,30 @@
+from calibre.web.feeds.news import BasicNewsRecipe
+
+class Economico(BasicNewsRecipe):
+ title = u'Economico'
+ language = 'pt'
+ __author__ = 'Krittika Goyal'
+ oldest_article = 1 #days
+ max_articles_per_feed = 25
+ encoding = 'utf-8'
+ use_embedded_content = False
+
+ no_stylesheets = True
+ auto_cleanup = True
+
+
+ feeds = [
+('Ultima Hora',
+ 'http://economico.sapo.pt/rss/ultimas'),
+ ('Em Foco',
+ 'http://economico.sapo.pt/rss/emfoco'),
+ ('Mercados',
+ 'http://economico.sapo.pt/rss/mercados'),
+ ('Empresas',
+ 'http://economico.sapo.pt/rss/empresas'),
+ ('Economia',
+ 'http://economico.sapo.pt/rss/economia'),
+ ('Politica',
+ 'http://economico.sapo.pt/rss/politica'),
+]
+
diff --git a/recipes/economist.recipe b/recipes/economist.recipe
index cc6bf4e42a..25e46892f8 100644
--- a/recipes/economist.recipe
+++ b/recipes/economist.recipe
@@ -20,7 +20,23 @@ class Economist(BasicNewsRecipe):
INDEX = 'http://www.economist.com/printedition'
description = ('Global news and current affairs from a European'
' perspective. Best downloaded on Friday mornings (GMT)')
- extra_css = '.headline {font-size: x-large;} \n h2 { font-size: small; } \n h1 { font-size: medium; }'
+ extra_css = '''
+ .headline {font-size: x-large;}
+ h2 { font-size: small; }
+ h1 { font-size: medium; }
+ .pullquote {
+ float: right;
+ font-size: larger;
+ font-weight: bold;
+ font-style: italic;
+ page-break-inside:avoid;
+ border-bottom: 3px solid black;
+ border-top: 3px solid black;
+ width: 228px;
+ margin: 0px 0px 10px 15px;
+ padding: 7px 0px 9px;
+ }
+ '''
oldest_article = 7.0
remove_tags = [
dict(name=['script', 'noscript', 'title', 'iframe', 'cf_floatingcontent']),
diff --git a/recipes/economist_free.recipe b/recipes/economist_free.recipe
index 30bb2c4faa..a64310c252 100644
--- a/recipes/economist_free.recipe
+++ b/recipes/economist_free.recipe
@@ -20,7 +20,24 @@ class Economist(BasicNewsRecipe):
INDEX = 'http://www.economist.com/printedition'
description = ('Global news and current affairs from a European'
' perspective. Best downloaded on Friday mornings (GMT)')
- extra_css = '.headline {font-size: x-large;} \n h2 { font-size: small; } \n h1 { font-size: medium; }'
+ extra_css = '''
+ .headline {font-size: x-large;}
+ h2 { font-size: small; }
+ h1 { font-size: medium; }
+ .pullquote {
+ float: right;
+ font-size: larger;
+ font-weight: bold;
+ font-style: italic;
+ page-break-inside:avoid;
+ border-bottom: 3px solid black;
+ border-top: 3px solid black;
+ width: 228px;
+ margin: 0px 0px 10px 15px;
+ padding: 7px 0px 9px;
+ }
+ '''
+
oldest_article = 7.0
remove_tags = [
dict(name=['script', 'noscript', 'title', 'iframe', 'cf_floatingcontent']),
diff --git a/recipes/el_mundo_today.recipe b/recipes/el_mundo_today.recipe
new file mode 100644
index 0000000000..7f558d10e7
--- /dev/null
+++ b/recipes/el_mundo_today.recipe
@@ -0,0 +1,43 @@
+import re
+from calibre.web.feeds.news import BasicNewsRecipe
+
+class ElMundoTodayRecipe(BasicNewsRecipe):
+ title = 'El Mundo Today'
+ __author__ = 'atordo'
+ description = u'La actualidad del mañana'
+ category = 'Noticias, humor'
+ cover_url = 'http://www.elmundotoday.com/wp-content/themes/EarthlyTouch/images/logo.png'
+ oldest_article = 30
+ max_articles_per_feed = 60
+ auto_cleanup = False
+ no_stylesheets = True
+ remove_javascript = True
+ language = 'es'
+ use_embedded_content = False
+
+ preprocess_regexps = [
+ (re.compile(r'.*', re.DOTALL),
+ lambda match: ''),
+ #(re.compile(r'^\t{5}$'), lambda match: ''),
+ #(re.compile(r'\t{5}$'), lambda match: ''),
+ (re.compile(r'', re.DOTALL),
+ lambda match: ''),
+ ]
+
+ keep_only_tags = [
+ dict(name='div', attrs={'class':'post-wrapper'})
+ ]
+
+ remove_attributes = [ 'href', 'title', 'alt' ]
+
+ extra_css = '''
+ .antetitulo{font-variant:small-caps; font-weight:bold} .articleinfo{font-size:small}
+ img{margin-bottom:0.4em; display:block; margin-left:auto; margin-right:auto}
+ '''
+
+ feeds = [('El Mundo Today', 'http://www.elmundotoday.com/feed/')]
+
+ def get_broser(self):
+ br = BasicNewsRecipe.get_browser(self)
+ br.set_handle_gzip(True)
+ return br
diff --git a/recipes/elektroda_pl.recipe b/recipes/elektroda_pl.recipe
index 55858020ad..34871ea04a 100644
--- a/recipes/elektroda_pl.recipe
+++ b/recipes/elektroda_pl.recipe
@@ -10,6 +10,7 @@ class Elektroda(BasicNewsRecipe):
category = 'electronics'
language = 'pl'
max_articles_per_feed = 100
+ no_stylesheets= True
remove_tags_before=dict(name='span', attrs={'class':'postbody'})
remove_tags_after=dict(name='td', attrs={'class':'spaceRow'})
remove_tags=[dict(name='a', attrs={'href':'#top'})]
diff --git a/recipes/elpais_impreso.recipe b/recipes/elpais_impreso.recipe
index b22a41dcec..ffa1033477 100644
--- a/recipes/elpais_impreso.recipe
+++ b/recipes/elpais_impreso.recipe
@@ -1,5 +1,6 @@
+# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
__license__ = 'GPL v3'
-__copyright__ = '2010, Darko Miletic '
+__copyright__ = '2010-2012, Darko Miletic '
'''
www.elpais.com
'''
@@ -7,23 +8,24 @@ www.elpais.com
from calibre.web.feeds.news import BasicNewsRecipe
class ElPais_RSS(BasicNewsRecipe):
- title = 'El Pais'
+ title = u'El País'
__author__ = 'Darko Miletic'
- description = 'el periodico global en Castellano'
+ description = u'Noticias de última hora sobre la actualidad en España y el mundo: política, economía, deportes, cultura, sociedad, tecnología, gente, opinión, viajes, moda, televisión, los blogs y las firmas de EL PAÍS. Además especiales, vídeos, fotos, audios, gráficos, entrevistas, promociones y todos los servicios de EL PAÍS.'
publisher = 'EDICIONES EL PAIS, S.L.'
category = 'news, politics, finances, world, spain'
oldest_article = 2
max_articles_per_feed = 200
no_stylesheets = True
- encoding = 'cp1252'
+ encoding = 'utf8'
use_embedded_content = False
language = 'es'
remove_empty_feeds = True
publication_type = 'newspaper'
- masthead_url = 'http://www.elpais.com/im/tit_logo.gif'
+ masthead_url = 'http://ep01.epimg.net/iconos/v1.x/v1.0/logos/cabecera_portada.png'
extra_css = """
- body{font-family: Georgia,"Times New Roman",Times,serif }
- h3{font-family: Arial,Helvetica,sans-serif}
+ h1{font-family: Georgia,"Times New Roman",Times,serif }
+ #subtitulo_noticia, .firma, .figcaption{font-size: small}
+ body{font-family: Arial,Helvetica,Garuda,sans-serif}
img{margin-bottom: 0.4em; display:block}
"""
@@ -34,49 +36,61 @@ class ElPais_RSS(BasicNewsRecipe):
, 'language' : language
}
- keep_only_tags = [dict(attrs={'class':['cabecera_noticia estirar','cabecera_noticia','','contenido_noticia']})]
- remove_tags = [
- dict(name=['meta','link','base','iframe','embed','object'])
- ,dict(attrs={'class':['info_complementa','estructura_2col_der','votos estirar','votos']})
- ,dict(attrs={'id':'utilidades'})
+ keep_only_tags = [
+ dict(attrs={'id':['titulo_noticia','subtitulo_noticia']})
+ ,dict(attrs={'class':['firma','columna_texto','entrevista_p_r']})
+ ]
+ remove_tags = [
+ dict(name=['meta','link','base','iframe','embed','object'])
+ ,dict(attrs={'class':'disposicion_vertical'})
]
- remove_tags_after = dict(attrs={'id':'utilidades'})
- remove_attributes = ['lang','border','width','height']
feeds = [
- (u'Lo ultimo' , u'http://www.elpais.com/rss/feed.html?feedId=17046')
- ,(u'America Latina' , u'http://www.elpais.com/rss/feed.html?feedId=17041')
- ,(u'Mexico' , u'http://www.elpais.com/rss/feed.html?feedId=17042')
- ,(u'Europa' , u'http://www.elpais.com/rss/feed.html?feedId=17043')
- ,(u'Estados Unidos' , u'http://www.elpais.com/rss/feed.html?feedId=17044')
- ,(u'Oriente proximo' , u'http://www.elpais.com/rss/feed.html?feedId=17045')
- ,(u'Espana' , u'http://www.elpais.com/rss/feed.html?feedId=1002' )
- ,(u'Andalucia' , u'http://www.elpais.com/rss/feed.html?feedId=17057')
- ,(u'Catalunia' , u'http://www.elpais.com/rss/feed.html?feedId=17059')
- ,(u'Comunidad Valenciana' , u'http://www.elpais.com/rss/feed.html?feedId=17061')
- ,(u'Madrid' , u'http://www.elpais.com/rss/feed.html?feedId=1016' )
- ,(u'Pais Vasco' , u'http://www.elpais.com/rss/feed.html?feedId=17062')
- ,(u'Galicia' , u'http://www.elpais.com/rss/feed.html?feedId=17063')
- ,(u'Opinion' , u'http://www.elpais.com/rss/feed.html?feedId=1003' )
- ,(u'Sociedad' , u'http://www.elpais.com/rss/feed.html?feedId=1004' )
- ,(u'Deportes' , u'http://www.elpais.com/rss/feed.html?feedId=1007' )
- ,(u'Cultura' , u'http://www.elpais.com/rss/feed.html?feedId=1008' )
- ,(u'Cine' , u'http://www.elpais.com/rss/feed.html?feedId=17052')
- ,(u'Literatura' , u'http://www.elpais.com/rss/feed.html?feedId=17053')
- ,(u'Musica' , u'http://www.elpais.com/rss/feed.html?feedId=17051')
- ,(u'Arte' , u'http://www.elpais.com/rss/feed.html?feedId=17060')
- ,(u'Tecnologia' , u'http://www.elpais.com/rss/feed.html?feedId=1005' )
- ,(u'Economia' , u'http://www.elpais.com/rss/feed.html?feedId=1006' )
- ,(u'Ciencia' , u'http://www.elpais.com/rss/feed.html?feedId=17068')
- ,(u'Salud' , u'http://www.elpais.com/rss/feed.html?feedId=17074')
- ,(u'Ocio' , u'http://www.elpais.com/rss/feed.html?feedId=17075')
- ,(u'Justicia y Leyes' , u'http://www.elpais.com/rss/feed.html?feedId=17069')
- ,(u'Guerras y conflictos' , u'http://www.elpais.com/rss/feed.html?feedId=17070')
- ,(u'Politica' , u'http://www.elpais.com/rss/feed.html?feedId=17073')
+ (u'Lo ultimo' , u'http://ep00.epimg.net/rss/tags/ultimas_noticias.xml')
+ ,(u'America Latina' , u'http://elpais.com/tag/rss/latinoamerica/a/' )
+ ,(u'Mexico' , u'http://elpais.com/tag/rss/mexico/a/' )
+ ,(u'Europa' , u'http://elpais.com/tag/rss/europa/a/' )
+ ,(u'Estados Unidos' , u'http://elpais.com/tag/rss/estados_unidos/a/' )
+ ,(u'Oriente proximo' , u'http://elpais.com/tag/rss/oriente_proximo/a/' )
+ ,(u'Andalucia' , u'http://ep00.epimg.net/rss/ccaa/andalucia.xml' )
+ ,(u'Catalunia' , u'http://ep00.epimg.net/rss/ccaa/catalunya.xml' )
+ ,(u'Comunidad Valenciana' , u'http://ep00.epimg.net/rss/ccaa/valencia.xml' )
+ ,(u'Madrid' , u'http://ep00.epimg.net/rss/ccaa/madrid.xml' )
+ ,(u'Pais Vasco' , u'http://ep00.epimg.net/rss/ccaa/paisvasco.xml' )
+ ,(u'Galicia' , u'http://ep00.epimg.net/rss/ccaa/galicia.xml' )
+ ,(u'Sociedad' , u'http://ep00.epimg.net/rss/sociedad/portada.xml' )
+ ,(u'Deportes' , u'http://ep00.epimg.net/rss/deportes/portada.xml' )
+ ,(u'Cultura' , u'http://ep00.epimg.net/rss/cultura/portada.xml' )
+ ,(u'Cine' , u'http://elpais.com/tag/rss/cine/a/' )
+ ,(u'Economía' , u'http://elpais.com/tag/rss/economia/a/' )
+ ,(u'Literatura' , u'http://elpais.com/tag/rss/libros/a/' )
+ ,(u'Musica' , u'http://elpais.com/tag/rss/musica/a/' )
+ ,(u'Arte' , u'http://elpais.com/tag/rss/arte/a/' )
+ ,(u'Medio Ambiente' , u'http://elpais.com/tag/rss/medio_ambiente/a/' )
+ ,(u'Tecnologia' , u'http://ep01.epimg.net/rss/tecnologia/portada.xml' )
+ ,(u'Ciencia' , u'http://ep00.epimg.net/rss/tags/c_ciencia.xml' )
+ ,(u'Salud' , u'http://elpais.com/tag/rss/salud/a/' )
+ ,(u'Ocio' , u'http://elpais.com/tag/rss/ocio/a/' )
+ ,(u'Justicia y Leyes' , u'http://elpais.com/tag/rss/justicia/a/' )
+ ,(u'Guerras y conflictos' , u'http://elpais.com/tag/rss/conflictos/a/' )
+ ,(u'Politica' , u'http://ep00.epimg.net/rss/politica/portada.xml' )
+ ,(u'Opinion' , u'http://ep01.epimg.net/rss/politica/opinion.xml' )
]
- def print_version(self, url):
- return url + '?print=1'
+ def get_article_url(self, article):
+ url = BasicNewsRecipe.get_article_url(self, article)
+ if url and (not('/album/' in url) and not('/futbol/partido/' in url)):
+ return url
+ self.log('Skipping non-article', url)
+ return None
+
+ def get_cover_url(self):
+ soup = self.index_to_soup('http://elpais.com/')
+ for image in soup.findAll('img'):
+ if image['src'].endswith('elpaisTodayMiddle.jpg'):
+ sstr = image['src']
+ return sstr.replace('elpaisTodayMiddle.jpg', 'elpaisToday.jpg')
+ return None
def preprocess_html(self, soup):
for item in soup.findAll(style=True):
diff --git a/recipes/endgadget_ja.recipe b/recipes/endgadget_ja.recipe
index 3c20380e9b..7eca0a6966 100644
--- a/recipes/endgadget_ja.recipe
+++ b/recipes/endgadget_ja.recipe
@@ -17,7 +17,25 @@ class EndgadgetJapan(BasicNewsRecipe):
no_stylesheets = True
language = 'ja'
encoding = 'utf-8'
- feeds = [(u'engadget', u'http://japanese.engadget.com/rss.xml')]
+ index = 'http://japanese.engadget.com/'
+ remove_javascript = True
+
+ remove_tags_before = dict(name="h1", attrs={'class':"post_title"})
+ remove_tags_after = dict(name='div', attrs={'class':'post_body'})
+
+ def parse_index(self):
+ feeds = []
+ newsarticles = []
+ soup = self.index_to_soup(self.index)
+ for topstories in soup.findAll('div',attrs={'class':'post_content'}):
+ itt = topstories.find('h4')
+ itema = itt.find('a',href=True)
+ newsarticles.append({
+ 'title' :itema.string
+ ,'date' :''
+ ,'url' :itema['href']
+ ,'description':''
+ })
+ feeds.append(('Latest Posts', newsarticles))
+ return feeds
- remove_tags_before = dict(name="div", attrs={'id':"content_wrap"})
- remove_tags_after = dict(name='h3', attrs={'id':'addcomments'})
diff --git a/recipes/folha.recipe b/recipes/folha.recipe
new file mode 100644
index 0000000000..bf5dc509a7
--- /dev/null
+++ b/recipes/folha.recipe
@@ -0,0 +1,82 @@
+# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
+
+__license__ = 'GPL v3'
+__copyright__ = '2012, Darko Miletic '
+'''
+www.folha.uol.com.br
+'''
+import urllib
+from calibre.web.feeds.news import BasicNewsRecipe
+
+class Folha_de_s_paulo(BasicNewsRecipe):
+ title = u'Folha de São Paulo - portal'
+ __author__ = 'Darko Miletic'
+ description = 'Um Jornala a servicao do Brasil'
+ publisher = 'Folhapress'
+ category = 'news, politics, Brasil'
+ oldest_article = 2
+ max_articles_per_feed = 200
+ no_stylesheets = True
+ encoding = 'cp1252'
+ use_embedded_content = False
+ language = 'pt_BR'
+ remove_empty_feeds = True
+ publication_type = 'newspaper'
+ masthead_url = 'http://f.i.uol.com.br/fsp/furniture/images/lgo-fsp-430x50-ffffff.gif'
+ extra_css = """
+ body{font-family: Arial,Helvetica,sans-serif }
+ img{margin-bottom: 0.4em; display:block}
+ """
+
+ conversion_options = {
+ 'comment' : description
+ , 'tags' : category
+ , 'publisher' : publisher
+ , 'language' : language
+ }
+
+ remove_tags = [dict(name=['meta','link','base','iframe','embed','object'])]
+ keep_only_tags = [dict(attrs={'id':'articleNew'})]
+
+
+ feeds = [
+ (u'Poder' , u'http://feeds.folha.uol.com.br/poder/rss091.xml' )
+ ,(u'Mundo' , u'http://feeds.folha.uol.com.br/mundo/rss091.xml' )
+ ,(u'Mercado' , u'http://feeds.folha.uol.com.br/mercado/rss091.xml' )
+ ,(u'Cotidiano' , u'http://feeds.folha.uol.com.br/cotidiano/rss091.xml' )
+ ,(u'Esporte' , u'http://feeds.folha.uol.com.br/esporte/rss091.xml' )
+ ,(u'Ilustrada' , u'http://feeds.folha.uol.com.br/ilustrada/rss091.xml' )
+ ,(u'F5' , u'http://feeds.folha.uol.com.br/f5/rss091.xml' )
+ ,(u'Ciência' , u'http://feeds.folha.uol.com.br/ciencia/rss091.xml' )
+ ,(u'Tec' , u'http://feeds.folha.uol.com.br/tec/rss091.xml' )
+ ,(u'Ambiente' , u'http://feeds.folha.uol.com.br/ambiente/rss091.xml' )
+ ,(u'Bichos' , u'http://feeds.folha.uol.com.br/bichos/rss091.xml' )
+ ,(u'Celebridades' , u'http://feeds.folha.uol.com.br/celebridades/rss091.xml' )
+ ,(u'Comida' , u'http://feeds.folha.uol.com.br/comida/rss091.xml' )
+ ,(u'Equilibrio' , u'http://feeds.folha.uol.com.br/equilibrioesaude/rss091.xml' )
+ ,(u'Folhateen' , u'http://feeds.folha.uol.com.br/folhateen/rss091.xml' )
+ ,(u'Folhinha' , u'http://feeds.folha.uol.com.br/folhinha/rss091.xml' )
+ ,(u'Ilustrissima' , u'http://feeds.folha.uol.com.br/ilustrissima/rss091.xml' )
+ ,(u'Saber' , u'http://feeds.folha.uol.com.br/saber/rss091.xml' )
+ ,(u'Turismo' , u'http://feeds.folha.uol.com.br/turismo/rss091.xml' )
+ ,(u'Panel do Leitor', u'http://feeds.folha.uol.com.br/folha/paineldoleitor/rss091.xml')
+ ,(u'Publifolha' , u'http://feeds.folha.uol.com.br/folha/publifolha/rss091.xml' )
+ ,(u'Em cima da hora', u'http://feeds.folha.uol.com.br/emcimadahora/rss091.xml' )
+ ]
+
+ def get_article_url(self, article):
+ url = BasicNewsRecipe.get_article_url(self, article)
+ curl = url.partition('/*')[2]
+ return curl
+
+ def print_version(self, url):
+ return 'http://tools.folha.com.br/print?site=emcimadahora&url=' + urllib.quote_plus(url)
+
+ def get_cover_url(self):
+ soup = self.index_to_soup('http://www.folha.uol.com.br/')
+ cont = soup.find('div', attrs={'id':'newspaper'})
+ if cont:
+ ai = cont.find('a', href='http://www1.folha.uol.com.br/fsp/')
+ if ai:
+ return ai.img['src']
+ return None
diff --git a/recipes/folhadesaopaulo.recipe b/recipes/folhadesaopaulo.recipe
index 5503f2ca1e..50e55a2990 100644
--- a/recipes/folhadesaopaulo.recipe
+++ b/recipes/folhadesaopaulo.recipe
@@ -8,7 +8,7 @@ from urllib2 import Request, urlopen, URLError
class FolhaOnline(BasicNewsRecipe):
THUMBALIZR_API = '' # ---->Get your at http://www.thumbalizr.com/ and put here
LANGUAGE = 'pt_br'
- language = 'pt'
+ language = 'pt_BR'
LANGHTM = 'pt-br'
ENCODING = 'cp1252'
ENCHTM = 'iso-8859-1'
diff --git a/recipes/folhadesaopaulo_sub.recipe b/recipes/folhadesaopaulo_sub.recipe
index 32dd347405..cd444b4682 100644
--- a/recipes/folhadesaopaulo_sub.recipe
+++ b/recipes/folhadesaopaulo_sub.recipe
@@ -14,7 +14,7 @@ class FSP(BasicNewsRecipe):
HOMEPAGE = 'http://www1.folha.uol.com.br/fsp/'
masthead_url = 'http://f.i.uol.com.br/fsp/furniture/images/lgo-fsp-430x50-ffffff.gif'
- language = 'pt'
+ language = 'pt_BR'
no_stylesheets = True
max_articles_per_feed = 40
remove_javascript = True
diff --git a/recipes/gameplay_pl.recipe b/recipes/gameplay_pl.recipe
index 7b0ccb4f55..dc90d79ed1 100644
--- a/recipes/gameplay_pl.recipe
+++ b/recipes/gameplay_pl.recipe
@@ -12,8 +12,8 @@ class Gameplay_pl(BasicNewsRecipe):
max_articles_per_feed = 100
remove_javascript= True
no_stylesheets= True
- keep_only_tags=[dict(name='div', attrs={'class':['news_endpage_tit', 'news']})]
- remove_tags=[dict(name='div', attrs={'class':['galeria', 'noedit center im', 'news_list', 'news_list_autor', 'stop_bot', 'tagi']}), dict(attrs={'usemap':'#map'})]
+ keep_only_tags=[dict(name='div', attrs={'class':['news_endpage_tit', 'news', 'news_container']})]
+ remove_tags=[dict(name='div', attrs={'class':['galeria', 'noedit center im', 'news_list', 'news_list_autor', 'stop_bot', 'tagi', 'news_tagi']}), dict(attrs={'usemap':'#map'}), dict(name='a', attrs={'class':['pin-it-button', 'twitter-share-button']})]
feeds = [(u'Wiadomo\u015bci', u'http://gameplay.pl/rss/')]
def image_url_processor(self, baseurl, url):
diff --git a/recipes/good_house_keeping.recipe b/recipes/good_house_keeping.recipe
index 1dc26e88e7..4501c1122a 100644
--- a/recipes/good_house_keeping.recipe
+++ b/recipes/good_house_keeping.recipe
@@ -8,12 +8,17 @@ class AdvancedUserRecipe1305547242(BasicNewsRecipe):
max_articles_per_feed = 100
no_stylesheets = True
use_embedded_content = False
+ #auto_cleanup = True
remove_javascript = True
def print_version(self,url):
- segments = url.split('/')
- printURL = '/'.join(segments[0:3]) + '/print-this/' + '/'.join(segments[4:])
- return printURL
+ if '/tips-for-making-desserts?' in url:
+ return None
+ segments = url.split('/')
+ segments[-1] = segments[-1].split('?')[0]
+ segments[-1] +='?page=all'
+ printURL = '/'.join(segments[0:3]) + '/print-this/' + segments[-1]
+ return printURL
def preprocess_html(self, soup):
for alink in soup.findAll('a'):
@@ -22,10 +27,19 @@ class AdvancedUserRecipe1305547242(BasicNewsRecipe):
alink.replaceWith(tstr)
return soup
- feeds = [ (u'Recipes & Entertaining', u'http://www.goodhousekeeping.com/food/food-rss/?src=rss'),
- (u'Home & House', u'http://www.goodhousekeeping.com/home/home-rss/?src=rss'),
- (u'Diet & Health', u'http://www.goodhousekeeping.com/health/health-rss/?src=rss'),
- (u'Beauty & Style', u'http://www.goodhousekeeping.com/beauty/beauty-rss/?src=rss'),
- (u'Family & Pets', u'http://www.goodhousekeeping.com/family/family-rss/?src=rss'),
- (u'Saving Money', u'http://www.goodhousekeeping.com/money/money-rss/?src=rss'),
- ]
+
+ #feeds = [
+#(u'Food and Recipes', u'http://www.goodhousekeeping.com/rss/recipes/'),
+#]
+
+
+ feeds = [
+(u'Food and Recipes', u'http://www.goodhousekeeping.com/rss/recipes/'),
+(u'Home and Organizing', u'http://www.goodhousekeeping.com/rss/home/'),
+(u'Diet and Health', u'http://www.goodhousekeeping.com/rss/health/'),
+(u'Beauty and Anti-Aging', u'http://www.goodhousekeeping.com/rss/beauty/'),
+(u'Family and Relationships', u'http://www.goodhousekeeping.com/rss/family/'),
+(u'Holidays', u'http://www.goodhousekeeping.com/rss/holidays/'),
+(u'In the Test Kitchen', 'http://www.goodhousekeeping.com/rss/test-kitchen-blog/'),
+]
+
diff --git a/recipes/gosc_niedzielny.recipe b/recipes/gosc_niedzielny.recipe
index 6a99411244..59c8fc2f26 100644
--- a/recipes/gosc_niedzielny.recipe
+++ b/recipes/gosc_niedzielny.recipe
@@ -6,21 +6,20 @@ __copyright__ = '2011, Piotr Kontek, piotr.kontek@gmail.com'
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ptempfile import PersistentTemporaryFile
+from datetime import date
import re
class GN(BasicNewsRecipe):
EDITION = 0
__author__ = 'Piotr Kontek'
+ title = u'Gość niedzielny'
description = 'Weekly magazine'
encoding = 'utf-8'
no_stylesheets = True
language = 'pl'
remove_javascript = True
temp_files = []
- simultaneous_downloads = 1
- masthead_url = 'http://gosc.pl/files/11/03/12/949089_top.gif'
- title = u'Gość niedzielny'
articles_are_obfuscated = True
@@ -56,22 +55,28 @@ class GN(BasicNewsRecipe):
self.temp_files[-1].close()
return self.temp_files[-1].name
- def find_last_issue(self):
- soup = self.index_to_soup('http://gosc.pl/wyszukaj/wydania/3.Gosc-Niedzielny')
- #szukam zdjęcia i linka do porzedniego pełnego numeru
+ def find_last_issue(self, year):
+ soup = self.index_to_soup('http://gosc.pl/wyszukaj/wydania/3.Gosc-Niedzielny/rok/' + str(year))
+
+ #szukam zdjęcia i linka do poprzedniego pełnego numeru
first = True
for d in soup.findAll('div', attrs={'class':'l release_preview_l'}):
img = d.find('img')
if img != None:
a = img.parent
self.EDITION = a['href']
+ self.title = img['alt']
self.cover_url = 'http://www.gosc.pl' + img['src']
- if not first:
+ if year != date.today().year or not first:
break
first = False
def parse_index(self):
- self.find_last_issue()
+ year = date.today().year
+ self.find_last_issue(year)
+ ##jeśli to pierwszy numer w roku trzeba pobrać poprzedni rok
+ if self.EDITION == 0:
+ self.find_last_issue(year-1)
soup = self.index_to_soup('http://www.gosc.pl' + self.EDITION)
feeds = []
#wstepniak
diff --git a/recipes/gram_pl.recipe b/recipes/gram_pl.recipe
index 1f8147ba3d..36982788f1 100644
--- a/recipes/gram_pl.recipe
+++ b/recipes/gram_pl.recipe
@@ -12,13 +12,16 @@ class Gram_pl(BasicNewsRecipe):
no_stylesheets= True
extra_css = 'h2 {font-style: italic; font-size:20px;} .picbox div {float: left;}'
cover_url=u'http://www.gram.pl/www/01/img/grampl_zima.png'
- remove_tags= [dict(name='p', attrs={'class':['extraText', 'must-log-in']}), dict(attrs={'class':['el', 'headline', 'post-info']}), dict(name='div', attrs={'class':['twojaOcena', 'comment-body', 'comment-author vcard', 'comment-meta commentmetadata', 'tw_button']}), dict(id=['igit_rpwt_css', 'comments', 'reply-title', 'igit_title'])]
- keep_only_tags= [dict(name='div', attrs={'class':['main', 'arkh-postmetadataheader', 'arkh-postcontent', 'post', 'content', 'news_header', 'news_subheader', 'news_text']}), dict(attrs={'class':['contentheading', 'contentpaneopen']})]
+ remove_tags= [dict(name='p', attrs={'class':['extraText', 'must-log-in']}), dict(attrs={'class':['el', 'headline', 'post-info', 'entry-footer clearfix']}), dict(name='div', attrs={'class':['twojaOcena', 'comment-body', 'comment-author vcard', 'comment-meta commentmetadata', 'tw_button', 'entry-comment-counter', 'snap_nopreview sharing robots-nocontent']}), dict(id=['igit_rpwt_css', 'comments', 'reply-title', 'igit_title'])]
+ keep_only_tags= [dict(name='div', attrs={'class':['main', 'arkh-postmetadataheader', 'arkh-postcontent', 'post', 'content', 'news_header', 'news_subheader', 'news_text']}), dict(attrs={'class':['contentheading', 'contentpaneopen']}), dict(name='article')]
feeds = [(u'Informacje', u'http://www.gram.pl/feed_news.asp'),
- (u'Publikacje', u'http://www.gram.pl/feed_news.asp?type=articles')]
+ (u'Publikacje', u'http://www.gram.pl/feed_news.asp?type=articles'),
+ (u'Kolektyw- Indie Games', u'http://indie.gram.pl/feed/'),
+ #(u'Kolektyw- Moto Games', u'http://www.motogames.gram.pl/news.rss')
+ ]
- def parse_feeds (self):
- feeds = BasicNewsRecipe.parse_feeds(self)
+ def parse_feeds (self):
+ feeds = BasicNewsRecipe.parse_feeds(self)
for feed in feeds:
for article in feed.articles[:]:
if 'REKLAMA SKLEP' in article.title.upper() or u'ARTYKUŁ:' in article.title.upper():
@@ -56,4 +59,4 @@ class Gram_pl(BasicNewsRecipe):
for a in soup('a'):
if a.has_key('href') and 'http://' not in a['href'] and 'https://' not in a['href']:
a['href']=self.index + a['href']
- return soup
\ No newline at end of file
+ return soup
diff --git a/recipes/greenlinux_pl.recipe b/recipes/greenlinux_pl.recipe
deleted file mode 100644
index 3c5a3c8f20..0000000000
--- a/recipes/greenlinux_pl.recipe
+++ /dev/null
@@ -1,13 +0,0 @@
-from calibre.web.feeds.news import BasicNewsRecipe
-
-class GreenLinux(BasicNewsRecipe):
- title = u'GreenLinux.pl'
- __author__ = 'fenuks'
- category = 'IT'
- language = 'pl'
- cover_url = 'http://lh5.ggpht.com/_xd_6Y9kXhEc/S8tjyqlfhfI/AAAAAAAAAYU/zFNTp07ZQko/top.png'
- oldest_article = 15
- max_articles_per_feed = 100
- auto_cleanup = True
-
- feeds = [(u'Newsy', u'http://feeds.feedburner.com/greenlinux')]
diff --git a/recipes/grid_to.recipe b/recipes/grid_to.recipe
new file mode 100644
index 0000000000..a066219b24
--- /dev/null
+++ b/recipes/grid_to.recipe
@@ -0,0 +1,77 @@
+from calibre.web.feeds.news import BasicNewsRecipe
+
+class TheGrid(BasicNewsRecipe):
+ #: The title to use for the ebook
+ title = u'The Grid'
+
+ #: A couple of lines that describe the content this recipe downloads.
+ #: This will be used primarily in a GUI that presents a list of recipes.
+ description = (u'The Grid is a weekly city magazine and daily website providing a fresh, '
+ 'accessible voice for Toronto.')
+
+ #: The author of this recipe
+ __author__ = u'Yusuf W'
+
+ #: The language that the news is in. Must be an ISO-639 code either
+ #: two or three characters long
+ language = 'en_CA'
+
+ #: Publication type
+ #: Set to newspaper, magazine or blog
+ publication_type = 'newspaper'
+
+ #: Convenient flag to disable loading of stylesheets for websites
+ #: that have overly complex stylesheets unsuitable for conversion
+ #: to ebooks formats
+ #: If True stylesheets are not downloaded and processed
+ no_stylesheets = True
+
+ #: List of tags to be removed. Specified tags are removed from downloaded HTML.
+ remove_tags_before = dict(name='div', id='content')
+ remove_tags_after = dict(name='div', id='content')
+ remove_tags = [
+ dict(name='div', attrs={'class':'right-content pull-right'}),
+ dict(name='div', attrs={'class':'right-content'}),
+ dict(name='div', attrs={'class':'ftr-line'}),
+ dict(name='div', attrs={'class':'pull-right'}),
+ dict(name='div', id='comments'),
+ dict(name='div', id='tags')
+ ]
+
+ #: Keep only the specified tags and their children.
+ #keep_only_tags = [dict(name='div', id='content')]
+
+ cover_margins = (0, 0, '#ffffff')
+
+ INDEX = 'http://www.thegridto.com'
+
+ def get_cover_url(self):
+ soup = self.index_to_soup(self.INDEX)
+ cover_url = soup.find(attrs={'class':'article-block latest-issue'}).find('img')['src']
+
+ return cover_url
+
+ def parse_index(self):
+
+ # Get the latest issue
+ soup = self.index_to_soup(self.INDEX)
+ a = soup.find('div', attrs={'class': 'full-content stuff-ftr'}).findAll('a')[2]
+
+ # Parse the index of the latest issue
+ self.INDEX = self.INDEX + a['href']
+ soup = self.index_to_soup(self.INDEX)
+
+ feeds = []
+ for section in ['city', 'life', 'culture']:
+ section_class = 'left-content article-listing ' + section + ' pull-left'
+ div = soup.find(attrs={'class': section_class})
+
+ articles = []
+ for a in div.findAll(attrs={'class':'post-title'}):
+ title = self.tag_to_string(a)
+ url = a['href']
+
+ articles.append({'title': title, 'url': url, 'description':'', 'date':''})
+
+ feeds.append((section, articles))
+ return feeds
diff --git a/recipes/haaretz_en.recipe b/recipes/haaretz_en.recipe
index 4404624aff..ade32ae5ea 100644
--- a/recipes/haaretz_en.recipe
+++ b/recipes/haaretz_en.recipe
@@ -1,16 +1,15 @@
__license__ = 'GPL v3'
-__copyright__ = '2010, Darko Miletic '
+__copyright__ = '2010-2012, Darko Miletic '
'''
www.haaretz.com
'''
import re
-from calibre import strftime
-from time import gmtime
+import urllib
from calibre.web.feeds.news import BasicNewsRecipe
-class HaaretzPrint_en(BasicNewsRecipe):
- title = 'Haaretz - print edition'
+class Haaretz_en(BasicNewsRecipe):
+ title = 'Haaretz'
__author__ = 'Darko Miletic'
description = "Haaretz.com is the world's leading English-language Website for real-time news and analysis of Israel and the Middle East."
publisher = 'Haaretz'
@@ -21,10 +20,16 @@ class HaaretzPrint_en(BasicNewsRecipe):
encoding = 'utf8'
use_embedded_content = False
language = 'en_IL'
+ needs_subscription = True
+ remove_empty_feeds = True
publication_type = 'newspaper'
PREFIX = 'http://www.haaretz.com'
- masthead_url = PREFIX + '/images/logos/logoGrey.gif'
- extra_css = ' body{font-family: Verdana,Arial,Helvetica,sans-serif } '
+ masthead_url = PREFIX + '/images/logos/HaaretzLogo.gif'
+ extra_css = """
+ body{font-family: Verdana,Arial,Helvetica,sans-serif }
+ h1, .articleBody {font-family: Georgia, serif}
+ .authorBar {font-size: small}
+ """
preprocess_regexps = [(re.compile(r'.*?', re.DOTALL|re.IGNORECASE),lambda match: '