Fix #1791 (Major recipe update)

This commit is contained in:
Kovid Goyal 2009-02-07 13:39:53 -08:00
parent 1c9c8870d2
commit 1145b768dc
31 changed files with 808 additions and 394 deletions

View File

@ -1,32 +1,39 @@
#!/usr/bin/env python #!/usr/bin/env python
__license__ = 'GPL v3' __license__ = 'GPL v3'
__copyright__ = '2008, Darko Miletic <darko.miletic at gmail.com>' __copyright__ = '2008-2009, Darko Miletic <darko.miletic at gmail.com>'
''' '''
ambito.com ambito.com
''' '''
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
class Ambito(BasicNewsRecipe): class Ambito(BasicNewsRecipe):
title = 'Ambito.com' title = 'Ambito.com'
__author__ = 'Darko Miletic' __author__ = 'Darko Miletic'
description = 'Informacion Libre las 24 horas' description = 'Informacion Libre las 24 horas'
publisher = 'Ambito.com'
category = 'news, politics, Argentina'
oldest_article = 2 oldest_article = 2
max_articles_per_feed = 100 max_articles_per_feed = 100
no_stylesheets = True no_stylesheets = True
use_embedded_content = False encoding = 'iso-8859-1'
encoding = 'iso--8859-1'
language = _('Spanish')
cover_url = 'http://www.ambito.com/img/logo_.jpg' cover_url = 'http://www.ambito.com/img/logo_.jpg'
remove_javascript = True
use_embedded_content = False
html2lrf_options = [ html2lrf_options = [
'--comment' , description '--comment', description
, '--category' , 'news, Argentina' , '--category', category
, '--publisher' , title , '--publisher', publisher
] ]
html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"'
keep_only_tags = [dict(name='div', attrs={'align':'justify'})]
remove_tags = [dict(name=['object','link'])]
feeds = [ feeds = [
(u'Principales Noticias', u'http://www.ambito.com/rss/noticiasp.asp' ) (u'Principales Noticias', u'http://www.ambito.com/rss/noticiasp.asp' )
,(u'Economia' , u'http://www.ambito.com/rss/noticias.asp?S=Econom%EDa' ) ,(u'Economia' , u'http://www.ambito.com/rss/noticias.asp?S=Econom%EDa' )
@ -43,3 +50,12 @@ class Ambito(BasicNewsRecipe):
def print_version(self, url): def print_version(self, url):
return url.replace('http://www.ambito.com/noticia.asp?','http://www.ambito.com/noticias/imprimir.asp?') return url.replace('http://www.ambito.com/noticia.asp?','http://www.ambito.com/noticias/imprimir.asp?')
def preprocess_html(self, soup):
mtag = '<meta http-equiv="Content-Language" content="es-AR"/>'
soup.head.insert(0,mtag)
for item in soup.findAll(style=True):
del item['style']
return soup
language = _('Spanish')

View File

@ -7,25 +7,33 @@ b92.net
''' '''
import re import re
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
class B92(BasicNewsRecipe): class B92(BasicNewsRecipe):
title = u'B92' title = 'B92'
__author__ = 'Darko Miletic' __author__ = 'Darko Miletic'
language = _('Serbian')
description = 'Dnevne vesti iz Srbije i sveta' description = 'Dnevne vesti iz Srbije i sveta'
oldest_article = 7 oldest_article = 2
publisher = 'B92.net'
category = 'news, politics, Serbia'
max_articles_per_feed = 100 max_articles_per_feed = 100
remove_javascript = True
no_stylesheets = True no_stylesheets = True
use_embedded_content = False use_embedded_content = False
cover_url = 'http://static.b92.net/images/fp/logo.gif' cover_url = 'http://static.b92.net/images/fp/logo.gif'
extra_css = '@font-face {font-family: "serif1";src:url(res:///opt/sony/ebook/FONT/tt0011m_.ttf)} @font-face {font-family: "monospace1";src:url(res:///opt/sony/ebook/FONT/tt0419m_.ttf)} @font-face {font-family: "sans1";src:url(res:///opt/sony/ebook/FONT/tt0003m_.ttf)} body{text-align: left; font-family: serif1, serif} .article_date{font-family: monospace1, monospace} .article_description{font-family: sans1, sans-serif} .navbar{font-family: monospace1, monospace}'
keep_only_tags = [ dict(name='div', attrs={'class':'sama_vest'}) ] keep_only_tags = [ dict(name='div', attrs={'class':'sama_vest'}) ]
html2lrf_options = [ html2lrf_options = [
'--comment', description '--comment', description
, '--base-font-size', '10' , '--category', category
, '--category', 'news, Serbia' , '--publisher', publisher
, '--publisher', 'B92'
] ]
html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"'
preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')] preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')]
feeds = [ feeds = [
(u'Vesti', u'http://www.b92.net/info/rss/vesti.xml') (u'Vesti', u'http://www.b92.net/info/rss/vesti.xml')
@ -44,3 +52,16 @@ class B92(BasicNewsRecipe):
if biz: if biz:
nurl = 'http://www.b92.net/mobilni/biz/index.php?nav_id=' + article_id nurl = 'http://www.b92.net/mobilni/biz/index.php?nav_id=' + article_id
return nurl return nurl
def preprocess_html(self, soup):
soup.html['xml:lang'] = 'sr-Latn'
soup.html['lang'] = 'sr-Latn'
mtag = '<meta http-equiv="Content-Language" content="sr-Latn"/>'
soup.head.insert(0,mtag)
for item in soup.findAll(style=True):
del item['style']
for item in soup.findAll(name='img',align=True):
del item['align']
item.insert(0,'<br /><br />')
return soup
language = _('Serbian')

View File

@ -5,32 +5,49 @@ __copyright__ = '2008, Darko Miletic <darko.miletic at gmail.com>'
''' '''
blic.rs blic.rs
''' '''
import string,re
import re
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
class Blic(BasicNewsRecipe): class Blic(BasicNewsRecipe):
title = u'Blic' title = u'Blic'
__author__ = 'Darko Miletic' __author__ = u'Darko Miletic'
description = 'Blic.rs online verzija najtiraznije novine u Srbiji donosi najnovije vesti iz Srbije i sveta, komentare, politicke analize, poslovne i ekonomske vesti, vesti iz regiona, intervjue, informacije iz kulture, reportaze, pokriva sve sportske dogadjaje, detaljan tv program, nagradne igre, zabavu, fenomenalni Blic strip, dnevni horoskop, arhivu svih dogadjaja' description = u'Blic.co.yu online verzija najtiraznije novine u Srbiji donosi najnovije vesti iz Srbije i sveta, komentare, politicke analize, poslovne i ekonomske vesti, vesti iz regiona, intervjue, informacije iz kulture, reportaze, pokriva sve sportske dogadjaje, detaljan tv program, nagradne igre, zabavu, fenomenalni Blic strip, dnevni horoskop, arhivu svih dogadjaja'
oldest_article = 7 publisher = 'RINGIER d.o.o.'
language = _('Serbian') category = 'news, politics, Serbia'
oldest_article = 2
max_articles_per_feed = 100 max_articles_per_feed = 100
remove_javascript = True
no_stylesheets = True no_stylesheets = True
use_embedded_content = False use_embedded_content = False
cover_url = 'http://www.blic.rs/resources/images/header_back_tile.png' extra_css = '@font-face {font-family: "serif1";src:url(res:///opt/sony/ebook/FONT/tt0011m_.ttf)} @font-face {font-family: "monospace1";src:url(res:///opt/sony/ebook/FONT/tt0419m_.ttf)} @font-face {font-family: "sans1";src:url(res:///opt/sony/ebook/FONT/tt0003m_.ttf)} body{text-align: left; font-family: serif1, serif} .article_date{font-family: monospace1, monospace} .article_description{font-family: sans1, sans-serif} .navbar{font-family: monospace1, monospace}'
html2lrf_options = [ html2lrf_options = [
'--comment', description '--comment', description
, '--base-font-size', '10' , '--category', category
, '--category', 'news, Serbia' , '--publisher', publisher
, '--publisher', 'Blic'
] ]
html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"'
preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')] preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')]
keep_only_tags = [ dict(name='div', attrs={'class':'single_news'}) ] keep_only_tags = [dict(name='div', attrs={'class':'single_news'})]
feeds = [ (u'Vesti', u'http://www.blic.rs/rssall.php')] feeds = [(u'Vesti', u'http://www.blic.rs/rssall.php')]
remove_tags = [dict(name=['object','link'])]
def print_version(self, url): def print_version(self, url):
start_url, question, rest_url = url.partition('?') start_url, question, rest_url = url.partition('?')
return u'http://www.blic.rs/_print.php?' + rest_url return u'http://www.blic.rs/_print.php?' + rest_url
def preprocess_html(self, soup):
mtag = '<meta http-equiv="Content-Language" content="sr-Latn"/>'
soup.head.insert(0,mtag)
for item in soup.findAll(style=True):
del item['style']
return soup
language = _('Serbian')

View File

@ -1,32 +1,36 @@
#!/usr/bin/env python #!/usr/bin/env python
__license__ = 'GPL v3' __license__ = 'GPL v3'
__copyright__ = '2008, Darko Miletic <darko.miletic at gmail.com>' __copyright__ = '2008-2009, Darko Miletic <darko.miletic at gmail.com>'
''' '''
clarin.com clarin.com
''' '''
from calibre import strftime from calibre import strftime
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
class Clarin(BasicNewsRecipe): class Clarin(BasicNewsRecipe):
title = 'Clarin' title = 'Clarin'
__author__ = 'Darko Miletic' __author__ = 'Darko Miletic'
description = 'Noticias de Argentina y mundo' description = 'Noticias de Argentina y mundo'
publisher = 'Grupo Clarin'
category = 'news, politics, Argentina'
oldest_article = 2 oldest_article = 2
max_articles_per_feed = 100 max_articles_per_feed = 100
language = _('Spanish')
use_embedded_content = False use_embedded_content = False
no_stylesheets = True no_stylesheets = True
cover_url = strftime('http://www.clarin.com/diario/%Y/%m/%d/portada.jpg') cover_url = strftime('http://www.clarin.com/diario/%Y/%m/%d/portada.jpg')
remove_javascript = True
html2lrf_options = [ html2lrf_options = [
'--comment', description '--comment', description
, '--base-font-size', '10' , '--category', category
, '--category', 'news, Argentina' , '--publisher', publisher
, '--publisher', 'Grupo Clarin'
] ]
html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"'
remove_tags = [ remove_tags = [
dict(name='a' , attrs={'class':'Imp' }) dict(name='a' , attrs={'class':'Imp' })
,dict(name='div' , attrs={'class':'Perma' }) ,dict(name='div' , attrs={'class':'Perma' })
@ -49,3 +53,12 @@ class Clarin(BasicNewsRecipe):
rest = artl.partition('-0')[-1] rest = artl.partition('-0')[-1]
lmain = rest.partition('.')[0] lmain = rest.partition('.')[0]
return 'http://www.servicios.clarin.com/notas/jsp/clarin/v9/notas/imprimir.jsp?pagid=' + lmain return 'http://www.servicios.clarin.com/notas/jsp/clarin/v9/notas/imprimir.jsp?pagid=' + lmain
def preprocess_html(self, soup):
mtag = '<meta http-equiv="Content-Language" content="es-AR"/>'
soup.head.insert(0,mtag)
for item in soup.findAll(style=True):
del item['style']
return soup
language = _('Spanish')

View File

@ -5,38 +5,47 @@ __copyright__ = '2008, Darko Miletic <darko.miletic at gmail.com>'
''' '''
danas.rs danas.rs
''' '''
import string,re import re
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
class Danas(BasicNewsRecipe): class Danas(BasicNewsRecipe):
title = 'Danas' title = u'Danas'
__author__ = 'Darko Miletic' __author__ = 'Darko Miletic'
description = 'Dnevne novine sa vestima iz sveta, politike, ekonomije, kulture, sporta, Beograda, Novog Sada i cele Srbije.' description = 'Vesti'
publisher = 'Danas d.o.o.'
category = 'news, politics, Serbia'
oldest_article = 2 oldest_article = 2
language = _('Serbian')
max_articles_per_feed = 100 max_articles_per_feed = 100
no_stylesheets = True no_stylesheets = False
remove_javascript = True
use_embedded_content = False use_embedded_content = False
cover_url = 'http://www.danas.rs/images/basic/danas.gif' extra_css = '@font-face {font-family: "serif1";src:url(res:///opt/sony/ebook/FONT/tt0011m_.ttf)} @font-face {font-family: "monospace1";src:url(res:///opt/sony/ebook/FONT/tt0419m_.ttf)} @font-face {font-family: "sans1";src:url(res:///opt/sony/ebook/FONT/tt0003m_.ttf)} body{text-align: left; font-family: serif1, serif} .article_date{font-family: monospace1, monospace} .article_description{font-family: sans1, sans-serif} .navbar{font-family: monospace1, monospace}'
html2lrf_options = [ html2lrf_options = [
'--comment', description '--comment', description
, '--base-font-size', '10' , '--category', category
, '--category', 'news, Serbia' , '--publisher', publisher
, '--publisher', 'Danas'
] ]
html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"'
preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')] preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')]
keep_only_tags = [ dict(name='div', attrs={'id':'left'}) ] keep_only_tags = [dict(name='div', attrs={'id':'left'})]
remove_tags = [ remove_tags = [
dict(name='div', attrs={'class':'width_1_4' }) dict(name='div', attrs={'class':['width_1_4','metaClanka','baner']})
,dict(name='div', attrs={'class':'metaClanka' }) ,dict(name='div', attrs={'id':'comments'})
,dict(name='div', attrs={'id':'comments' }) ,dict(name=['object','link'])
,dict(name='div', attrs={'class':'baner' })
,dict(name='div', attrs={'class':'slikaClanka'})
] ]
feeds = [(u'Vesti', u'http://www.danas.rs/rss/rss.asp')] feeds = [ (u'Vesti', u'http://www.danas.rs/rss/rss.asp')]
def print_version(self, url): def preprocess_html(self, soup):
return url + '&action=print' mtag = '<meta http-equiv="Content-Language" content="sr-Latn"/>'
soup.head.insert(0,mtag)
for item in soup.findAll(style=True):
del item['style']
return soup
language = _('Serbian')

View File

@ -5,32 +5,37 @@ __copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>'
''' '''
emol.com emol.com
''' '''
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
class ElMercurio(BasicNewsRecipe): class ElMercurio(BasicNewsRecipe):
title = 'El Mercurio online' title = 'El Mercurio online'
language = _('Spanish')
__author__ = 'Darko Miletic' __author__ = 'Darko Miletic'
description = 'El sitio de noticias online de Chile' description = 'El sitio de noticias online de Chile'
publisher = 'El Mercurio'
category = 'news, politics, Chile'
oldest_article = 2 oldest_article = 2
max_articles_per_feed = 100 max_articles_per_feed = 100
no_stylesheets = True no_stylesheets = True
use_embedded_content = False use_embedded_content = False
encoding = 'cp1252' encoding = 'cp1252'
cover_url = 'http://www.emol.com/especiales/logo_emol/logo_emol.gif' cover_url = 'http://www.emol.com/especiales/logo_emol/logo_emol.gif'
remove_javascript = True
use_embedded_content = False
html2lrf_options = [ html2lrf_options = [
'--comment' , description '--comment', description
, '--category' , 'news, Chile' , '--category', category
, '--publisher' , title , '--publisher', publisher
] ]
html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"'
keep_only_tags = [ keep_only_tags = [
dict(name='div', attrs={'class':'despliegue-txt_750px'}) dict(name='div', attrs={'class':'despliegue-txt_750px'})
,dict(name='div', attrs={'id':'div_cuerpo_participa'}) ,dict(name='div', attrs={'id':'div_cuerpo_participa'})
] ]
remove_tags = [ remove_tags = [
dict(name='div', attrs={'class':'contenedor_despliegue-col-left300'}) dict(name='div', attrs={'class':'contenedor_despliegue-col-left300'})
,dict(name='div', attrs={'id':['div_centro_dn_opc','div_cabezera','div_secciones','div_contenidos','div_pie','nav']}) ,dict(name='div', attrs={'id':['div_centro_dn_opc','div_cabezera','div_secciones','div_contenidos','div_pie','nav']})
@ -46,3 +51,11 @@ class ElMercurio(BasicNewsRecipe):
,(u'La Musica', u'http://www.emol.com/rss20/rss.asp?canal=7') ,(u'La Musica', u'http://www.emol.com/rss20/rss.asp?canal=7')
] ]
def preprocess_html(self, soup):
mtag = '<meta http-equiv="Content-Language" content="es-CL"/>'
soup.head.insert(0,mtag)
for item in soup.findAll(style=True):
del item['style']
return soup
language = _('Spanish')

View File

@ -1,7 +1,7 @@
#!/usr/bin/env python #!/usr/bin/env python
__license__ = 'GPL v3' __license__ = 'GPL v3'
__copyright__ = '2008, Darko Miletic <darko.miletic at gmail.com>' __copyright__ = '2008-2009, Darko Miletic <darko.miletic at gmail.com>'
''' '''
elargentino.com elargentino.com
''' '''
@ -12,20 +12,24 @@ class ElArgentino(BasicNewsRecipe):
title = 'ElArgentino.com' title = 'ElArgentino.com'
__author__ = 'Darko Miletic' __author__ = 'Darko Miletic'
description = 'Informacion Libre las 24 horas' description = 'Informacion Libre las 24 horas'
language = _('Spanish') publisher = 'ElArgentino.com'
category = 'news, politics, Argentina'
oldest_article = 2 oldest_article = 2
max_articles_per_feed = 100 max_articles_per_feed = 100
remove_javascript = True
no_stylesheets = True no_stylesheets = True
use_embedded_content = False use_embedded_content = False
encoding = 'utf8' encoding = 'utf8'
cover_url = 'http://www.elargentino.com/TemplateWeb/MediosFooter/tapa_elargentino.png' cover_url = 'http://www.elargentino.com/TemplateWeb/MediosFooter/tapa_elargentino.png'
html2lrf_options = [ html2lrf_options = [
'--comment' , description '--comment', description
, '--category' , 'news, Argentina' , '--category', category
, '--publisher' , 'ElArgentino.com' , '--publisher', publisher
] ]
html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"'
remove_tags = [ remove_tags = [
dict(name='div', attrs={'id':'noprint' }) dict(name='div', attrs={'id':'noprint' })
,dict(name='div', attrs={'class':'encabezadoImprimir'}) ,dict(name='div', attrs={'class':'encabezadoImprimir'})
@ -50,7 +54,10 @@ class ElArgentino(BasicNewsRecipe):
return u'http://www.elargentino.com/Impresion.aspx?Id=' + article_id return u'http://www.elargentino.com/Impresion.aspx?Id=' + article_id
def preprocess_html(self, soup): def preprocess_html(self, soup):
mtag = '<meta http-equiv="Content-Type" content="text/html; charset=utf-8">' mtag = '<meta http-equiv="Content-Type" content="text/html; charset=utf-8">\n<meta http-equiv="Content-Language" content="es-AR"/>\n'
soup.head.insert(0,mtag) soup.head.insert(0,mtag)
soup.prettify() for item in soup.findAll(style=True):
del item['style']
return soup return soup
language = _('Spanish')

View File

@ -12,35 +12,49 @@ class ElMundo(BasicNewsRecipe):
title = 'El Mundo' title = 'El Mundo'
__author__ = 'Darko Miletic' __author__ = 'Darko Miletic'
description = 'News from Spain' description = 'News from Spain'
language = _('Spanish') publisher = 'El Mundo'
category = 'news, politics, Spain'
oldest_article = 2 oldest_article = 2
max_articles_per_feed = 100 max_articles_per_feed = 100
no_stylesheets = True no_stylesheets = True
use_embedded_content = False use_embedded_content = False
encoding = 'iso8859_15' encoding = 'iso8859_15'
cover_url = 'http://estaticos02.cache.el-mundo.net/papel/imagenes/v2.0/logoverde.gif' cover_url = 'http://estaticos02.cache.el-mundo.net/papel/imagenes/v2.0/logoverde.gif'
remove_javascript = True
html2lrf_options = [ html2lrf_options = [
'--comment' , description '--comment', description
, '--category' , 'news, Spain' , '--category', category
, '--publisher' , title , '--publisher', publisher
] ]
keep_only_tags = [dict(name='div', attrs={'class':'noticia'})] html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"'
keep_only_tags = [
dict(name='div', attrs={'id':['bloqueprincipal','noticia']})
,dict(name='div', attrs={'class':['contenido_noticia_01']})
]
remove_tags = [ remove_tags = [
dict(name='div', attrs={'class':['herramientas','publicidad_google','video','herramientasarriba','contenido_noticia_02']}) dict(name='div', attrs={'class':['herramientas','publicidad_google']})
,dict(name='div', attrs={'id':'modulo_multimedia' }) ,dict(name='div', attrs={'id':'modulo_multimedia' })
,dict(name=['object','script','link', 'a']) ,dict(name='ul', attrs={'class':'herramientas' })
,dict(name='ul', attrs={'class':'herramientas'}) ,dict(name=['object','link'])
] ]
feeds = [ feeds = [
(u'Portada' , u'http://rss.elmundo.es/rss/descarga.htm?data2=4' ) (u'Portada' , u'http://rss.elmundo.es/rss/descarga.htm?data2=4' )
,(u'Television' , u'http://rss.elmundo.es/rss/descarga.htm?data2=76')
,(u'Espana' , u'http://rss.elmundo.es/rss/descarga.htm?data2=8' ) ,(u'Espana' , u'http://rss.elmundo.es/rss/descarga.htm?data2=8' )
,(u'Internacional' , u'http://rss.elmundo.es/rss/descarga.htm?data2=9' ) ,(u'Internacional' , u'http://rss.elmundo.es/rss/descarga.htm?data2=9' )
,(u'Cultura' , u'http://rss.elmundo.es/rss/descarga.htm?data2=6' ) ,(u'Cultura' , u'http://rss.elmundo.es/rss/descarga.htm?data2=6' )
,(u'Ciencia/Ecologia', u'http://rss.elmundo.es/rss/descarga.htm?data2=5' ) ,(u'Ciencia/Ecologia', u'http://rss.elmundo.es/rss/descarga.htm?data2=5' )
,(u'Comunicacion' , u'http://rss.elmundo.es/rss/descarga.htm?data2=26') ,(u'Comunicacion' , u'http://rss.elmundo.es/rss/descarga.htm?data2=26')
,(u'Television' , u'http://rss.elmundo.es/rss/descarga.htm?data2=76')
] ]
def preprocess_html(self, soup):
for item in soup.findAll(style=True):
del item['style']
return soup
language = _('Spanish')

View File

@ -8,25 +8,28 @@ estadao.com.br
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
class Estadao(BasicNewsRecipe): class Estadao(BasicNewsRecipe):
title = 'O Estado de S. Paulo' title = 'O Estado de S. Paulo'
__author__ = 'Darko Miletic' __author__ = 'Darko Miletic'
description = 'News from Brasil' description = 'News from Brasil in Portugese'
language = _('Portugese') publisher = 'O Estado de S. Paulo'
category = 'news, politics, Brasil'
oldest_article = 2 oldest_article = 2
max_articles_per_feed = 100 max_articles_per_feed = 100
no_stylesheets = True no_stylesheets = True
use_embedded_content = False use_embedded_content = False
encoding = 'utf8' encoding = 'utf8'
cover_url = 'http://www.estadao.com.br/img/logo_estadao.png' cover_url = 'http://www.estadao.com.br/img/logo_estadao.png'
remove_javascript = True
html2lrf_options = [ html2lrf_options = [
'--comment' , description '--comment', description
, '--category' , 'news, Brasil' , '--category', category
, '--publisher' , title , '--publisher', publisher
] ]
html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"'
keep_only_tags = [dict(name='div', attrs={'id':'c1'})] keep_only_tags = [dict(name='div', attrs={'id':'c1'})]
remove_tags = [ remove_tags = [
@ -52,4 +55,8 @@ class Estadao(BasicNewsRecipe):
ifr = soup.find('iframe') ifr = soup.find('iframe')
if ifr: if ifr:
ifr.extract() ifr.extract()
for item in soup.findAll(style=True):
del item['style']
return soup return soup
language = _('Portugese')

View File

@ -7,37 +7,46 @@ granma.cubaweb.cu
''' '''
import urllib import urllib
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
class Granma(BasicNewsRecipe): class Granma(BasicNewsRecipe):
title = 'Diario Granma' title = 'Diario Granma'
__author__ = 'Darko Miletic' __author__ = 'Darko Miletic'
language = _('Spanish')
description = 'Organo oficial del Comite Central del Partido Comunista de Cuba' description = 'Organo oficial del Comite Central del Partido Comunista de Cuba'
publisher = 'Granma'
category = 'news, politics, Cuba'
oldest_article = 2 oldest_article = 2
max_articles_per_feed = 100 max_articles_per_feed = 100
no_stylesheets = True no_stylesheets = True
use_embedded_content = False use_embedded_content = False
encoding = 'cp1252' encoding = 'cp1252'
cover_url = 'http://www.granma.cubaweb.cu/imagenes/granweb229d.jpg' cover_url = 'http://www.granma.cubaweb.cu/imagenes/granweb229d.jpg'
remove_javascript = True
html2lrf_options = [ html2lrf_options = [
'--comment' , description '--comment', description
, '--category' , 'news, Cuba' , '--category', category
, '--publisher' , title , '--publisher', publisher
, '--ignore-tables' , '--ignore-tables'
] ]
html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"'
keep_only_tags = [dict(name='table', attrs={'height':'466'})] keep_only_tags = [dict(name='table', attrs={'height':'466'})]
feeds = [(u'Noticias', u'http://www.granma.cubaweb.cu/noticias.xml' )] feeds = [(u'Noticias', u'http://www.granma.cubaweb.cu/noticias.xml' )]
def preprocess_html(self, soup): def preprocess_html(self, soup):
del soup.body.table['style'] mtag = '<meta http-equiv="Content-Language" content="es-CU"/>'
rtag = soup.find('td', attrs={'height':'458'}) soup.head.insert(0,mtag)
if rtag: for item in soup.findAll('table'):
del rtag['style'] if item.has_key('width'):
del item['width']
if item.has_key('height'):
del item['height']
for item in soup.findAll(style=True):
del item['style']
return soup return soup
language = _('Spanish')

View File

@ -1,7 +1,7 @@
#!/usr/bin/env python #!/usr/bin/env python
__license__ = 'GPL v3' __license__ = 'GPL v3'
__copyright__ = '2008, Darko Miletic <darko.miletic at gmail.com>' __copyright__ = '2008-2009, Darko Miletic <darko.miletic at gmail.com>'
''' '''
harpers.org - paid subscription/ printed issue articles harpers.org - paid subscription/ printed issue articles
This recipe only get's article's published in text format This recipe only get's article's published in text format
@ -9,13 +9,15 @@ images and pdf's are ignored
''' '''
from calibre import strftime from calibre import strftime
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
class Harpers_full(BasicNewsRecipe): class Harpers_full(BasicNewsRecipe):
title = u"Harper's Magazine - articles from printed edition" title = u"Harper's Magazine - articles from printed edition"
__author__ = u'Darko Miletic' __author__ = u'Darko Miletic'
description = u"Harper's Magazine: Founded June 1850." description = u"Harper's Magazine: Founded June 1850."
language = _('English') publisher = "Harpers's"
category = 'news, politics, USA'
oldest_article = 30 oldest_article = 30
max_articles_per_feed = 100 max_articles_per_feed = 100
no_stylesheets = True no_stylesheets = True
@ -26,6 +28,15 @@ class Harpers_full(BasicNewsRecipe):
INDEX = strftime('http://www.harpers.org/archive/%Y/%m') INDEX = strftime('http://www.harpers.org/archive/%Y/%m')
LOGIN = 'http://www.harpers.org' LOGIN = 'http://www.harpers.org'
cover_url = strftime('http://www.harpers.org/media/pages/%Y/%m/gif/0001.gif') cover_url = strftime('http://www.harpers.org/media/pages/%Y/%m/gif/0001.gif')
remove_javascript = True
html2lrf_options = [
'--comment', description
, '--category', category
, '--publisher', publisher
]
html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"'
keep_only_tags = [ dict(name='div', attrs={'id':'cached'}) ] keep_only_tags = [ dict(name='div', attrs={'id':'cached'}) ]
remove_tags = [ remove_tags = [
@ -60,3 +71,10 @@ class Harpers_full(BasicNewsRecipe):
,'description':'' ,'description':''
}) })
return [(soup.head.title.string, articles)] return [(soup.head.title.string, articles)]
def preprocess_html(self, soup):
for item in soup.findAll(style=True):
del item['style']
return soup
language = _('English')

View File

@ -1,7 +1,7 @@
#!/usr/bin/env python #!/usr/bin/env python
__license__ = 'GPL v3' __license__ = 'GPL v3'
__copyright__ = '2008, Darko Miletic <darko.miletic at gmail.com>' __copyright__ = '2008-2009, Darko Miletic <darko.miletic at gmail.com>'
''' '''
infobae.com infobae.com
''' '''
@ -12,21 +12,23 @@ class Infobae(BasicNewsRecipe):
title = 'Infobae.com' title = 'Infobae.com'
__author__ = 'Darko Miletic' __author__ = 'Darko Miletic'
description = 'Informacion Libre las 24 horas' description = 'Informacion Libre las 24 horas'
publisher = 'Infobae.com'
category = 'news, politics, Argentina'
oldest_article = 2 oldest_article = 2
language = _('Spanish')
max_articles_per_feed = 100 max_articles_per_feed = 100
no_stylesheets = True no_stylesheets = True
use_embedded_content = False use_embedded_content = False
encoding = 'iso-8859-1' encoding = 'iso-8859-1'
cover_url = 'http://www.infobae.com/imgs/header/header.gif' cover_url = 'http://www.infobae.com/imgs/header/header.gif'
remove_javascript = True
html2lrf_options = [ html2lrf_options = [
'--comment' , description '--comment', description
, '--category' , 'news, Argentina' , '--category', category
, '--publisher' , 'Infobae.com' , '--publisher', publisher
] ]
html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"'
feeds = [ feeds = [
(u'Noticias' , u'http://www.infobae.com/adjuntos/html/RSS/hoy.xml' ) (u'Noticias' , u'http://www.infobae.com/adjuntos/html/RSS/hoy.xml' )
@ -39,3 +41,12 @@ class Infobae(BasicNewsRecipe):
main, sep, article_part = url.partition('contenidos/') main, sep, article_part = url.partition('contenidos/')
article_id, rsep, rrest = article_part.partition('-') article_id, rsep, rrest = article_part.partition('-')
return u'http://www.infobae.com/notas/nota_imprimir.php?Idx=' + article_id return u'http://www.infobae.com/notas/nota_imprimir.php?Idx=' + article_id
def preprocess_html(self, soup):
mtag = '<meta http-equiv="Content-Type" content="text/html; charset=utf-8">\n<meta http-equiv="Content-Language" content="es-AR"/>\n'
soup.head.insert(0,mtag)
for item in soup.findAll(style=True):
del item['style']
return soup
language = _('Spanish')

View File

@ -12,20 +12,24 @@ class JBOnline(BasicNewsRecipe):
title = 'Jornal Brasileiro Online' title = 'Jornal Brasileiro Online'
__author__ = 'Darko Miletic' __author__ = 'Darko Miletic'
description = 'News from Brasil' description = 'News from Brasil'
publisher = 'Jornal Brasileiro'
category = 'news, politics, Brasil'
oldest_article = 2 oldest_article = 2
language = _('Portugese')
max_articles_per_feed = 100 max_articles_per_feed = 100
no_stylesheets = True no_stylesheets = True
use_embedded_content = False use_embedded_content = False
encoding = 'cp1252' encoding = 'cp1252'
cover_url = 'http://jbonline.terra.com.br/img/logo_01.gif' cover_url = 'http://jbonline.terra.com.br/img/logo_01.gif'
remove_javascript = True
html2lrf_options = [ html2lrf_options = [
'--comment' , description '--comment', description
, '--category' , 'news, Brasil' , '--category', category
, '--publisher' , title , '--publisher', publisher
] ]
html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"'
keep_only_tags = [dict(name='div', attrs={'id':'corpoNoticia'})] keep_only_tags = [dict(name='div', attrs={'id':'corpoNoticia'})]
remove_tags = [dict(name=['script','object','form'])] remove_tags = [dict(name=['script','object','form'])]
@ -36,7 +40,8 @@ class JBOnline(BasicNewsRecipe):
ifr = soup.find('iframe') ifr = soup.find('iframe')
if ifr: if ifr:
ifr.extract() ifr.extract()
item = soup.find('div', attrs={'id':'corpoNoticia'}) for item in soup.findAll(style=True):
if item:
del item['style'] del item['style']
return soup return soup
language = _('Portugese')

View File

@ -6,28 +6,35 @@ __copyright__ = '2008, Darko Miletic <darko.miletic at gmail.com>'
jutarnji.hr jutarnji.hr
''' '''
import string, re import re
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulSoup
class Jutarnji(BasicNewsRecipe): class Jutarnji(BasicNewsRecipe):
title = 'Jutarnji' title = u'Jutarnji'
__author__ = 'Darko Miletic' __author__ = u'Darko Miletic'
description = 'Online izdanje Jutarnjeg lista' description = u'Hrvatski portal'
publisher = 'Jutarnji.hr'
category = 'news, politics, Croatia'
oldest_article = 2 oldest_article = 2
max_articles_per_feed = 100 max_articles_per_feed = 100
simultaneous_downloads = 1 simultaneous_downloads = 1
delay = 1 delay = 1
no_stylesheets = True no_stylesheets = True
use_embedded_content = False use_embedded_content = False
remove_javascript = True
encoding = 'cp1250' encoding = 'cp1250'
cover_url = 'http://www.jutarnji.hr/EPHResources/Images/2008/06/05/jhrlogo.png' extra_css = '@font-face {font-family: "serif1";src:url(res:///opt/sony/ebook/FONT/tt0011m_.ttf)} @font-face {font-family: "monospace1";src:url(res:///opt/sony/ebook/FONT/tt0419m_.ttf)} @font-face {font-family: "sans1";src:url(res:///opt/sony/ebook/FONT/tt0003m_.ttf)} body{text-align: left; font-family: serif1, serif} .article_date{font-family: monospace1, monospace} .article_description{font-family: sans1, sans-serif} .navbar{font-family: monospace1, monospace}'
html2lrf_options = [ html2lrf_options = [
'--comment', description '--comment', description
, '--base-font-size', '10' , '--category', category
, '--category', 'news, Croatia' , '--publisher', publisher
, '--publisher', 'Europapress holding d.o.o.'
] ]
html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"'
preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')] preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')]
remove_tags = [ remove_tags = [
@ -49,11 +56,16 @@ class Jutarnji(BasicNewsRecipe):
def print_version(self, url): def print_version(self, url):
main, split, rest = url.partition('.jl') main, split, rest = url.partition('.jl')
rmain, rsplit, rrest = main.rpartition(',') rmain, rsplit, rrest = main.rpartition(',')
return u'http://www.jutarnji.hr/ispis_clanka.jl?artid=' + rrest return 'http://www.jutarnji.hr/ispis_clanka.jl?artid=' + rrest
def preprocess_html(self, soup): def preprocess_html(self, soup):
mtag = '<meta http-equiv="Content-Type" content="text/html; charset=utf-8">' mtag = '<meta http-equiv="Content-Type" content="text/html; charset=utf-8">'
soup.head.insert(0,mtag) soup.head.insert(0,mtag)
soup.prettify() mtag = '<meta http-equiv="Content-Language" content="hr"/>'
soup.head.insert(0,mtag)
for item in soup.findAll(style=True):
del item['style']
for item in soup.findAll(width=True):
del item['width']
return soup return soup

View File

@ -13,21 +13,25 @@ class Juventudrebelde(BasicNewsRecipe):
title = 'Juventud Rebelde' title = 'Juventud Rebelde'
__author__ = 'Darko Miletic' __author__ = 'Darko Miletic'
description = 'Diario de la Juventud Cubana' description = 'Diario de la Juventud Cubana'
publisher = 'Juventud rebelde'
category = 'news, politics, Cuba'
oldest_article = 2 oldest_article = 2
language = _('Spanish')
max_articles_per_feed = 100 max_articles_per_feed = 100
no_stylesheets = True no_stylesheets = True
use_embedded_content = False use_embedded_content = False
encoding = 'cp1252' encoding = 'cp1252'
cover_url = strftime('http://www.juventudrebelde.cu/UserFiles/File/impreso/iportada-%Y-%m-%d.jpg') cover_url = strftime('http://www.juventudrebelde.cu/UserFiles/File/impreso/iportada-%Y-%m-%d.jpg')
remove_javascript = True
html2lrf_options = [ html2lrf_options = [
'--comment' , description '--comment', description
, '--category' , 'news, Cuba' , '--category', category
, '--publisher' , title , '--publisher', publisher
, '--ignore-tables' , '--ignore-tables'
] ]
html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"'
keep_only_tags = [dict(name='div', attrs={'id':'noticia'})] keep_only_tags = [dict(name='div', attrs={'id':'noticia'})]
feeds = [ feeds = [
@ -40,4 +44,11 @@ class Juventudrebelde(BasicNewsRecipe):
,(u'Lectura', u'http://www.juventudrebelde.cu/rss/generales.php?seccion=lectura' ) ,(u'Lectura', u'http://www.juventudrebelde.cu/rss/generales.php?seccion=lectura' )
] ]
def preprocess_html(self, soup):
mtag = '<meta http-equiv="Content-Language" content="es-CU"/>'
soup.head.insert(0,mtag)
for item in soup.findAll(style=True):
del item['style']
return soup
language = _('Spanish')

View File

@ -5,7 +5,6 @@ __copyright__ = '2008, Darko Miletic <darko.miletic at gmail.com>'
''' '''
juventudrebelde.co.cu juventudrebelde.co.cu
''' '''
from calibre import strftime
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
@ -13,22 +12,33 @@ class Juventudrebelde_english(BasicNewsRecipe):
title = 'Juventud Rebelde in english' title = 'Juventud Rebelde in english'
__author__ = 'Darko Miletic' __author__ = 'Darko Miletic'
description = 'The newspaper of Cuban Youth' description = 'The newspaper of Cuban Youth'
language = _('English') publisher = 'Juventud Rebelde'
category = 'news, politics, Cuba'
oldest_article = 2 oldest_article = 2
max_articles_per_feed = 100 max_articles_per_feed = 100
no_stylesheets = True no_stylesheets = True
use_embedded_content = False use_embedded_content = False
encoding = 'iso-8859-1' encoding = 'iso-8859-1'
remove_javascript = True
html2lrf_options = [ html2lrf_options = [
'--comment' , description '--comment', description
, '--category' , 'news, Cuba' , '--category', category
, '--publisher' , title , '--publisher', publisher
, '--ignore-tables' , '--ignore-tables'
] ]
html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"'
keep_only_tags = [dict(name='div', attrs={'class':'read'})] keep_only_tags = [dict(name='div', attrs={'class':'read'})]
feeds = [(u'All news', u'http://www.juventudrebelde.cip.cu/rss/all/' )] feeds = [(u'All news', u'http://www.juventudrebelde.cip.cu/rss/all/' )]
def preprocess_html(self, soup):
mtag = '<meta http-equiv="Content-Language" content="es-CU"/>'
soup.head.insert(0,mtag)
for item in soup.findAll(style=True):
del item['style']
return soup
language = _('English')

View File

@ -11,25 +11,28 @@ from calibre.web.feeds.news import BasicNewsRecipe
class LaCuarta(BasicNewsRecipe): class LaCuarta(BasicNewsRecipe):
title = 'La Cuarta' title = 'La Cuarta'
__author__ = 'Darko Miletic' __author__ = 'Darko Miletic'
description = 'El sitio de noticias online de Chile' description = 'La Cuarta Cibernetica: El Diario popular'
publisher = 'CODISA, Consorcio Digital S.A.'
category = 'news, politics, entertainment, Chile'
oldest_article = 2 oldest_article = 2
language = _('Spanish')
max_articles_per_feed = 100 max_articles_per_feed = 100
no_stylesheets = True no_stylesheets = True
use_embedded_content = False use_embedded_content = False
encoding = 'cp1252' encoding = 'cp1252'
remove_javascript = True
html2lrf_options = [ html2lrf_options = [
'--comment' , description '--comment', description
, '--category' , 'news, Chile' , '--category', category
, '--publisher' , title , '--publisher', publisher
] ]
html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"'
keep_only_tags = [dict(name='div', attrs={'class':'articulo desplegado'}) ] keep_only_tags = [dict(name='div', attrs={'class':'articulo desplegado'}) ]
remove_tags = [ remove_tags = [
dict(name='script') dict(name='ul')
,dict(name='ul')
,dict(name='div', attrs={'id':['toolbox','articleImageDisplayer','enviarAmigo']}) ,dict(name='div', attrs={'id':['toolbox','articleImageDisplayer','enviarAmigo']})
,dict(name='div', attrs={'class':['par ad-1','par ad-2']}) ,dict(name='div', attrs={'class':['par ad-1','par ad-2']})
,dict(name='input') ,dict(name='input')
@ -37,7 +40,14 @@ class LaCuarta(BasicNewsRecipe):
,dict(name='strong', text='PUBLICIDAD') ,dict(name='strong', text='PUBLICIDAD')
] ]
def preprocess_html(self, soup):
mtag = '<meta http-equiv="Content-Language" content="es-CL"/>'
soup.head.insert(0,mtag)
for item in soup.findAll(style=True):
del item['style']
return soup
feeds = [(u'Noticias', u'http://lacuarta.cl/app/rss?sc=TEFDVUFSVEE=')] feeds = [(u'Noticias', u'http://lacuarta.cl/app/rss?sc=TEFDVUFSVEE=')]
language = _('Spanish')

View File

@ -12,21 +12,24 @@ class LaSegunda(BasicNewsRecipe):
title = 'La Segunda' title = 'La Segunda'
__author__ = 'Darko Miletic' __author__ = 'Darko Miletic'
description = 'El sitio de noticias online de Chile' description = 'El sitio de noticias online de Chile'
language = _('Spanish') publisher = 'La Segunda'
category = 'news, politics, Chile'
oldest_article = 2 oldest_article = 2
max_articles_per_feed = 100 max_articles_per_feed = 100
no_stylesheets = True no_stylesheets = True
use_embedded_content = False use_embedded_content = False
encoding = 'cp1252' encoding = 'cp1252'
cover_url = 'http://www.lasegunda.com/imagenes/logotipo_lasegunda_Oli.gif' cover_url = 'http://www.lasegunda.com/imagenes/logotipo_lasegunda_Oli.gif'
remove_javascript = True
html2lrf_options = [ html2lrf_options = [
'--comment' , description '--comment', description
, '--category' , 'news, Chile' , '--category', category
, '--publisher' , title , '--publisher', publisher
, '--ignore-tables'
] ]
html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"'
keep_only_tags = [dict(name='table')] keep_only_tags = [dict(name='table')]
feeds = [ feeds = [
@ -46,3 +49,13 @@ class LaSegunda(BasicNewsRecipe):
rest, sep, article_id = url.partition('index.asp?idnoticia=') rest, sep, article_id = url.partition('index.asp?idnoticia=')
return u'http://www.lasegunda.com/edicionOnline/include/secciones/_detalle_impresion.asp?idnoticia=' + article_id return u'http://www.lasegunda.com/edicionOnline/include/secciones/_detalle_impresion.asp?idnoticia=' + article_id
def preprocess_html(self, soup):
mtag = '<meta http-equiv="Content-Language" content="es-CL"/>'
soup.head.insert(0,mtag)
for item in soup.findAll(name='table', width=True):
del item['width']
for item in soup.findAll(style=True):
del item['style']
return soup
language = _('Spanish')

View File

@ -12,20 +12,24 @@ class LaTercera(BasicNewsRecipe):
title = 'La Tercera' title = 'La Tercera'
__author__ = 'Darko Miletic' __author__ = 'Darko Miletic'
description = 'El sitio de noticias online de Chile' description = 'El sitio de noticias online de Chile'
publisher = 'La Tercera'
category = 'news, politics, Chile'
oldest_article = 2 oldest_article = 2
language = _('Spanish')
max_articles_per_feed = 100 max_articles_per_feed = 100
no_stylesheets = True no_stylesheets = True
use_embedded_content = False
encoding = 'cp1252' encoding = 'cp1252'
remove_javascript = True
use_embedded_content = False
html2lrf_options = [ html2lrf_options = [
'--comment' , description '--comment', description
, '--category' , 'news, Chile' , '--category', category
, '--publisher' , title , '--publisher', publisher
] ]
keep_only_tags = [dict(name='div', attrs={'class':'span-16 articulo border'}) ] html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"'
keep_only_tags = [dict(name='div', attrs={'class':['span-16 articulo border','span-16 border','span-16']}) ]
remove_tags = [ remove_tags = [
dict(name='script') dict(name='script')
@ -50,4 +54,11 @@ class LaTercera(BasicNewsRecipe):
,(u'Educacion', u'http://www.latercera.com/app/rss?sc=TEFURVJDRVJB&category=657') ,(u'Educacion', u'http://www.latercera.com/app/rss?sc=TEFURVJDRVJB&category=657')
] ]
def preprocess_html(self, soup):
mtag = '<meta http-equiv="Content-Language" content="es-CL"/>'
soup.head.insert(0,mtag)
for item in soup.findAll(style=True):
del item['style']
return soup
language = _('Spanish')

View File

@ -1,7 +1,7 @@
#!/usr/bin/env python #!/usr/bin/env python
__license__ = 'GPL v3' __license__ = 'GPL v3'
__copyright__ = '2008, Darko Miletic <darko.miletic at gmail.com>' __copyright__ = '2008-2009, Darko Miletic <darko.miletic at gmail.com>'
''' '''
lanacion.com.ar lanacion.com.ar
''' '''
@ -11,20 +11,23 @@ from calibre.web.feeds.news import BasicNewsRecipe
class Lanacion(BasicNewsRecipe): class Lanacion(BasicNewsRecipe):
title = 'La Nacion' title = 'La Nacion'
__author__ = 'Darko Miletic' __author__ = 'Darko Miletic'
description = 'Informacion actualizada las 24 horas, con noticias de Argentina y del mundo - Informate ya!' description = 'Noticias de Argentina y el resto del mundo'
publisher = 'La Nacion'
category = 'news, politics, Argentina'
oldest_article = 2 oldest_article = 2
language = _('Spanish')
max_articles_per_feed = 100 max_articles_per_feed = 100
no_stylesheets = True
use_embedded_content = False use_embedded_content = False
remove_javascript = True
no_stylesheets = True
html2lrf_options = [ html2lrf_options = [
'--comment', description '--comment', description
, '--base-font-size', '10' , '--category', category
, '--category', 'news, Argentina' , '--publisher', publisher
, '--publisher', 'La Nacion SA'
] ]
html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"'
keep_only_tags = [dict(name='div', attrs={'class':'nota floatFix'})] keep_only_tags = [dict(name='div', attrs={'class':'nota floatFix'})]
remove_tags = [ remove_tags = [
dict(name='div' , attrs={'class':'notaComentario floatFix noprint' }) dict(name='div' , attrs={'class':'notaComentario floatFix noprint' })
@ -47,11 +50,11 @@ class Lanacion(BasicNewsRecipe):
,(u'Revista' , u'http://www.lanacion.com.ar/herramientas/rss/index.asp?categoria_id=494' ) ,(u'Revista' , u'http://www.lanacion.com.ar/herramientas/rss/index.asp?categoria_id=494' )
] ]
def get_cover_url(self): def preprocess_html(self, soup):
index = 'http://www.lanacion.com.ar' mtag = '<meta http-equiv="Content-Language" content="es-AR"/>'
cover_url = None soup.head.insert(0,mtag)
soup = self.index_to_soup(index) for item in soup.findAll(style=True):
cover_item = soup.find('img',attrs={'class':'logo'}) del item['style']
if cover_item: return soup
cover_url = index + cover_item['src']
return cover_url language = _('Spanish')

View File

@ -13,20 +13,24 @@ class LaNacionChile(BasicNewsRecipe):
title = 'La Nacion Chile' title = 'La Nacion Chile'
__author__ = 'Darko Miletic' __author__ = 'Darko Miletic'
description = 'El sitio de noticias online de Chile' description = 'El sitio de noticias online de Chile'
publisher = 'La Nacion'
category = 'news, politics, Chile'
oldest_article = 2 oldest_article = 2
language = _('Spanish')
max_articles_per_feed = 100 max_articles_per_feed = 100
no_stylesheets = True no_stylesheets = True
use_embedded_content = False use_embedded_content = False
encoding = 'cp1252' encoding = 'cp1252'
cover_url = 'http://www.lanacion.cl/prontus_noticias_v2/imag/site/logo.gif' cover_url = 'http://www.lanacion.cl/prontus_noticias_v2/imag/site/logo.gif'
remove_javascript = True
html2lrf_options = [ html2lrf_options = [
'--comment' , description '--comment', description
, '--category' , 'news, Chile' , '--category', category
, '--publisher' , title , '--publisher', publisher
] ]
html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"'
keep_only_tags = [dict(name='div', attrs={'class':'bloque'})] keep_only_tags = [dict(name='div', attrs={'class':'bloque'})]
feeds = [(u'Noticias', u'http://www.lanacion.cl/rss.xml')] feeds = [(u'Noticias', u'http://www.lanacion.cl/rss.xml')]
@ -41,5 +45,10 @@ class LaNacionChile(BasicNewsRecipe):
item = soup.find('a', attrs={'href':'javascript:window.close()'}) item = soup.find('a', attrs={'href':'javascript:window.close()'})
if item: if item:
item.extract() item.extract()
mtag = '<meta http-equiv="Content-Language" content="es-CL"/>'
soup.head.insert(0,mtag)
for item in soup.findAll(style=True):
del item['style']
return soup return soup
language = _('Spanish')

View File

@ -1,7 +1,7 @@
#!/usr/bin/env python #!/usr/bin/env python
__license__ = 'GPL v3' __license__ = 'GPL v3'
__copyright__ = '2008, Darko Miletic <darko.miletic at gmail.com>' __copyright__ = '2008-2009, Darko Miletic <darko.miletic at gmail.com>'
''' '''
laprensa.com.ar laprensa.com.ar
''' '''
@ -13,20 +13,24 @@ class LaPrensa(BasicNewsRecipe):
title = 'La Prensa' title = 'La Prensa'
__author__ = 'Darko Miletic' __author__ = 'Darko Miletic'
description = 'Informacion Libre las 24 horas' description = 'Informacion Libre las 24 horas'
publisher = 'La Prensa'
category = 'news, politics, Argentina'
oldest_article = 7 oldest_article = 7
language = _('Spanish')
max_articles_per_feed = 100 max_articles_per_feed = 100
no_stylesheets = True no_stylesheets = True
use_embedded_content = False use_embedded_content = False
encoding = 'cp1252' encoding = 'cp1252'
cover_url = 'http://www.laprensa.com.ar/imgs/logo.gif' cover_url = 'http://www.laprensa.com.ar/imgs/logo.gif'
remove_javascript = True
html2lrf_options = [ html2lrf_options = [
'--comment' , description '--comment', description
, '--category' , 'news, Argentina' , '--category', category
, '--publisher' , title , '--publisher', publisher
] ]
html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"'
feeds = [ feeds = [
(u'Politica' , u'http://www.laprensa.com.ar/Rss.aspx?Rss=4' ) (u'Politica' , u'http://www.laprensa.com.ar/Rss.aspx?Rss=4' )
,(u'Economia' , u'http://www.laprensa.com.ar/Rss.aspx?Rss=5' ) ,(u'Economia' , u'http://www.laprensa.com.ar/Rss.aspx?Rss=5' )
@ -47,5 +51,10 @@ class LaPrensa(BasicNewsRecipe):
def preprocess_html(self, soup): def preprocess_html(self, soup):
del soup.body['onload'] del soup.body['onload']
mtag = '<meta http-equiv="Content-Language" content="es-AR"/>'
soup.head.insert(0,mtag)
for item in soup.findAll(style=True):
del item['style']
return soup return soup
language = _('Spanish')

View File

@ -7,15 +7,17 @@ nin.co.yu
''' '''
import re, urllib import re, urllib
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
class Nin(BasicNewsRecipe): class Nin(BasicNewsRecipe):
title = 'NIN online' title = 'NIN online'
__author__ = 'Darko Miletic' __author__ = 'Darko Miletic'
description = 'Nedeljne informativne novine' description = 'Nedeljne informativne novine'
publisher = 'NIN'
category = 'news, politics, Serbia'
no_stylesheets = True no_stylesheets = True
oldest_article = 15 oldest_article = 15
language = _('Serbian')
simultaneous_downloads = 1 simultaneous_downloads = 1
delay = 1 delay = 1
encoding = 'utf8' encoding = 'utf8'
@ -23,12 +25,18 @@ class Nin(BasicNewsRecipe):
PREFIX = 'http://www.nin.co.yu' PREFIX = 'http://www.nin.co.yu'
INDEX = PREFIX + '/?change_lang=ls' INDEX = PREFIX + '/?change_lang=ls'
LOGIN = PREFIX + '/?logout=true' LOGIN = PREFIX + '/?logout=true'
remove_javascript = True
use_embedded_content = False
extra_css = '@font-face {font-family: "serif1";src:url(res:///opt/sony/ebook/FONT/tt0011m_.ttf)} @font-face {font-family: "monospace1";src:url(res:///opt/sony/ebook/FONT/tt0419m_.ttf)} @font-face {font-family: "sans1";src:url(res:///opt/sony/ebook/FONT/tt0003m_.ttf)} body{text-align: left; font-family: serif1, serif} .article_date{font-family: monospace1, monospace} .article_description{font-family: sans1, sans-serif} .navbar{font-family: monospace1, monospace}'
html2lrf_options = [ html2lrf_options = [
'--comment' , description '--comment', description
, '--category' , 'news, politics, Serbia' , '--category', category
, '--publisher' , 'NIN' , '--publisher', publisher
] ]
html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"'
preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')] preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')]
def get_browser(self): def get_browser(self):
@ -54,3 +62,12 @@ class Nin(BasicNewsRecipe):
if link_item: if link_item:
cover_url = self.PREFIX + link_item['src'] cover_url = self.PREFIX + link_item['src']
return cover_url return cover_url
def preprocess_html(self, soup):
mtag = '<meta http-equiv="Content-Language" content="sr-Latn-RS"/>'
soup.head.insert(0,mtag)
for item in soup.findAll(style=True):
del item['style']
return soup
language = _('Serbian')

View File

@ -5,32 +5,45 @@ __copyright__ = '2008, Darko Miletic <darko.miletic at gmail.com>'
''' '''
novosti.rs novosti.rs
''' '''
import string,re
import re
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
class Novosti(BasicNewsRecipe): class Novosti(BasicNewsRecipe):
title = 'Vecernje Novosti' title = u'Vecernje Novosti'
__author__ = 'Darko Miletic' __author__ = u'Darko Miletic'
description = 'novosti, vesti, politika, dosije, drustvo, ekonomija, hronika, reportaze, svet, kultura, sport, beograd, regioni, mozaik, feljton, intrvju, pjer, fudbal, kosarka, podvig, arhiva, komentari, kolumne, srbija, republika srpska,Vecernje novosti' description = u'Vesti'
publisher = 'Kompanija Novosti'
category = 'news, politics, Serbia'
oldest_article = 2 oldest_article = 2
language = _('Serbian')
max_articles_per_feed = 100 max_articles_per_feed = 100
no_stylesheets = True no_stylesheets = True
use_embedded_content = False use_embedded_content = False
encoding = 'utf8'
remove_javascript = True
extra_css = '@font-face {font-family: "serif1";src:url(res:///opt/sony/ebook/FONT/tt0011m_.ttf)} @font-face {font-family: "monospace1";src:url(res:///opt/sony/ebook/FONT/tt0419m_.ttf)} @font-face {font-family: "sans1";src:url(res:///opt/sony/ebook/FONT/tt0003m_.ttf)} body{text-align: left; font-family: serif1, serif} .article_date{font-family: monospace1, monospace} .article_description{font-family: sans1, sans-serif} .navbar{font-family: monospace1, monospace}'
html2lrf_options = [ html2lrf_options = [
'--comment', description '--comment', description
, '--base-font-size', '10' , '--category', category
, '--category', 'news, Serbia' , '--publisher', publisher
, '--publisher', 'Novosti AD'
] ]
html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"'
preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')] preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')]
keep_only_tags = [ dict(name='div', attrs={'class':'jednaVest'}) ] keep_only_tags = [dict(name='div', attrs={'class':'jednaVest'})]
remove_tags_after = dict(name='div', attrs={'class':'info_bottom'}) remove_tags = [dict(name='div', attrs={'class':['info','info_bottom','clip_div']})]
remove_tags = [
dict(name='div', attrs={'class':'info'})
,dict(name='div', attrs={'class':'info_bottom'})
]
feeds = [ (u'Vesti', u'http://www.novosti.rs/php/vesti/rss.php')] feeds = [(u'Vesti', u'http://www.novosti.rs/php/vesti/rss.php')]
def preprocess_html(self, soup):
mtag = '<meta http-equiv="Content-Language" content="sr-Latn"/>'
soup.head.insert(0,mtag)
for item in soup.findAll(style=True):
del item['style']
return soup
language = _('Serbian')

View File

@ -6,35 +6,55 @@ __copyright__ = '2008, Darko Miletic <darko.miletic at gmail.com>'
nspm.rs nspm.rs
''' '''
import string,re import re
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
class Nspm(BasicNewsRecipe): class Nspm(BasicNewsRecipe):
title = u'Nova srpska politicka misao' title = u'Nova srpska politicka misao'
__author__ = 'Darko Miletic' __author__ = 'Darko Miletic'
description = 'Casopis za politicku teoriju i drustvena istrazivanja' description = 'Casopis za politicku teoriju i drustvena istrazivanja'
publisher = 'NSPM'
category = 'news, politics, Serbia'
oldest_article = 7 oldest_article = 7
language = _('Serbian')
max_articles_per_feed = 100 max_articles_per_feed = 100
no_stylesheets = True no_stylesheets = True
use_embedded_content = False use_embedded_content = False
INDEX = 'http://www.nspm.rs/?alphabet=l' INDEX = 'http://www.nspm.rs/?alphabet=l'
cover_url = 'http://nspm.rs/templates/jsn_epic_pro/images/logol.jpg' encoding = 'utf8'
remove_javascript = True
extra_css = '@font-face {font-family: "serif1";src:url(res:///opt/sony/ebook/FONT/tt0011m_.ttf)} @font-face {font-family: "monospace1";src:url(res:///opt/sony/ebook/FONT/tt0419m_.ttf)} @font-face {font-family: "sans1";src:url(res:///opt/sony/ebook/FONT/tt0003m_.ttf)} body{text-align: left; font-family: serif1, serif} .article_date{font-family: monospace1, monospace} .article_description{font-family: sans1, sans-serif} .navbar{font-family: monospace1, monospace}'
html2lrf_options = [ html2lrf_options = [
'--comment', description '--comment', description
, '--base-font-size', '10' , '--category', category
, '--category', 'news, politics, Serbia' , '--publisher', publisher
, '--publisher', 'IIC NSPM' , '--ignore-tables'
] ]
html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"'
preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')] preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')]
remove_tags = [dict(name='a')]
def get_browser(self): def get_browser(self):
br = BasicNewsRecipe.get_browser() br = BasicNewsRecipe.get_browser()
br.open(self.INDEX) br.open(self.INDEX)
return br return br
feeds = [ (u'Nova srpska politicka misao', u'http://www.nspm.rs/feed/rss.html')] feeds = [(u'Nova srpska politicka misao', u'http://www.nspm.rs/feed/rss.html')]
def print_version(self, url): def print_version(self, url):
return url.replace('.html','/stampa.html') return url.replace('.html','/stampa.html')
def preprocess_html(self, soup):
soup.html['xml:lang'] = 'sr-Latn-RS'
soup.html['lang'] = 'sr-Latn-RS'
ftag = soup.find('meta',attrs={'http-equiv':'Content-Language'})
if ftag:
ftag['content'] = 'sr-Latn-RS'
for item in soup.findAll(style=True):
del item['style']
return soup
language = _('Serbian')

View File

@ -12,20 +12,24 @@ class OGlobo(BasicNewsRecipe):
title = 'O Globo' title = 'O Globo'
__author__ = 'Darko Miletic' __author__ = 'Darko Miletic'
description = 'News from Brasil' description = 'News from Brasil'
publisher = 'O Globo'
category = 'news, politics, Brasil'
oldest_article = 2 oldest_article = 2
max_articles_per_feed = 100 max_articles_per_feed = 100
language = _('Portugese')
no_stylesheets = True no_stylesheets = True
use_embedded_content = False use_embedded_content = False
encoding = 'cp1252' encoding = 'cp1252'
cover_url = 'http://oglobo.globo.com/_img/o-globo.png' cover_url = 'http://oglobo.globo.com/_img/o-globo.png'
remove_javascript = True
html2lrf_options = [ html2lrf_options = [
'--comment' , description '--comment', description
, '--category' , 'news, Brasil' , '--category', category
, '--publisher' , title , '--publisher', publisher
] ]
html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"'
keep_only_tags = [dict(name='div', attrs={'id':'ltintb'})] keep_only_tags = [dict(name='div', attrs={'id':'ltintb'})]
remove_tags = [ remove_tags = [
@ -56,3 +60,10 @@ class OGlobo(BasicNewsRecipe):
,(u'Economia', u'http://oglobo.globo.com/rss/plantaoeconomia.xml') ,(u'Economia', u'http://oglobo.globo.com/rss/plantaoeconomia.xml')
,(u'Tecnologia', u'http://oglobo.globo.com/rss/plantaotecnologia.xml') ,(u'Tecnologia', u'http://oglobo.globo.com/rss/plantaotecnologia.xml')
] ]
def preprocess_html(self, soup):
for item in soup.findAll(style=True):
del item['style']
return soup
language = _('Portugese')

View File

@ -1,32 +1,37 @@
#!/usr/bin/env python #!/usr/bin/env python
__license__ = 'GPL v3' __license__ = 'GPL v3'
__copyright__ = '2008, Darko Miletic <darko.miletic at gmail.com>' __copyright__ = '2008-2009, Darko Miletic <darko.miletic at gmail.com>'
''' '''
pagina12.com.ar pagina12.com.ar
''' '''
from calibre import strftime from calibre import strftime
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
class Pagina12(BasicNewsRecipe): class Pagina12(BasicNewsRecipe):
title = u'Pagina/12' title = u'Pagina/12'
__author__ = 'Darko Miletic' __author__ = 'Darko Miletic'
description = 'Noticias de Argentina y el resto del mundo' description = 'Noticias de Argentina y el resto del mundo'
language = _('Spanish') publisher = 'La Pagina S.A.'
category = 'news, politics, Argentina'
oldest_article = 2 oldest_article = 2
max_articles_per_feed = 100 max_articles_per_feed = 100
no_stylesheets = True no_stylesheets = True
use_embedded_content = False
encoding = 'cp1252' encoding = 'cp1252'
cover_url = strftime('http://www.pagina12.com.ar/fotos/%Y%m%d/diario/TAPAN.jpg') cover_url = strftime('http://www.pagina12.com.ar/fotos/%Y%m%d/diario/TAPAN.jpg')
remove_javascript = True
use_embedded_content = False
html2lrf_options = [ html2lrf_options = [
'--comment' , description '--comment', description
, '--category' , 'news, Argentina' , '--category', category
, '--publisher' , 'La Pagina S.A.' , '--publisher', publisher
] ]
html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"'
remove_tags = [ remove_tags = [
dict(name='div', attrs={'id':'volver'}) dict(name='div', attrs={'id':'volver'})
@ -38,3 +43,12 @@ class Pagina12(BasicNewsRecipe):
def print_version(self, url): def print_version(self, url):
return url.replace('http://www.pagina12.com.ar/','http://www.pagina12.com.ar/imprimir/') return url.replace('http://www.pagina12.com.ar/','http://www.pagina12.com.ar/imprimir/')
def preprocess_html(self, soup):
mtag = '<meta http-equiv="Content-Language" content="es-AR"/>'
soup.head.insert(0,mtag)
for item in soup.findAll(style=True):
del item['style']
return soup
language = _('Spanish')

View File

@ -6,31 +6,53 @@ __copyright__ = '2008, Darko Miletic <darko.miletic at gmail.com>'
pescanik.net pescanik.net
''' '''
import string,re import re
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
class Pescanik(BasicNewsRecipe): class Pescanik(BasicNewsRecipe):
title = 'Pescanik' title = 'Pescanik'
__author__ = 'Darko Miletic' __author__ = 'Darko Miletic'
description = 'Pescanik' description = 'Pescanik'
publisher = 'Pescanik'
category = 'news, politics, Serbia'
oldest_article = 7 oldest_article = 7
language = _('Serbian')
max_articles_per_feed = 100 max_articles_per_feed = 100
no_stylesheets = True no_stylesheets = True
use_embedded_content = False use_embedded_content = False
html2lrf_options = ['--base-font-size', '10'] remove_javascript = True
html2epub_options = 'base_font_size = "10pt"' encoding = 'utf8'
extra_css = '@font-face {font-family: "serif1";src:url(res:///opt/sony/ebook/FONT/tt0011m_.ttf)} @font-face {font-family: "monospace1";src:url(res:///opt/sony/ebook/FONT/tt0419m_.ttf)} @font-face {font-family: "sans1";src:url(res:///opt/sony/ebook/FONT/tt0003m_.ttf)} body{text-align: left; font-family: serif1, serif} .article_date{font-family: monospace1, monospace} .article_description{font-family: sans1, sans-serif} .navbar{font-family: monospace1, monospace}'
html2lrf_options = [
'--comment', description
, '--category', category
, '--publisher', publisher
]
html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"'
cover_url = "http://pescanik.net/templates/ja_teline/images/logo.png" cover_url = "http://pescanik.net/templates/ja_teline/images/logo.png"
preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')] preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')]
remove_tags_after = dict(name='div', attrs={'class':'article_seperator'}) remove_tags = [
dict(name='td' , attrs={'class':'buttonheading'})
remove_tags = [dict(name='td' , attrs={'class':'buttonheading'})] ,dict(name='span', attrs={'class':'article_seperator'})
,dict(name=['object','link'])
]
feeds = [(u'Pescanik Online', u'http://pescanik.net/index.php?option=com_rd_rss&id=12')] feeds = [(u'Pescanik Online', u'http://pescanik.net/index.php?option=com_rd_rss&id=12')]
def print_version(self, url): def print_version(self, url):
nurl = url.replace('http://pescanik.net/index.php','http://pescanik.net/index2.php') nurl = url.replace('/index.php','/index2.php')
return nurl + '&pop=1&page=0' return nurl + '&pop=1&page=0'
def preprocess_html(self, soup):
mtag = '<meta http-equiv="Content-Language" content="sr-Latn-RS"/>'
soup.head.insert(0,mtag)
for item in soup.findAll(style=True):
del item['style']
return soup
language = _('Serbian')

View File

@ -5,37 +5,61 @@ __copyright__ = '2008, Darko Miletic <darko.miletic at gmail.com>'
''' '''
politika.rs politika.rs
''' '''
import string,re import re
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
class Politika(BasicNewsRecipe): class Politika(BasicNewsRecipe):
title = 'Politika Online' title = u'Politika Online'
__author__ = 'Darko Miletic' __author__ = 'Darko Miletic'
description = 'Najstariji dnevni list na Balkanu' description = 'Najstariji dnevni list na Balkanu'
publisher = 'Politika novine i Magazini d.o.o'
category = 'news, politics, Serbia'
oldest_article = 2 oldest_article = 2
max_articles_per_feed = 100 max_articles_per_feed = 100
no_stylesheets = True no_stylesheets = True
extra_css = '.content_center_border {text-align: left;}'
use_embedded_content = False use_embedded_content = False
cover_url = 'http://www.politika.rs:8080/images/politika.gif' remove_javascript = True
encoding = 'utf8'
extra_css = '@font-face {font-family: "serif1";src:url(res:///opt/sony/ebook/FONT/tt0011m_.ttf)} @font-face {font-family: "monospace1";src:url(res:///opt/sony/ebook/FONT/tt0419m_.ttf)} @font-face {font-family: "sans1";src:url(res:///opt/sony/ebook/FONT/tt0003m_.ttf)} body{text-align: left; font-family: serif1, serif} .article_date{font-family: monospace1, monospace} .article_description{font-family: sans1, sans-serif} .navbar{font-family: monospace1, monospace}'
html2lrf_options = [ html2lrf_options = [
'--comment', description '--comment', description
, '--base-font-size', '10' , '--category', category
, '--category', 'news, Serbia' , '--publisher', publisher
, '--publisher', 'POLITIKA NOVINE I MAGAZINI d.o.o.'
] ]
html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"'
preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')] preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')]
keep_only_tags = [ dict(name='div', attrs={'class':'contentcenter'}) ] keep_only_tags = [dict(name='div', attrs={'class':'content_center_border'})]
remove_tags_after = dict(name='div', attrs={'class':'datum_item_details'})
remove_tags = [
dict(name='div', attrs={'class':['send_print','txt-komentar']})
,dict(name=['object','link','a'])
,dict(name='h1', attrs={'class':'box_header-tags'})
]
feeds = [ feeds = [
(u'Politika' , u'http://www.politika.rs/rubrike/Politika/index.1.lt.xml' ) (u'Politika' , u'http://www.politika.rs/rubrike/Politika/index.1.lt.xml' )
,(u'Svet' , u'http://www.politika.rs/rubrike/Svet/index.1.lt.xml' ) ,(u'Svet' , u'http://www.politika.rs/rubrike/Svet/index.1.lt.xml' )
,(u'Redakcijski komentari', u'http://www.politika.rs/rubrike/redakcijski-komentari/index.1.lt.xml')
,(u'Pogledi' , u'http://www.politika.rs/pogledi/index.lt.xml' ) ,(u'Pogledi' , u'http://www.politika.rs/pogledi/index.lt.xml' )
,(u'Pogledi sa strane' , u'http://www.politika.rs/rubrike/Pogledi-sa-strane/index.1.lt.xml' ) ,(u'Pogledi sa strane' , u'http://www.politika.rs/rubrike/Pogledi-sa-strane/index.1.lt.xml' )
,(u'Tema dana' , u'http://www.politika.rs/rubrike/tema-dana/index.1.lt.xml' ) ,(u'Tema dana' , u'http://www.politika.rs/rubrike/tema-dana/index.1.lt.xml' )
,(u'Kultura' , u'http://www.politika.rs/rubrike/Kultura/index.1.lt.xml' ) ,(u'Kultura' , u'http://www.politika.rs/rubrike/Kultura/index.1.lt.xml' )
,(u'Zivot i stil' , u'http://www.politika.rs/rubrike/zivot-i-stil/index.1.lt.xml' ) ,(u'Zivot i stil' , u'http://www.politika.rs/rubrike/zivot-i-stil/index.1.lt.xml' )
] ]
def preprocess_html(self, soup):
mtag = '<meta http-equiv="Content-Language" content="sr-Latn-RS"/>'
soup.head.insert(0,mtag)
for item in soup.findAll(style=True):
del item['style']
ftag = soup.find('div',attrs={'class':'content_center_border'})
if ftag:
ftag['align'] = 'left'
return soup

View File

@ -1,13 +1,13 @@
#!/usr/bin/env python #!/usr/bin/env python
__license__ = 'GPL v3' __license__ = 'GPL v3'
__copyright__ = '2008, Darko Miletic <darko.miletic at gmail.com>' __copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>'
''' '''
vijesti.cg.yu vijesti.cg.yu
''' '''
import string,re import re
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
@ -15,24 +15,35 @@ class Vijesti(BasicNewsRecipe):
title = 'Vijesti' title = 'Vijesti'
__author__ = 'Darko Miletic' __author__ = 'Darko Miletic'
description = 'News from Montenegro' description = 'News from Montenegro'
oldest_article = 2 publisher = 'Daily Press Vijesti'
language = _('Serbian') category = 'news, politics, Montenegro'
oldest_article = 1
max_articles_per_feed = 100 max_articles_per_feed = 100
no_stylesheets = True no_stylesheets = True
use_embedded_content = False remove_javascript = True
encoding = 'cp1250' encoding = 'cp1250'
cover_url = 'http://www.vijesti.cg.yu/img/logo.gif' cover_url = 'http://www.vijesti.cg.yu/img/logo.gif'
remove_javascript = True
use_embedded_content = False
extra_css = '@font-face {font-family: "serif1";src:url(res:///opt/sony/ebook/FONT/tt0011m_.ttf)} @font-face {font-family: "monospace1";src:url(res:///opt/sony/ebook/FONT/tt0419m_.ttf)} @font-face {font-family: "sans1";src:url(res:///opt/sony/ebook/FONT/tt0003m_.ttf)} body{text-align: left; font-family: serif1, serif} .article_date{font-family: monospace1, monospace} .article_description{font-family: sans1, sans-serif} .navbar{font-family: monospace1, monospace}'
html2lrf_options = [
'--comment', description
, '--category', category
, '--publisher', publisher
]
html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"'
preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')] preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')]
html2lrf_options = [
'--comment' , description
, '--category' , 'news, Montenegro'
, '--publisher' , 'Daily Press Vijesti'
]
keep_only_tags = [dict(name='div', attrs={'id':'mainnews'})] keep_only_tags = [dict(name='div', attrs={'id':'mainnews'})]
remove_tags = [
dict(name='div', attrs={'align':'right'})
,dict(name=['object','link'])
]
feeds = [(u'Sve vijesti', u'http://www.vijesti.cg.yu/rss.php' )] feeds = [(u'Sve vijesti', u'http://www.vijesti.cg.yu/rss.php' )]
def preprocess_html(self, soup): def preprocess_html(self, soup):
@ -40,4 +51,10 @@ class Vijesti(BasicNewsRecipe):
soup.html['lang'] = 'sr-Latn-ME' soup.html['lang'] = 'sr-Latn-ME'
mtag = '<meta http-equiv="Content-Language" content="sr-Latn-ME"/>' mtag = '<meta http-equiv="Content-Language" content="sr-Latn-ME"/>'
soup.head.insert(0,mtag) soup.head.insert(0,mtag)
for item in soup.findAll('img'):
if item.has_key('align'):
del item['align']
item.insert(0,'<br /><br />')
return soup return soup
language = _('Serbian')

View File

@ -6,27 +6,34 @@ __copyright__ = '2008, Darko Miletic <darko.miletic at gmail.com>'
vreme.com vreme.com
''' '''
import string,re import re
from calibre import strftime from calibre import strftime
from calibre.web.feeds.recipes import BasicNewsRecipe
from calibre.web.feeds.news import BasicNewsRecipe
class Vreme(BasicNewsRecipe): class Vreme(BasicNewsRecipe):
title = 'Vreme' title = 'Vreme'
__author__ = 'Darko Miletic' __author__ = 'Darko Miletic'
description = 'Politicki Nedeljnik Srbije' description = 'Politicki Nedeljnik Srbije'
publisher = 'Vreme d.o.o.'
category = 'news, politics, Serbia'
no_stylesheets = True no_stylesheets = True
language = _('Serbian') remove_javascript = True
needs_subscription = True needs_subscription = True
INDEX = 'http://www.vreme.com' INDEX = 'http://www.vreme.com'
LOGIN = 'http://www.vreme.com/account/index.php' LOGIN = 'http://www.vreme.com/account/index.php'
remove_javascript = True
use_embedded_content = False
extra_css = '@font-face {font-family: "serif1";src:url(res:///opt/sony/ebook/FONT/tt0011m_.ttf)} @font-face {font-family: "monospace1";src:url(res:///opt/sony/ebook/FONT/tt0419m_.ttf)} @font-face {font-family: "sans1";src:url(res:///opt/sony/ebook/FONT/tt0003m_.ttf)} body{text-align: left; font-family: serif1, serif} .article_date{font-family: monospace1, monospace} .article_description{font-family: sans1, sans-serif} .navbar{font-family: monospace1, monospace}'
html2lrf_options = [ html2lrf_options = [
'--comment', description '--comment', description
, '--base-font-size', '10' , '--category', category
, '--category', 'news, politics, Serbia' , '--publisher', publisher
, '--publisher', 'Vreme d.o.o.'
] ]
html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"'
preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')] preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')]
def get_browser(self): def get_browser(self):
@ -68,9 +75,28 @@ class Vreme(BasicNewsRecipe):
}) })
return [(soup.head.title.string, articles)] return [(soup.head.title.string, articles)]
remove_tags = [
dict(name=['object','link'])
,dict(name='table',attrs={'xclass':'image'})
]
def print_version(self, url): def print_version(self, url):
return url + '&print=yes' return url + '&print=yes'
def preprocess_html(self, soup):
del soup.body['text' ]
del soup.body['bgcolor']
del soup.body['onload' ]
mtag = '<meta http-equiv="Content-Language" content="sr-Latn"/>'
soup.head.insert(0,mtag)
tbl = soup.body.table
tbbb = soup.find('td')
if tbbb:
tbbb.extract()
tbl.extract()
soup.body.insert(0,tbbb)
return soup
def get_cover_url(self): def get_cover_url(self):
cover_url = None cover_url = None
soup = self.index_to_soup(self.INDEX) soup = self.index_to_soup(self.INDEX)
@ -78,3 +104,5 @@ class Vreme(BasicNewsRecipe):
if cover_item: if cover_item:
cover_url = self.INDEX + cover_item['src'] cover_url = self.INDEX + cover_item['src']
return cover_url return cover_url
language = _('Serbian')