Fix #1791 (Major recipe update)

This commit is contained in:
Kovid Goyal 2009-02-07 13:39:53 -08:00
parent 1c9c8870d2
commit 1145b768dc
31 changed files with 808 additions and 394 deletions

View File

@ -1,31 +1,38 @@
#!/usr/bin/env python #!/usr/bin/env python
__license__ = 'GPL v3' __license__ = 'GPL v3'
__copyright__ = '2008, Darko Miletic <darko.miletic at gmail.com>' __copyright__ = '2008-2009, Darko Miletic <darko.miletic at gmail.com>'
''' '''
ambito.com ambito.com
''' '''
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.web.feeds.news import BasicNewsRecipe
class Ambito(BasicNewsRecipe): class Ambito(BasicNewsRecipe):
title = 'Ambito.com' title = 'Ambito.com'
__author__ = 'Darko Miletic' __author__ = 'Darko Miletic'
description = 'Informacion Libre las 24 horas' description = 'Informacion Libre las 24 horas'
publisher = 'Ambito.com'
category = 'news, politics, Argentina'
oldest_article = 2 oldest_article = 2
max_articles_per_feed = 100 max_articles_per_feed = 100
no_stylesheets = True no_stylesheets = True
use_embedded_content = False encoding = 'iso-8859-1'
encoding = 'iso--8859-1'
language = _('Spanish')
cover_url = 'http://www.ambito.com/img/logo_.jpg' cover_url = 'http://www.ambito.com/img/logo_.jpg'
remove_javascript = True
use_embedded_content = False
html2lrf_options = [ html2lrf_options = [
'--comment' , description '--comment', description
, '--category' , 'news, Argentina' , '--category', category
, '--publisher' , title , '--publisher', publisher
] ]
html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"'
keep_only_tags = [dict(name='div', attrs={'align':'justify'})]
remove_tags = [dict(name=['object','link'])]
feeds = [ feeds = [
(u'Principales Noticias', u'http://www.ambito.com/rss/noticiasp.asp' ) (u'Principales Noticias', u'http://www.ambito.com/rss/noticiasp.asp' )
@ -43,3 +50,12 @@ class Ambito(BasicNewsRecipe):
def print_version(self, url): def print_version(self, url):
return url.replace('http://www.ambito.com/noticia.asp?','http://www.ambito.com/noticias/imprimir.asp?') return url.replace('http://www.ambito.com/noticia.asp?','http://www.ambito.com/noticias/imprimir.asp?')
def preprocess_html(self, soup):
mtag = '<meta http-equiv="Content-Language" content="es-AR"/>'
soup.head.insert(0,mtag)
for item in soup.findAll(style=True):
del item['style']
return soup
language = _('Spanish')

View File

@ -7,25 +7,33 @@ b92.net
''' '''
import re import re
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.web.feeds.news import BasicNewsRecipe
class B92(BasicNewsRecipe): class B92(BasicNewsRecipe):
title = u'B92' title = 'B92'
__author__ = 'Darko Miletic' __author__ = 'Darko Miletic'
language = _('Serbian')
description = 'Dnevne vesti iz Srbije i sveta' description = 'Dnevne vesti iz Srbije i sveta'
oldest_article = 7 oldest_article = 2
publisher = 'B92.net'
category = 'news, politics, Serbia'
max_articles_per_feed = 100 max_articles_per_feed = 100
remove_javascript = True
no_stylesheets = True no_stylesheets = True
use_embedded_content = False use_embedded_content = False
cover_url = 'http://static.b92.net/images/fp/logo.gif' cover_url = 'http://static.b92.net/images/fp/logo.gif'
extra_css = '@font-face {font-family: "serif1";src:url(res:///opt/sony/ebook/FONT/tt0011m_.ttf)} @font-face {font-family: "monospace1";src:url(res:///opt/sony/ebook/FONT/tt0419m_.ttf)} @font-face {font-family: "sans1";src:url(res:///opt/sony/ebook/FONT/tt0003m_.ttf)} body{text-align: left; font-family: serif1, serif} .article_date{font-family: monospace1, monospace} .article_description{font-family: sans1, sans-serif} .navbar{font-family: monospace1, monospace}'
keep_only_tags = [ dict(name='div', attrs={'class':'sama_vest'}) ] keep_only_tags = [ dict(name='div', attrs={'class':'sama_vest'}) ]
html2lrf_options = [ html2lrf_options = [
'--comment', description '--comment', description
, '--base-font-size', '10' , '--category', category
, '--category', 'news, Serbia' , '--publisher', publisher
, '--publisher', 'B92'
] ]
html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"'
preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')] preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')]
feeds = [ feeds = [
(u'Vesti', u'http://www.b92.net/info/rss/vesti.xml') (u'Vesti', u'http://www.b92.net/info/rss/vesti.xml')
@ -44,3 +52,16 @@ class B92(BasicNewsRecipe):
if biz: if biz:
nurl = 'http://www.b92.net/mobilni/biz/index.php?nav_id=' + article_id nurl = 'http://www.b92.net/mobilni/biz/index.php?nav_id=' + article_id
return nurl return nurl
def preprocess_html(self, soup):
soup.html['xml:lang'] = 'sr-Latn'
soup.html['lang'] = 'sr-Latn'
mtag = '<meta http-equiv="Content-Language" content="sr-Latn"/>'
soup.head.insert(0,mtag)
for item in soup.findAll(style=True):
del item['style']
for item in soup.findAll(name='img',align=True):
del item['align']
item.insert(0,'<br /><br />')
return soup
language = _('Serbian')

View File

@ -5,32 +5,49 @@ __copyright__ = '2008, Darko Miletic <darko.miletic at gmail.com>'
''' '''
blic.rs blic.rs
''' '''
import string,re
from calibre.web.feeds.news import BasicNewsRecipe
import re
from calibre.web.feeds.news import BasicNewsRecipe
class Blic(BasicNewsRecipe): class Blic(BasicNewsRecipe):
title = u'Blic' title = u'Blic'
__author__ = 'Darko Miletic' __author__ = u'Darko Miletic'
description = 'Blic.rs online verzija najtiraznije novine u Srbiji donosi najnovije vesti iz Srbije i sveta, komentare, politicke analize, poslovne i ekonomske vesti, vesti iz regiona, intervjue, informacije iz kulture, reportaze, pokriva sve sportske dogadjaje, detaljan tv program, nagradne igre, zabavu, fenomenalni Blic strip, dnevni horoskop, arhivu svih dogadjaja' description = u'Blic.co.yu online verzija najtiraznije novine u Srbiji donosi najnovije vesti iz Srbije i sveta, komentare, politicke analize, poslovne i ekonomske vesti, vesti iz regiona, intervjue, informacije iz kulture, reportaze, pokriva sve sportske dogadjaje, detaljan tv program, nagradne igre, zabavu, fenomenalni Blic strip, dnevni horoskop, arhivu svih dogadjaja'
oldest_article = 7 publisher = 'RINGIER d.o.o.'
language = _('Serbian') category = 'news, politics, Serbia'
oldest_article = 2
max_articles_per_feed = 100 max_articles_per_feed = 100
remove_javascript = True
no_stylesheets = True no_stylesheets = True
use_embedded_content = False use_embedded_content = False
cover_url = 'http://www.blic.rs/resources/images/header_back_tile.png' extra_css = '@font-face {font-family: "serif1";src:url(res:///opt/sony/ebook/FONT/tt0011m_.ttf)} @font-face {font-family: "monospace1";src:url(res:///opt/sony/ebook/FONT/tt0419m_.ttf)} @font-face {font-family: "sans1";src:url(res:///opt/sony/ebook/FONT/tt0003m_.ttf)} body{text-align: left; font-family: serif1, serif} .article_date{font-family: monospace1, monospace} .article_description{font-family: sans1, sans-serif} .navbar{font-family: monospace1, monospace}'
html2lrf_options = [ html2lrf_options = [
'--comment', description '--comment', description
, '--base-font-size', '10' , '--category', category
, '--category', 'news, Serbia' , '--publisher', publisher
, '--publisher', 'Blic'
] ]
html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"'
preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')] preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')]
keep_only_tags = [ dict(name='div', attrs={'class':'single_news'}) ] keep_only_tags = [dict(name='div', attrs={'class':'single_news'})]
feeds = [ (u'Vesti', u'http://www.blic.rs/rssall.php')] feeds = [(u'Vesti', u'http://www.blic.rs/rssall.php')]
remove_tags = [dict(name=['object','link'])]
def print_version(self, url): def print_version(self, url):
start_url, question, rest_url = url.partition('?') start_url, question, rest_url = url.partition('?')
return u'http://www.blic.rs/_print.php?' + rest_url return u'http://www.blic.rs/_print.php?' + rest_url
def preprocess_html(self, soup):
mtag = '<meta http-equiv="Content-Language" content="sr-Latn"/>'
soup.head.insert(0,mtag)
for item in soup.findAll(style=True):
del item['style']
return soup
language = _('Serbian')

View File

@ -1,31 +1,35 @@
#!/usr/bin/env python #!/usr/bin/env python
__license__ = 'GPL v3' __license__ = 'GPL v3'
__copyright__ = '2008, Darko Miletic <darko.miletic at gmail.com>' __copyright__ = '2008-2009, Darko Miletic <darko.miletic at gmail.com>'
''' '''
clarin.com clarin.com
''' '''
from calibre import strftime from calibre import strftime
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.web.feeds.news import BasicNewsRecipe
class Clarin(BasicNewsRecipe): class Clarin(BasicNewsRecipe):
title = 'Clarin' title = 'Clarin'
__author__ = 'Darko Miletic' __author__ = 'Darko Miletic'
description = 'Noticias de Argentina y mundo' description = 'Noticias de Argentina y mundo'
publisher = 'Grupo Clarin'
category = 'news, politics, Argentina'
oldest_article = 2 oldest_article = 2
max_articles_per_feed = 100 max_articles_per_feed = 100
language = _('Spanish')
use_embedded_content = False use_embedded_content = False
no_stylesheets = True no_stylesheets = True
cover_url = strftime('http://www.clarin.com/diario/%Y/%m/%d/portada.jpg') cover_url = strftime('http://www.clarin.com/diario/%Y/%m/%d/portada.jpg')
remove_javascript = True
html2lrf_options = [ html2lrf_options = [
'--comment', description '--comment', description
, '--base-font-size', '10' , '--category', category
, '--category', 'news, Argentina' , '--publisher', publisher
, '--publisher', 'Grupo Clarin'
] ]
html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"'
remove_tags = [ remove_tags = [
dict(name='a' , attrs={'class':'Imp' }) dict(name='a' , attrs={'class':'Imp' })
@ -49,3 +53,12 @@ class Clarin(BasicNewsRecipe):
rest = artl.partition('-0')[-1] rest = artl.partition('-0')[-1]
lmain = rest.partition('.')[0] lmain = rest.partition('.')[0]
return 'http://www.servicios.clarin.com/notas/jsp/clarin/v9/notas/imprimir.jsp?pagid=' + lmain return 'http://www.servicios.clarin.com/notas/jsp/clarin/v9/notas/imprimir.jsp?pagid=' + lmain
def preprocess_html(self, soup):
mtag = '<meta http-equiv="Content-Language" content="es-AR"/>'
soup.head.insert(0,mtag)
for item in soup.findAll(style=True):
del item['style']
return soup
language = _('Spanish')

View File

@ -5,38 +5,47 @@ __copyright__ = '2008, Darko Miletic <darko.miletic at gmail.com>'
''' '''
danas.rs danas.rs
''' '''
import string,re import re
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.web.feeds.news import BasicNewsRecipe
class Danas(BasicNewsRecipe): class Danas(BasicNewsRecipe):
title = 'Danas' title = u'Danas'
__author__ = 'Darko Miletic' __author__ = 'Darko Miletic'
description = 'Dnevne novine sa vestima iz sveta, politike, ekonomije, kulture, sporta, Beograda, Novog Sada i cele Srbije.' description = 'Vesti'
publisher = 'Danas d.o.o.'
category = 'news, politics, Serbia'
oldest_article = 2 oldest_article = 2
language = _('Serbian')
max_articles_per_feed = 100 max_articles_per_feed = 100
no_stylesheets = True no_stylesheets = False
remove_javascript = True
use_embedded_content = False use_embedded_content = False
cover_url = 'http://www.danas.rs/images/basic/danas.gif' extra_css = '@font-face {font-family: "serif1";src:url(res:///opt/sony/ebook/FONT/tt0011m_.ttf)} @font-face {font-family: "monospace1";src:url(res:///opt/sony/ebook/FONT/tt0419m_.ttf)} @font-face {font-family: "sans1";src:url(res:///opt/sony/ebook/FONT/tt0003m_.ttf)} body{text-align: left; font-family: serif1, serif} .article_date{font-family: monospace1, monospace} .article_description{font-family: sans1, sans-serif} .navbar{font-family: monospace1, monospace}'
html2lrf_options = [ html2lrf_options = [
'--comment', description '--comment', description
, '--base-font-size', '10' , '--category', category
, '--category', 'news, Serbia' , '--publisher', publisher
, '--publisher', 'Danas'
] ]
html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"'
preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')] preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')]
keep_only_tags = [ dict(name='div', attrs={'id':'left'}) ] keep_only_tags = [dict(name='div', attrs={'id':'left'})]
remove_tags = [ remove_tags = [
dict(name='div', attrs={'class':'width_1_4' }) dict(name='div', attrs={'class':['width_1_4','metaClanka','baner']})
,dict(name='div', attrs={'class':'metaClanka' }) ,dict(name='div', attrs={'id':'comments'})
,dict(name='div', attrs={'id':'comments' }) ,dict(name=['object','link'])
,dict(name='div', attrs={'class':'baner' })
,dict(name='div', attrs={'class':'slikaClanka'})
] ]
feeds = [(u'Vesti', u'http://www.danas.rs/rss/rss.asp')] feeds = [ (u'Vesti', u'http://www.danas.rs/rss/rss.asp')]
def print_version(self, url): def preprocess_html(self, soup):
return url + '&action=print' mtag = '<meta http-equiv="Content-Language" content="sr-Latn"/>'
soup.head.insert(0,mtag)
for item in soup.findAll(style=True):
del item['style']
return soup
language = _('Serbian')

View File

@ -5,31 +5,36 @@ __copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>'
''' '''
emol.com emol.com
''' '''
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.web.feeds.news import BasicNewsRecipe
class ElMercurio(BasicNewsRecipe): class ElMercurio(BasicNewsRecipe):
title = 'El Mercurio online' title = 'El Mercurio online'
language = _('Spanish')
__author__ = 'Darko Miletic' __author__ = 'Darko Miletic'
description = 'El sitio de noticias online de Chile' description = 'El sitio de noticias online de Chile'
publisher = 'El Mercurio'
category = 'news, politics, Chile'
oldest_article = 2 oldest_article = 2
max_articles_per_feed = 100 max_articles_per_feed = 100
no_stylesheets = True no_stylesheets = True
use_embedded_content = False use_embedded_content = False
encoding = 'cp1252' encoding = 'cp1252'
cover_url = 'http://www.emol.com/especiales/logo_emol/logo_emol.gif' cover_url = 'http://www.emol.com/especiales/logo_emol/logo_emol.gif'
remove_javascript = True
use_embedded_content = False
html2lrf_options = [ html2lrf_options = [
'--comment' , description '--comment', description
, '--category' , 'news, Chile' , '--category', category
, '--publisher' , title , '--publisher', publisher
] ]
html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"'
keep_only_tags = [ keep_only_tags = [
dict(name='div', attrs={'class':'despliegue-txt_750px'}) dict(name='div', attrs={'class':'despliegue-txt_750px'})
,dict(name='div', attrs={'id':'div_cuerpo_participa'}) ,dict(name='div', attrs={'id':'div_cuerpo_participa'})
] ]
remove_tags = [ remove_tags = [
dict(name='div', attrs={'class':'contenedor_despliegue-col-left300'}) dict(name='div', attrs={'class':'contenedor_despliegue-col-left300'})
@ -45,4 +50,12 @@ class ElMercurio(BasicNewsRecipe):
,(u'Tecnologia', u'http://www.emol.com/rss20/rss.asp?canal=5') ,(u'Tecnologia', u'http://www.emol.com/rss20/rss.asp?canal=5')
,(u'La Musica', u'http://www.emol.com/rss20/rss.asp?canal=7') ,(u'La Musica', u'http://www.emol.com/rss20/rss.asp?canal=7')
] ]
def preprocess_html(self, soup):
mtag = '<meta http-equiv="Content-Language" content="es-CL"/>'
soup.head.insert(0,mtag)
for item in soup.findAll(style=True):
del item['style']
return soup
language = _('Spanish')

View File

@ -1,30 +1,34 @@
#!/usr/bin/env python #!/usr/bin/env python
__license__ = 'GPL v3' __license__ = 'GPL v3'
__copyright__ = '2008, Darko Miletic <darko.miletic at gmail.com>' __copyright__ = '2008-2009, Darko Miletic <darko.miletic at gmail.com>'
''' '''
elargentino.com elargentino.com
''' '''
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
class ElArgentino(BasicNewsRecipe): class ElArgentino(BasicNewsRecipe):
title = 'ElArgentino.com' title = 'ElArgentino.com'
__author__ = 'Darko Miletic' __author__ = 'Darko Miletic'
description = 'Informacion Libre las 24 horas' description = 'Informacion Libre las 24 horas'
language = _('Spanish') publisher = 'ElArgentino.com'
category = 'news, politics, Argentina'
oldest_article = 2 oldest_article = 2
max_articles_per_feed = 100 max_articles_per_feed = 100
remove_javascript = True
no_stylesheets = True no_stylesheets = True
use_embedded_content = False use_embedded_content = False
encoding = 'utf8' encoding = 'utf8'
cover_url = 'http://www.elargentino.com/TemplateWeb/MediosFooter/tapa_elargentino.png' cover_url = 'http://www.elargentino.com/TemplateWeb/MediosFooter/tapa_elargentino.png'
html2lrf_options = [ html2lrf_options = [
'--comment' , description '--comment', description
, '--category' , 'news, Argentina' , '--category', category
, '--publisher' , 'ElArgentino.com' , '--publisher', publisher
] ]
html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"'
remove_tags = [ remove_tags = [
dict(name='div', attrs={'id':'noprint' }) dict(name='div', attrs={'id':'noprint' })
@ -50,7 +54,10 @@ class ElArgentino(BasicNewsRecipe):
return u'http://www.elargentino.com/Impresion.aspx?Id=' + article_id return u'http://www.elargentino.com/Impresion.aspx?Id=' + article_id
def preprocess_html(self, soup): def preprocess_html(self, soup):
mtag = '<meta http-equiv="Content-Type" content="text/html; charset=utf-8">' mtag = '<meta http-equiv="Content-Type" content="text/html; charset=utf-8">\n<meta http-equiv="Content-Language" content="es-AR"/>\n'
soup.head.insert(0,mtag) soup.head.insert(0,mtag)
soup.prettify() for item in soup.findAll(style=True):
del item['style']
return soup return soup
language = _('Spanish')

View File

@ -6,41 +6,55 @@ __copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>'
elmundo.es elmundo.es
''' '''
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
class ElMundo(BasicNewsRecipe): class ElMundo(BasicNewsRecipe):
title = 'El Mundo' title = 'El Mundo'
__author__ = 'Darko Miletic' __author__ = 'Darko Miletic'
description = 'News from Spain' description = 'News from Spain'
language = _('Spanish') publisher = 'El Mundo'
category = 'news, politics, Spain'
oldest_article = 2 oldest_article = 2
max_articles_per_feed = 100 max_articles_per_feed = 100
no_stylesheets = True no_stylesheets = True
use_embedded_content = False use_embedded_content = False
encoding = 'iso8859_15' encoding = 'iso8859_15'
cover_url = 'http://estaticos02.cache.el-mundo.net/papel/imagenes/v2.0/logoverde.gif' cover_url = 'http://estaticos02.cache.el-mundo.net/papel/imagenes/v2.0/logoverde.gif'
remove_javascript = True
html2lrf_options = [ html2lrf_options = [
'--comment' , description '--comment', description
, '--category' , 'news, Spain' , '--category', category
, '--publisher' , title , '--publisher', publisher
] ]
html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"'
keep_only_tags = [dict(name='div', attrs={'class':'noticia'})]
keep_only_tags = [
dict(name='div', attrs={'id':['bloqueprincipal','noticia']})
,dict(name='div', attrs={'class':['contenido_noticia_01']})
]
remove_tags = [ remove_tags = [
dict(name='div', attrs={'class':['herramientas','publicidad_google','video','herramientasarriba','contenido_noticia_02']}) dict(name='div', attrs={'class':['herramientas','publicidad_google']})
,dict(name='div', attrs={'id':'modulo_multimedia' }) ,dict(name='div', attrs={'id':'modulo_multimedia' })
,dict(name=['object','script','link', 'a']) ,dict(name='ul', attrs={'class':'herramientas' })
,dict(name='ul', attrs={'class':'herramientas'}) ,dict(name=['object','link'])
] ]
feeds = [ feeds = [
(u'Portada' , u'http://rss.elmundo.es/rss/descarga.htm?data2=4' ) (u'Portada' , u'http://rss.elmundo.es/rss/descarga.htm?data2=4' )
,(u'Television' , u'http://rss.elmundo.es/rss/descarga.htm?data2=76')
,(u'Espana' , u'http://rss.elmundo.es/rss/descarga.htm?data2=8' ) ,(u'Espana' , u'http://rss.elmundo.es/rss/descarga.htm?data2=8' )
,(u'Internacional' , u'http://rss.elmundo.es/rss/descarga.htm?data2=9' ) ,(u'Internacional' , u'http://rss.elmundo.es/rss/descarga.htm?data2=9' )
,(u'Cultura' , u'http://rss.elmundo.es/rss/descarga.htm?data2=6' ) ,(u'Cultura' , u'http://rss.elmundo.es/rss/descarga.htm?data2=6' )
,(u'Ciencia/Ecologia', u'http://rss.elmundo.es/rss/descarga.htm?data2=5' ) ,(u'Ciencia/Ecologia', u'http://rss.elmundo.es/rss/descarga.htm?data2=5' )
,(u'Comunicacion' , u'http://rss.elmundo.es/rss/descarga.htm?data2=26') ,(u'Comunicacion' , u'http://rss.elmundo.es/rss/descarga.htm?data2=26')
,(u'Television' , u'http://rss.elmundo.es/rss/descarga.htm?data2=76')
] ]
def preprocess_html(self, soup):
for item in soup.findAll(style=True):
del item['style']
return soup
language = _('Spanish')

View File

@ -6,26 +6,29 @@ __copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>'
estadao.com.br estadao.com.br
''' '''
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
class Estadao(BasicNewsRecipe): class Estadao(BasicNewsRecipe):
title = 'O Estado de S. Paulo' title = 'O Estado de S. Paulo'
__author__ = 'Darko Miletic' __author__ = 'Darko Miletic'
description = 'News from Brasil' description = 'News from Brasil in Portugese'
language = _('Portugese') publisher = 'O Estado de S. Paulo'
category = 'news, politics, Brasil'
oldest_article = 2 oldest_article = 2
max_articles_per_feed = 100 max_articles_per_feed = 100
no_stylesheets = True no_stylesheets = True
use_embedded_content = False use_embedded_content = False
encoding = 'utf8' encoding = 'utf8'
cover_url = 'http://www.estadao.com.br/img/logo_estadao.png' cover_url = 'http://www.estadao.com.br/img/logo_estadao.png'
remove_javascript = True
html2lrf_options = [ html2lrf_options = [
'--comment' , description '--comment', description
, '--category' , 'news, Brasil' , '--category', category
, '--publisher' , title , '--publisher', publisher
] ]
html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"'
keep_only_tags = [dict(name='div', attrs={'id':'c1'})] keep_only_tags = [dict(name='div', attrs={'id':'c1'})]
@ -52,4 +55,8 @@ class Estadao(BasicNewsRecipe):
ifr = soup.find('iframe') ifr = soup.find('iframe')
if ifr: if ifr:
ifr.extract() ifr.extract()
for item in soup.findAll(style=True):
del item['style']
return soup return soup
language = _('Portugese')

View File

@ -7,27 +7,30 @@ granma.cubaweb.cu
''' '''
import urllib import urllib
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.web.feeds.news import BasicNewsRecipe
class Granma(BasicNewsRecipe): class Granma(BasicNewsRecipe):
title = 'Diario Granma' title = 'Diario Granma'
__author__ = 'Darko Miletic' __author__ = 'Darko Miletic'
language = _('Spanish')
description = 'Organo oficial del Comite Central del Partido Comunista de Cuba' description = 'Organo oficial del Comite Central del Partido Comunista de Cuba'
publisher = 'Granma'
category = 'news, politics, Cuba'
oldest_article = 2 oldest_article = 2
max_articles_per_feed = 100 max_articles_per_feed = 100
no_stylesheets = True no_stylesheets = True
use_embedded_content = False use_embedded_content = False
encoding = 'cp1252' encoding = 'cp1252'
cover_url = 'http://www.granma.cubaweb.cu/imagenes/granweb229d.jpg' cover_url = 'http://www.granma.cubaweb.cu/imagenes/granweb229d.jpg'
remove_javascript = True
html2lrf_options = [ html2lrf_options = [
'--comment' , description '--comment', description
, '--category' , 'news, Cuba' , '--category', category
, '--publisher' , title , '--publisher', publisher
, '--ignore-tables' , '--ignore-tables'
] ]
html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"'
keep_only_tags = [dict(name='table', attrs={'height':'466'})] keep_only_tags = [dict(name='table', attrs={'height':'466'})]
@ -35,9 +38,15 @@ class Granma(BasicNewsRecipe):
def preprocess_html(self, soup): def preprocess_html(self, soup):
del soup.body.table['style'] mtag = '<meta http-equiv="Content-Language" content="es-CU"/>'
rtag = soup.find('td', attrs={'height':'458'}) soup.head.insert(0,mtag)
if rtag: for item in soup.findAll('table'):
del rtag['style'] if item.has_key('width'):
del item['width']
if item.has_key('height'):
del item['height']
for item in soup.findAll(style=True):
del item['style']
return soup return soup
language = _('Spanish')

View File

@ -1,62 +1,80 @@
#!/usr/bin/env python #!/usr/bin/env python
__license__ = 'GPL v3' __license__ = 'GPL v3'
__copyright__ = '2008, Darko Miletic <darko.miletic at gmail.com>' __copyright__ = '2008-2009, Darko Miletic <darko.miletic at gmail.com>'
''' '''
harpers.org - paid subscription/ printed issue articles harpers.org - paid subscription/ printed issue articles
This recipe only get's article's published in text format This recipe only get's article's published in text format
images and pdf's are ignored images and pdf's are ignored
''' '''
from calibre import strftime from calibre import strftime
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
class Harpers_full(BasicNewsRecipe): class Harpers_full(BasicNewsRecipe):
title = u"Harper's Magazine - articles from printed edition" title = u"Harper's Magazine - articles from printed edition"
__author__ = u'Darko Miletic' __author__ = u'Darko Miletic'
description = u"Harper's Magazine: Founded June 1850." description = u"Harper's Magazine: Founded June 1850."
language = _('English') publisher = "Harpers's"
oldest_article = 30 category = 'news, politics, USA'
max_articles_per_feed = 100 oldest_article = 30
no_stylesheets = True max_articles_per_feed = 100
use_embedded_content = False no_stylesheets = True
simultaneous_downloads = 1 use_embedded_content = False
delay = 1 simultaneous_downloads = 1
needs_subscription = True delay = 1
INDEX = strftime('http://www.harpers.org/archive/%Y/%m') needs_subscription = True
LOGIN = 'http://www.harpers.org' INDEX = strftime('http://www.harpers.org/archive/%Y/%m')
cover_url = strftime('http://www.harpers.org/media/pages/%Y/%m/gif/0001.gif') LOGIN = 'http://www.harpers.org'
cover_url = strftime('http://www.harpers.org/media/pages/%Y/%m/gif/0001.gif')
keep_only_tags = [ dict(name='div', attrs={'id':'cached'}) ] remove_javascript = True
remove_tags = [
dict(name='table', attrs={'class':'rcnt'}) html2lrf_options = [
,dict(name='table', attrs={'class':'rcnt topline'}) '--comment', description
] , '--category', category
, '--publisher', publisher
def get_browser(self): ]
br = BasicNewsRecipe.get_browser()
if self.username is not None and self.password is not None: html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"'
br.open(self.LOGIN)
br.select_form(nr=1) keep_only_tags = [ dict(name='div', attrs={'id':'cached'}) ]
br['handle' ] = self.username remove_tags = [
br['password'] = self.password dict(name='table', attrs={'class':'rcnt'})
br.submit() ,dict(name='table', attrs={'class':'rcnt topline'})
return br ]
def get_browser(self):
br = BasicNewsRecipe.get_browser()
if self.username is not None and self.password is not None:
br.open(self.LOGIN)
br.select_form(nr=1)
br['handle' ] = self.username
br['password'] = self.password
br.submit()
return br
def parse_index(self):
articles = []
print 'Processing ' + self.INDEX
soup = self.index_to_soup(self.INDEX)
for item in soup.findAll('div', attrs={'class':'title'}):
text_link = item.parent.find('img',attrs={'alt':'Text'})
if text_link:
url = self.LOGIN + item.a['href']
title = item.a.contents[0]
date = strftime(' %B %Y')
articles.append({
'title' :title
,'date' :date
,'url' :url
,'description':''
})
return [(soup.head.title.string, articles)]
def preprocess_html(self, soup):
for item in soup.findAll(style=True):
del item['style']
return soup
def parse_index(self): language = _('English')
articles = []
print 'Processing ' + self.INDEX
soup = self.index_to_soup(self.INDEX)
for item in soup.findAll('div', attrs={'class':'title'}):
text_link = item.parent.find('img',attrs={'alt':'Text'})
if text_link:
url = self.LOGIN + item.a['href']
title = item.a.contents[0]
date = strftime(' %B %Y')
articles.append({
'title' :title
,'date' :date
,'url' :url
,'description':''
})
return [(soup.head.title.string, articles)]

View File

@ -1,34 +1,36 @@
#!/usr/bin/env python #!/usr/bin/env python
__license__ = 'GPL v3' __license__ = 'GPL v3'
__copyright__ = '2008, Darko Miletic <darko.miletic at gmail.com>' __copyright__ = '2008-2009, Darko Miletic <darko.miletic at gmail.com>'
''' '''
infobae.com infobae.com
''' '''
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
class Infobae(BasicNewsRecipe): class Infobae(BasicNewsRecipe):
title = 'Infobae.com' title = 'Infobae.com'
__author__ = 'Darko Miletic' __author__ = 'Darko Miletic'
description = 'Informacion Libre las 24 horas' description = 'Informacion Libre las 24 horas'
publisher = 'Infobae.com'
category = 'news, politics, Argentina'
oldest_article = 2 oldest_article = 2
language = _('Spanish')
max_articles_per_feed = 100 max_articles_per_feed = 100
no_stylesheets = True no_stylesheets = True
use_embedded_content = False use_embedded_content = False
encoding = 'iso-8859-1' encoding = 'iso-8859-1'
cover_url = 'http://www.infobae.com/imgs/header/header.gif' cover_url = 'http://www.infobae.com/imgs/header/header.gif'
remove_javascript = True
html2lrf_options = [
'--comment' , description
, '--category' , 'news, Argentina'
, '--publisher' , 'Infobae.com'
]
feeds = [ html2lrf_options = [
'--comment', description
, '--category', category
, '--publisher', publisher
]
html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"'
feeds = [
(u'Noticias' , u'http://www.infobae.com/adjuntos/html/RSS/hoy.xml' ) (u'Noticias' , u'http://www.infobae.com/adjuntos/html/RSS/hoy.xml' )
,(u'Salud' , u'http://www.infobae.com/adjuntos/html/RSS/salud.xml' ) ,(u'Salud' , u'http://www.infobae.com/adjuntos/html/RSS/salud.xml' )
,(u'Tecnologia', u'http://www.infobae.com/adjuntos/html/RSS/tecnologia.xml') ,(u'Tecnologia', u'http://www.infobae.com/adjuntos/html/RSS/tecnologia.xml')
@ -37,5 +39,14 @@ class Infobae(BasicNewsRecipe):
def print_version(self, url): def print_version(self, url):
main, sep, article_part = url.partition('contenidos/') main, sep, article_part = url.partition('contenidos/')
article_id, rsep, rrest = article_part.partition('-') article_id, rsep, rrest = article_part.partition('-')
return u'http://www.infobae.com/notas/nota_imprimir.php?Idx=' + article_id return u'http://www.infobae.com/notas/nota_imprimir.php?Idx=' + article_id
def preprocess_html(self, soup):
mtag = '<meta http-equiv="Content-Type" content="text/html; charset=utf-8">\n<meta http-equiv="Content-Language" content="es-AR"/>\n'
soup.head.insert(0,mtag)
for item in soup.findAll(style=True):
del item['style']
return soup
language = _('Spanish')

View File

@ -6,25 +6,29 @@ __copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>'
jbonline.terra.com.br jbonline.terra.com.br
''' '''
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
class JBOnline(BasicNewsRecipe): class JBOnline(BasicNewsRecipe):
title = 'Jornal Brasileiro Online' title = 'Jornal Brasileiro Online'
__author__ = 'Darko Miletic' __author__ = 'Darko Miletic'
description = 'News from Brasil' description = 'News from Brasil'
publisher = 'Jornal Brasileiro'
category = 'news, politics, Brasil'
oldest_article = 2 oldest_article = 2
language = _('Portugese')
max_articles_per_feed = 100 max_articles_per_feed = 100
no_stylesheets = True no_stylesheets = True
use_embedded_content = False use_embedded_content = False
encoding = 'cp1252' encoding = 'cp1252'
cover_url = 'http://jbonline.terra.com.br/img/logo_01.gif' cover_url = 'http://jbonline.terra.com.br/img/logo_01.gif'
remove_javascript = True
html2lrf_options = [ html2lrf_options = [
'--comment' , description '--comment', description
, '--category' , 'news, Brasil' , '--category', category
, '--publisher' , title , '--publisher', publisher
] ]
html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"'
keep_only_tags = [dict(name='div', attrs={'id':'corpoNoticia'})] keep_only_tags = [dict(name='div', attrs={'id':'corpoNoticia'})]
@ -36,7 +40,8 @@ class JBOnline(BasicNewsRecipe):
ifr = soup.find('iframe') ifr = soup.find('iframe')
if ifr: if ifr:
ifr.extract() ifr.extract()
item = soup.find('div', attrs={'id':'corpoNoticia'}) for item in soup.findAll(style=True):
if item: del item['style']
del item['style']
return soup return soup
language = _('Portugese')

View File

@ -6,28 +6,35 @@ __copyright__ = '2008, Darko Miletic <darko.miletic at gmail.com>'
jutarnji.hr jutarnji.hr
''' '''
import string, re import re
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulSoup
from calibre.web.feeds.news import BasicNewsRecipe
class Jutarnji(BasicNewsRecipe): class Jutarnji(BasicNewsRecipe):
title = 'Jutarnji' title = u'Jutarnji'
__author__ = 'Darko Miletic' __author__ = u'Darko Miletic'
description = 'Online izdanje Jutarnjeg lista' description = u'Hrvatski portal'
publisher = 'Jutarnji.hr'
category = 'news, politics, Croatia'
oldest_article = 2 oldest_article = 2
max_articles_per_feed = 100 max_articles_per_feed = 100
simultaneous_downloads = 1 simultaneous_downloads = 1
delay = 1 delay = 1
no_stylesheets = True no_stylesheets = True
use_embedded_content = False use_embedded_content = False
remove_javascript = True
encoding = 'cp1250' encoding = 'cp1250'
cover_url = 'http://www.jutarnji.hr/EPHResources/Images/2008/06/05/jhrlogo.png' extra_css = '@font-face {font-family: "serif1";src:url(res:///opt/sony/ebook/FONT/tt0011m_.ttf)} @font-face {font-family: "monospace1";src:url(res:///opt/sony/ebook/FONT/tt0419m_.ttf)} @font-face {font-family: "sans1";src:url(res:///opt/sony/ebook/FONT/tt0003m_.ttf)} body{text-align: left; font-family: serif1, serif} .article_date{font-family: monospace1, monospace} .article_description{font-family: sans1, sans-serif} .navbar{font-family: monospace1, monospace}'
html2lrf_options = [ html2lrf_options = [
'--comment', description '--comment', description
, '--base-font-size', '10' , '--category', category
, '--category', 'news, Croatia' , '--publisher', publisher
, '--publisher', 'Europapress holding d.o.o.' ]
]
html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"'
preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')] preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')]
remove_tags = [ remove_tags = [
@ -49,11 +56,16 @@ class Jutarnji(BasicNewsRecipe):
def print_version(self, url): def print_version(self, url):
main, split, rest = url.partition('.jl') main, split, rest = url.partition('.jl')
rmain, rsplit, rrest = main.rpartition(',') rmain, rsplit, rrest = main.rpartition(',')
return u'http://www.jutarnji.hr/ispis_clanka.jl?artid=' + rrest return 'http://www.jutarnji.hr/ispis_clanka.jl?artid=' + rrest
def preprocess_html(self, soup): def preprocess_html(self, soup):
mtag = '<meta http-equiv="Content-Type" content="text/html; charset=utf-8">' mtag = '<meta http-equiv="Content-Type" content="text/html; charset=utf-8">'
soup.head.insert(0,mtag) soup.head.insert(0,mtag)
soup.prettify() mtag = '<meta http-equiv="Content-Language" content="hr"/>'
soup.head.insert(0,mtag)
for item in soup.findAll(style=True):
del item['style']
for item in soup.findAll(width=True):
del item['width']
return soup return soup

View File

@ -7,26 +7,30 @@ juventudrebelde.cu
''' '''
from calibre import strftime from calibre import strftime
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
class Juventudrebelde(BasicNewsRecipe): class Juventudrebelde(BasicNewsRecipe):
title = 'Juventud Rebelde' title = 'Juventud Rebelde'
__author__ = 'Darko Miletic' __author__ = 'Darko Miletic'
description = 'Diario de la Juventud Cubana' description = 'Diario de la Juventud Cubana'
publisher = 'Juventud rebelde'
category = 'news, politics, Cuba'
oldest_article = 2 oldest_article = 2
language = _('Spanish')
max_articles_per_feed = 100 max_articles_per_feed = 100
no_stylesheets = True no_stylesheets = True
use_embedded_content = False use_embedded_content = False
encoding = 'cp1252' encoding = 'cp1252'
cover_url = strftime('http://www.juventudrebelde.cu/UserFiles/File/impreso/iportada-%Y-%m-%d.jpg') cover_url = strftime('http://www.juventudrebelde.cu/UserFiles/File/impreso/iportada-%Y-%m-%d.jpg')
remove_javascript = True
html2lrf_options = [ html2lrf_options = [
'--comment' , description '--comment', description
, '--category' , 'news, Cuba' , '--category', category
, '--publisher' , title , '--publisher', publisher
, '--ignore-tables' , '--ignore-tables'
] ]
html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"'
keep_only_tags = [dict(name='div', attrs={'id':'noticia'})] keep_only_tags = [dict(name='div', attrs={'id':'noticia'})]
@ -40,4 +44,11 @@ class Juventudrebelde(BasicNewsRecipe):
,(u'Lectura', u'http://www.juventudrebelde.cu/rss/generales.php?seccion=lectura' ) ,(u'Lectura', u'http://www.juventudrebelde.cu/rss/generales.php?seccion=lectura' )
] ]
def preprocess_html(self, soup):
mtag = '<meta http-equiv="Content-Language" content="es-CU"/>'
soup.head.insert(0,mtag)
for item in soup.findAll(style=True):
del item['style']
return soup
language = _('Spanish')

View File

@ -5,30 +5,40 @@ __copyright__ = '2008, Darko Miletic <darko.miletic at gmail.com>'
''' '''
juventudrebelde.co.cu juventudrebelde.co.cu
''' '''
from calibre import strftime
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.web.feeds.news import BasicNewsRecipe
class Juventudrebelde_english(BasicNewsRecipe): class Juventudrebelde_english(BasicNewsRecipe):
title = 'Juventud Rebelde in english' title = 'Juventud Rebelde in english'
__author__ = 'Darko Miletic' __author__ = 'Darko Miletic'
description = 'The newspaper of Cuban Youth' description = 'The newspaper of Cuban Youth'
language = _('English') publisher = 'Juventud Rebelde'
category = 'news, politics, Cuba'
oldest_article = 2 oldest_article = 2
max_articles_per_feed = 100 max_articles_per_feed = 100
no_stylesheets = True no_stylesheets = True
use_embedded_content = False use_embedded_content = False
encoding = 'iso-8859-1' encoding = 'iso-8859-1'
remove_javascript = True
html2lrf_options = [ html2lrf_options = [
'--comment' , description '--comment', description
, '--category' , 'news, Cuba' , '--category', category
, '--publisher' , title , '--publisher', publisher
, '--ignore-tables' , '--ignore-tables'
] ]
html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"'
keep_only_tags = [dict(name='div', attrs={'class':'read'})] keep_only_tags = [dict(name='div', attrs={'class':'read'})]
feeds = [(u'All news', u'http://www.juventudrebelde.cip.cu/rss/all/' )] feeds = [(u'All news', u'http://www.juventudrebelde.cip.cu/rss/all/' )]
def preprocess_html(self, soup):
mtag = '<meta http-equiv="Content-Language" content="es-CU"/>'
soup.head.insert(0,mtag)
for item in soup.findAll(style=True):
del item['style']
return soup
language = _('English')

View File

@ -6,30 +6,33 @@ __copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>'
lacuarta.cl lacuarta.cl
''' '''
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
class LaCuarta(BasicNewsRecipe): class LaCuarta(BasicNewsRecipe):
title = 'La Cuarta' title = 'La Cuarta'
__author__ = 'Darko Miletic' __author__ = 'Darko Miletic'
description = 'El sitio de noticias online de Chile' description = 'La Cuarta Cibernetica: El Diario popular'
publisher = 'CODISA, Consorcio Digital S.A.'
category = 'news, politics, entertainment, Chile'
oldest_article = 2 oldest_article = 2
language = _('Spanish')
max_articles_per_feed = 100 max_articles_per_feed = 100
no_stylesheets = True no_stylesheets = True
use_embedded_content = False use_embedded_content = False
encoding = 'cp1252' encoding = 'cp1252'
remove_javascript = True
html2lrf_options = [ html2lrf_options = [
'--comment' , description '--comment', description
, '--category' , 'news, Chile' , '--category', category
, '--publisher' , title , '--publisher', publisher
] ]
html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"'
keep_only_tags = [dict(name='div', attrs={'class':'articulo desplegado'}) ] keep_only_tags = [dict(name='div', attrs={'class':'articulo desplegado'}) ]
remove_tags = [ remove_tags = [
dict(name='script') dict(name='ul')
,dict(name='ul')
,dict(name='div', attrs={'id':['toolbox','articleImageDisplayer','enviarAmigo']}) ,dict(name='div', attrs={'id':['toolbox','articleImageDisplayer','enviarAmigo']})
,dict(name='div', attrs={'class':['par ad-1','par ad-2']}) ,dict(name='div', attrs={'class':['par ad-1','par ad-2']})
,dict(name='input') ,dict(name='input')
@ -37,7 +40,14 @@ class LaCuarta(BasicNewsRecipe):
,dict(name='strong', text='PUBLICIDAD') ,dict(name='strong', text='PUBLICIDAD')
] ]
def preprocess_html(self, soup):
mtag = '<meta http-equiv="Content-Language" content="es-CL"/>'
soup.head.insert(0,mtag)
for item in soup.findAll(style=True):
del item['style']
return soup
feeds = [(u'Noticias', u'http://lacuarta.cl/app/rss?sc=TEFDVUFSVEE=')] feeds = [(u'Noticias', u'http://lacuarta.cl/app/rss?sc=TEFDVUFSVEE=')]
language = _('Spanish')

View File

@ -6,26 +6,29 @@ __copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>'
lasegunda.com lasegunda.com
''' '''
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
class LaSegunda(BasicNewsRecipe): class LaSegunda(BasicNewsRecipe):
title = 'La Segunda' title = 'La Segunda'
__author__ = 'Darko Miletic' __author__ = 'Darko Miletic'
description = 'El sitio de noticias online de Chile' description = 'El sitio de noticias online de Chile'
language = _('Spanish') publisher = 'La Segunda'
category = 'news, politics, Chile'
oldest_article = 2 oldest_article = 2
max_articles_per_feed = 100 max_articles_per_feed = 100
no_stylesheets = True no_stylesheets = True
use_embedded_content = False use_embedded_content = False
encoding = 'cp1252' encoding = 'cp1252'
cover_url = 'http://www.lasegunda.com/imagenes/logotipo_lasegunda_Oli.gif' cover_url = 'http://www.lasegunda.com/imagenes/logotipo_lasegunda_Oli.gif'
remove_javascript = True
html2lrf_options = [ html2lrf_options = [
'--comment' , description '--comment', description
, '--category' , 'news, Chile' , '--category', category
, '--publisher' , title , '--publisher', publisher
, '--ignore-tables'
] ]
html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"'
keep_only_tags = [dict(name='table')] keep_only_tags = [dict(name='table')]
@ -45,4 +48,14 @@ class LaSegunda(BasicNewsRecipe):
def print_version(self, url): def print_version(self, url):
rest, sep, article_id = url.partition('index.asp?idnoticia=') rest, sep, article_id = url.partition('index.asp?idnoticia=')
return u'http://www.lasegunda.com/edicionOnline/include/secciones/_detalle_impresion.asp?idnoticia=' + article_id return u'http://www.lasegunda.com/edicionOnline/include/secciones/_detalle_impresion.asp?idnoticia=' + article_id
def preprocess_html(self, soup):
mtag = '<meta http-equiv="Content-Language" content="es-CL"/>'
soup.head.insert(0,mtag)
for item in soup.findAll(name='table', width=True):
del item['width']
for item in soup.findAll(style=True):
del item['style']
return soup
language = _('Spanish')

View File

@ -6,26 +6,30 @@ __copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>'
latercera.com latercera.com
''' '''
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
class LaTercera(BasicNewsRecipe): class LaTercera(BasicNewsRecipe):
title = 'La Tercera' title = 'La Tercera'
__author__ = 'Darko Miletic' __author__ = 'Darko Miletic'
description = 'El sitio de noticias online de Chile' description = 'El sitio de noticias online de Chile'
publisher = 'La Tercera'
category = 'news, politics, Chile'
oldest_article = 2 oldest_article = 2
language = _('Spanish')
max_articles_per_feed = 100 max_articles_per_feed = 100
no_stylesheets = True no_stylesheets = True
use_embedded_content = False
encoding = 'cp1252' encoding = 'cp1252'
remove_javascript = True
use_embedded_content = False
html2lrf_options = [ html2lrf_options = [
'--comment' , description '--comment', description
, '--category' , 'news, Chile' , '--category', category
, '--publisher' , title , '--publisher', publisher
] ]
html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"'
keep_only_tags = [dict(name='div', attrs={'class':'span-16 articulo border'}) ] keep_only_tags = [dict(name='div', attrs={'class':['span-16 articulo border','span-16 border','span-16']}) ]
remove_tags = [ remove_tags = [
dict(name='script') dict(name='script')
@ -50,4 +54,11 @@ class LaTercera(BasicNewsRecipe):
,(u'Educacion', u'http://www.latercera.com/app/rss?sc=TEFURVJDRVJB&category=657') ,(u'Educacion', u'http://www.latercera.com/app/rss?sc=TEFURVJDRVJB&category=657')
] ]
def preprocess_html(self, soup):
mtag = '<meta http-equiv="Content-Language" content="es-CL"/>'
soup.head.insert(0,mtag)
for item in soup.findAll(style=True):
del item['style']
return soup
language = _('Spanish')

View File

@ -1,29 +1,32 @@
#!/usr/bin/env python #!/usr/bin/env python
__license__ = 'GPL v3' __license__ = 'GPL v3'
__copyright__ = '2008, Darko Miletic <darko.miletic at gmail.com>' __copyright__ = '2008-2009, Darko Miletic <darko.miletic at gmail.com>'
''' '''
lanacion.com.ar lanacion.com.ar
''' '''
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
class Lanacion(BasicNewsRecipe): class Lanacion(BasicNewsRecipe):
title = 'La Nacion' title = 'La Nacion'
__author__ = 'Darko Miletic' __author__ = 'Darko Miletic'
description = 'Informacion actualizada las 24 horas, con noticias de Argentina y del mundo - Informate ya!' description = 'Noticias de Argentina y el resto del mundo'
publisher = 'La Nacion'
category = 'news, politics, Argentina'
oldest_article = 2 oldest_article = 2
language = _('Spanish')
max_articles_per_feed = 100 max_articles_per_feed = 100
no_stylesheets = True
use_embedded_content = False use_embedded_content = False
remove_javascript = True
no_stylesheets = True
html2lrf_options = [ html2lrf_options = [
'--comment', description '--comment', description
, '--base-font-size', '10' , '--category', category
, '--category', 'news, Argentina' , '--publisher', publisher
, '--publisher', 'La Nacion SA' ]
]
html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"'
keep_only_tags = [dict(name='div', attrs={'class':'nota floatFix'})] keep_only_tags = [dict(name='div', attrs={'class':'nota floatFix'})]
remove_tags = [ remove_tags = [
@ -47,11 +50,11 @@ class Lanacion(BasicNewsRecipe):
,(u'Revista' , u'http://www.lanacion.com.ar/herramientas/rss/index.asp?categoria_id=494' ) ,(u'Revista' , u'http://www.lanacion.com.ar/herramientas/rss/index.asp?categoria_id=494' )
] ]
def get_cover_url(self): def preprocess_html(self, soup):
index = 'http://www.lanacion.com.ar' mtag = '<meta http-equiv="Content-Language" content="es-AR"/>'
cover_url = None soup.head.insert(0,mtag)
soup = self.index_to_soup(index) for item in soup.findAll(style=True):
cover_item = soup.find('img',attrs={'class':'logo'}) del item['style']
if cover_item: return soup
cover_url = index + cover_item['src']
return cover_url language = _('Spanish')

View File

@ -7,25 +7,29 @@ lanacion.cl
''' '''
import urllib import urllib
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
class LaNacionChile(BasicNewsRecipe): class LaNacionChile(BasicNewsRecipe):
title = 'La Nacion Chile' title = 'La Nacion Chile'
__author__ = 'Darko Miletic' __author__ = 'Darko Miletic'
description = 'El sitio de noticias online de Chile' description = 'El sitio de noticias online de Chile'
publisher = 'La Nacion'
category = 'news, politics, Chile'
oldest_article = 2 oldest_article = 2
language = _('Spanish')
max_articles_per_feed = 100 max_articles_per_feed = 100
no_stylesheets = True no_stylesheets = True
use_embedded_content = False use_embedded_content = False
encoding = 'cp1252' encoding = 'cp1252'
cover_url = 'http://www.lanacion.cl/prontus_noticias_v2/imag/site/logo.gif' cover_url = 'http://www.lanacion.cl/prontus_noticias_v2/imag/site/logo.gif'
remove_javascript = True
html2lrf_options = [ html2lrf_options = [
'--comment' , description '--comment', description
, '--category' , 'news, Chile' , '--category', category
, '--publisher' , title , '--publisher', publisher
] ]
html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"'
keep_only_tags = [dict(name='div', attrs={'class':'bloque'})] keep_only_tags = [dict(name='div', attrs={'class':'bloque'})]
@ -41,5 +45,10 @@ class LaNacionChile(BasicNewsRecipe):
item = soup.find('a', attrs={'href':'javascript:window.close()'}) item = soup.find('a', attrs={'href':'javascript:window.close()'})
if item: if item:
item.extract() item.extract()
mtag = '<meta http-equiv="Content-Language" content="es-CL"/>'
soup.head.insert(0,mtag)
for item in soup.findAll(style=True):
del item['style']
return soup return soup
language = _('Spanish')

View File

@ -1,31 +1,35 @@
#!/usr/bin/env python #!/usr/bin/env python
__license__ = 'GPL v3' __license__ = 'GPL v3'
__copyright__ = '2008, Darko Miletic <darko.miletic at gmail.com>' __copyright__ = '2008-2009, Darko Miletic <darko.miletic at gmail.com>'
''' '''
laprensa.com.ar laprensa.com.ar
''' '''
import urllib import urllib
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
class LaPrensa(BasicNewsRecipe): class LaPrensa(BasicNewsRecipe):
title = 'La Prensa' title = 'La Prensa'
__author__ = 'Darko Miletic' __author__ = 'Darko Miletic'
description = 'Informacion Libre las 24 horas' description = 'Informacion Libre las 24 horas'
publisher = 'La Prensa'
category = 'news, politics, Argentina'
oldest_article = 7 oldest_article = 7
language = _('Spanish')
max_articles_per_feed = 100 max_articles_per_feed = 100
no_stylesheets = True no_stylesheets = True
use_embedded_content = False use_embedded_content = False
encoding = 'cp1252' encoding = 'cp1252'
cover_url = 'http://www.laprensa.com.ar/imgs/logo.gif' cover_url = 'http://www.laprensa.com.ar/imgs/logo.gif'
remove_javascript = True
html2lrf_options = [ html2lrf_options = [
'--comment' , description '--comment', description
, '--category' , 'news, Argentina' , '--category', category
, '--publisher' , title , '--publisher', publisher
] ]
html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"'
feeds = [ feeds = [
(u'Politica' , u'http://www.laprensa.com.ar/Rss.aspx?Rss=4' ) (u'Politica' , u'http://www.laprensa.com.ar/Rss.aspx?Rss=4' )
@ -47,5 +51,10 @@ class LaPrensa(BasicNewsRecipe):
def preprocess_html(self, soup): def preprocess_html(self, soup):
del soup.body['onload'] del soup.body['onload']
mtag = '<meta http-equiv="Content-Language" content="es-AR"/>'
soup.head.insert(0,mtag)
for item in soup.findAll(style=True):
del item['style']
return soup return soup
language = _('Spanish')

View File

@ -7,15 +7,17 @@ nin.co.yu
''' '''
import re, urllib import re, urllib
from calibre.web.feeds.news import BasicNewsRecipe
class Nin(BasicNewsRecipe): from calibre.web.feeds.news import BasicNewsRecipe
class Nin(BasicNewsRecipe):
title = 'NIN online' title = 'NIN online'
__author__ = 'Darko Miletic' __author__ = 'Darko Miletic'
description = 'Nedeljne informativne novine' description = 'Nedeljne informativne novine'
publisher = 'NIN'
category = 'news, politics, Serbia'
no_stylesheets = True no_stylesheets = True
oldest_article = 15 oldest_article = 15
language = _('Serbian')
simultaneous_downloads = 1 simultaneous_downloads = 1
delay = 1 delay = 1
encoding = 'utf8' encoding = 'utf8'
@ -23,11 +25,17 @@ class Nin(BasicNewsRecipe):
PREFIX = 'http://www.nin.co.yu' PREFIX = 'http://www.nin.co.yu'
INDEX = PREFIX + '/?change_lang=ls' INDEX = PREFIX + '/?change_lang=ls'
LOGIN = PREFIX + '/?logout=true' LOGIN = PREFIX + '/?logout=true'
remove_javascript = True
use_embedded_content = False
extra_css = '@font-face {font-family: "serif1";src:url(res:///opt/sony/ebook/FONT/tt0011m_.ttf)} @font-face {font-family: "monospace1";src:url(res:///opt/sony/ebook/FONT/tt0419m_.ttf)} @font-face {font-family: "sans1";src:url(res:///opt/sony/ebook/FONT/tt0003m_.ttf)} body{text-align: left; font-family: serif1, serif} .article_date{font-family: monospace1, monospace} .article_description{font-family: sans1, sans-serif} .navbar{font-family: monospace1, monospace}'
html2lrf_options = [ html2lrf_options = [
'--comment' , description '--comment', description
, '--category' , 'news, politics, Serbia' , '--category', category
, '--publisher' , 'NIN' , '--publisher', publisher
] ]
html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"'
preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')] preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')]
@ -54,3 +62,12 @@ class Nin(BasicNewsRecipe):
if link_item: if link_item:
cover_url = self.PREFIX + link_item['src'] cover_url = self.PREFIX + link_item['src']
return cover_url return cover_url
def preprocess_html(self, soup):
mtag = '<meta http-equiv="Content-Language" content="sr-Latn-RS"/>'
soup.head.insert(0,mtag)
for item in soup.findAll(style=True):
del item['style']
return soup
language = _('Serbian')

View File

@ -5,32 +5,45 @@ __copyright__ = '2008, Darko Miletic <darko.miletic at gmail.com>'
''' '''
novosti.rs novosti.rs
''' '''
import string,re
from calibre.web.feeds.news import BasicNewsRecipe
import re
from calibre.web.feeds.news import BasicNewsRecipe
class Novosti(BasicNewsRecipe): class Novosti(BasicNewsRecipe):
title = 'Vecernje Novosti' title = u'Vecernje Novosti'
__author__ = 'Darko Miletic' __author__ = u'Darko Miletic'
description = 'novosti, vesti, politika, dosije, drustvo, ekonomija, hronika, reportaze, svet, kultura, sport, beograd, regioni, mozaik, feljton, intrvju, pjer, fudbal, kosarka, podvig, arhiva, komentari, kolumne, srbija, republika srpska,Vecernje novosti' description = u'Vesti'
publisher = 'Kompanija Novosti'
category = 'news, politics, Serbia'
oldest_article = 2 oldest_article = 2
language = _('Serbian')
max_articles_per_feed = 100 max_articles_per_feed = 100
no_stylesheets = True no_stylesheets = True
use_embedded_content = False use_embedded_content = False
encoding = 'utf8'
remove_javascript = True
extra_css = '@font-face {font-family: "serif1";src:url(res:///opt/sony/ebook/FONT/tt0011m_.ttf)} @font-face {font-family: "monospace1";src:url(res:///opt/sony/ebook/FONT/tt0419m_.ttf)} @font-face {font-family: "sans1";src:url(res:///opt/sony/ebook/FONT/tt0003m_.ttf)} body{text-align: left; font-family: serif1, serif} .article_date{font-family: monospace1, monospace} .article_description{font-family: sans1, sans-serif} .navbar{font-family: monospace1, monospace}'
html2lrf_options = [ html2lrf_options = [
'--comment', description '--comment', description
, '--base-font-size', '10' , '--category', category
, '--category', 'news, Serbia' , '--publisher', publisher
, '--publisher', 'Novosti AD'
] ]
html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"'
preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')] preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')]
keep_only_tags = [ dict(name='div', attrs={'class':'jednaVest'}) ] keep_only_tags = [dict(name='div', attrs={'class':'jednaVest'})]
remove_tags_after = dict(name='div', attrs={'class':'info_bottom'}) remove_tags = [dict(name='div', attrs={'class':['info','info_bottom','clip_div']})]
remove_tags = [
dict(name='div', attrs={'class':'info'})
,dict(name='div', attrs={'class':'info_bottom'})
]
feeds = [ (u'Vesti', u'http://www.novosti.rs/php/vesti/rss.php')] feeds = [(u'Vesti', u'http://www.novosti.rs/php/vesti/rss.php')]
def preprocess_html(self, soup):
mtag = '<meta http-equiv="Content-Language" content="sr-Latn"/>'
soup.head.insert(0,mtag)
for item in soup.findAll(style=True):
del item['style']
return soup
language = _('Serbian')

View File

@ -6,35 +6,55 @@ __copyright__ = '2008, Darko Miletic <darko.miletic at gmail.com>'
nspm.rs nspm.rs
''' '''
import string,re import re
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.web.feeds.news import BasicNewsRecipe
class Nspm(BasicNewsRecipe): class Nspm(BasicNewsRecipe):
title = u'Nova srpska politicka misao' title = u'Nova srpska politicka misao'
__author__ = 'Darko Miletic' __author__ = 'Darko Miletic'
description = 'Casopis za politicku teoriju i drustvena istrazivanja' description = 'Casopis za politicku teoriju i drustvena istrazivanja'
publisher = 'NSPM'
category = 'news, politics, Serbia'
oldest_article = 7 oldest_article = 7
language = _('Serbian')
max_articles_per_feed = 100 max_articles_per_feed = 100
no_stylesheets = True no_stylesheets = True
use_embedded_content = False use_embedded_content = False
INDEX = 'http://www.nspm.rs/?alphabet=l' INDEX = 'http://www.nspm.rs/?alphabet=l'
cover_url = 'http://nspm.rs/templates/jsn_epic_pro/images/logol.jpg' encoding = 'utf8'
remove_javascript = True
extra_css = '@font-face {font-family: "serif1";src:url(res:///opt/sony/ebook/FONT/tt0011m_.ttf)} @font-face {font-family: "monospace1";src:url(res:///opt/sony/ebook/FONT/tt0419m_.ttf)} @font-face {font-family: "sans1";src:url(res:///opt/sony/ebook/FONT/tt0003m_.ttf)} body{text-align: left; font-family: serif1, serif} .article_date{font-family: monospace1, monospace} .article_description{font-family: sans1, sans-serif} .navbar{font-family: monospace1, monospace}'
html2lrf_options = [ html2lrf_options = [
'--comment', description '--comment', description
, '--base-font-size', '10' , '--category', category
, '--category', 'news, politics, Serbia' , '--publisher', publisher
, '--publisher', 'IIC NSPM' , '--ignore-tables'
] ]
html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"'
preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')] preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')]
remove_tags = [dict(name='a')]
def get_browser(self): def get_browser(self):
br = BasicNewsRecipe.get_browser() br = BasicNewsRecipe.get_browser()
br.open(self.INDEX) br.open(self.INDEX)
return br return br
feeds = [ (u'Nova srpska politicka misao', u'http://www.nspm.rs/feed/rss.html')] feeds = [(u'Nova srpska politicka misao', u'http://www.nspm.rs/feed/rss.html')]
def print_version(self, url): def print_version(self, url):
return url.replace('.html','/stampa.html') return url.replace('.html','/stampa.html')
def preprocess_html(self, soup):
soup.html['xml:lang'] = 'sr-Latn-RS'
soup.html['lang'] = 'sr-Latn-RS'
ftag = soup.find('meta',attrs={'http-equiv':'Content-Language'})
if ftag:
ftag['content'] = 'sr-Latn-RS'
for item in soup.findAll(style=True):
del item['style']
return soup
language = _('Serbian')

View File

@ -6,25 +6,29 @@ __copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>'
oglobo.globo.com oglobo.globo.com
''' '''
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
class OGlobo(BasicNewsRecipe): class OGlobo(BasicNewsRecipe):
title = 'O Globo' title = 'O Globo'
__author__ = 'Darko Miletic' __author__ = 'Darko Miletic'
description = 'News from Brasil' description = 'News from Brasil'
publisher = 'O Globo'
category = 'news, politics, Brasil'
oldest_article = 2 oldest_article = 2
max_articles_per_feed = 100 max_articles_per_feed = 100
language = _('Portugese')
no_stylesheets = True no_stylesheets = True
use_embedded_content = False use_embedded_content = False
encoding = 'cp1252' encoding = 'cp1252'
cover_url = 'http://oglobo.globo.com/_img/o-globo.png' cover_url = 'http://oglobo.globo.com/_img/o-globo.png'
remove_javascript = True
html2lrf_options = [ html2lrf_options = [
'--comment' , description '--comment', description
, '--category' , 'news, Brasil' , '--category', category
, '--publisher' , title , '--publisher', publisher
] ]
html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"'
keep_only_tags = [dict(name='div', attrs={'id':'ltintb'})] keep_only_tags = [dict(name='div', attrs={'id':'ltintb'})]
@ -56,3 +60,10 @@ class OGlobo(BasicNewsRecipe):
,(u'Economia', u'http://oglobo.globo.com/rss/plantaoeconomia.xml') ,(u'Economia', u'http://oglobo.globo.com/rss/plantaoeconomia.xml')
,(u'Tecnologia', u'http://oglobo.globo.com/rss/plantaotecnologia.xml') ,(u'Tecnologia', u'http://oglobo.globo.com/rss/plantaotecnologia.xml')
] ]
def preprocess_html(self, soup):
for item in soup.findAll(style=True):
del item['style']
return soup
language = _('Portugese')

View File

@ -1,31 +1,36 @@
#!/usr/bin/env python #!/usr/bin/env python
__license__ = 'GPL v3' __license__ = 'GPL v3'
__copyright__ = '2008, Darko Miletic <darko.miletic at gmail.com>' __copyright__ = '2008-2009, Darko Miletic <darko.miletic at gmail.com>'
''' '''
pagina12.com.ar pagina12.com.ar
''' '''
from calibre import strftime from calibre import strftime
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.web.feeds.news import BasicNewsRecipe
class Pagina12(BasicNewsRecipe): class Pagina12(BasicNewsRecipe):
title = u'Pagina/12' title = u'Pagina/12'
__author__ = 'Darko Miletic' __author__ = 'Darko Miletic'
description = 'Noticias de Argentina y el resto del mundo' description = 'Noticias de Argentina y el resto del mundo'
language = _('Spanish') publisher = 'La Pagina S.A.'
category = 'news, politics, Argentina'
oldest_article = 2 oldest_article = 2
max_articles_per_feed = 100 max_articles_per_feed = 100
no_stylesheets = True no_stylesheets = True
use_embedded_content = False
encoding = 'cp1252' encoding = 'cp1252'
cover_url = strftime('http://www.pagina12.com.ar/fotos/%Y%m%d/diario/TAPAN.jpg') cover_url = strftime('http://www.pagina12.com.ar/fotos/%Y%m%d/diario/TAPAN.jpg')
remove_javascript = True
use_embedded_content = False
html2lrf_options = [ html2lrf_options = [
'--comment' , description '--comment', description
, '--category' , 'news, Argentina' , '--category', category
, '--publisher' , 'La Pagina S.A.' , '--publisher', publisher
] ]
html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"'
remove_tags = [ remove_tags = [
@ -38,3 +43,12 @@ class Pagina12(BasicNewsRecipe):
def print_version(self, url): def print_version(self, url):
return url.replace('http://www.pagina12.com.ar/','http://www.pagina12.com.ar/imprimir/') return url.replace('http://www.pagina12.com.ar/','http://www.pagina12.com.ar/imprimir/')
def preprocess_html(self, soup):
mtag = '<meta http-equiv="Content-Language" content="es-AR"/>'
soup.head.insert(0,mtag)
for item in soup.findAll(style=True):
del item['style']
return soup
language = _('Spanish')

View File

@ -6,31 +6,53 @@ __copyright__ = '2008, Darko Miletic <darko.miletic at gmail.com>'
pescanik.net pescanik.net
''' '''
import string,re import re
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.web.feeds.news import BasicNewsRecipe
class Pescanik(BasicNewsRecipe): class Pescanik(BasicNewsRecipe):
title = 'Pescanik' title = 'Pescanik'
__author__ = 'Darko Miletic' __author__ = 'Darko Miletic'
description = 'Pescanik' description = 'Pescanik'
publisher = 'Pescanik'
category = 'news, politics, Serbia'
oldest_article = 7 oldest_article = 7
language = _('Serbian')
max_articles_per_feed = 100 max_articles_per_feed = 100
no_stylesheets = True no_stylesheets = True
use_embedded_content = False use_embedded_content = False
html2lrf_options = ['--base-font-size', '10'] remove_javascript = True
html2epub_options = 'base_font_size = "10pt"' encoding = 'utf8'
extra_css = '@font-face {font-family: "serif1";src:url(res:///opt/sony/ebook/FONT/tt0011m_.ttf)} @font-face {font-family: "monospace1";src:url(res:///opt/sony/ebook/FONT/tt0419m_.ttf)} @font-face {font-family: "sans1";src:url(res:///opt/sony/ebook/FONT/tt0003m_.ttf)} body{text-align: left; font-family: serif1, serif} .article_date{font-family: monospace1, monospace} .article_description{font-family: sans1, sans-serif} .navbar{font-family: monospace1, monospace}'
html2lrf_options = [
'--comment', description
, '--category', category
, '--publisher', publisher
]
html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"'
cover_url = "http://pescanik.net/templates/ja_teline/images/logo.png" cover_url = "http://pescanik.net/templates/ja_teline/images/logo.png"
preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')] preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')]
remove_tags_after = dict(name='div', attrs={'class':'article_seperator'}) remove_tags = [
dict(name='td' , attrs={'class':'buttonheading'})
remove_tags = [dict(name='td' , attrs={'class':'buttonheading'})] ,dict(name='span', attrs={'class':'article_seperator'})
,dict(name=['object','link'])
]
feeds = [(u'Pescanik Online', u'http://pescanik.net/index.php?option=com_rd_rss&id=12')] feeds = [(u'Pescanik Online', u'http://pescanik.net/index.php?option=com_rd_rss&id=12')]
def print_version(self, url): def print_version(self, url):
nurl = url.replace('http://pescanik.net/index.php','http://pescanik.net/index2.php') nurl = url.replace('/index.php','/index2.php')
return nurl + '&pop=1&page=0' return nurl + '&pop=1&page=0'
def preprocess_html(self, soup):
mtag = '<meta http-equiv="Content-Language" content="sr-Latn-RS"/>'
soup.head.insert(0,mtag)
for item in soup.findAll(style=True):
del item['style']
return soup
language = _('Serbian')

View File

@ -5,37 +5,61 @@ __copyright__ = '2008, Darko Miletic <darko.miletic at gmail.com>'
''' '''
politika.rs politika.rs
''' '''
import string,re import re
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.web.feeds.news import BasicNewsRecipe
class Politika(BasicNewsRecipe): class Politika(BasicNewsRecipe):
title = 'Politika Online' title = u'Politika Online'
__author__ = 'Darko Miletic' __author__ = 'Darko Miletic'
description = 'Najstariji dnevni list na Balkanu' description = 'Najstariji dnevni list na Balkanu'
publisher = 'Politika novine i Magazini d.o.o'
category = 'news, politics, Serbia'
oldest_article = 2 oldest_article = 2
max_articles_per_feed = 100 max_articles_per_feed = 100
no_stylesheets = True no_stylesheets = True
extra_css = '.content_center_border {text-align: left;}'
use_embedded_content = False use_embedded_content = False
cover_url = 'http://www.politika.rs:8080/images/politika.gif' remove_javascript = True
encoding = 'utf8'
extra_css = '@font-face {font-family: "serif1";src:url(res:///opt/sony/ebook/FONT/tt0011m_.ttf)} @font-face {font-family: "monospace1";src:url(res:///opt/sony/ebook/FONT/tt0419m_.ttf)} @font-face {font-family: "sans1";src:url(res:///opt/sony/ebook/FONT/tt0003m_.ttf)} body{text-align: left; font-family: serif1, serif} .article_date{font-family: monospace1, monospace} .article_description{font-family: sans1, sans-serif} .navbar{font-family: monospace1, monospace}'
html2lrf_options = [ html2lrf_options = [
'--comment', description '--comment', description
, '--base-font-size', '10' , '--category', category
, '--category', 'news, Serbia' , '--publisher', publisher
, '--publisher', 'POLITIKA NOVINE I MAGAZINI d.o.o.'
] ]
html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"'
preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')] preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')]
keep_only_tags = [ dict(name='div', attrs={'class':'contentcenter'}) ] keep_only_tags = [dict(name='div', attrs={'class':'content_center_border'})]
remove_tags_after = dict(name='div', attrs={'class':'datum_item_details'})
remove_tags = [
dict(name='div', attrs={'class':['send_print','txt-komentar']})
,dict(name=['object','link','a'])
,dict(name='h1', attrs={'class':'box_header-tags'})
]
feeds = [ feeds = [
(u'Politika' , u'http://www.politika.rs/rubrike/Politika/index.1.lt.xml' ) (u'Politika' , u'http://www.politika.rs/rubrike/Politika/index.1.lt.xml' )
,(u'Svet' , u'http://www.politika.rs/rubrike/Svet/index.1.lt.xml' ) ,(u'Svet' , u'http://www.politika.rs/rubrike/Svet/index.1.lt.xml' )
,(u'Redakcijski komentari', u'http://www.politika.rs/rubrike/redakcijski-komentari/index.1.lt.xml')
,(u'Pogledi' , u'http://www.politika.rs/pogledi/index.lt.xml' ) ,(u'Pogledi' , u'http://www.politika.rs/pogledi/index.lt.xml' )
,(u'Pogledi sa strane' , u'http://www.politika.rs/rubrike/Pogledi-sa-strane/index.1.lt.xml' ) ,(u'Pogledi sa strane' , u'http://www.politika.rs/rubrike/Pogledi-sa-strane/index.1.lt.xml' )
,(u'Tema dana' , u'http://www.politika.rs/rubrike/tema-dana/index.1.lt.xml' ) ,(u'Tema dana' , u'http://www.politika.rs/rubrike/tema-dana/index.1.lt.xml' )
,(u'Kultura' , u'http://www.politika.rs/rubrike/Kultura/index.1.lt.xml' ) ,(u'Kultura' , u'http://www.politika.rs/rubrike/Kultura/index.1.lt.xml' )
,(u'Zivot i stil' , u'http://www.politika.rs/rubrike/zivot-i-stil/index.1.lt.xml' ) ,(u'Zivot i stil' , u'http://www.politika.rs/rubrike/zivot-i-stil/index.1.lt.xml' )
] ]
def preprocess_html(self, soup):
mtag = '<meta http-equiv="Content-Language" content="sr-Latn-RS"/>'
soup.head.insert(0,mtag)
for item in soup.findAll(style=True):
del item['style']
ftag = soup.find('div',attrs={'class':'content_center_border'})
if ftag:
ftag['align'] = 'left'
return soup

View File

@ -1,38 +1,49 @@
#!/usr/bin/env python #!/usr/bin/env python
__license__ = 'GPL v3' __license__ = 'GPL v3'
__copyright__ = '2008, Darko Miletic <darko.miletic at gmail.com>' __copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>'
''' '''
vijesti.cg.yu vijesti.cg.yu
''' '''
import string,re import re
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.web.feeds.news import BasicNewsRecipe
class Vijesti(BasicNewsRecipe): class Vijesti(BasicNewsRecipe):
title = 'Vijesti' title = 'Vijesti'
__author__ = 'Darko Miletic' __author__ = 'Darko Miletic'
description = 'News from Montenegro' description = 'News from Montenegro'
oldest_article = 2 publisher = 'Daily Press Vijesti'
language = _('Serbian') category = 'news, politics, Montenegro'
oldest_article = 1
max_articles_per_feed = 100 max_articles_per_feed = 100
no_stylesheets = True no_stylesheets = True
use_embedded_content = False remove_javascript = True
encoding = 'cp1250' encoding = 'cp1250'
cover_url = 'http://www.vijesti.cg.yu/img/logo.gif' cover_url = 'http://www.vijesti.cg.yu/img/logo.gif'
remove_javascript = True
use_embedded_content = False
extra_css = '@font-face {font-family: "serif1";src:url(res:///opt/sony/ebook/FONT/tt0011m_.ttf)} @font-face {font-family: "monospace1";src:url(res:///opt/sony/ebook/FONT/tt0419m_.ttf)} @font-face {font-family: "sans1";src:url(res:///opt/sony/ebook/FONT/tt0003m_.ttf)} body{text-align: left; font-family: serif1, serif} .article_date{font-family: monospace1, monospace} .article_description{font-family: sans1, sans-serif} .navbar{font-family: monospace1, monospace}'
html2lrf_options = [
'--comment', description
, '--category', category
, '--publisher', publisher
]
html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"'
preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')] preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')]
html2lrf_options = [
'--comment' , description
, '--category' , 'news, Montenegro'
, '--publisher' , 'Daily Press Vijesti'
]
keep_only_tags = [dict(name='div', attrs={'id':'mainnews'})] keep_only_tags = [dict(name='div', attrs={'id':'mainnews'})]
remove_tags = [
dict(name='div', attrs={'align':'right'})
,dict(name=['object','link'])
]
feeds = [(u'Sve vijesti', u'http://www.vijesti.cg.yu/rss.php' )] feeds = [(u'Sve vijesti', u'http://www.vijesti.cg.yu/rss.php' )]
def preprocess_html(self, soup): def preprocess_html(self, soup):
@ -40,4 +51,10 @@ class Vijesti(BasicNewsRecipe):
soup.html['lang'] = 'sr-Latn-ME' soup.html['lang'] = 'sr-Latn-ME'
mtag = '<meta http-equiv="Content-Language" content="sr-Latn-ME"/>' mtag = '<meta http-equiv="Content-Language" content="sr-Latn-ME"/>'
soup.head.insert(0,mtag) soup.head.insert(0,mtag)
for item in soup.findAll('img'):
if item.has_key('align'):
del item['align']
item.insert(0,'<br /><br />')
return soup return soup
language = _('Serbian')

View File

@ -6,27 +6,34 @@ __copyright__ = '2008, Darko Miletic <darko.miletic at gmail.com>'
vreme.com vreme.com
''' '''
import string,re import re
from calibre import strftime from calibre import strftime
from calibre.web.feeds.recipes import BasicNewsRecipe
class Vreme(BasicNewsRecipe): from calibre.web.feeds.news import BasicNewsRecipe
title = 'Vreme' class Vreme(BasicNewsRecipe):
__author__ = 'Darko Miletic' title = 'Vreme'
description = 'Politicki Nedeljnik Srbije' __author__ = 'Darko Miletic'
description = 'Politicki Nedeljnik Srbije'
publisher = 'Vreme d.o.o.'
category = 'news, politics, Serbia'
no_stylesheets = True no_stylesheets = True
language = _('Serbian') remove_javascript = True
needs_subscription = True needs_subscription = True
INDEX = 'http://www.vreme.com' INDEX = 'http://www.vreme.com'
LOGIN = 'http://www.vreme.com/account/index.php' LOGIN = 'http://www.vreme.com/account/index.php'
remove_javascript = True
use_embedded_content = False
extra_css = '@font-face {font-family: "serif1";src:url(res:///opt/sony/ebook/FONT/tt0011m_.ttf)} @font-face {font-family: "monospace1";src:url(res:///opt/sony/ebook/FONT/tt0419m_.ttf)} @font-face {font-family: "sans1";src:url(res:///opt/sony/ebook/FONT/tt0003m_.ttf)} body{text-align: left; font-family: serif1, serif} .article_date{font-family: monospace1, monospace} .article_description{font-family: sans1, sans-serif} .navbar{font-family: monospace1, monospace}'
html2lrf_options = [ html2lrf_options = [
'--comment', description '--comment', description
, '--base-font-size', '10' , '--category', category
, '--category', 'news, politics, Serbia' , '--publisher', publisher
, '--publisher', 'Vreme d.o.o.'
] ]
html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"'
preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')] preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')]
def get_browser(self): def get_browser(self):
@ -67,10 +74,29 @@ class Vreme(BasicNewsRecipe):
,'description':description ,'description':description
}) })
return [(soup.head.title.string, articles)] return [(soup.head.title.string, articles)]
remove_tags = [
dict(name=['object','link'])
,dict(name='table',attrs={'xclass':'image'})
]
def print_version(self, url): def print_version(self, url):
return url + '&print=yes' return url + '&print=yes'
def preprocess_html(self, soup):
del soup.body['text' ]
del soup.body['bgcolor']
del soup.body['onload' ]
mtag = '<meta http-equiv="Content-Language" content="sr-Latn"/>'
soup.head.insert(0,mtag)
tbl = soup.body.table
tbbb = soup.find('td')
if tbbb:
tbbb.extract()
tbl.extract()
soup.body.insert(0,tbbb)
return soup
def get_cover_url(self): def get_cover_url(self):
cover_url = None cover_url = None
soup = self.index_to_soup(self.INDEX) soup = self.index_to_soup(self.INDEX)
@ -78,3 +104,5 @@ class Vreme(BasicNewsRecipe):
if cover_item: if cover_item:
cover_url = self.INDEX + cover_item['src'] cover_url = self.INDEX + cover_item['src']
return cover_url return cover_url
language = _('Serbian')