diff --git a/src/calibre/web/feeds/recipes/recipe_ambito.py b/src/calibre/web/feeds/recipes/recipe_ambito.py index 17d33a1cde..e6e3c224e9 100644 --- a/src/calibre/web/feeds/recipes/recipe_ambito.py +++ b/src/calibre/web/feeds/recipes/recipe_ambito.py @@ -1,31 +1,38 @@ #!/usr/bin/env python __license__ = 'GPL v3' -__copyright__ = '2008, Darko Miletic ' +__copyright__ = '2008-2009, Darko Miletic ' ''' ambito.com ''' - -from calibre.web.feeds.news import BasicNewsRecipe - +from calibre.web.feeds.news import BasicNewsRecipe + class Ambito(BasicNewsRecipe): title = 'Ambito.com' __author__ = 'Darko Miletic' - description = 'Informacion Libre las 24 horas' + description = 'Informacion Libre las 24 horas' + publisher = 'Ambito.com' + category = 'news, politics, Argentina' oldest_article = 2 max_articles_per_feed = 100 no_stylesheets = True - use_embedded_content = False - encoding = 'iso--8859-1' - language = _('Spanish') + encoding = 'iso-8859-1' cover_url = 'http://www.ambito.com/img/logo_.jpg' - + remove_javascript = True + use_embedded_content = False + html2lrf_options = [ - '--comment' , description - , '--category' , 'news, Argentina' - , '--publisher' , title + '--comment', description + , '--category', category + , '--publisher', publisher ] + + html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"' + + keep_only_tags = [dict(name='div', attrs={'align':'justify'})] + + remove_tags = [dict(name=['object','link'])] feeds = [ (u'Principales Noticias', u'http://www.ambito.com/rss/noticiasp.asp' ) @@ -43,3 +50,12 @@ class Ambito(BasicNewsRecipe): def print_version(self, url): return url.replace('http://www.ambito.com/noticia.asp?','http://www.ambito.com/noticias/imprimir.asp?') + + def preprocess_html(self, soup): + mtag = '' + soup.head.insert(0,mtag) + for item in soup.findAll(style=True): + del item['style'] + return soup + + language = _('Spanish') \ No newline at end of file diff --git a/src/calibre/web/feeds/recipes/recipe_b92.py b/src/calibre/web/feeds/recipes/recipe_b92.py index c20bfab8dc..611647620b 100644 --- a/src/calibre/web/feeds/recipes/recipe_b92.py +++ b/src/calibre/web/feeds/recipes/recipe_b92.py @@ -7,25 +7,33 @@ b92.net ''' import re -from calibre.web.feeds.news import BasicNewsRecipe +from calibre.web.feeds.news import BasicNewsRecipe + class B92(BasicNewsRecipe): - title = u'B92' + title = 'B92' __author__ = 'Darko Miletic' - language = _('Serbian') description = 'Dnevne vesti iz Srbije i sveta' - oldest_article = 7 + oldest_article = 2 + publisher = 'B92.net' + category = 'news, politics, Serbia' max_articles_per_feed = 100 + remove_javascript = True no_stylesheets = True use_embedded_content = False cover_url = 'http://static.b92.net/images/fp/logo.gif' + extra_css = '@font-face {font-family: "serif1";src:url(res:///opt/sony/ebook/FONT/tt0011m_.ttf)} @font-face {font-family: "monospace1";src:url(res:///opt/sony/ebook/FONT/tt0419m_.ttf)} @font-face {font-family: "sans1";src:url(res:///opt/sony/ebook/FONT/tt0003m_.ttf)} body{text-align: left; font-family: serif1, serif} .article_date{font-family: monospace1, monospace} .article_description{font-family: sans1, sans-serif} .navbar{font-family: monospace1, monospace}' + keep_only_tags = [ dict(name='div', attrs={'class':'sama_vest'}) ] + html2lrf_options = [ - '--comment', description - , '--base-font-size', '10' - , '--category', 'news, Serbia' - , '--publisher', 'B92' + '--comment', description + , '--category', category + , '--publisher', publisher ] + + html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"' + preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')] feeds = [ (u'Vesti', u'http://www.b92.net/info/rss/vesti.xml') @@ -44,3 +52,16 @@ class B92(BasicNewsRecipe): if biz: nurl = 'http://www.b92.net/mobilni/biz/index.php?nav_id=' + article_id return nurl + + def preprocess_html(self, soup): + soup.html['xml:lang'] = 'sr-Latn' + soup.html['lang'] = 'sr-Latn' + mtag = '' + soup.head.insert(0,mtag) + for item in soup.findAll(style=True): + del item['style'] + for item in soup.findAll(name='img',align=True): + del item['align'] + item.insert(0,'

') + return soup + language = _('Serbian') \ No newline at end of file diff --git a/src/calibre/web/feeds/recipes/recipe_blic.py b/src/calibre/web/feeds/recipes/recipe_blic.py index 33f98f562e..ae75394fec 100644 --- a/src/calibre/web/feeds/recipes/recipe_blic.py +++ b/src/calibre/web/feeds/recipes/recipe_blic.py @@ -5,32 +5,49 @@ __copyright__ = '2008, Darko Miletic ' ''' blic.rs ''' -import string,re -from calibre.web.feeds.news import BasicNewsRecipe +import re + +from calibre.web.feeds.news import BasicNewsRecipe + class Blic(BasicNewsRecipe): title = u'Blic' - __author__ = 'Darko Miletic' - description = 'Blic.rs online verzija najtiraznije novine u Srbiji donosi najnovije vesti iz Srbije i sveta, komentare, politicke analize, poslovne i ekonomske vesti, vesti iz regiona, intervjue, informacije iz kulture, reportaze, pokriva sve sportske dogadjaje, detaljan tv program, nagradne igre, zabavu, fenomenalni Blic strip, dnevni horoskop, arhivu svih dogadjaja' - oldest_article = 7 - language = _('Serbian') + __author__ = u'Darko Miletic' + description = u'Blic.co.yu online verzija najtiraznije novine u Srbiji donosi najnovije vesti iz Srbije i sveta, komentare, politicke analize, poslovne i ekonomske vesti, vesti iz regiona, intervjue, informacije iz kulture, reportaze, pokriva sve sportske dogadjaje, detaljan tv program, nagradne igre, zabavu, fenomenalni Blic strip, dnevni horoskop, arhivu svih dogadjaja' + publisher = 'RINGIER d.o.o.' + category = 'news, politics, Serbia' + oldest_article = 2 max_articles_per_feed = 100 + remove_javascript = True no_stylesheets = True use_embedded_content = False - cover_url = 'http://www.blic.rs/resources/images/header_back_tile.png' + extra_css = '@font-face {font-family: "serif1";src:url(res:///opt/sony/ebook/FONT/tt0011m_.ttf)} @font-face {font-family: "monospace1";src:url(res:///opt/sony/ebook/FONT/tt0419m_.ttf)} @font-face {font-family: "sans1";src:url(res:///opt/sony/ebook/FONT/tt0003m_.ttf)} body{text-align: left; font-family: serif1, serif} .article_date{font-family: monospace1, monospace} .article_description{font-family: sans1, sans-serif} .navbar{font-family: monospace1, monospace}' + html2lrf_options = [ - '--comment', description - , '--base-font-size', '10' - , '--category', 'news, Serbia' - , '--publisher', 'Blic' + '--comment', description + , '--category', category + , '--publisher', publisher ] - + + html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"' + preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')] - keep_only_tags = [ dict(name='div', attrs={'class':'single_news'}) ] + keep_only_tags = [dict(name='div', attrs={'class':'single_news'})] - feeds = [ (u'Vesti', u'http://www.blic.rs/rssall.php')] + feeds = [(u'Vesti', u'http://www.blic.rs/rssall.php')] + remove_tags = [dict(name=['object','link'])] + def print_version(self, url): start_url, question, rest_url = url.partition('?') return u'http://www.blic.rs/_print.php?' + rest_url + + def preprocess_html(self, soup): + mtag = '' + soup.head.insert(0,mtag) + for item in soup.findAll(style=True): + del item['style'] + return soup + + language = _('Serbian') \ No newline at end of file diff --git a/src/calibre/web/feeds/recipes/recipe_clarin.py b/src/calibre/web/feeds/recipes/recipe_clarin.py index cd72163c88..146719cc8b 100644 --- a/src/calibre/web/feeds/recipes/recipe_clarin.py +++ b/src/calibre/web/feeds/recipes/recipe_clarin.py @@ -1,31 +1,35 @@ #!/usr/bin/env python __license__ = 'GPL v3' -__copyright__ = '2008, Darko Miletic ' +__copyright__ = '2008-2009, Darko Miletic ' ''' clarin.com ''' from calibre import strftime -from calibre.web.feeds.news import BasicNewsRecipe +from calibre.web.feeds.news import BasicNewsRecipe + class Clarin(BasicNewsRecipe): title = 'Clarin' __author__ = 'Darko Miletic' description = 'Noticias de Argentina y mundo' + publisher = 'Grupo Clarin' + category = 'news, politics, Argentina' oldest_article = 2 max_articles_per_feed = 100 - language = _('Spanish') use_embedded_content = False no_stylesheets = True cover_url = strftime('http://www.clarin.com/diario/%Y/%m/%d/portada.jpg') - + remove_javascript = True + html2lrf_options = [ - '--comment', description - , '--base-font-size', '10' - , '--category', 'news, Argentina' - , '--publisher', 'Grupo Clarin' + '--comment', description + , '--category', category + , '--publisher', publisher ] + + html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"' remove_tags = [ dict(name='a' , attrs={'class':'Imp' }) @@ -49,3 +53,12 @@ class Clarin(BasicNewsRecipe): rest = artl.partition('-0')[-1] lmain = rest.partition('.')[0] return 'http://www.servicios.clarin.com/notas/jsp/clarin/v9/notas/imprimir.jsp?pagid=' + lmain + + def preprocess_html(self, soup): + mtag = '' + soup.head.insert(0,mtag) + for item in soup.findAll(style=True): + del item['style'] + return soup + + language = _('Spanish') \ No newline at end of file diff --git a/src/calibre/web/feeds/recipes/recipe_danas.py b/src/calibre/web/feeds/recipes/recipe_danas.py index 59e99fc746..f9c05e7b20 100644 --- a/src/calibre/web/feeds/recipes/recipe_danas.py +++ b/src/calibre/web/feeds/recipes/recipe_danas.py @@ -5,38 +5,47 @@ __copyright__ = '2008, Darko Miletic ' ''' danas.rs ''' -import string,re -from calibre.web.feeds.news import BasicNewsRecipe +import re +from calibre.web.feeds.news import BasicNewsRecipe + class Danas(BasicNewsRecipe): - title = 'Danas' + title = u'Danas' __author__ = 'Darko Miletic' - description = 'Dnevne novine sa vestima iz sveta, politike, ekonomije, kulture, sporta, Beograda, Novog Sada i cele Srbije.' + description = 'Vesti' + publisher = 'Danas d.o.o.' + category = 'news, politics, Serbia' oldest_article = 2 - language = _('Serbian') max_articles_per_feed = 100 - no_stylesheets = True + no_stylesheets = False + remove_javascript = True use_embedded_content = False - cover_url = 'http://www.danas.rs/images/basic/danas.gif' + extra_css = '@font-face {font-family: "serif1";src:url(res:///opt/sony/ebook/FONT/tt0011m_.ttf)} @font-face {font-family: "monospace1";src:url(res:///opt/sony/ebook/FONT/tt0419m_.ttf)} @font-face {font-family: "sans1";src:url(res:///opt/sony/ebook/FONT/tt0003m_.ttf)} body{text-align: left; font-family: serif1, serif} .article_date{font-family: monospace1, monospace} .article_description{font-family: sans1, sans-serif} .navbar{font-family: monospace1, monospace}' + html2lrf_options = [ - '--comment', description - , '--base-font-size', '10' - , '--category', 'news, Serbia' - , '--publisher', 'Danas' + '--comment', description + , '--category', category + , '--publisher', publisher ] + + html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"' + preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')] - keep_only_tags = [ dict(name='div', attrs={'id':'left'}) ] + keep_only_tags = [dict(name='div', attrs={'id':'left'})] remove_tags = [ - dict(name='div', attrs={'class':'width_1_4' }) - ,dict(name='div', attrs={'class':'metaClanka' }) - ,dict(name='div', attrs={'id':'comments' }) - ,dict(name='div', attrs={'class':'baner' }) - ,dict(name='div', attrs={'class':'slikaClanka'}) + dict(name='div', attrs={'class':['width_1_4','metaClanka','baner']}) + ,dict(name='div', attrs={'id':'comments'}) + ,dict(name=['object','link']) ] - feeds = [(u'Vesti', u'http://www.danas.rs/rss/rss.asp')] + feeds = [ (u'Vesti', u'http://www.danas.rs/rss/rss.asp')] - def print_version(self, url): - return url + '&action=print' + def preprocess_html(self, soup): + mtag = '' + soup.head.insert(0,mtag) + for item in soup.findAll(style=True): + del item['style'] + return soup + language = _('Serbian') \ No newline at end of file diff --git a/src/calibre/web/feeds/recipes/recipe_el_mercurio_chile.py b/src/calibre/web/feeds/recipes/recipe_el_mercurio_chile.py index 0b7d994b34..fb11d54072 100644 --- a/src/calibre/web/feeds/recipes/recipe_el_mercurio_chile.py +++ b/src/calibre/web/feeds/recipes/recipe_el_mercurio_chile.py @@ -5,31 +5,36 @@ __copyright__ = '2009, Darko Miletic ' ''' emol.com ''' -from calibre.web.feeds.news import BasicNewsRecipe +from calibre.web.feeds.news import BasicNewsRecipe + class ElMercurio(BasicNewsRecipe): title = 'El Mercurio online' - language = _('Spanish') __author__ = 'Darko Miletic' - description = 'El sitio de noticias online de Chile' + description = 'El sitio de noticias online de Chile' + publisher = 'El Mercurio' + category = 'news, politics, Chile' oldest_article = 2 max_articles_per_feed = 100 no_stylesheets = True use_embedded_content = False encoding = 'cp1252' cover_url = 'http://www.emol.com/especiales/logo_emol/logo_emol.gif' - + remove_javascript = True + use_embedded_content = False + html2lrf_options = [ - '--comment' , description - , '--category' , 'news, Chile' - , '--publisher' , title + '--comment', description + , '--category', category + , '--publisher', publisher ] + + html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"' keep_only_tags = [ dict(name='div', attrs={'class':'despliegue-txt_750px'}) ,dict(name='div', attrs={'id':'div_cuerpo_participa'}) ] - remove_tags = [ dict(name='div', attrs={'class':'contenedor_despliegue-col-left300'}) @@ -45,4 +50,12 @@ class ElMercurio(BasicNewsRecipe): ,(u'Tecnologia', u'http://www.emol.com/rss20/rss.asp?canal=5') ,(u'La Musica', u'http://www.emol.com/rss20/rss.asp?canal=7') ] + + def preprocess_html(self, soup): + mtag = '' + soup.head.insert(0,mtag) + for item in soup.findAll(style=True): + del item['style'] + return soup + language = _('Spanish') \ No newline at end of file diff --git a/src/calibre/web/feeds/recipes/recipe_elargentino.py b/src/calibre/web/feeds/recipes/recipe_elargentino.py index ef79c96baa..1801c81b81 100644 --- a/src/calibre/web/feeds/recipes/recipe_elargentino.py +++ b/src/calibre/web/feeds/recipes/recipe_elargentino.py @@ -1,30 +1,34 @@ #!/usr/bin/env python __license__ = 'GPL v3' -__copyright__ = '2008, Darko Miletic ' +__copyright__ = '2008-2009, Darko Miletic ' ''' elargentino.com ''' -from calibre.web.feeds.news import BasicNewsRecipe - +from calibre.web.feeds.news import BasicNewsRecipe + class ElArgentino(BasicNewsRecipe): title = 'ElArgentino.com' __author__ = 'Darko Miletic' - description = 'Informacion Libre las 24 horas' - language = _('Spanish') + description = 'Informacion Libre las 24 horas' + publisher = 'ElArgentino.com' + category = 'news, politics, Argentina' oldest_article = 2 max_articles_per_feed = 100 + remove_javascript = True no_stylesheets = True use_embedded_content = False encoding = 'utf8' cover_url = 'http://www.elargentino.com/TemplateWeb/MediosFooter/tapa_elargentino.png' html2lrf_options = [ - '--comment' , description - , '--category' , 'news, Argentina' - , '--publisher' , 'ElArgentino.com' + '--comment', description + , '--category', category + , '--publisher', publisher ] + + html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"' remove_tags = [ dict(name='div', attrs={'id':'noprint' }) @@ -50,7 +54,10 @@ class ElArgentino(BasicNewsRecipe): return u'http://www.elargentino.com/Impresion.aspx?Id=' + article_id def preprocess_html(self, soup): - mtag = '' + mtag = '\n\n' soup.head.insert(0,mtag) - soup.prettify() + for item in soup.findAll(style=True): + del item['style'] return soup + + language = _('Spanish') \ No newline at end of file diff --git a/src/calibre/web/feeds/recipes/recipe_elmundo.py b/src/calibre/web/feeds/recipes/recipe_elmundo.py index 9fd6eefb35..3ecedb5822 100644 --- a/src/calibre/web/feeds/recipes/recipe_elmundo.py +++ b/src/calibre/web/feeds/recipes/recipe_elmundo.py @@ -6,41 +6,55 @@ __copyright__ = '2009, Darko Miletic ' elmundo.es ''' -from calibre.web.feeds.news import BasicNewsRecipe - +from calibre.web.feeds.news import BasicNewsRecipe + class ElMundo(BasicNewsRecipe): title = 'El Mundo' __author__ = 'Darko Miletic' description = 'News from Spain' - language = _('Spanish') + publisher = 'El Mundo' + category = 'news, politics, Spain' oldest_article = 2 max_articles_per_feed = 100 no_stylesheets = True use_embedded_content = False encoding = 'iso8859_15' cover_url = 'http://estaticos02.cache.el-mundo.net/papel/imagenes/v2.0/logoverde.gif' - + remove_javascript = True + html2lrf_options = [ - '--comment' , description - , '--category' , 'news, Spain' - , '--publisher' , title + '--comment', description + , '--category', category + , '--publisher', publisher ] + + html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"' - keep_only_tags = [dict(name='div', attrs={'class':'noticia'})] - + + keep_only_tags = [ + dict(name='div', attrs={'id':['bloqueprincipal','noticia']}) + ,dict(name='div', attrs={'class':['contenido_noticia_01']}) + ] remove_tags = [ - dict(name='div', attrs={'class':['herramientas','publicidad_google','video','herramientasarriba','contenido_noticia_02']}) + dict(name='div', attrs={'class':['herramientas','publicidad_google']}) ,dict(name='div', attrs={'id':'modulo_multimedia' }) - ,dict(name=['object','script','link', 'a']) - ,dict(name='ul', attrs={'class':'herramientas'}) + ,dict(name='ul', attrs={'class':'herramientas' }) + ,dict(name=['object','link']) ] feeds = [ (u'Portada' , u'http://rss.elmundo.es/rss/descarga.htm?data2=4' ) - ,(u'Television' , u'http://rss.elmundo.es/rss/descarga.htm?data2=76') ,(u'Espana' , u'http://rss.elmundo.es/rss/descarga.htm?data2=8' ) ,(u'Internacional' , u'http://rss.elmundo.es/rss/descarga.htm?data2=9' ) ,(u'Cultura' , u'http://rss.elmundo.es/rss/descarga.htm?data2=6' ) ,(u'Ciencia/Ecologia', u'http://rss.elmundo.es/rss/descarga.htm?data2=5' ) ,(u'Comunicacion' , u'http://rss.elmundo.es/rss/descarga.htm?data2=26') + ,(u'Television' , u'http://rss.elmundo.es/rss/descarga.htm?data2=76') ] + + def preprocess_html(self, soup): + for item in soup.findAll(style=True): + del item['style'] + return soup + + language = _('Spanish') \ No newline at end of file diff --git a/src/calibre/web/feeds/recipes/recipe_estadao.py b/src/calibre/web/feeds/recipes/recipe_estadao.py index f44e62e0ca..4b23fdb235 100644 --- a/src/calibre/web/feeds/recipes/recipe_estadao.py +++ b/src/calibre/web/feeds/recipes/recipe_estadao.py @@ -6,26 +6,29 @@ __copyright__ = '2009, Darko Miletic ' estadao.com.br ''' -from calibre.web.feeds.news import BasicNewsRecipe - - +from calibre.web.feeds.news import BasicNewsRecipe + class Estadao(BasicNewsRecipe): title = 'O Estado de S. Paulo' __author__ = 'Darko Miletic' - description = 'News from Brasil' - language = _('Portugese') + description = 'News from Brasil in Portugese' + publisher = 'O Estado de S. Paulo' + category = 'news, politics, Brasil' oldest_article = 2 max_articles_per_feed = 100 no_stylesheets = True use_embedded_content = False encoding = 'utf8' cover_url = 'http://www.estadao.com.br/img/logo_estadao.png' - + remove_javascript = True + html2lrf_options = [ - '--comment' , description - , '--category' , 'news, Brasil' - , '--publisher' , title + '--comment', description + , '--category', category + , '--publisher', publisher ] + + html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"' keep_only_tags = [dict(name='div', attrs={'id':'c1'})] @@ -52,4 +55,8 @@ class Estadao(BasicNewsRecipe): ifr = soup.find('iframe') if ifr: ifr.extract() + for item in soup.findAll(style=True): + del item['style'] return soup + + language = _('Portugese') \ No newline at end of file diff --git a/src/calibre/web/feeds/recipes/recipe_granma.py b/src/calibre/web/feeds/recipes/recipe_granma.py index 43cbd32ae1..66ebba1d64 100644 --- a/src/calibre/web/feeds/recipes/recipe_granma.py +++ b/src/calibre/web/feeds/recipes/recipe_granma.py @@ -7,27 +7,30 @@ granma.cubaweb.cu ''' import urllib - -from calibre.web.feeds.news import BasicNewsRecipe - +from calibre.web.feeds.news import BasicNewsRecipe + class Granma(BasicNewsRecipe): title = 'Diario Granma' __author__ = 'Darko Miletic' - language = _('Spanish') description = 'Organo oficial del Comite Central del Partido Comunista de Cuba' + publisher = 'Granma' + category = 'news, politics, Cuba' oldest_article = 2 max_articles_per_feed = 100 no_stylesheets = True use_embedded_content = False encoding = 'cp1252' cover_url = 'http://www.granma.cubaweb.cu/imagenes/granweb229d.jpg' - + remove_javascript = True + html2lrf_options = [ - '--comment' , description - , '--category' , 'news, Cuba' - , '--publisher' , title + '--comment', description + , '--category', category + , '--publisher', publisher , '--ignore-tables' ] + + html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"' keep_only_tags = [dict(name='table', attrs={'height':'466'})] @@ -35,9 +38,15 @@ class Granma(BasicNewsRecipe): def preprocess_html(self, soup): - del soup.body.table['style'] - rtag = soup.find('td', attrs={'height':'458'}) - if rtag: - del rtag['style'] + mtag = '' + soup.head.insert(0,mtag) + for item in soup.findAll('table'): + if item.has_key('width'): + del item['width'] + if item.has_key('height'): + del item['height'] + for item in soup.findAll(style=True): + del item['style'] return soup + language = _('Spanish') \ No newline at end of file diff --git a/src/calibre/web/feeds/recipes/recipe_harpers_full.py b/src/calibre/web/feeds/recipes/recipe_harpers_full.py index adf0bf82a3..72e633bde0 100644 --- a/src/calibre/web/feeds/recipes/recipe_harpers_full.py +++ b/src/calibre/web/feeds/recipes/recipe_harpers_full.py @@ -1,62 +1,80 @@ -#!/usr/bin/env python - -__license__ = 'GPL v3' -__copyright__ = '2008, Darko Miletic ' -''' -harpers.org - paid subscription/ printed issue articles -This recipe only get's article's published in text format -images and pdf's are ignored -''' - -from calibre import strftime +#!/usr/bin/env python + +__license__ = 'GPL v3' +__copyright__ = '2008-2009, Darko Miletic ' +''' +harpers.org - paid subscription/ printed issue articles +This recipe only get's article's published in text format +images and pdf's are ignored +''' + +from calibre import strftime + from calibre.web.feeds.news import BasicNewsRecipe -class Harpers_full(BasicNewsRecipe): - title = u"Harper's Magazine - articles from printed edition" - __author__ = u'Darko Miletic' - description = u"Harper's Magazine: Founded June 1850." - language = _('English') - oldest_article = 30 - max_articles_per_feed = 100 - no_stylesheets = True - use_embedded_content = False - simultaneous_downloads = 1 - delay = 1 - needs_subscription = True - INDEX = strftime('http://www.harpers.org/archive/%Y/%m') - LOGIN = 'http://www.harpers.org' - cover_url = strftime('http://www.harpers.org/media/pages/%Y/%m/gif/0001.gif') - - keep_only_tags = [ dict(name='div', attrs={'id':'cached'}) ] - remove_tags = [ - dict(name='table', attrs={'class':'rcnt'}) - ,dict(name='table', attrs={'class':'rcnt topline'}) - ] - - def get_browser(self): - br = BasicNewsRecipe.get_browser() - if self.username is not None and self.password is not None: - br.open(self.LOGIN) - br.select_form(nr=1) - br['handle' ] = self.username - br['password'] = self.password - br.submit() - return br +class Harpers_full(BasicNewsRecipe): + title = u"Harper's Magazine - articles from printed edition" + __author__ = u'Darko Miletic' + description = u"Harper's Magazine: Founded June 1850." + publisher = "Harpers's" + category = 'news, politics, USA' + oldest_article = 30 + max_articles_per_feed = 100 + no_stylesheets = True + use_embedded_content = False + simultaneous_downloads = 1 + delay = 1 + needs_subscription = True + INDEX = strftime('http://www.harpers.org/archive/%Y/%m') + LOGIN = 'http://www.harpers.org' + cover_url = strftime('http://www.harpers.org/media/pages/%Y/%m/gif/0001.gif') + remove_javascript = True + + html2lrf_options = [ + '--comment', description + , '--category', category + , '--publisher', publisher + ] + + html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"' + + keep_only_tags = [ dict(name='div', attrs={'id':'cached'}) ] + remove_tags = [ + dict(name='table', attrs={'class':'rcnt'}) + ,dict(name='table', attrs={'class':'rcnt topline'}) + ] + + def get_browser(self): + br = BasicNewsRecipe.get_browser() + if self.username is not None and self.password is not None: + br.open(self.LOGIN) + br.select_form(nr=1) + br['handle' ] = self.username + br['password'] = self.password + br.submit() + return br + + def parse_index(self): + articles = [] + print 'Processing ' + self.INDEX + soup = self.index_to_soup(self.INDEX) + for item in soup.findAll('div', attrs={'class':'title'}): + text_link = item.parent.find('img',attrs={'alt':'Text'}) + if text_link: + url = self.LOGIN + item.a['href'] + title = item.a.contents[0] + date = strftime(' %B %Y') + articles.append({ + 'title' :title + ,'date' :date + ,'url' :url + ,'description':'' + }) + return [(soup.head.title.string, articles)] + + def preprocess_html(self, soup): + for item in soup.findAll(style=True): + del item['style'] + return soup - def parse_index(self): - articles = [] - print 'Processing ' + self.INDEX - soup = self.index_to_soup(self.INDEX) - for item in soup.findAll('div', attrs={'class':'title'}): - text_link = item.parent.find('img',attrs={'alt':'Text'}) - if text_link: - url = self.LOGIN + item.a['href'] - title = item.a.contents[0] - date = strftime(' %B %Y') - articles.append({ - 'title' :title - ,'date' :date - ,'url' :url - ,'description':'' - }) - return [(soup.head.title.string, articles)] \ No newline at end of file + language = _('English') \ No newline at end of file diff --git a/src/calibre/web/feeds/recipes/recipe_infobae.py b/src/calibre/web/feeds/recipes/recipe_infobae.py index 5acbcfa763..40e720f94c 100644 --- a/src/calibre/web/feeds/recipes/recipe_infobae.py +++ b/src/calibre/web/feeds/recipes/recipe_infobae.py @@ -1,34 +1,36 @@ #!/usr/bin/env python __license__ = 'GPL v3' -__copyright__ = '2008, Darko Miletic ' +__copyright__ = '2008-2009, Darko Miletic ' ''' infobae.com ''' -from calibre.web.feeds.news import BasicNewsRecipe - +from calibre.web.feeds.news import BasicNewsRecipe + class Infobae(BasicNewsRecipe): title = 'Infobae.com' __author__ = 'Darko Miletic' - description = 'Informacion Libre las 24 horas' + description = 'Informacion Libre las 24 horas' + publisher = 'Infobae.com' + category = 'news, politics, Argentina' oldest_article = 2 - language = _('Spanish') max_articles_per_feed = 100 no_stylesheets = True use_embedded_content = False encoding = 'iso-8859-1' cover_url = 'http://www.infobae.com/imgs/header/header.gif' - - html2lrf_options = [ - '--comment' , description - , '--category' , 'news, Argentina' - , '--publisher' , 'Infobae.com' - ] - - + remove_javascript = True - feeds = [ + html2lrf_options = [ + '--comment', description + , '--category', category + , '--publisher', publisher + ] + + html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"' + + feeds = [ (u'Noticias' , u'http://www.infobae.com/adjuntos/html/RSS/hoy.xml' ) ,(u'Salud' , u'http://www.infobae.com/adjuntos/html/RSS/salud.xml' ) ,(u'Tecnologia', u'http://www.infobae.com/adjuntos/html/RSS/tecnologia.xml') @@ -37,5 +39,14 @@ class Infobae(BasicNewsRecipe): def print_version(self, url): main, sep, article_part = url.partition('contenidos/') - article_id, rsep, rrest = article_part.partition('-') + article_id, rsep, rrest = article_part.partition('-') return u'http://www.infobae.com/notas/nota_imprimir.php?Idx=' + article_id + + def preprocess_html(self, soup): + mtag = '\n\n' + soup.head.insert(0,mtag) + for item in soup.findAll(style=True): + del item['style'] + return soup + + language = _('Spanish') \ No newline at end of file diff --git a/src/calibre/web/feeds/recipes/recipe_jb_online.py b/src/calibre/web/feeds/recipes/recipe_jb_online.py index 2b40f1960d..4ac4b70836 100644 --- a/src/calibre/web/feeds/recipes/recipe_jb_online.py +++ b/src/calibre/web/feeds/recipes/recipe_jb_online.py @@ -6,25 +6,29 @@ __copyright__ = '2009, Darko Miletic ' jbonline.terra.com.br ''' -from calibre.web.feeds.news import BasicNewsRecipe - +from calibre.web.feeds.news import BasicNewsRecipe + class JBOnline(BasicNewsRecipe): title = 'Jornal Brasileiro Online' __author__ = 'Darko Miletic' - description = 'News from Brasil' + description = 'News from Brasil' + publisher = 'Jornal Brasileiro' + category = 'news, politics, Brasil' oldest_article = 2 - language = _('Portugese') max_articles_per_feed = 100 no_stylesheets = True use_embedded_content = False encoding = 'cp1252' cover_url = 'http://jbonline.terra.com.br/img/logo_01.gif' - + remove_javascript = True + html2lrf_options = [ - '--comment' , description - , '--category' , 'news, Brasil' - , '--publisher' , title + '--comment', description + , '--category', category + , '--publisher', publisher ] + + html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"' keep_only_tags = [dict(name='div', attrs={'id':'corpoNoticia'})] @@ -36,7 +40,8 @@ class JBOnline(BasicNewsRecipe): ifr = soup.find('iframe') if ifr: ifr.extract() - item = soup.find('div', attrs={'id':'corpoNoticia'}) - if item: - del item['style'] + for item in soup.findAll(style=True): + del item['style'] return soup + + language = _('Portugese') \ No newline at end of file diff --git a/src/calibre/web/feeds/recipes/recipe_jutarnji.py b/src/calibre/web/feeds/recipes/recipe_jutarnji.py index f2504a78c1..194d2163f9 100644 --- a/src/calibre/web/feeds/recipes/recipe_jutarnji.py +++ b/src/calibre/web/feeds/recipes/recipe_jutarnji.py @@ -6,28 +6,35 @@ __copyright__ = '2008, Darko Miletic ' jutarnji.hr ''' -import string, re -from calibre.web.feeds.news import BasicNewsRecipe -from calibre.ebooks.BeautifulSoup import BeautifulSoup +import re +from calibre.web.feeds.news import BasicNewsRecipe + class Jutarnji(BasicNewsRecipe): - title = 'Jutarnji' - __author__ = 'Darko Miletic' - description = 'Online izdanje Jutarnjeg lista' + title = u'Jutarnji' + __author__ = u'Darko Miletic' + description = u'Hrvatski portal' + publisher = 'Jutarnji.hr' + category = 'news, politics, Croatia' oldest_article = 2 max_articles_per_feed = 100 simultaneous_downloads = 1 delay = 1 no_stylesheets = True use_embedded_content = False + remove_javascript = True encoding = 'cp1250' - cover_url = 'http://www.jutarnji.hr/EPHResources/Images/2008/06/05/jhrlogo.png' + extra_css = '@font-face {font-family: "serif1";src:url(res:///opt/sony/ebook/FONT/tt0011m_.ttf)} @font-face {font-family: "monospace1";src:url(res:///opt/sony/ebook/FONT/tt0419m_.ttf)} @font-face {font-family: "sans1";src:url(res:///opt/sony/ebook/FONT/tt0003m_.ttf)} body{text-align: left; font-family: serif1, serif} .article_date{font-family: monospace1, monospace} .article_description{font-family: sans1, sans-serif} .navbar{font-family: monospace1, monospace}' + html2lrf_options = [ - '--comment', description - , '--base-font-size', '10' - , '--category', 'news, Croatia' - , '--publisher', 'Europapress holding d.o.o.' - ] + '--comment', description + , '--category', category + , '--publisher', publisher + ] + + html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"' + + preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')] remove_tags = [ @@ -49,11 +56,16 @@ class Jutarnji(BasicNewsRecipe): def print_version(self, url): main, split, rest = url.partition('.jl') rmain, rsplit, rrest = main.rpartition(',') - return u'http://www.jutarnji.hr/ispis_clanka.jl?artid=' + rrest + return 'http://www.jutarnji.hr/ispis_clanka.jl?artid=' + rrest def preprocess_html(self, soup): mtag = '' soup.head.insert(0,mtag) - soup.prettify() + mtag = '' + soup.head.insert(0,mtag) + for item in soup.findAll(style=True): + del item['style'] + for item in soup.findAll(width=True): + del item['width'] return soup \ No newline at end of file diff --git a/src/calibre/web/feeds/recipes/recipe_juventudrebelde.py b/src/calibre/web/feeds/recipes/recipe_juventudrebelde.py index 5fa9f45e41..eea510a7cd 100644 --- a/src/calibre/web/feeds/recipes/recipe_juventudrebelde.py +++ b/src/calibre/web/feeds/recipes/recipe_juventudrebelde.py @@ -7,26 +7,30 @@ juventudrebelde.cu ''' from calibre import strftime -from calibre.web.feeds.news import BasicNewsRecipe - +from calibre.web.feeds.news import BasicNewsRecipe + class Juventudrebelde(BasicNewsRecipe): title = 'Juventud Rebelde' __author__ = 'Darko Miletic' - description = 'Diario de la Juventud Cubana' + description = 'Diario de la Juventud Cubana' + publisher = 'Juventud rebelde' + category = 'news, politics, Cuba' oldest_article = 2 - language = _('Spanish') max_articles_per_feed = 100 no_stylesheets = True use_embedded_content = False encoding = 'cp1252' cover_url = strftime('http://www.juventudrebelde.cu/UserFiles/File/impreso/iportada-%Y-%m-%d.jpg') - + remove_javascript = True + html2lrf_options = [ - '--comment' , description - , '--category' , 'news, Cuba' - , '--publisher' , title + '--comment', description + , '--category', category + , '--publisher', publisher , '--ignore-tables' ] + + html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"' keep_only_tags = [dict(name='div', attrs={'id':'noticia'})] @@ -40,4 +44,11 @@ class Juventudrebelde(BasicNewsRecipe): ,(u'Lectura', u'http://www.juventudrebelde.cu/rss/generales.php?seccion=lectura' ) ] - + def preprocess_html(self, soup): + mtag = '' + soup.head.insert(0,mtag) + for item in soup.findAll(style=True): + del item['style'] + return soup + + language = _('Spanish') \ No newline at end of file diff --git a/src/calibre/web/feeds/recipes/recipe_juventudrebelde_english.py b/src/calibre/web/feeds/recipes/recipe_juventudrebelde_english.py index e7c1002323..5bae2b6e9c 100644 --- a/src/calibre/web/feeds/recipes/recipe_juventudrebelde_english.py +++ b/src/calibre/web/feeds/recipes/recipe_juventudrebelde_english.py @@ -5,30 +5,40 @@ __copyright__ = '2008, Darko Miletic ' ''' juventudrebelde.co.cu ''' -from calibre import strftime - -from calibre.web.feeds.news import BasicNewsRecipe +from calibre.web.feeds.news import BasicNewsRecipe + class Juventudrebelde_english(BasicNewsRecipe): title = 'Juventud Rebelde in english' __author__ = 'Darko Miletic' - description = 'The newspaper of Cuban Youth' - language = _('English') + description = 'The newspaper of Cuban Youth' + publisher = 'Juventud Rebelde' + category = 'news, politics, Cuba' oldest_article = 2 max_articles_per_feed = 100 no_stylesheets = True use_embedded_content = False encoding = 'iso-8859-1' - + remove_javascript = True + html2lrf_options = [ - '--comment' , description - , '--category' , 'news, Cuba' - , '--publisher' , title + '--comment', description + , '--category', category + , '--publisher', publisher , '--ignore-tables' ] + + html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"' keep_only_tags = [dict(name='div', attrs={'class':'read'})] feeds = [(u'All news', u'http://www.juventudrebelde.cip.cu/rss/all/' )] + def preprocess_html(self, soup): + mtag = '' + soup.head.insert(0,mtag) + for item in soup.findAll(style=True): + del item['style'] + return soup + language = _('English') \ No newline at end of file diff --git a/src/calibre/web/feeds/recipes/recipe_la_cuarta.py b/src/calibre/web/feeds/recipes/recipe_la_cuarta.py index e5576cd442..17bc708245 100644 --- a/src/calibre/web/feeds/recipes/recipe_la_cuarta.py +++ b/src/calibre/web/feeds/recipes/recipe_la_cuarta.py @@ -6,30 +6,33 @@ __copyright__ = '2009, Darko Miletic ' lacuarta.cl ''' -from calibre.web.feeds.news import BasicNewsRecipe - +from calibre.web.feeds.news import BasicNewsRecipe + class LaCuarta(BasicNewsRecipe): title = 'La Cuarta' __author__ = 'Darko Miletic' - description = 'El sitio de noticias online de Chile' + description = 'La Cuarta Cibernetica: El Diario popular' + publisher = 'CODISA, Consorcio Digital S.A.' + category = 'news, politics, entertainment, Chile' oldest_article = 2 - language = _('Spanish') max_articles_per_feed = 100 no_stylesheets = True use_embedded_content = False encoding = 'cp1252' - + remove_javascript = True + html2lrf_options = [ - '--comment' , description - , '--category' , 'news, Chile' - , '--publisher' , title + '--comment', description + , '--category', category + , '--publisher', publisher ] + + html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"' keep_only_tags = [dict(name='div', attrs={'class':'articulo desplegado'}) ] remove_tags = [ - dict(name='script') - ,dict(name='ul') + dict(name='ul') ,dict(name='div', attrs={'id':['toolbox','articleImageDisplayer','enviarAmigo']}) ,dict(name='div', attrs={'class':['par ad-1','par ad-2']}) ,dict(name='input') @@ -37,7 +40,14 @@ class LaCuarta(BasicNewsRecipe): ,dict(name='strong', text='PUBLICIDAD') ] + def preprocess_html(self, soup): + mtag = '' + soup.head.insert(0,mtag) + for item in soup.findAll(style=True): + del item['style'] + return soup feeds = [(u'Noticias', u'http://lacuarta.cl/app/rss?sc=TEFDVUFSVEE=')] + language = _('Spanish') \ No newline at end of file diff --git a/src/calibre/web/feeds/recipes/recipe_la_segunda.py b/src/calibre/web/feeds/recipes/recipe_la_segunda.py index 7f5415806d..d049d9c92b 100644 --- a/src/calibre/web/feeds/recipes/recipe_la_segunda.py +++ b/src/calibre/web/feeds/recipes/recipe_la_segunda.py @@ -6,26 +6,29 @@ __copyright__ = '2009, Darko Miletic ' lasegunda.com ''' -from calibre.web.feeds.news import BasicNewsRecipe - +from calibre.web.feeds.news import BasicNewsRecipe + class LaSegunda(BasicNewsRecipe): title = 'La Segunda' __author__ = 'Darko Miletic' - description = 'El sitio de noticias online de Chile' - language = _('Spanish') + description = 'El sitio de noticias online de Chile' + publisher = 'La Segunda' + category = 'news, politics, Chile' oldest_article = 2 max_articles_per_feed = 100 no_stylesheets = True use_embedded_content = False encoding = 'cp1252' cover_url = 'http://www.lasegunda.com/imagenes/logotipo_lasegunda_Oli.gif' - + remove_javascript = True + html2lrf_options = [ - '--comment' , description - , '--category' , 'news, Chile' - , '--publisher' , title - , '--ignore-tables' + '--comment', description + , '--category', category + , '--publisher', publisher ] + + html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"' keep_only_tags = [dict(name='table')] @@ -45,4 +48,14 @@ class LaSegunda(BasicNewsRecipe): def print_version(self, url): rest, sep, article_id = url.partition('index.asp?idnoticia=') return u'http://www.lasegunda.com/edicionOnline/include/secciones/_detalle_impresion.asp?idnoticia=' + article_id + + def preprocess_html(self, soup): + mtag = '' + soup.head.insert(0,mtag) + for item in soup.findAll(name='table', width=True): + del item['width'] + for item in soup.findAll(style=True): + del item['style'] + return soup + language = _('Spanish') \ No newline at end of file diff --git a/src/calibre/web/feeds/recipes/recipe_la_tercera.py b/src/calibre/web/feeds/recipes/recipe_la_tercera.py index 65b0e630df..a0a2f94ee3 100644 --- a/src/calibre/web/feeds/recipes/recipe_la_tercera.py +++ b/src/calibre/web/feeds/recipes/recipe_la_tercera.py @@ -6,26 +6,30 @@ __copyright__ = '2009, Darko Miletic ' latercera.com ''' -from calibre.web.feeds.news import BasicNewsRecipe - +from calibre.web.feeds.news import BasicNewsRecipe + class LaTercera(BasicNewsRecipe): title = 'La Tercera' __author__ = 'Darko Miletic' - description = 'El sitio de noticias online de Chile' + description = 'El sitio de noticias online de Chile' + publisher = 'La Tercera' + category = 'news, politics, Chile' oldest_article = 2 - language = _('Spanish') max_articles_per_feed = 100 no_stylesheets = True - use_embedded_content = False encoding = 'cp1252' - + remove_javascript = True + use_embedded_content = False + html2lrf_options = [ - '--comment' , description - , '--category' , 'news, Chile' - , '--publisher' , title + '--comment', description + , '--category', category + , '--publisher', publisher ] + + html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"' - keep_only_tags = [dict(name='div', attrs={'class':'span-16 articulo border'}) ] + keep_only_tags = [dict(name='div', attrs={'class':['span-16 articulo border','span-16 border','span-16']}) ] remove_tags = [ dict(name='script') @@ -50,4 +54,11 @@ class LaTercera(BasicNewsRecipe): ,(u'Educacion', u'http://www.latercera.com/app/rss?sc=TEFURVJDRVJB&category=657') ] + def preprocess_html(self, soup): + mtag = '' + soup.head.insert(0,mtag) + for item in soup.findAll(style=True): + del item['style'] + return soup + language = _('Spanish') \ No newline at end of file diff --git a/src/calibre/web/feeds/recipes/recipe_lanacion.py b/src/calibre/web/feeds/recipes/recipe_lanacion.py index 6cf2afdc89..6675fc9b21 100644 --- a/src/calibre/web/feeds/recipes/recipe_lanacion.py +++ b/src/calibre/web/feeds/recipes/recipe_lanacion.py @@ -1,29 +1,32 @@ #!/usr/bin/env python __license__ = 'GPL v3' -__copyright__ = '2008, Darko Miletic ' +__copyright__ = '2008-2009, Darko Miletic ' ''' lanacion.com.ar ''' -from calibre.web.feeds.news import BasicNewsRecipe - +from calibre.web.feeds.news import BasicNewsRecipe + class Lanacion(BasicNewsRecipe): title = 'La Nacion' __author__ = 'Darko Miletic' - description = 'Informacion actualizada las 24 horas, con noticias de Argentina y del mundo - Informate ya!' + description = 'Noticias de Argentina y el resto del mundo' + publisher = 'La Nacion' + category = 'news, politics, Argentina' oldest_article = 2 - language = _('Spanish') max_articles_per_feed = 100 - no_stylesheets = True use_embedded_content = False + remove_javascript = True + no_stylesheets = True html2lrf_options = [ - '--comment', description - , '--base-font-size', '10' - , '--category', 'news, Argentina' - , '--publisher', 'La Nacion SA' - ] + '--comment', description + , '--category', category + , '--publisher', publisher + ] + + html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"' keep_only_tags = [dict(name='div', attrs={'class':'nota floatFix'})] remove_tags = [ @@ -47,11 +50,11 @@ class Lanacion(BasicNewsRecipe): ,(u'Revista' , u'http://www.lanacion.com.ar/herramientas/rss/index.asp?categoria_id=494' ) ] - def get_cover_url(self): - index = 'http://www.lanacion.com.ar' - cover_url = None - soup = self.index_to_soup(index) - cover_item = soup.find('img',attrs={'class':'logo'}) - if cover_item: - cover_url = index + cover_item['src'] - return cover_url + def preprocess_html(self, soup): + mtag = '' + soup.head.insert(0,mtag) + for item in soup.findAll(style=True): + del item['style'] + return soup + + language = _('Spanish') \ No newline at end of file diff --git a/src/calibre/web/feeds/recipes/recipe_lanacion_chile.py b/src/calibre/web/feeds/recipes/recipe_lanacion_chile.py index 81f31f07d8..8bd521df4b 100644 --- a/src/calibre/web/feeds/recipes/recipe_lanacion_chile.py +++ b/src/calibre/web/feeds/recipes/recipe_lanacion_chile.py @@ -7,25 +7,29 @@ lanacion.cl ''' import urllib -from calibre.web.feeds.news import BasicNewsRecipe - +from calibre.web.feeds.news import BasicNewsRecipe + class LaNacionChile(BasicNewsRecipe): title = 'La Nacion Chile' __author__ = 'Darko Miletic' - description = 'El sitio de noticias online de Chile' + description = 'El sitio de noticias online de Chile' + publisher = 'La Nacion' + category = 'news, politics, Chile' oldest_article = 2 - language = _('Spanish') max_articles_per_feed = 100 no_stylesheets = True use_embedded_content = False encoding = 'cp1252' cover_url = 'http://www.lanacion.cl/prontus_noticias_v2/imag/site/logo.gif' - + remove_javascript = True + html2lrf_options = [ - '--comment' , description - , '--category' , 'news, Chile' - , '--publisher' , title + '--comment', description + , '--category', category + , '--publisher', publisher ] + + html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"' keep_only_tags = [dict(name='div', attrs={'class':'bloque'})] @@ -41,5 +45,10 @@ class LaNacionChile(BasicNewsRecipe): item = soup.find('a', attrs={'href':'javascript:window.close()'}) if item: item.extract() + mtag = '' + soup.head.insert(0,mtag) + for item in soup.findAll(style=True): + del item['style'] return soup + language = _('Spanish') \ No newline at end of file diff --git a/src/calibre/web/feeds/recipes/recipe_laprensa.py b/src/calibre/web/feeds/recipes/recipe_laprensa.py index 7c2567f8e6..f2064c1f6b 100644 --- a/src/calibre/web/feeds/recipes/recipe_laprensa.py +++ b/src/calibre/web/feeds/recipes/recipe_laprensa.py @@ -1,31 +1,35 @@ #!/usr/bin/env python __license__ = 'GPL v3' -__copyright__ = '2008, Darko Miletic ' +__copyright__ = '2008-2009, Darko Miletic ' ''' laprensa.com.ar ''' import urllib -from calibre.web.feeds.news import BasicNewsRecipe - +from calibre.web.feeds.news import BasicNewsRecipe + class LaPrensa(BasicNewsRecipe): title = 'La Prensa' __author__ = 'Darko Miletic' - description = 'Informacion Libre las 24 horas' + description = 'Informacion Libre las 24 horas' + publisher = 'La Prensa' + category = 'news, politics, Argentina' oldest_article = 7 - language = _('Spanish') max_articles_per_feed = 100 no_stylesheets = True use_embedded_content = False encoding = 'cp1252' cover_url = 'http://www.laprensa.com.ar/imgs/logo.gif' - + remove_javascript = True + html2lrf_options = [ - '--comment' , description - , '--category' , 'news, Argentina' - , '--publisher' , title + '--comment', description + , '--category', category + , '--publisher', publisher ] + + html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"' feeds = [ (u'Politica' , u'http://www.laprensa.com.ar/Rss.aspx?Rss=4' ) @@ -47,5 +51,10 @@ class LaPrensa(BasicNewsRecipe): def preprocess_html(self, soup): del soup.body['onload'] + mtag = '' + soup.head.insert(0,mtag) + for item in soup.findAll(style=True): + del item['style'] return soup + language = _('Spanish') \ No newline at end of file diff --git a/src/calibre/web/feeds/recipes/recipe_nin.py b/src/calibre/web/feeds/recipes/recipe_nin.py index bc62af1b4f..d180f2b221 100644 --- a/src/calibre/web/feeds/recipes/recipe_nin.py +++ b/src/calibre/web/feeds/recipes/recipe_nin.py @@ -7,15 +7,17 @@ nin.co.yu ''' import re, urllib -from calibre.web.feeds.news import BasicNewsRecipe -class Nin(BasicNewsRecipe): +from calibre.web.feeds.news import BasicNewsRecipe + +class Nin(BasicNewsRecipe): title = 'NIN online' __author__ = 'Darko Miletic' description = 'Nedeljne informativne novine' + publisher = 'NIN' + category = 'news, politics, Serbia' no_stylesheets = True oldest_article = 15 - language = _('Serbian') simultaneous_downloads = 1 delay = 1 encoding = 'utf8' @@ -23,11 +25,17 @@ class Nin(BasicNewsRecipe): PREFIX = 'http://www.nin.co.yu' INDEX = PREFIX + '/?change_lang=ls' LOGIN = PREFIX + '/?logout=true' + remove_javascript = True + use_embedded_content = False + extra_css = '@font-face {font-family: "serif1";src:url(res:///opt/sony/ebook/FONT/tt0011m_.ttf)} @font-face {font-family: "monospace1";src:url(res:///opt/sony/ebook/FONT/tt0419m_.ttf)} @font-face {font-family: "sans1";src:url(res:///opt/sony/ebook/FONT/tt0003m_.ttf)} body{text-align: left; font-family: serif1, serif} .article_date{font-family: monospace1, monospace} .article_description{font-family: sans1, sans-serif} .navbar{font-family: monospace1, monospace}' + html2lrf_options = [ - '--comment' , description - , '--category' , 'news, politics, Serbia' - , '--publisher' , 'NIN' + '--comment', description + , '--category', category + , '--publisher', publisher ] + + html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"' preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')] @@ -54,3 +62,12 @@ class Nin(BasicNewsRecipe): if link_item: cover_url = self.PREFIX + link_item['src'] return cover_url + + def preprocess_html(self, soup): + mtag = '' + soup.head.insert(0,mtag) + for item in soup.findAll(style=True): + del item['style'] + return soup + + language = _('Serbian') \ No newline at end of file diff --git a/src/calibre/web/feeds/recipes/recipe_novosti.py b/src/calibre/web/feeds/recipes/recipe_novosti.py index 2fefc8a566..136302c573 100644 --- a/src/calibre/web/feeds/recipes/recipe_novosti.py +++ b/src/calibre/web/feeds/recipes/recipe_novosti.py @@ -5,32 +5,45 @@ __copyright__ = '2008, Darko Miletic ' ''' novosti.rs ''' -import string,re -from calibre.web.feeds.news import BasicNewsRecipe +import re + +from calibre.web.feeds.news import BasicNewsRecipe + class Novosti(BasicNewsRecipe): - title = 'Vecernje Novosti' - __author__ = 'Darko Miletic' - description = 'novosti, vesti, politika, dosije, drustvo, ekonomija, hronika, reportaze, svet, kultura, sport, beograd, regioni, mozaik, feljton, intrvju, pjer, fudbal, kosarka, podvig, arhiva, komentari, kolumne, srbija, republika srpska,Vecernje novosti' + title = u'Vecernje Novosti' + __author__ = u'Darko Miletic' + description = u'Vesti' + publisher = 'Kompanija Novosti' + category = 'news, politics, Serbia' oldest_article = 2 - language = _('Serbian') max_articles_per_feed = 100 no_stylesheets = True use_embedded_content = False + encoding = 'utf8' + remove_javascript = True + extra_css = '@font-face {font-family: "serif1";src:url(res:///opt/sony/ebook/FONT/tt0011m_.ttf)} @font-face {font-family: "monospace1";src:url(res:///opt/sony/ebook/FONT/tt0419m_.ttf)} @font-face {font-family: "sans1";src:url(res:///opt/sony/ebook/FONT/tt0003m_.ttf)} body{text-align: left; font-family: serif1, serif} .article_date{font-family: monospace1, monospace} .article_description{font-family: sans1, sans-serif} .navbar{font-family: monospace1, monospace}' + html2lrf_options = [ - '--comment', description - , '--base-font-size', '10' - , '--category', 'news, Serbia' - , '--publisher', 'Novosti AD' + '--comment', description + , '--category', category + , '--publisher', publisher ] + + html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"' preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')] - keep_only_tags = [ dict(name='div', attrs={'class':'jednaVest'}) ] - remove_tags_after = dict(name='div', attrs={'class':'info_bottom'}) - remove_tags = [ - dict(name='div', attrs={'class':'info'}) - ,dict(name='div', attrs={'class':'info_bottom'}) - ] + keep_only_tags = [dict(name='div', attrs={'class':'jednaVest'})] + remove_tags = [dict(name='div', attrs={'class':['info','info_bottom','clip_div']})] - feeds = [ (u'Vesti', u'http://www.novosti.rs/php/vesti/rss.php')] + feeds = [(u'Vesti', u'http://www.novosti.rs/php/vesti/rss.php')] + + def preprocess_html(self, soup): + mtag = '' + soup.head.insert(0,mtag) + for item in soup.findAll(style=True): + del item['style'] + return soup + + language = _('Serbian') \ No newline at end of file diff --git a/src/calibre/web/feeds/recipes/recipe_nspm.py b/src/calibre/web/feeds/recipes/recipe_nspm.py index d828636545..4cc6d50ca0 100644 --- a/src/calibre/web/feeds/recipes/recipe_nspm.py +++ b/src/calibre/web/feeds/recipes/recipe_nspm.py @@ -6,35 +6,55 @@ __copyright__ = '2008, Darko Miletic ' nspm.rs ''' -import string,re -from calibre.web.feeds.news import BasicNewsRecipe +import re +from calibre.web.feeds.news import BasicNewsRecipe + class Nspm(BasicNewsRecipe): title = u'Nova srpska politicka misao' __author__ = 'Darko Miletic' description = 'Casopis za politicku teoriju i drustvena istrazivanja' + publisher = 'NSPM' + category = 'news, politics, Serbia' oldest_article = 7 - language = _('Serbian') max_articles_per_feed = 100 no_stylesheets = True use_embedded_content = False INDEX = 'http://www.nspm.rs/?alphabet=l' - cover_url = 'http://nspm.rs/templates/jsn_epic_pro/images/logol.jpg' + encoding = 'utf8' + remove_javascript = True + extra_css = '@font-face {font-family: "serif1";src:url(res:///opt/sony/ebook/FONT/tt0011m_.ttf)} @font-face {font-family: "monospace1";src:url(res:///opt/sony/ebook/FONT/tt0419m_.ttf)} @font-face {font-family: "sans1";src:url(res:///opt/sony/ebook/FONT/tt0003m_.ttf)} body{text-align: left; font-family: serif1, serif} .article_date{font-family: monospace1, monospace} .article_description{font-family: sans1, sans-serif} .navbar{font-family: monospace1, monospace}' + html2lrf_options = [ - '--comment', description - , '--base-font-size', '10' - , '--category', 'news, politics, Serbia' - , '--publisher', 'IIC NSPM' + '--comment', description + , '--category', category + , '--publisher', publisher + , '--ignore-tables' ] + + html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"' preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')] - + remove_tags = [dict(name='a')] + def get_browser(self): br = BasicNewsRecipe.get_browser() br.open(self.INDEX) return br - feeds = [ (u'Nova srpska politicka misao', u'http://www.nspm.rs/feed/rss.html')] + feeds = [(u'Nova srpska politicka misao', u'http://www.nspm.rs/feed/rss.html')] def print_version(self, url): return url.replace('.html','/stampa.html') + + def preprocess_html(self, soup): + soup.html['xml:lang'] = 'sr-Latn-RS' + soup.html['lang'] = 'sr-Latn-RS' + ftag = soup.find('meta',attrs={'http-equiv':'Content-Language'}) + if ftag: + ftag['content'] = 'sr-Latn-RS' + for item in soup.findAll(style=True): + del item['style'] + return soup + + language = _('Serbian') \ No newline at end of file diff --git a/src/calibre/web/feeds/recipes/recipe_o_globo.py b/src/calibre/web/feeds/recipes/recipe_o_globo.py index 83e7f2da93..f4f78d54b8 100644 --- a/src/calibre/web/feeds/recipes/recipe_o_globo.py +++ b/src/calibre/web/feeds/recipes/recipe_o_globo.py @@ -6,25 +6,29 @@ __copyright__ = '2009, Darko Miletic ' oglobo.globo.com ''' -from calibre.web.feeds.news import BasicNewsRecipe - +from calibre.web.feeds.news import BasicNewsRecipe + class OGlobo(BasicNewsRecipe): title = 'O Globo' __author__ = 'Darko Miletic' - description = 'News from Brasil' + description = 'News from Brasil' + publisher = 'O Globo' + category = 'news, politics, Brasil' oldest_article = 2 max_articles_per_feed = 100 - language = _('Portugese') no_stylesheets = True use_embedded_content = False encoding = 'cp1252' cover_url = 'http://oglobo.globo.com/_img/o-globo.png' - + remove_javascript = True + html2lrf_options = [ - '--comment' , description - , '--category' , 'news, Brasil' - , '--publisher' , title + '--comment', description + , '--category', category + , '--publisher', publisher ] + + html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"' keep_only_tags = [dict(name='div', attrs={'id':'ltintb'})] @@ -56,3 +60,10 @@ class OGlobo(BasicNewsRecipe): ,(u'Economia', u'http://oglobo.globo.com/rss/plantaoeconomia.xml') ,(u'Tecnologia', u'http://oglobo.globo.com/rss/plantaotecnologia.xml') ] + + def preprocess_html(self, soup): + for item in soup.findAll(style=True): + del item['style'] + return soup + + language = _('Portugese') \ No newline at end of file diff --git a/src/calibre/web/feeds/recipes/recipe_pagina12.py b/src/calibre/web/feeds/recipes/recipe_pagina12.py index 8428a9a35f..b821ed0b68 100644 --- a/src/calibre/web/feeds/recipes/recipe_pagina12.py +++ b/src/calibre/web/feeds/recipes/recipe_pagina12.py @@ -1,31 +1,36 @@ #!/usr/bin/env python __license__ = 'GPL v3' -__copyright__ = '2008, Darko Miletic ' +__copyright__ = '2008-2009, Darko Miletic ' ''' pagina12.com.ar ''' from calibre import strftime -from calibre.web.feeds.news import BasicNewsRecipe +from calibre.web.feeds.news import BasicNewsRecipe + class Pagina12(BasicNewsRecipe): title = u'Pagina/12' __author__ = 'Darko Miletic' description = 'Noticias de Argentina y el resto del mundo' - language = _('Spanish') + publisher = 'La Pagina S.A.' + category = 'news, politics, Argentina' oldest_article = 2 max_articles_per_feed = 100 no_stylesheets = True - use_embedded_content = False encoding = 'cp1252' cover_url = strftime('http://www.pagina12.com.ar/fotos/%Y%m%d/diario/TAPAN.jpg') - + remove_javascript = True + use_embedded_content = False + html2lrf_options = [ - '--comment' , description - , '--category' , 'news, Argentina' - , '--publisher' , 'La Pagina S.A.' + '--comment', description + , '--category', category + , '--publisher', publisher ] + + html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"' remove_tags = [ @@ -38,3 +43,12 @@ class Pagina12(BasicNewsRecipe): def print_version(self, url): return url.replace('http://www.pagina12.com.ar/','http://www.pagina12.com.ar/imprimir/') + + def preprocess_html(self, soup): + mtag = '' + soup.head.insert(0,mtag) + for item in soup.findAll(style=True): + del item['style'] + return soup + + language = _('Spanish') \ No newline at end of file diff --git a/src/calibre/web/feeds/recipes/recipe_pescanik.py b/src/calibre/web/feeds/recipes/recipe_pescanik.py index 48d6a90966..e3385e02aa 100644 --- a/src/calibre/web/feeds/recipes/recipe_pescanik.py +++ b/src/calibre/web/feeds/recipes/recipe_pescanik.py @@ -6,31 +6,53 @@ __copyright__ = '2008, Darko Miletic ' pescanik.net ''' -import string,re -from calibre.web.feeds.news import BasicNewsRecipe +import re +from calibre.web.feeds.news import BasicNewsRecipe + class Pescanik(BasicNewsRecipe): title = 'Pescanik' __author__ = 'Darko Miletic' description = 'Pescanik' + publisher = 'Pescanik' + category = 'news, politics, Serbia' oldest_article = 7 - language = _('Serbian') max_articles_per_feed = 100 no_stylesheets = True use_embedded_content = False - html2lrf_options = ['--base-font-size', '10'] - html2epub_options = 'base_font_size = "10pt"' + remove_javascript = True + encoding = 'utf8' + extra_css = '@font-face {font-family: "serif1";src:url(res:///opt/sony/ebook/FONT/tt0011m_.ttf)} @font-face {font-family: "monospace1";src:url(res:///opt/sony/ebook/FONT/tt0419m_.ttf)} @font-face {font-family: "sans1";src:url(res:///opt/sony/ebook/FONT/tt0003m_.ttf)} body{text-align: left; font-family: serif1, serif} .article_date{font-family: monospace1, monospace} .article_description{font-family: sans1, sans-serif} .navbar{font-family: monospace1, monospace}' + + html2lrf_options = [ + '--comment', description + , '--category', category + , '--publisher', publisher + ] + + html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"' cover_url = "http://pescanik.net/templates/ja_teline/images/logo.png" preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')] - remove_tags_after = dict(name='div', attrs={'class':'article_seperator'}) - - remove_tags = [dict(name='td' , attrs={'class':'buttonheading'})] + remove_tags = [ + dict(name='td' , attrs={'class':'buttonheading'}) + ,dict(name='span', attrs={'class':'article_seperator'}) + ,dict(name=['object','link']) + ] feeds = [(u'Pescanik Online', u'http://pescanik.net/index.php?option=com_rd_rss&id=12')] def print_version(self, url): - nurl = url.replace('http://pescanik.net/index.php','http://pescanik.net/index2.php') + nurl = url.replace('/index.php','/index2.php') return nurl + '&pop=1&page=0' + + def preprocess_html(self, soup): + mtag = '' + soup.head.insert(0,mtag) + for item in soup.findAll(style=True): + del item['style'] + return soup + + language = _('Serbian') \ No newline at end of file diff --git a/src/calibre/web/feeds/recipes/recipe_politika.py b/src/calibre/web/feeds/recipes/recipe_politika.py index 949a9b781d..1575d8984f 100644 --- a/src/calibre/web/feeds/recipes/recipe_politika.py +++ b/src/calibre/web/feeds/recipes/recipe_politika.py @@ -5,37 +5,61 @@ __copyright__ = '2008, Darko Miletic ' ''' politika.rs ''' -import string,re -from calibre.web.feeds.news import BasicNewsRecipe +import re +from calibre.web.feeds.news import BasicNewsRecipe + class Politika(BasicNewsRecipe): - title = 'Politika Online' + title = u'Politika Online' __author__ = 'Darko Miletic' - description = 'Najstariji dnevni list na Balkanu' + description = 'Najstariji dnevni list na Balkanu' + publisher = 'Politika novine i Magazini d.o.o' + category = 'news, politics, Serbia' oldest_article = 2 max_articles_per_feed = 100 no_stylesheets = True - extra_css = '.content_center_border {text-align: left;}' use_embedded_content = False - cover_url = 'http://www.politika.rs:8080/images/politika.gif' + remove_javascript = True + encoding = 'utf8' + extra_css = '@font-face {font-family: "serif1";src:url(res:///opt/sony/ebook/FONT/tt0011m_.ttf)} @font-face {font-family: "monospace1";src:url(res:///opt/sony/ebook/FONT/tt0419m_.ttf)} @font-face {font-family: "sans1";src:url(res:///opt/sony/ebook/FONT/tt0003m_.ttf)} body{text-align: left; font-family: serif1, serif} .article_date{font-family: monospace1, monospace} .article_description{font-family: sans1, sans-serif} .navbar{font-family: monospace1, monospace}' + html2lrf_options = [ - '--comment', description - , '--base-font-size', '10' - , '--category', 'news, Serbia' - , '--publisher', 'POLITIKA NOVINE I MAGAZINI d.o.o.' + '--comment', description + , '--category', category + , '--publisher', publisher ] + + html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"' + preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')] - keep_only_tags = [ dict(name='div', attrs={'class':'contentcenter'}) ] - remove_tags_after = dict(name='div', attrs={'class':'datum_item_details'}) + keep_only_tags = [dict(name='div', attrs={'class':'content_center_border'})] + + remove_tags = [ + dict(name='div', attrs={'class':['send_print','txt-komentar']}) + ,dict(name=['object','link','a']) + ,dict(name='h1', attrs={'class':'box_header-tags'}) + ] + feeds = [ (u'Politika' , u'http://www.politika.rs/rubrike/Politika/index.1.lt.xml' ) ,(u'Svet' , u'http://www.politika.rs/rubrike/Svet/index.1.lt.xml' ) + ,(u'Redakcijski komentari', u'http://www.politika.rs/rubrike/redakcijski-komentari/index.1.lt.xml') ,(u'Pogledi' , u'http://www.politika.rs/pogledi/index.lt.xml' ) ,(u'Pogledi sa strane' , u'http://www.politika.rs/rubrike/Pogledi-sa-strane/index.1.lt.xml' ) ,(u'Tema dana' , u'http://www.politika.rs/rubrike/tema-dana/index.1.lt.xml' ) ,(u'Kultura' , u'http://www.politika.rs/rubrike/Kultura/index.1.lt.xml' ) ,(u'Zivot i stil' , u'http://www.politika.rs/rubrike/zivot-i-stil/index.1.lt.xml' ) ] + + def preprocess_html(self, soup): + mtag = '' + soup.head.insert(0,mtag) + for item in soup.findAll(style=True): + del item['style'] + ftag = soup.find('div',attrs={'class':'content_center_border'}) + if ftag: + ftag['align'] = 'left' + return soup diff --git a/src/calibre/web/feeds/recipes/recipe_vijesti.py b/src/calibre/web/feeds/recipes/recipe_vijesti.py index e018e91b02..98a7736a96 100644 --- a/src/calibre/web/feeds/recipes/recipe_vijesti.py +++ b/src/calibre/web/feeds/recipes/recipe_vijesti.py @@ -1,38 +1,49 @@ #!/usr/bin/env python __license__ = 'GPL v3' -__copyright__ = '2008, Darko Miletic ' +__copyright__ = '2009, Darko Miletic ' ''' vijesti.cg.yu ''' -import string,re - -from calibre.web.feeds.news import BasicNewsRecipe +import re +from calibre.web.feeds.news import BasicNewsRecipe + class Vijesti(BasicNewsRecipe): title = 'Vijesti' __author__ = 'Darko Miletic' - description = 'News from Montenegro' - oldest_article = 2 - language = _('Serbian') + description = 'News from Montenegro' + publisher = 'Daily Press Vijesti' + category = 'news, politics, Montenegro' + oldest_article = 1 max_articles_per_feed = 100 no_stylesheets = True - use_embedded_content = False + remove_javascript = True encoding = 'cp1250' cover_url = 'http://www.vijesti.cg.yu/img/logo.gif' - + remove_javascript = True + use_embedded_content = False + extra_css = '@font-face {font-family: "serif1";src:url(res:///opt/sony/ebook/FONT/tt0011m_.ttf)} @font-face {font-family: "monospace1";src:url(res:///opt/sony/ebook/FONT/tt0419m_.ttf)} @font-face {font-family: "sans1";src:url(res:///opt/sony/ebook/FONT/tt0003m_.ttf)} body{text-align: left; font-family: serif1, serif} .article_date{font-family: monospace1, monospace} .article_description{font-family: sans1, sans-serif} .navbar{font-family: monospace1, monospace}' + + html2lrf_options = [ + '--comment', description + , '--category', category + , '--publisher', publisher + ] + + html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"' + preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')] - html2lrf_options = [ - '--comment' , description - , '--category' , 'news, Montenegro' - , '--publisher' , 'Daily Press Vijesti' - ] - keep_only_tags = [dict(name='div', attrs={'id':'mainnews'})] + remove_tags = [ + dict(name='div', attrs={'align':'right'}) + ,dict(name=['object','link']) + ] + feeds = [(u'Sve vijesti', u'http://www.vijesti.cg.yu/rss.php' )] def preprocess_html(self, soup): @@ -40,4 +51,10 @@ class Vijesti(BasicNewsRecipe): soup.html['lang'] = 'sr-Latn-ME' mtag = '' soup.head.insert(0,mtag) + for item in soup.findAll('img'): + if item.has_key('align'): + del item['align'] + item.insert(0,'

') return soup + + language = _('Serbian') \ No newline at end of file diff --git a/src/calibre/web/feeds/recipes/recipe_vreme.py b/src/calibre/web/feeds/recipes/recipe_vreme.py index 3f0f8b735f..27697acf8e 100644 --- a/src/calibre/web/feeds/recipes/recipe_vreme.py +++ b/src/calibre/web/feeds/recipes/recipe_vreme.py @@ -6,27 +6,34 @@ __copyright__ = '2008, Darko Miletic ' vreme.com ''' -import string,re +import re from calibre import strftime -from calibre.web.feeds.recipes import BasicNewsRecipe -class Vreme(BasicNewsRecipe): - - title = 'Vreme' - __author__ = 'Darko Miletic' - description = 'Politicki Nedeljnik Srbije' +from calibre.web.feeds.news import BasicNewsRecipe + +class Vreme(BasicNewsRecipe): + title = 'Vreme' + __author__ = 'Darko Miletic' + description = 'Politicki Nedeljnik Srbije' + publisher = 'Vreme d.o.o.' + category = 'news, politics, Serbia' no_stylesheets = True - language = _('Serbian') + remove_javascript = True needs_subscription = True INDEX = 'http://www.vreme.com' LOGIN = 'http://www.vreme.com/account/index.php' + remove_javascript = True + use_embedded_content = False + extra_css = '@font-face {font-family: "serif1";src:url(res:///opt/sony/ebook/FONT/tt0011m_.ttf)} @font-face {font-family: "monospace1";src:url(res:///opt/sony/ebook/FONT/tt0419m_.ttf)} @font-face {font-family: "sans1";src:url(res:///opt/sony/ebook/FONT/tt0003m_.ttf)} body{text-align: left; font-family: serif1, serif} .article_date{font-family: monospace1, monospace} .article_description{font-family: sans1, sans-serif} .navbar{font-family: monospace1, monospace}' + html2lrf_options = [ - '--comment', description - , '--base-font-size', '10' - , '--category', 'news, politics, Serbia' - , '--publisher', 'Vreme d.o.o.' + '--comment', description + , '--category', category + , '--publisher', publisher ] - + + html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"' + preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')] def get_browser(self): @@ -67,10 +74,29 @@ class Vreme(BasicNewsRecipe): ,'description':description }) return [(soup.head.title.string, articles)] + + remove_tags = [ + dict(name=['object','link']) + ,dict(name='table',attrs={'xclass':'image'}) + ] def print_version(self, url): return url + '&print=yes' + def preprocess_html(self, soup): + del soup.body['text' ] + del soup.body['bgcolor'] + del soup.body['onload' ] + mtag = '' + soup.head.insert(0,mtag) + tbl = soup.body.table + tbbb = soup.find('td') + if tbbb: + tbbb.extract() + tbl.extract() + soup.body.insert(0,tbbb) + return soup + def get_cover_url(self): cover_url = None soup = self.index_to_soup(self.INDEX) @@ -78,3 +104,5 @@ class Vreme(BasicNewsRecipe): if cover_item: cover_url = self.INDEX + cover_item['src'] return cover_url + + language = _('Serbian') \ No newline at end of file