diff --git a/resources/recipes/infobae.recipe b/resources/recipes/infobae.recipe index cda9bf83d2..b7f9cd3c6c 100644 --- a/resources/recipes/infobae.recipe +++ b/resources/recipes/infobae.recipe @@ -1,12 +1,8 @@ -#!/usr/bin/env python - __license__ = 'GPL v3' -__copyright__ = '2008-2009, Darko Miletic ' +__copyright__ = '2008-2010, Darko Miletic ' ''' infobae.com ''' -import re -import urllib, urlparse from calibre.web.feeds.news import BasicNewsRecipe @@ -20,35 +16,24 @@ class Infobae(BasicNewsRecipe): max_articles_per_feed = 100 no_stylesheets = True use_embedded_content = False - language = 'es' - lang = 'es-AR' - + language = 'es' encoding = 'cp1252' - cover_url = 'http://www.infobae.com/imgs/header/header.gif' + masthead_url = 'http://www.infobae.com/imgs/header/header.gif' remove_javascript = True - preprocess_regexps = [(re.compile( - r''), lambda m:'')] - - - html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"\nlinearize_tables=True' - - extra_css = ''' - .col-center{font-family:Arial,Helvetica,sans-serif;} - h1{font-family:Arial,Helvetica,sans-serif; color:#0D4261;} - .fuenteIntNota{font-family:Arial,Helvetica,sans-serif; color:#1D1D1D; font-size:x-small;} - ''' - - keep_only_tags = [dict(name='div', attrs={'class':['content']})] - - - remove_tags = [ - dict(name='div', attrs={'class':['options','col-right','controles', 'bannerLibre','tiulo-masleidas','masleidas-h']}), - dict(name='a', attrs={'name' : 'comentario',}), - dict(name='iframe'), - dict(name='img', alt = "Ver galerias de imagenes"), - - ] - + remove_empty_feeds = True + extra_css = ''' + body{font-family:Arial,Helvetica,sans-serif;} + .popUpTitulo{color:#0D4261; font-size: xx-large} + ''' + + conversion_options = { + 'comment' : description + , 'tags' : category + , 'publisher' : publisher + , 'language' : language + , 'linearize_tables' : True + } + feeds = [ (u'Noticias' , u'http://www.infobae.com/adjuntos/html/RSS/hoy.xml' ) @@ -57,39 +42,14 @@ class Infobae(BasicNewsRecipe): ,(u'Deportes' , u'http://www.infobae.com/adjuntos/html/RSS/deportes.xml' ) ] -# def print_version(self, url): -# main, sep, article_part = url.partition('contenidos/') -# article_id, rsep, rrest = article_part.partition('-') -# return u'http://www.infobae.com/notas/nota_imprimir.php?Idx=' + article_id - - def get_article_url(self, article): - ans = article.get('link').encode('utf-8') - parts = list(urlparse.urlparse(ans)) - parts[2] = urllib.quote(parts[2]) - ans = urlparse.urlunparse(parts) - return ans.decode('utf-8') - - - def preprocess_html(self, soup): - - for tag in soup.head.findAll('strong'): - tag.extract() - for tag in soup.findAll('meta'): - del tag['content'] - tag.extract() - - mtag = '\n\n' - soup.head.insert(0,mtag) - for item in soup.findAll(style=True): - del item['style'] - - return soup + def print_version(self, url): + article_part = url.rpartition('/')[2] + article_id= article_part.partition('-')[0] + return 'http://www.infobae.com/notas/nota_imprimir.php?Idx=' + article_id def postprocess_html(self, soup, first): - for tag in soup.findAll(name='strong'): tag.name = 'b' - return soup diff --git a/resources/recipes/nspm.recipe b/resources/recipes/nspm.recipe index 13ff42b277..29f2cfc5e3 100644 --- a/resources/recipes/nspm.recipe +++ b/resources/recipes/nspm.recipe @@ -6,6 +6,7 @@ nspm.rs import re from calibre.web.feeds.news import BasicNewsRecipe +from calibre.ebooks.BeautifulSoup import Tag, NavigableString class Nspm(BasicNewsRecipe): title = 'Nova srpska politicka misao' @@ -21,6 +22,7 @@ class Nspm(BasicNewsRecipe): encoding = 'utf-8' language = 'sr' delay = 2 + remove_empty_feeds = True publication_type = 'magazine' masthead_url = 'http://www.nspm.rs/templates/jsn_epic_pro/images/logol.jpg' extra_css = """ @font-face {font-family: "serif1";src:url(res:///opt/sony/ebook/FONT/tt0011m_.ttf)} @@ -45,8 +47,9 @@ class Nspm(BasicNewsRecipe): dict(name=['link','object','embed','script','meta','base','iframe']) ,dict(attrs={'class':'buttonheading'}) ] - remove_tags_after = dict(attrs={'class':'article_separator'}) - remove_attributes = ['width','height'] + remove_tags_before = dict(attrs={'class':'contentheading'}) + remove_tags_after = dict(attrs={'class':'article_separator'}) + remove_attributes = ['width','height'] def get_browser(self): br = BasicNewsRecipe.get_browser() @@ -67,4 +70,8 @@ class Nspm(BasicNewsRecipe): def preprocess_html(self, soup): for item in soup.body.findAll(style=True): del item['style'] + for item in soup.body.findAll('h1'): + nh = NavigableString(item.a.string) + item.a.extract() + item.insert(0,nh) return self.adeify_images(soup)