#!/usr/bin/env python # -*- coding: utf-8 -*- __license__ = 'GPL v3' __copyright__ = '2009, Darko Miletic ' ''' www.lavanguardia.es ''' from calibre.web.feeds.news import BasicNewsRecipe from calibre.ebooks.BeautifulSoup import Tag def new_tag(soup, name, attrs=()): impl = getattr(soup, 'new_tag', None) if impl is not None: return impl(name, attrs=dict(attrs)) return Tag(soup, name, attrs=attrs or None) class LaVanguardia(BasicNewsRecipe): title = 'La Vanguardia Digital' __author__ = 'Darko Miletic' description = u'Noticias desde EspaƱa' publisher = 'La Vanguardia' category = 'news, politics, Spain' oldest_article = 2 max_articles_per_feed = 100 no_stylesheets = True use_embedded_content = False delay = 5 language = 'es' direction = 'ltr' html2lrf_options = [ '--comment', description, '--category', category, '--publisher', publisher ] html2epub_options = 'publisher="' + publisher + \ '"\ncomments="' + description + '"\ntags="' + category + '"' feeds = [ (u'Portada', u'http://feeds.feedburner.com/lavanguardia/home'), (u'Cultura', u'http://feeds.feedburner.com/lavanguardia/cultura'), (u'Deportes', u'http://feeds.feedburner.com/lavanguardia/deportes'), (u'Economia', u'http://feeds.feedburner.com/lavanguardia/economia'), (u'El lector opina', u'http://feeds.feedburner.com/lavanguardia/lectoropina'), (u'Gente y TV', u'http://feeds.feedburner.com/lavanguardia/gente'), (u'Internacional', u'http://feeds.feedburner.com/lavanguardia/internacional'), (u'Internet y tecnologia', u'http://feeds.feedburner.com/lavanguardia/internet'), (u'Motor', u'http://feeds.feedburner.com/lavanguardia/motor'), (u'Politica', u'http://feeds.feedburner.com/lavanguardia/politica'), (u'Sucesos', u'http://feeds.feedburner.com/lavanguardia/sucesos') ] keep_only_tags = [ dict(name='div', attrs={'class': 'detalle noticia'}) ] remove_tags = [ dict(name=['object', 'link', 'script']), dict( name='div', attrs={'class': ['colC', 'peu', 'jstoolbar']}) ] remove_tags_after = [dict(name='div', attrs={'class': 'text'})] def preprocess_html(self, soup): soup.html['dir'] = self.direction mcharset = new_tag(soup, 'meta', [ ("http-equiv", "Content-Type"), ("content", "text/html; charset=utf-8")]) soup.head.insert(0, mcharset) for item in soup.findAll(style=True): del item['style'] return soup