#!/usr/bin/env python # -*- coding: utf-8 -*- __license__ = 'GPL v3' __copyright__ = '2010, Francisco Javier Nieto ' ''' www.hoy.es ''' from calibre.web.feeds.news import BasicNewsRecipe from calibre.ebooks.BeautifulSoup import Tag def new_tag(soup, name, attrs=()): impl = getattr(soup, 'new_tag', None) if impl is not None: return impl(name, attrs=dict(attrs)) return Tag(soup, name, attrs=attrs or None) class Hoy(BasicNewsRecipe): title = 'HOY' __author__ = 'Fco Javier Nieto' description = u'Noticias desde Extremadura' publisher = 'HOY' category = 'news, politics, Spain, Extremadura' oldest_article = 2 max_articles_per_feed = 100 no_stylesheets = True use_embedded_content = False delay = 1 encoding = 'cp1252' language = 'es' feeds = [ (u'Portada', u'http://www.hoy.es/portada.xml'), (u'Regional', u'http://www.hoy.es/rss/feeds/regional.xml'), (u'Prov de Badajoz', u'http://www.hoy.es/rss/feeds/prov_badajoz.xml'), (u'Prov de Caceres', u'http://www.hoy.es/rss/feeds/prov_caceres.xml'), (u'Badajoz', u'http://www.hoy.es/rss/feeds/badajoz.xml'), (u'Caceres', u'http://www.hoy.es/rss/feeds/caceres.xml'), (u'Merida', u'http://www.hoy.es/rss/feeds/merida.xml'), (u'Opinion', u'http://www.hoy.es/rss/feeds/opinion.xml'), (u'Nacional', u'http://www.hoy.es/rss/feeds/nacional.xml'), (u'Internacional', u'http://www.hoy.es/rss/feeds/internacional.xml'), (u'Economia', u'http://www.hoy.es/rss/feeds/economia.xml'), (u'Deportes', u'http://www.hoy.es/rss/feeds/deportes.xml'), (u'Sociedad', u'http://www.hoy.es/rss/feeds/sociedad.xml'), (u'Cultura', u'http://www.hoy.es/rss/feeds/cultura.xml'), (u'Television', u'http://www.hoy.es/rss/feeds/television.xml'), (u'contraportada', u'http://www.hoy.es/rss/feeds/contraportada.xml') ] keep_only_tags = [ dict(name='h1', attrs={'class': ['headline']}), dict(name='h2', attrs={'class': ['subhead']}), dict(name='div', attrs={'class': ['text']}) ] remove_tags = [ dict(name=['object', 'link', 'script']), dict( name='div', attrs={'class': ['colC_articulo', 'peu']}) ] remove_tags_after = [dict(name='div', attrs={'class': 'text'})] extra_css = '.headline {font: sans-serif 2em;}\n.subhead,h2{font: sans-serif 1.5em\n' def preprocess_html(self, soup): soup.html['dir'] = self.direction mcharset = new_tag(soup, 'meta', [ ("http-equiv", "Content-Type"), ("content", "text/html; charset=utf-8")]) soup.head.insert(0, mcharset) for item in soup.findAll(style=True): del item['style'] return soup