#!/usr/bin/env python2 __license__ = 'GPL v3' __author__ = 'Jordi Balcells, based on an earlier version by Lorenzo Vigentini & Kovid Goyal' __copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net' description = 'Main daily newspaper from Spain - v1.04 (19, October 2010)' __docformat__ = 'restructuredtext en' ''' elpais.es ''' from calibre.web.feeds.news import BasicNewsRecipe class ElPais(BasicNewsRecipe): __author__ = 'Kovid Goyal & Lorenzo Vigentini & Jordi Balcells' description = 'Main daily newspaper from Spain' title = u'El Pais' publisher = u'Ediciones El Pa\xeds SL' category = 'News, politics, culture, economy, general interest' language = 'es' timefmt = '[%a, %d %b, %Y]' oldest_article = 2 max_articles_per_feed = 15 use_embedded_content = False recursion = 5 remove_javascript = True no_stylesheets = True keep_only_tags = [ dict(name='h1'), dict(itemprop=['articleBody', 'image', 'caption']), dict(attrs={'class': ['articulo-subtitulos', 'articulo-apertura ']}), ] feeds = [ (u'Titulares de portada', u'http://www.elpais.com/rss/feed.html?feedId=1022'), (u'Internacional', u'http://www.elpais.com/rss/feed.html?feedId=1001'), (u'Espa\xf1a', u'http://www.elpais.com/rss/feed.html?feedId=1002'), (u'Deportes', u'http://www.elpais.com/rss/feed.html?feedId=1007'), (u'Econom\xeda', u'http://www.elpais.com/rss/feed.html?feedId=1006'), (u'Pol\xedtica', u'http://www.elpais.com/rss/feed.html?feedId=17073'), (u'Tecnolog\xeda', u'http://www.elpais.com/rss/feed.html?feedId=1005'), (u'Cultura', u'http://www.elpais.com/rss/feed.html?feedId=1008'), (u'Gente', u'http://www.elpais.com/rss/feed.html?feedId=1009'), (u'Sociedad', u'http://www.elpais.com/rss/feed.html?feedId=1004'), (u'Opini\xf3n', u'http://www.elpais.com/rss/feed.html?feedId=1003'), (u'Ciencia', u'http://www.elpais.com/rss/feed.html?feedId=17068'), (u'Justicia y leyes', u'http://www.elpais.com/rss/feed.html?feedId=17069'), (u'Medio ambiente', u'http://www.elpais.com/rss/feed.html?feedId=17071'), (u'Vi\xf1etas', u'http://www.elpais.com/rss/feed.html?feedId=17058') ] def preprocess_html(self, soup): for img in soup.findAll('img', srcset=True): try: img['src'] = filter(None, img['srcset'].split())[0] except IndexError: continue return soup