#!/usr/bin/env python2 __license__ = 'GPL v3' __copyright__ = '04 December 2010, desUBIKado' __author__ = 'desUBIKado' __description__ = 'Daily newspaper from Aragon' __version__ = 'v0.08' __date__ = '10, September 2017' ''' http://www.heraldo.es/ ''' import time import re from calibre.web.feeds.news import BasicNewsRecipe class heraldo(BasicNewsRecipe): author = 'desUBIKado' description = 'Daily newspaper from Aragon' title = u'Heraldo de Aragon' publisher = 'Grupo Heraldo' category = 'News, politics, culture, economy, general interest' language = 'es' timefmt = '[%a, %d %b, %Y]' oldest_article = 2 delay = 1 max_articles_per_feed = 100 use_embedded_content = False masthead_url = 'http://aureel.com/es/wp-content/uploads/sites/4/2016/07/Heraldo_de_Arago%CC%81n.png' remove_empty_feeds = True remove_javascript = True no_stylesheets = True feeds = [(u'Noticias', u'http://www.heraldo.es/index.php/mod.portadas/mem.rss')] keep_only_tags = [ dict( name='div', attrs={ 'class': ['row-f2 brd-row-f4 bck-row-f1-f1 padd-t padd-btt con n-marg-btt'] } ), dict(name='div', attrs={'id': ['dts', 'com']}), dict(name='img', attrs={'class': ['lazy']}) ] remove_tags = [ dict( name='a', attrs={'class': ['com flo-r', 'enl-if', 'enl-df', 'next_com']} ), dict( name='div', attrs={ 'class': [ 'brb-b-s con marg-btt', 'cnt-rel con', 'col5-f1', 'tit txt-wh f-s con', 'con cont-top ', 'col5-f1 flo-l', 'cnt-rel brr', 'caj_part con', 'caj_topic con' ] } ), dict( name='div', attrs={ 'id': [ 'cont-Top-8760', 'caj-pub', '8760-cpt1', 'caj_topic con', 'slider-oferplan', 'cont-Top-' ] } ), dict(name='form', attrs={'class': 'form'}), dict( name='ul', attrs={'class': ['tabs-nav', 'men_nav con hg_2n', 'lst-not-f2 con ']} ), dict(name='span', attrs={'class': ['flo-r']}), dict(name='ul', attrs={'id': ['cont-tags', 'pag-1', 'pag-cnt-I-']}) ] remove_tags_before = dict(name='div', attrs={'id': 'dts'}) remove_tags_after = dict(name='div', attrs={'id': 'com'}) def get_cover_url(self): cover = None st = time.localtime() year = str(st.tm_year) month = "%.2d" % st.tm_mon day = "%.2d" % st.tm_mday # http://img.kiosko.net/2017/09/10/es/heraldo_aragon.750.jpg cover = 'http://img.kiosko.net/' + year + '/' + month + '/' + day + '/es/heraldo_aragon.750.jpg' br = BasicNewsRecipe.get_browser(self) try: br.open(cover) except: self.log("\nPortada no disponible") cover = 'http://aureel.com/es/wp-content/uploads/sites/4/2016/07/Heraldo_de_Arago%CC%81n.png' return cover extra_css = ''' h1 {font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:28px;} h2 {font-family:georgia,serif; font-style:italic; font-weight:normal;font-size:22px;color:#4D4D4D;} .ladillo {font-family:georgia,serif; font-weight:bold;font-size:18px;} .firm, .sp, .fech, ".com flo-r" {font-family:Arial,Helvetica,sans-serif; font-weight:normal;font-size:12px;} img{margin-bottom: 0.4em} ''' preprocess_regexps = [ # Para separar los comentarios con una linea en blanco ( re.compile(r'
', re.DOTALL | re.IGNORECASE), lambda match: '

' ), ( re.compile(r'