__copyright__ = '2008-2013, Darko Miletic ' ''' lanacion.com.ar ''' from calibre.web.feeds.news import BasicNewsRecipe class Lanacion(BasicNewsRecipe): title = 'La Nacion' __author__ = 'Darko Miletic' description = "lanacion.com - Informacion actualizada las 24 horas, con noticias de Argentina y del mundo" publisher = 'La Nacion S.A.' category = 'news, politics, Argentina' oldest_article = 1 max_articles_per_feed = 100 use_embedded_content = False no_stylesheets = True language = 'es_AR' publication_type = 'newspaper' remove_empty_feeds = True masthead_url = 'http://www.lanacion.com.ar/_ui/desktop/imgs/layout/logos/ln-home.gif' extra_css = """ h1{font-family: TheSans,Arial,sans-serif} body{font-family: Arial,sans-serif} img{display: block} .firma,.fecha{font-size: small} .epigrafe-columna{font-size: x-small} """ conversion_options = { 'comment' : description , 'tags' : category , 'publisher': publisher , 'language' : language } remove_tags = [ dict(name=['iframe','embed','object','meta','link']) ,dict(attrs={'id':['herramientas','relacionadas','ampliar']}) ] remove_tags_before = dict(attrs={'id':'encabezado'}) remove_tags_after = dict(attrs={'id':'relacionadas'}) feeds = [ (u'Politica' , u'http://lanacion.com.ar.feedsportal.com/politica' ) ,(u'Deportes' , u'http://lanacion.com.ar.feedsportal.com/deportes' ) ,(u'Economia' , u'http://lanacion.com.ar.feedsportal.com/economia' ) ,(u'Sociedad' , u'http://lanacion.com.ar.feedsportal.com/sociedad' ) ,(u'Seguridad' , u'http://lanacion.com.ar.feedsportal.com/seguridad' ) ,(u'Buenos Aires' , u'http://lanacion.com.ar.feedsportal.com/buenosaires' ) ,(u'Opinion' , u'http://lanacion.com.ar.feedsportal.com/opinion' ) ,(u'Espectaculos' , u'http://lanacion.com.ar.feedsportal.com/espectaculos' ) ,(u'El Mundo' , u'http://lanacion.com.ar.feedsportal.com/mundo' ) ,(u'Revista' , u'http://lanacion.com.ar.feedsportal.com/revistalanacion' ) ,(u'Enfoques' , u'http://lanacion.com.ar.feedsportal.com/enfoques' ) ,(u'Comercio Exterior' , u'http://lanacion.com.ar.feedsportal.com/comercioexterior' ) ,(u'Tecnologia' , u'http://lanacion.com.ar.feedsportal.com/tecnologia' ) ,(u'Turismo' , u'http://lanacion.com.ar.feedsportal.com/turismo' ) ,(u'Al volante' , u'http://lanacion.com.ar.feedsportal.com/alvolante' ) ,(u'El Campo' , u'http://lanacion.com.ar.feedsportal.com/elcampo' ) ,(u'Moda y Belleza' , u'http://lanacion.com.ar.feedsportal.com/modaybelleza' ) ,(u'Inmuebles Comerciales', u'http://lanacion.com.ar.feedsportal.com/inmueblescomerciales' ) ,(u'Countries' , u'http://lanacion.com.ar.feedsportal.com/countries' ) ,(u'adnCultura' , u'http://lanacion.com.ar.feedsportal.com/adncultura' ) ,(u'The WSJ Americas' , u'http://lanacion.com.ar.feedsportal.com/wallstreetjournalamericas') ] def get_article_url(self, article): link = article.get('guid', None) if link.startswith('http://blogs.lanacion') and not link.endswith('/'): return self.browser.open_novisit(link).geturl() if link.rfind('galeria=') > 0: return None return link def get_cover_url(self): soup = self.index_to_soup('http://www.lanacion.com.ar/edicion-impresa') atap = soup.find(attrs={'class':'tapa'}) if atap: li = atap.find('img') if li: return li['src'] return None def preprocess_html(self, soup): for item in soup.findAll(style=True): del item['style'] for item in soup.findAll('a'): limg = item.find('img') if item.string is not None: str = item.string item.replaceWith(str) else: if limg: item.name = 'div' item.attrs = [] else: str = self.tag_to_string(item) item.replaceWith(str) for item in soup.findAll('img'): if not item.has_key('alt'): item['alt'] = 'image' return soup