#!/usr/bin/env python # -*- mode: python -*- # -*- coding: utf-8 -*- __license__ = 'GPL v3' __copyright__ = '2011 - 2021, Darko Miletic ' ''' https://www.ambito.com/contenidos/edicion-impresa.html ''' import time from calibre import strftime from calibre.web.feeds.news import BasicNewsRecipe class Ambito_Financiero(BasicNewsRecipe): title = 'Ambito Financiero' __author__ = 'Darko Miletic' description = 'Informacion Libre las 24 horas' publisher = 'Editorial Nefir S.A.' category = 'news, politics, economy, Argentina' no_stylesheets = True encoding = 'utf8' publication_type = 'newspaper' needs_subscription = True use_embedded_content = False language = 'es_AR' fetch_retries = 10 delay = 1 timeout = 8 ignore_duplicate_articles = {'url'} INDEX = 'https://www.ambito.com/contenidos/edicion-impresa.html' LOGIN = 'https://usuarios.ambito.com/singup' LOGOUT = 'https://usuarios.ambito.com/logout' extra_css = """ body{font-family: Roboto, "Helvetica Neue", Arial, sans-serif;} .title{font-family: "IBM Plex Sans", "Helvetica Neue", Arial, sans-serif} """ conversion_options = { 'comment': description, 'tags': category, 'publisher': publisher, 'language': language } keep_only_tags = [ dict(name='div', attrs={'class': 'detail-header-wrapper'}), dict(attrs={'class': lambda x: x and 'detail-body' in x.split()}), ] remove_tags = [ dict(name=['object', 'link', 'embed', 'iframe', 'meta', 'link', 'img']), dict(attrs={"class": lambda x: x and 'detail-gallery' in x.split()}) ] def get_browser(self): br = BasicNewsRecipe.get_browser(self) br.open(self.INDEX) br.open(self.LOGIN) if self.username is not None and self.password is not None: br.select_form(name='td_login') br['login_user'] = self.username br['login_pass'] = self.password br.submit() return br def parse_index(self): soup = self.index_to_soup(self.INDEX) articles = [] checker = [] printed = soup.find('div', attrs={"class":"view-printed-edition"}) if printed: dimg = printed.find('img', attrs={"class":"lightbox"}) if dimg: self.cover_url = dimg['src'] for feed_link in soup.find_all('h2', attrs={"class": "title"}): url = feed_link.a['href'] title = self.tag_to_string(feed_link.a) date = strftime("%a, %d %b %Y %H:%M:%S +0000", time.gmtime()) if url not in checker: checker.append(url) articles.append({ 'title': title, 'date': date, 'url': url, 'description': u'' }) return [(self.title, articles)] def cleanup(self): self.browser.open(self.LOGOUT)