diff --git a/recipes/ambito_financiero.recipe b/recipes/ambito_financiero.recipe index fa4e6c0a51..633a8d5b74 100644 --- a/recipes/ambito_financiero.recipe +++ b/recipes/ambito_financiero.recipe @@ -3,9 +3,9 @@ # -*- coding: utf-8 -*- __license__ = 'GPL v3' -__copyright__ = '2011 - 2018, Darko Miletic ' +__copyright__ = '2011 - 2021, Darko Miletic ' ''' -http://www.ambito.com/diario/ +https://www.ambito.com/contenidos/edicion-impresa.html ''' import time @@ -17,7 +17,6 @@ import re from calibre import strftime from calibre.web.feeds.news import BasicNewsRecipe from calibre.ebooks.BeautifulSoup import BeautifulSoup -from calibre.ptempfile import PersistentTemporaryFile class Ambito_Financiero(BasicNewsRecipe): @@ -34,19 +33,14 @@ class Ambito_Financiero(BasicNewsRecipe): language = 'es_AR' fetch_retries = 10 delay = 1 - session_id = None timeout = 8 ignore_duplicate_articles = {'url'} - articles_are_obfuscated = True - temp_files = [] - PREFIX = 'http://www.ambito.com' - PREFIXDIARIO = PREFIX + '/diario' - INDEX = PREFIX + '/diario/index.asp' - LOGIN = PREFIX + '/login/login_cabezal.asp' + INDEX = 'https://www.ambito.com/contenidos/edicion-impresa.html' + LOGIN = 'https://usuarios.ambito.com/singup' + LOGOUT = 'https://usuarios.ambito.com/logout' extra_css = """ - body{font-family: Roboto, sans-serif;} - .titulo-noticia{font-family: "Roboto Condensed", sans-serif;} - .dia{font-family: "Roboto Condensed", sans-serif; font-size: small;} + body{font-family: Roboto, "Helvetica Neue", Arial, sans-serif;} + .title{font-family: "IBM Plex Sans", "Helvetica Neue", Arial, sans-serif} """ conversion_options = { @@ -57,47 +51,37 @@ class Ambito_Financiero(BasicNewsRecipe): } keep_only_tags = [ - dict(name='h6', attrs={'class': lambda x: x and 'bajada' in x.split()}), - dict(name='span', attrs={'class': lambda x: x and 'dia' in x.split()}), - dict(attrs={'class': lambda x: x and 'titulo-noticia' in x.split()}), - dict(attrs={'class': lambda x: x and 'foto-perfil-columnista' in x.split()}), - dict(attrs={'class': lambda x: x and 'despliegue-noticia' in x.split()}), + dict(name='div', attrs={'class': 'detail-header-wrapper'}), + dict(attrs={'class': lambda x: x and 'detail-body' in x.split()}), + ] + remove_tags = [ + dict(name=['object', 'link', 'embed', 'iframe', 'meta', 'link', 'img']), + dict(attrs={"class": lambda x: x and 'detail-gallery' in x.split()}) ] - remove_tags = [dict(name=['object', 'link', 'embed', 'iframe', 'meta', 'link'])] def get_browser(self): br = BasicNewsRecipe.get_browser(self) br.open(self.INDEX) + br.open(self.LOGIN) if self.username is not None and self.password is not None: - postdata = urlencode({ - 'txtUser': self.username, - 'txtPassword': self.password - }) - response = br.open( - 'http://www.ambito.com/diario/no-cache/login/x_login_cabezal.asp', - data=postdata, - timeout=self.timeout - ) - sessiondata = response.read() - prog = re.compile( - r"^(?P\d+?),(?P.+?),(?P.+?),.*?" - ) - m = prog.match(sessiondata) - if m: - self.session_id = m.group('session_id') - # br.set_debug_http(True) - # br.set_debug_responses(True) - # br.set_debug_redirects(True) + br.select_form(name='td_login') + br['login_user'] = self.username + br['login_pass'] = self.password + br.submit() return br def parse_index(self): soup = self.index_to_soup(self.INDEX) articles = [] checker = [] - rootitem = soup.find(attrs={'class': 'ei-dropdown'}) - for feed_link in rootitem.findAll('a', href=True): - url = self.PREFIXDIARIO + feed_link['href'] - title = self.tag_to_string(feed_link) + printed = soup.find('div', attrs={"class":"view-printed-edition"}) + if printed: + dimg = printed.find('img', attrs={"class":"lightbox"}) + if dimg: + self.cover_url = dimg['src'] + for feed_link in soup.find_all('h2', attrs={"class": "title"}): + url = feed_link.a['href'] + title = self.tag_to_string(feed_link.a) date = strftime("%a, %d %b %Y %H:%M:%S +0000", time.gmtime()) if url not in checker: checker.append(url) @@ -109,56 +93,5 @@ class Ambito_Financiero(BasicNewsRecipe): }) return [(self.title, articles)] - def preprocess_raw_html(self, raw_html, url): - if self.session_id: - l, s, r = url.rpartition('.html') - o, s1, artid = l.rpartition('_') - postdata = urlencode({'id': artid, 'id_session': self.session_id}) - response = self.browser.open( - 'http://data.ambito.com/diario/cuerpo_noticia.asp', - data=postdata, - timeout=self.timeout - ) - soup = BeautifulSoup(raw_html) - p = soup.find(id="cuerpo_noticia") - if p: - smallsoup = BeautifulSoup(response.read()) - cfind = smallsoup.find('div', id="contenido_data") - if cfind: - p.append(cfind) - return type(u'')(soup) - return raw_html - def cleanup(self): - if self.session_id is not None: - postdata = urlencode({'session_id': self.session_id}) - self.browser.open( - 'http://www.ambito.com/diario/no-cache/login/x_logout.asp', data=postdata, timeout=self.timeout - ) - self.session_id = None - - def get_obfuscated_article(self, url): - result = None - count = 0 - while (count < self.fetch_retries): - try: - response = self.browser.open(url, timeout=self.timeout) - html = response.read() - count = self.fetch_retries - l, s, r = url.rpartition('/') - artid, s1, r1 = r.partition('-') - tfile = PersistentTemporaryFile('_' + artid + '.html') - tfile.write(html) - tfile.close() - self.temp_files.append(tfile) - result = tfile.name - except: - self.info("Retrying download...") - count += 1 - return result - - def image_url_processor(self, baseurl, url): - result = url - if url.startswith('/'): - result = self.PREFIX + url - return result + self.browser.open(self.LOGOUT)