This commit is contained in:
Kovid Goyal 2021-06-02 07:56:54 +05:30
parent da38b6653d
commit 1f8d39caf2
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C

View File

@ -3,9 +3,9 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
__license__ = 'GPL v3' __license__ = 'GPL v3'
__copyright__ = '2011 - 2018, Darko Miletic <darko.miletic at gmail.com>' __copyright__ = '2011 - 2021, Darko Miletic <darko.miletic at gmail.com>'
''' '''
http://www.ambito.com/diario/ https://www.ambito.com/contenidos/edicion-impresa.html
''' '''
import time import time
@ -17,7 +17,6 @@ import re
from calibre import strftime from calibre import strftime
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulSoup from calibre.ebooks.BeautifulSoup import BeautifulSoup
from calibre.ptempfile import PersistentTemporaryFile
class Ambito_Financiero(BasicNewsRecipe): class Ambito_Financiero(BasicNewsRecipe):
@ -34,19 +33,14 @@ class Ambito_Financiero(BasicNewsRecipe):
language = 'es_AR' language = 'es_AR'
fetch_retries = 10 fetch_retries = 10
delay = 1 delay = 1
session_id = None
timeout = 8 timeout = 8
ignore_duplicate_articles = {'url'} ignore_duplicate_articles = {'url'}
articles_are_obfuscated = True INDEX = 'https://www.ambito.com/contenidos/edicion-impresa.html'
temp_files = [] LOGIN = 'https://usuarios.ambito.com/singup'
PREFIX = 'http://www.ambito.com' LOGOUT = 'https://usuarios.ambito.com/logout'
PREFIXDIARIO = PREFIX + '/diario'
INDEX = PREFIX + '/diario/index.asp'
LOGIN = PREFIX + '/login/login_cabezal.asp'
extra_css = """ extra_css = """
body{font-family: Roboto, sans-serif;} body{font-family: Roboto, "Helvetica Neue", Arial, sans-serif;}
.titulo-noticia{font-family: "Roboto Condensed", sans-serif;} .title{font-family: "IBM Plex Sans", "Helvetica Neue", Arial, sans-serif}
.dia{font-family: "Roboto Condensed", sans-serif; font-size: small;}
""" """
conversion_options = { conversion_options = {
@ -57,47 +51,37 @@ class Ambito_Financiero(BasicNewsRecipe):
} }
keep_only_tags = [ keep_only_tags = [
dict(name='h6', attrs={'class': lambda x: x and 'bajada' in x.split()}), dict(name='div', attrs={'class': 'detail-header-wrapper'}),
dict(name='span', attrs={'class': lambda x: x and 'dia' in x.split()}), dict(attrs={'class': lambda x: x and 'detail-body' in x.split()}),
dict(attrs={'class': lambda x: x and 'titulo-noticia' in x.split()}), ]
dict(attrs={'class': lambda x: x and 'foto-perfil-columnista' in x.split()}), remove_tags = [
dict(attrs={'class': lambda x: x and 'despliegue-noticia' in x.split()}), dict(name=['object', 'link', 'embed', 'iframe', 'meta', 'link', 'img']),
dict(attrs={"class": lambda x: x and 'detail-gallery' in x.split()})
] ]
remove_tags = [dict(name=['object', 'link', 'embed', 'iframe', 'meta', 'link'])]
def get_browser(self): def get_browser(self):
br = BasicNewsRecipe.get_browser(self) br = BasicNewsRecipe.get_browser(self)
br.open(self.INDEX) br.open(self.INDEX)
br.open(self.LOGIN)
if self.username is not None and self.password is not None: if self.username is not None and self.password is not None:
postdata = urlencode({ br.select_form(name='td_login')
'txtUser': self.username, br['login_user'] = self.username
'txtPassword': self.password br['login_pass'] = self.password
}) br.submit()
response = br.open(
'http://www.ambito.com/diario/no-cache/login/x_login_cabezal.asp',
data=postdata,
timeout=self.timeout
)
sessiondata = response.read()
prog = re.compile(
r"^(?P<status>\d+?),(?P<session_id>.+?),(?P<username>.+?),.*?"
)
m = prog.match(sessiondata)
if m:
self.session_id = m.group('session_id')
# br.set_debug_http(True)
# br.set_debug_responses(True)
# br.set_debug_redirects(True)
return br return br
def parse_index(self): def parse_index(self):
soup = self.index_to_soup(self.INDEX) soup = self.index_to_soup(self.INDEX)
articles = [] articles = []
checker = [] checker = []
rootitem = soup.find(attrs={'class': 'ei-dropdown'}) printed = soup.find('div', attrs={"class":"view-printed-edition"})
for feed_link in rootitem.findAll('a', href=True): if printed:
url = self.PREFIXDIARIO + feed_link['href'] dimg = printed.find('img', attrs={"class":"lightbox"})
title = self.tag_to_string(feed_link) if dimg:
self.cover_url = dimg['src']
for feed_link in soup.find_all('h2', attrs={"class": "title"}):
url = feed_link.a['href']
title = self.tag_to_string(feed_link.a)
date = strftime("%a, %d %b %Y %H:%M:%S +0000", time.gmtime()) date = strftime("%a, %d %b %Y %H:%M:%S +0000", time.gmtime())
if url not in checker: if url not in checker:
checker.append(url) checker.append(url)
@ -109,56 +93,5 @@ class Ambito_Financiero(BasicNewsRecipe):
}) })
return [(self.title, articles)] return [(self.title, articles)]
def preprocess_raw_html(self, raw_html, url):
if self.session_id:
l, s, r = url.rpartition('.html')
o, s1, artid = l.rpartition('_')
postdata = urlencode({'id': artid, 'id_session': self.session_id})
response = self.browser.open(
'http://data.ambito.com/diario/cuerpo_noticia.asp',
data=postdata,
timeout=self.timeout
)
soup = BeautifulSoup(raw_html)
p = soup.find(id="cuerpo_noticia")
if p:
smallsoup = BeautifulSoup(response.read())
cfind = smallsoup.find('div', id="contenido_data")
if cfind:
p.append(cfind)
return type(u'')(soup)
return raw_html
def cleanup(self): def cleanup(self):
if self.session_id is not None: self.browser.open(self.LOGOUT)
postdata = urlencode({'session_id': self.session_id})
self.browser.open(
'http://www.ambito.com/diario/no-cache/login/x_logout.asp', data=postdata, timeout=self.timeout
)
self.session_id = None
def get_obfuscated_article(self, url):
result = None
count = 0
while (count < self.fetch_retries):
try:
response = self.browser.open(url, timeout=self.timeout)
html = response.read()
count = self.fetch_retries
l, s, r = url.rpartition('/')
artid, s1, r1 = r.partition('-')
tfile = PersistentTemporaryFile('_' + artid + '.html')
tfile.write(html)
tfile.close()
self.temp_files.append(tfile)
result = tfile.name
except:
self.info("Retrying download...")
count += 1
return result
def image_url_processor(self, baseurl, url):
result = url
if url.startswith('/'):
result = self.PREFIX + url
return result