__license__ = 'GPL v3' __copyright__ = '2011, Darko Miletic ' ''' ambito.com/diario ''' import time from calibre import strftime from calibre.web.feeds.news import BasicNewsRecipe class Ambito_Financiero(BasicNewsRecipe): title = 'Ambito Financiero' __author__ = 'Darko Miletic' description = 'Informacion Libre las 24 horas' publisher = 'Editorial Nefir S.A.' category = 'news, politics, economy, Argentina' no_stylesheets = True encoding = 'cp1252' masthead_url = 'http://www.ambito.com/diario/img/logo_af.gif' publication_type = 'newspaper' needs_subscription = 'optional' use_embedded_content = False language = 'es_AR' PREFIX = 'http://www.ambito.com' INDEX = PREFIX + '/diario/index.asp' LOGIN = PREFIX + '/diario/login/entrada.asp' extra_css = """ body{font-family: "Trebuchet MS",Verdana,sans-serif} .volanta{font-size: small} .t2_portada{font-size: xx-large; font-family: Georgia,serif; color: #026698} """ conversion_options = { 'comment': description, 'tags': category, 'publisher': publisher, 'language': language } keep_only_tags = [dict(name='div', attrs={'align': 'justify'})] remove_tags = [dict(name=['object', 'link', 'embed', 'iframe', 'meta', 'link', 'table', 'img'])] remove_attributes = ['align'] def get_browser(self): br = BasicNewsRecipe.get_browser(self) br.open(self.INDEX) if self.username is not None and self.password is not None: br.open(self.LOGIN) br.select_form(name='frmlogin') br['USER_NAME'] = self.username br['USER_PASS'] = self.password br.submit() return br def print_version(self, url): return url.replace('/diario/noticia.asp?', '/noticias/imprimir.asp?') def preprocess_html(self, soup): for item in soup.findAll(style=True): del item['style'] for item in soup.findAll('a'): str = item.string if str is None: str = self.tag_to_string(item) item.replaceWith(str) return soup def parse_index(self): soup = self.index_to_soup(self.INDEX) cover_item = soup.find('img', attrs={'class': 'fotodespliegue'}) if cover_item: self.cover_url = self.PREFIX + cover_item['src'] articles = [] checker = [] for feed_link in soup.findAll('a', attrs={'class': ['t0_portada', 't2_portada', 'bajada']}): url = self.PREFIX + feed_link['href'] title = self.tag_to_string(feed_link) date = strftime("%a, %d %b %Y %H:%M:%S +0000", time.gmtime()) if url not in checker: checker.append(url) articles.append({ 'title': title, 'date': date, 'url': url, 'description': u'' }) return [(self.title, articles)]