#!/usr/bin/env python __license__ = 'GPL v3' __author__ = '2010, Gustavo Azambuja ' ''' http://freeway.com.uy ''' from calibre.web.feeds.news import BasicNewsRecipe class General(BasicNewsRecipe): title = 'freeway.com.uy' __author__ = 'Gustavo Azambuja' description = 'Revista Freeway, Montevideo, Uruguay' language = 'es_UY' timefmt = '[%a, %d %b, %Y]' use_embedded_content = False recursion = 1 encoding = 'utf8' remove_javascript = True no_stylesheets = True conversion_options = {'linearize_tables': True} oldest_article = 180 max_articles_per_feed = 100 keep_only_tags = [ dict(id=['contenido']), dict(name='a', attrs={'class':'titulo_art_ppal'}), dict(name='img', attrs={'class':'recuadro'}), dict(name='td', attrs={'class':'txt_art_ppal'}) ] remove_tags = [ dict(name=['object','link']) ] remove_attributes = ['width','height', 'style', 'font', 'color'] extra_css = ''' h1{font-family:Geneva, Arial, Helvetica, sans-serif;color:#154B7A;} h3{font-size: 14px;color:#999999; font-family:Geneva, Arial, Helvetica, sans-serif;font-weight: bold;} h2{color:#666666; font-family:Geneva, Arial, Helvetica, sans-serif;font-size:small;} img {float:left; clear:both; margin:10px} p {font-family:Arial,Helvetica,sans-serif;} ''' def parse_index(self): feeds = [] for title, url in [('Articulos', 'http://freeway.com.uy/revista/')]: articles = self.art_parse_section(url) if articles: feeds.append((title, articles)) return feeds def art_parse_section(self, url): soup = self.index_to_soup(url) div = soup.find(attrs={'id': 'tbl_1'}) current_articles = [] for tag in div.findAllNext(attrs = {'class': 'ancho_articulos'}): if tag.get('class') == 'link-list-heading': break for td in tag.findAll('td'): a = td.find('a', attrs= {'class': 'titulo_articulos'}) if a is None: continue title = self.tag_to_string(a) url = a.get('href', False) if not url or not title: continue if url.startswith('/'): url = 'http://freeway.com.uy'+url p = td.find('p', attrs= {'class': 'txt_articulos'}) description = self.tag_to_string(p) self.log('\t\tFound article:', title) self.log('\t\t\t', url) self.log('\t\t\t', description) current_articles.append({'title': title, 'url': url, 'description':description, 'date':''}) return current_articles def preprocess_html(self, soup): attribs = [ 'style','font','valign' ,'colspan','width','height' ,'rowspan','summary','align' ,'cellspacing','cellpadding' ,'frames','rules','border' ] for item in soup.body.findAll(name=['table','td','tr','th','caption','thead','tfoot','tbody','colgroup','col']): item.name = 'div' for attrib in attribs: if item.has_key(attrib): del item[attrib] return soup def get_cover_url(self): #index = 'http://www.cosmohispano.com/revista' #soup = self.index_to_soup(index) #link_item = soup.find('img',attrs={'class':'img_portada'}) #if link_item: # cover_url = "http://www.cosmohispano.com"+link_item['src'] return 'http://freeway.com.uy/_upload/_n_foto_grande/noticia_1792_tapanoviembre2010.jpg'