mirror of
				https://github.com/kovidgoyal/calibre.git
				synced 2025-11-01 19:17:02 -04:00 
			
		
		
		
	
		
			
				
	
	
		
			93 lines
		
	
	
		
			3.4 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			93 lines
		
	
	
		
			3.4 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
| #!/usr/bin/env python
 | |
| 
 | |
| __license__ = 'GPL v3'
 | |
| __author__ = '2010, Gustavo Azambuja <hola at gazambuja.com>'
 | |
| '''
 | |
| http://freeway.com.uy
 | |
| '''
 | |
| 
 | |
| from calibre.web.feeds.news import BasicNewsRecipe
 | |
| 
 | |
| 
 | |
| class General(BasicNewsRecipe):
 | |
|     title = 'freeway.com.uy'
 | |
|     __author__ = 'Gustavo Azambuja'
 | |
|     description = 'Revista Freeway, Montevideo, Uruguay'
 | |
|     language = 'es_UY'
 | |
|     timefmt = '[%a, %d %b, %Y]'
 | |
|     use_embedded_content = False
 | |
|     recursion = 1
 | |
|     encoding = 'utf8'
 | |
|     remove_javascript = True
 | |
|     no_stylesheets = True
 | |
|     conversion_options = {'linearize_tables': True}
 | |
| 
 | |
|     oldest_article = 180
 | |
|     max_articles_per_feed = 100
 | |
|     keep_only_tags = [
 | |
|         dict(id=['contenido']),
 | |
|         dict(name='a', attrs={'class': 'titulo_art_ppal'}),
 | |
|         dict(name='img', attrs={'class': 'recuadro'}),
 | |
|         dict(name='td', attrs={'class': 'txt_art_ppal'})
 | |
|     ]
 | |
|     remove_tags = [
 | |
|         dict(name=['object', 'link'])
 | |
|     ]
 | |
|     remove_attributes = ['width', 'height', 'style', 'font', 'color']
 | |
| 
 | |
|     extra_css = '''
 | |
|                 h1{font-family:Geneva, Arial, Helvetica, sans-serif;color:#154B7A;}
 | |
|                 h3{font-size: 14px;color:#999999; font-family:Geneva, Arial, Helvetica, sans-serif;font-weight: bold;}
 | |
|                 h2{color:#666666; font-family:Geneva, Arial, Helvetica, sans-serif;font-size:small;}
 | |
|                 img {float:left; clear:both; margin:10px}
 | |
|                 p {font-family:Arial,Helvetica,sans-serif;}
 | |
|                 '''
 | |
| 
 | |
|     def parse_index(self):
 | |
|         feeds = []
 | |
|         for title, url in [('Articulos', 'http://freeway.com.uy/revista/')]:
 | |
|             articles = self.art_parse_section(url)
 | |
|             if articles:
 | |
|                 feeds.append((title, articles))
 | |
|         return feeds
 | |
| 
 | |
|     def art_parse_section(self, url):
 | |
|         soup = self.index_to_soup(url)
 | |
|         div = soup.find(attrs={'id': 'tbl_1'})
 | |
| 
 | |
|         current_articles = []
 | |
|         for tag in div.findAllNext(attrs={'class': 'ancho_articulos'}):
 | |
|             if tag.get('class') == 'link-list-heading':
 | |
|                 break
 | |
|             for td in tag.findAll('td'):
 | |
|                 a = td.find('a', attrs={'class': 'titulo_articulos'})
 | |
|                 if a is None:
 | |
|                     continue
 | |
|                 title = self.tag_to_string(a)
 | |
|                 url = a.get('href', False)
 | |
|                 if not url or not title:
 | |
|                     continue
 | |
|                 if url.startswith('/'):
 | |
|                     url = 'http://freeway.com.uy' + url
 | |
|                 p = td.find('p', attrs={'class': 'txt_articulos'})
 | |
|                 description = self.tag_to_string(p)
 | |
|                 self.log('\t\tFound article:', title)
 | |
|                 self.log('\t\t\t', url)
 | |
|                 self.log('\t\t\t', description)
 | |
|                 current_articles.append(
 | |
|                     {'title': title, 'url': url, 'description': description, 'date': ''})
 | |
| 
 | |
|         return current_articles
 | |
| 
 | |
|     def preprocess_html(self, soup):
 | |
|         attribs = ['style', 'font', 'valign', 'colspan', 'width', 'height', 'rowspan', 'summary', 'align', 'cellspacing', 'cellpadding', 'frames', 'rules', 'border' ]  # noqa
 | |
|         for item in soup.body.findAll(name=['table', 'td', 'tr', 'th', 'caption', 'thead', 'tfoot', 'tbody', 'colgroup', 'col']):
 | |
|             item.name = 'div'
 | |
|             for attrib in attribs:
 | |
|                 item[attrib] = ''
 | |
|                 del item[attrib]
 | |
|         return soup
 | |
| 
 | |
|     def get_cover_url(self):
 | |
|         return 'http://freeway.com.uy/_upload/_n_foto_grande/noticia_1792_tapanoviembre2010.jpg'
 |