mirror of
				https://github.com/kovidgoyal/calibre.git
				synced 2025-10-31 10:37:00 -04:00 
			
		
		
		
	
		
			
				
	
	
		
			82 lines
		
	
	
		
			2.8 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			82 lines
		
	
	
		
			2.8 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
| from calibre.web.feeds.news import BasicNewsRecipe
 | |
| 
 | |
| 
 | |
| class TheGrid(BasicNewsRecipe):
 | |
|     #: The title to use for the ebook
 | |
|     title = u'The Grid'
 | |
| 
 | |
|     #: A couple of lines that describe the content this recipe downloads.
 | |
|     #: This will be used primarily in a GUI that presents a list of recipes.
 | |
|     description = (u'The Grid is a weekly city magazine and daily website providing a fresh, '
 | |
|                    'accessible voice for Toronto.')
 | |
| 
 | |
|     #: The author of this recipe
 | |
|     __author__ = u'Yusuf W'
 | |
| 
 | |
|     #: The language that the news is in. Must be an ISO-639 code either
 | |
|     #: two or three characters long
 | |
|     language = 'en_CA'
 | |
| 
 | |
|     #: Publication type
 | |
|     #: Set to newspaper, magazine or blog
 | |
|     publication_type = 'newspaper'
 | |
| 
 | |
|     #: Convenient flag to disable loading of stylesheets for websites
 | |
|     #: that have overly complex stylesheets unsuitable for conversion
 | |
|     #: to ebooks formats
 | |
|     #: If True stylesheets are not downloaded and processed
 | |
|     no_stylesheets = True
 | |
| 
 | |
|     #: List of tags to be removed. Specified tags are removed from downloaded HTML.
 | |
|     remove_tags_before = dict(name='div', id='content')
 | |
|     remove_tags_after = dict(name='div', id='content')
 | |
|     remove_tags = [
 | |
|         dict(name='div', attrs={'class': 'right-content pull-right'}),
 | |
|         dict(name='div', attrs={'class': 'right-content'}),
 | |
|         dict(name='div', attrs={'class': 'ftr-line'}),
 | |
|         dict(name='div', attrs={'class': 'pull-right'}),
 | |
|         dict(name='div', id='comments'),
 | |
|         dict(name='div', id='tags')
 | |
|     ]
 | |
| 
 | |
|     #: Keep only the specified tags and their children.
 | |
|     # keep_only_tags        = [dict(name='div', id='content')]
 | |
| 
 | |
|     cover_margins = (0, 0, '#ffffff')
 | |
| 
 | |
|     INDEX = 'http://www.thegridto.com'
 | |
| 
 | |
|     def get_cover_url(self):
 | |
|         soup = self.index_to_soup(self.INDEX)
 | |
|         cover_url = soup.find(
 | |
|             attrs={'class': 'article-block latest-issue'}).find('img')['src']
 | |
| 
 | |
|         return cover_url
 | |
| 
 | |
|     def parse_index(self):
 | |
| 
 | |
|         # Get the latest issue
 | |
|         soup = self.index_to_soup(self.INDEX)
 | |
|         a = soup.find(
 | |
|             'div', attrs={'class': 'full-content stuff-ftr'}).findAll('a')[2]
 | |
| 
 | |
|         # Parse the index of the latest issue
 | |
|         self.INDEX = self.INDEX + a['href']
 | |
|         soup = self.index_to_soup(self.INDEX)
 | |
| 
 | |
|         feeds = []
 | |
|         for section in ['city', 'life', 'culture']:
 | |
|             section_class = 'left-content article-listing ' + section + ' pull-left'
 | |
|             div = soup.find(attrs={'class': section_class})
 | |
| 
 | |
|             articles = []
 | |
|             for a in div.findAll(attrs={'class': 'post-title'}):
 | |
|                 title = self.tag_to_string(a)
 | |
|                 url = a['href']
 | |
| 
 | |
|                 articles.append({'title': title, 'url': url,
 | |
|                                  'description': '', 'date': ''})
 | |
| 
 | |
|             feeds.append((section, articles))
 | |
|         return feeds
 |