mirror of
				https://github.com/kovidgoyal/calibre.git
				synced 2025-10-25 07:48:55 -04:00 
			
		
		
		
	
		
			
				
	
	
		
			107 lines
		
	
	
		
			4.3 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			107 lines
		
	
	
		
			4.3 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
| #!/usr/bin/env  python
 | |
| 
 | |
| __license__   = 'GPL v3'
 | |
| 
 | |
| '''
 | |
| www.canada.com
 | |
| '''
 | |
| 
 | |
| from calibre.web.feeds.recipes import BasicNewsRecipe
 | |
| 
 | |
| 
 | |
| class CanWestPaper(BasicNewsRecipe):
 | |
| 
 | |
|     # un-comment the following three lines for the Windsor Star
 | |
|     title = u'Windsor Star'
 | |
|     url_prefix = 'http://www.windsorstar.com'
 | |
|     description = u'News from Windsor, ON'
 | |
| 
 | |
|     # un-comment the following three lines for the Ottawa Citizen
 | |
|     #title = u'Ottawa Citizen'
 | |
|     #url_prefix = 'http://www.ottawacitizen.com'
 | |
|     #description = u'News from Ottawa, ON'
 | |
| 
 | |
|     # un-comment the following three lines for the Montreal Gazette
 | |
|     #title = u'Montreal Gazette'
 | |
|     #url_prefix = 'http://www.montrealgazette.com'
 | |
|     #description = u'News from Montreal, QC'
 | |
| 
 | |
| 
 | |
|     language = 'en_CA'
 | |
|     __author__ = 'Nick Redding'
 | |
|     no_stylesheets = True
 | |
|     timefmt = ' [%b %d]'
 | |
|     extra_css = '''
 | |
|                 .timestamp {  font-size:xx-small; display: block; }
 | |
|                 #storyheader { font-size: medium; }
 | |
|                 #storyheader h1 { font-size: x-large; }
 | |
|                 #storyheader h2 { font-size: large;  font-style: italic; }
 | |
|                 .byline { font-size:xx-small; }
 | |
|                 #photocaption { font-size: small; font-style: italic }
 | |
|                 #photocredit { font-size: xx-small; }'''
 | |
|     keep_only_tags = [dict(name='div', attrs={'id':'storyheader'}),dict(name='div', attrs={'id':'storycontent'})]
 | |
|     remove_tags = [{'class':'comments'},
 | |
|                    dict(name='div', attrs={'class':'navbar'}),dict(name='div', attrs={'class':'morelinks'}),
 | |
|                    dict(name='div', attrs={'class':'viewmore'}),dict(name='li', attrs={'class':'email'}),
 | |
|                    dict(name='div', attrs={'class':'story_tool_hr'}),dict(name='div', attrs={'class':'clear'}),
 | |
|                    dict(name='div', attrs={'class':'story_tool'}),dict(name='div', attrs={'class':'copyright'}),
 | |
|                    dict(name='div', attrs={'class':'rule_grey_solid'}),
 | |
|                    dict(name='li', attrs={'class':'print'}),dict(name='li', attrs={'class':'share'}),dict(name='ul', attrs={'class':'bullet'})]
 | |
| 
 | |
|     def preprocess_html(self,soup):
 | |
|         #delete iempty id attributes--they screw up the TOC for unknow reasons
 | |
|         divtags = soup.findAll('div',attrs={'id':''})
 | |
|         if divtags:
 | |
|             for div in divtags:
 | |
|                 del(div['id'])
 | |
|         return soup
 | |
| 
 | |
| 
 | |
|     def parse_index(self):
 | |
|         soup = self.index_to_soup(self.url_prefix+'/news/todays-paper/index.html')
 | |
| 
 | |
|         articles = {}
 | |
|         key = 'News'
 | |
|         ans = ['News']
 | |
| 
 | |
|         # Find each instance of class="sectiontitle", class="featurecontent"
 | |
|         for divtag in soup.findAll('div',attrs={'class' : ["section_title02","featurecontent"]}):
 | |
|                 #self.log(" div class = %s" % divtag['class'])
 | |
|                 if divtag['class'].startswith('section_title'):
 | |
|                     # div contains section title
 | |
|                     if not divtag.h3:
 | |
|                         continue
 | |
|                     key = self.tag_to_string(divtag.h3,False)
 | |
|                     ans.append(key)
 | |
|                     self.log("Section name %s" % key)
 | |
|                     continue
 | |
|                 # div contains article data
 | |
|                 h1tag = divtag.find('h1')
 | |
|                 if not h1tag:
 | |
|                     continue
 | |
|                 atag = h1tag.find('a',href=True)
 | |
|                 if not atag:
 | |
|                     continue
 | |
|                 url = self.url_prefix+'/news/todays-paper/'+atag['href']
 | |
|                 #self.log("Section %s" % key)
 | |
|                 #self.log("url %s" % url)
 | |
|                 title = self.tag_to_string(atag,False)
 | |
|                 #self.log("title %s" % title)
 | |
|                 pubdate = ''
 | |
|                 description = ''
 | |
|                 ptag = divtag.find('p');
 | |
|                 if ptag:
 | |
|                     description = self.tag_to_string(ptag,False)
 | |
|                     #self.log("description %s" % description)
 | |
|                 author = ''
 | |
|                 autag = divtag.find('h4')
 | |
|                 if autag:
 | |
|                     author = self.tag_to_string(autag,False)
 | |
|                     #self.log("author %s" % author)
 | |
|                 if not articles.has_key(key):
 | |
|                     articles[key] = []
 | |
|                 articles[key].append(dict(title=title,url=url,date=pubdate,description=description,author=author,content=''))
 | |
| 
 | |
|         ans = [(key, articles[key]) for key in ans if articles.has_key(key)]
 | |
|         return ans
 |