mirror of
				https://github.com/kovidgoyal/calibre.git
				synced 2025-10-25 07:48:55 -04:00 
			
		
		
		
	Fix Montreal Gazette
This commit is contained in:
		
							parent
							
								
									370bc1992c
								
							
						
					
					
						commit
						ca020832e5
					
				| @ -1,5 +1,4 @@ | |||||||
| #!/usr/bin/env  python | #!/usr/bin/env  python | ||||||
| # -*- coding: utf-8 -*- |  | ||||||
| 
 | 
 | ||||||
| __license__   = 'GPL v3' | __license__   = 'GPL v3' | ||||||
| 
 | 
 | ||||||
| @ -7,77 +6,21 @@ __license__   = 'GPL v3' | |||||||
| www.canada.com | www.canada.com | ||||||
| ''' | ''' | ||||||
| 
 | 
 | ||||||
| import re | from calibre.web.feeds.recipes import BasicNewsRecipe | ||||||
| from calibre.web.feeds.news import BasicNewsRecipe |  | ||||||
| from calibre.ebooks.BeautifulSoup import BeautifulStoneSoup |  | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| class CanWestPaper(BasicNewsRecipe): | class CanWestPaper(BasicNewsRecipe): | ||||||
| 
 | 
 | ||||||
|     # un-comment the following four lines for the Victoria Times Colonist |     # un-comment the following three lines for the Montreal Gazette | ||||||
| ##    title = u'Victoria Times Colonist' |  | ||||||
| ##    url_prefix = 'http://www.timescolonist.com' |  | ||||||
| ##    description = u'News from Victoria, BC' |  | ||||||
| ##    fp_tag = 'CAN_TC' |  | ||||||
| 
 |  | ||||||
|     # un-comment the following four lines for the Vancouver Province |  | ||||||
| ##    title = u'Vancouver Province' |  | ||||||
| ##    url_prefix = 'http://www.theprovince.com' |  | ||||||
| ##    description = u'News from Vancouver, BC' |  | ||||||
| ##    fp_tag = 'CAN_VP' |  | ||||||
| 
 |  | ||||||
|     # un-comment the following four lines for the Vancouver Sun |  | ||||||
| ##    title = u'Vancouver Sun' |  | ||||||
| ##    url_prefix = 'http://www.vancouversun.com' |  | ||||||
| ##    description = u'News from Vancouver, BC' |  | ||||||
| ##    fp_tag = 'CAN_VS' |  | ||||||
| 
 |  | ||||||
|     # un-comment the following four lines for the Edmonton Journal |  | ||||||
| ##    title = u'Edmonton Journal' |  | ||||||
| ##    url_prefix = 'http://www.edmontonjournal.com' |  | ||||||
| ##    description = u'News from Edmonton, AB' |  | ||||||
| ##    fp_tag = 'CAN_EJ' |  | ||||||
| 
 |  | ||||||
|     # un-comment the following four lines for the Calgary Herald |  | ||||||
| ##    title = u'Calgary Herald' |  | ||||||
| ##    url_prefix = 'http://www.calgaryherald.com' |  | ||||||
| ##    description = u'News from Calgary, AB' |  | ||||||
| ##    fp_tag = 'CAN_CH' |  | ||||||
| 
 |  | ||||||
|     # un-comment the following four lines for the Regina Leader-Post |  | ||||||
| ##    title = u'Regina Leader-Post' |  | ||||||
| ##    url_prefix = 'http://www.leaderpost.com' |  | ||||||
| ##    description = u'News from Regina, SK' |  | ||||||
| ##    fp_tag = '' |  | ||||||
| 
 |  | ||||||
|     # un-comment the following four lines for the Saskatoon Star-Phoenix |  | ||||||
| ##    title = u'Saskatoon Star-Phoenix' |  | ||||||
| ##    url_prefix = 'http://www.thestarphoenix.com' |  | ||||||
| ##    description = u'News from Saskatoon, SK' |  | ||||||
| ##    fp_tag = '' |  | ||||||
| 
 |  | ||||||
|     # un-comment the following four lines for the Windsor Star |  | ||||||
| ##    title = u'Windsor Star' |  | ||||||
| ##    url_prefix = 'http://www.windsorstar.com' |  | ||||||
| ##    description = u'News from Windsor, ON' |  | ||||||
| ##    fp_tag = 'CAN_' |  | ||||||
| 
 |  | ||||||
|     # un-comment the following four lines for the Ottawa Citizen |  | ||||||
| ##    title = u'Ottawa Citizen' |  | ||||||
| ##    url_prefix = 'http://www.ottawacitizen.com' |  | ||||||
| ##    description = u'News from Ottawa, ON' |  | ||||||
| ##    fp_tag = 'CAN_OC' |  | ||||||
| 
 |  | ||||||
|     # un-comment the following four lines for the Montreal Gazette |  | ||||||
|     title = u'Montreal Gazette' |     title = u'Montreal Gazette' | ||||||
|     url_prefix = 'http://www.montrealgazette.com' |  | ||||||
|     description = u'News from Montreal, QC' |     description = u'News from Montreal, QC' | ||||||
|     fp_tag = 'CAN_MG' |  | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|     language = 'en_CA' |     language = 'en_CA' | ||||||
|     __author__ = 'Nick Redding' |     __author__ = 'Nick Redding' | ||||||
|     no_stylesheets = True |     no_stylesheets = True | ||||||
|  |     auto_cleanup = True | ||||||
|  |     auto_cleanup_keep = '//*[@id="imageBox"]' | ||||||
|     timefmt = ' [%b %d]' |     timefmt = ' [%b %d]' | ||||||
|     extra_css = ''' |     extra_css = ''' | ||||||
|                 .timestamp {  font-size:xx-small; display: block; } |                 .timestamp {  font-size:xx-small; display: block; } | ||||||
| @ -87,135 +30,19 @@ class CanWestPaper(BasicNewsRecipe): | |||||||
|                 .byline { font-size:xx-small; } |                 .byline { font-size:xx-small; } | ||||||
|                 #photocaption { font-size: small; font-style: italic } |                 #photocaption { font-size: small; font-style: italic } | ||||||
|                 #photocredit { font-size: xx-small; }''' |                 #photocredit { font-size: xx-small; }''' | ||||||
|     keep_only_tags = [dict(name='div', attrs={'id':'storyheader'}),dict(name='div', attrs={'id':'storycontent'})] |  | ||||||
|     remove_tags = [{'class':'comments'}, |  | ||||||
|                    dict(name='div', attrs={'class':'navbar'}),dict(name='div', attrs={'class':'morelinks'}), |  | ||||||
|                    dict(name='div', attrs={'class':'viewmore'}),dict(name='li', attrs={'class':'email'}), |  | ||||||
|                    dict(name='div', attrs={'class':'story_tool_hr'}),dict(name='div', attrs={'class':'clear'}), |  | ||||||
|                    dict(name='div', attrs={'class':'story_tool'}),dict(name='div', attrs={'class':'copyright'}), |  | ||||||
|                    dict(name='div', attrs={'class':'rule_grey_solid'}), |  | ||||||
|                    dict(name='li', attrs={'class':'print'}),dict(name='li', attrs={'class':'share'}),dict(name='ul', attrs={'class':'bullet'})] |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
|     def get_cover_url(self): |  | ||||||
|         from datetime import timedelta, date |  | ||||||
|         if self.fp_tag=='': |  | ||||||
|             return None |  | ||||||
|         cover = 'http://webmedia.newseum.org/newseum-multimedia/dfp/jpg'+str(date.today().day)+'/lg/'+self.fp_tag+'.jpg' |  | ||||||
|         br = BasicNewsRecipe.get_browser() |  | ||||||
|         daysback=1 |  | ||||||
|         try: |  | ||||||
|             br.open(cover) |  | ||||||
|         except: |  | ||||||
|             while daysback<7: |  | ||||||
|                 cover = 'http://webmedia.newseum.org/newseum-multimedia/dfp/jpg'+str((date.today() - timedelta(days=daysback)).day)+'/lg/'+self.fp_tag+'.jpg' |  | ||||||
|                 br = BasicNewsRecipe.get_browser() |  | ||||||
|                 try: |  | ||||||
|                     br.open(cover) |  | ||||||
|                 except: |  | ||||||
|                     daysback = daysback+1 |  | ||||||
|                     continue |  | ||||||
|                 break |  | ||||||
|         if daysback==7: |  | ||||||
|             self.log("\nCover unavailable") |  | ||||||
|             cover = None |  | ||||||
|         return cover |  | ||||||
| 
 |  | ||||||
|     def fixChars(self,string): |  | ||||||
|         # Replace lsquo (\x91) |  | ||||||
|         fixed = re.sub("\x91","‘",string) |  | ||||||
|         # Replace rsquo (\x92) |  | ||||||
|         fixed = re.sub("\x92","’",fixed) |  | ||||||
|         # Replace ldquo (\x93) |  | ||||||
|         fixed = re.sub("\x93","“",fixed) |  | ||||||
|         # Replace rdquo (\x94) |  | ||||||
|         fixed = re.sub("\x94","”",fixed) |  | ||||||
|         # Replace ndash (\x96) |  | ||||||
|         fixed = re.sub("\x96","–",fixed) |  | ||||||
|         # Replace mdash (\x97) |  | ||||||
|         fixed = re.sub("\x97","—",fixed) |  | ||||||
|         fixed = re.sub("’","’",fixed) |  | ||||||
|         return fixed |  | ||||||
| 
 |  | ||||||
|     def massageNCXText(self, description): |  | ||||||
|         # Kindle TOC descriptions won't render certain characters |  | ||||||
|         if description: |  | ||||||
|             massaged = unicode(BeautifulStoneSoup(description, convertEntities=BeautifulStoneSoup.HTML_ENTITIES)) |  | ||||||
|             # Replace '&' with '&' |  | ||||||
|             massaged = re.sub("&","&", massaged) |  | ||||||
|             return self.fixChars(massaged) |  | ||||||
|         else: |  | ||||||
|             return description |  | ||||||
| 
 |  | ||||||
|     def populate_article_metadata(self, article, soup, first): |  | ||||||
|         if first: |  | ||||||
|             picdiv = soup.find('body').find('img') |  | ||||||
|             if picdiv is not None: |  | ||||||
|                 self.add_toc_thumbnail(article,re.sub(r'links\\link\d+\\','',picdiv['src'])) |  | ||||||
|         xtitle = article.text_summary.strip() |  | ||||||
|         if len(xtitle) == 0: |  | ||||||
|             desc = soup.find('meta',attrs={'property':'og:description'}) |  | ||||||
|             if desc is not None: |  | ||||||
|                 article.summary = article.text_summary = desc['content'] |  | ||||||
| 
 |  | ||||||
|     def strip_anchors(self,soup): |  | ||||||
|         paras = soup.findAll(True) |  | ||||||
|         for para in paras: |  | ||||||
|             aTags = para.findAll('a') |  | ||||||
|             for a in aTags: |  | ||||||
|                 if a.img is None: |  | ||||||
|                     a.replaceWith(a.renderContents().decode('cp1252','replace')) |  | ||||||
|         return soup |  | ||||||
| 
 |  | ||||||
|     def preprocess_html(self, soup): |  | ||||||
|         return self.strip_anchors(soup) |  | ||||||
|      |      | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|     def parse_index(self): |     feeds          = [ | ||||||
|         soup = self.index_to_soup(self.url_prefix+'/news/todays-paper/index.html') | ('News',  | ||||||
|  |  'http://rss.canada.com/get/?F297'), | ||||||
|  |  ('Sports',  | ||||||
|  |  'http://rss.canada.com/get/?F299'), | ||||||
|  |  ('Entertainment',  | ||||||
|  |  'http://rss.canada.com/get/?F7366'), | ||||||
|  |  ('Business',  | ||||||
|  |  'http://rss.canada.com/get/?F6939'), | ||||||
|  | ] | ||||||
| 
 | 
 | ||||||
|         articles = {} |  | ||||||
|         key = 'News' |  | ||||||
|         ans = ['News'] |  | ||||||
| 
 | 
 | ||||||
|         # Find each instance of class="sectiontitle", class="featurecontent" |  | ||||||
|         for divtag in soup.findAll('div',attrs={'class' : ["section_title02","featurecontent"]}): |  | ||||||
|                 #self.log(" div class = %s" % divtag['class']) |  | ||||||
|                 if divtag['class'].startswith('section_title'): |  | ||||||
|                     # div contains section title |  | ||||||
|                     if not divtag.h3: |  | ||||||
|                         continue |  | ||||||
|                     key = self.tag_to_string(divtag.h3,False) |  | ||||||
|                     ans.append(key) |  | ||||||
|                     self.log("Section name %s" % key) |  | ||||||
|                     continue |  | ||||||
|                 # div contains article data |  | ||||||
|                 h1tag = divtag.find('h1') |  | ||||||
|                 if not h1tag: |  | ||||||
|                     continue |  | ||||||
|                 atag = h1tag.find('a',href=True) |  | ||||||
|                 if not atag: |  | ||||||
|                     continue |  | ||||||
|                 url = self.url_prefix+'/news/todays-paper/'+atag['href'] |  | ||||||
|                 #self.log("Section %s" % key) |  | ||||||
|                 #self.log("url %s" % url) |  | ||||||
|                 title = self.tag_to_string(atag,False) |  | ||||||
|                 #self.log("title %s" % title) |  | ||||||
|                 pubdate = '' |  | ||||||
|                 description = '' |  | ||||||
|                 ptag = divtag.find('p'); |  | ||||||
|                 if ptag: |  | ||||||
|                     description = self.tag_to_string(ptag,False) |  | ||||||
|                     #self.log("description %s" % description) |  | ||||||
|                 author = '' |  | ||||||
|                 autag = divtag.find('h4') |  | ||||||
|                 if autag: |  | ||||||
|                     author = self.tag_to_string(autag,False) |  | ||||||
|                     #self.log("author %s" % author) |  | ||||||
|                 if not articles.has_key(key): |  | ||||||
|                     articles[key] = [] |  | ||||||
|                 articles[key].append(dict(title=title,url=url,date=pubdate,description=description,author=author,content='')) |  | ||||||
|   |   | ||||||
|         ans = [(key, articles[key]) for key in ans if articles.has_key(key)] |  | ||||||
|         return ans |  | ||||||
|  | |||||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user