#!/usr/bin/env python # -*- coding: utf-8 -*- __license__ = 'GPL v3' ''' www.canada.com ''' import re from calibre.web.feeds.news import BasicNewsRecipe class CanWestPaper(BasicNewsRecipe): # un-comment the following four lines for the Victoria Times Colonist # title = u'Victoria Times Colonist' # url_prefix = 'http://www.timescolonist.com' # description = u'News from Victoria, BC' # fp_tag = 'CAN_TC' # un-comment the following four lines for the Vancouver Province # title = u'Vancouver Province' # url_prefix = 'http://www.theprovince.com' # description = u'News from Vancouver, BC' # fp_tag = 'CAN_VP' # un-comment the following four lines for the Vancouver Sun # title = u'Vancouver Sun' # url_prefix = 'http://www.vancouversun.com' # description = u'News from Vancouver, BC' # fp_tag = 'CAN_VS' # un-comment the following four lines for the Edmonton Journal # title = u'Edmonton Journal' # url_prefix = 'http://www.edmontonjournal.com' # description = u'News from Edmonton, AB' # fp_tag = 'CAN_EJ' # un-comment the following four lines for the Calgary Herald # title = u'Calgary Herald' # url_prefix = 'http://www.calgaryherald.com' # description = u'News from Calgary, AB' # fp_tag = 'CAN_CH' # un-comment the following four lines for the Regina Leader-Post # title = u'Regina Leader-Post' # url_prefix = 'http://www.leaderpost.com' # description = u'News from Regina, SK' # fp_tag = '' # un-comment the following four lines for the Saskatoon Star-Phoenix # title = u'Saskatoon Star-Phoenix' # url_prefix = 'http://www.thestarphoenix.com' # description = u'News from Saskatoon, SK' # fp_tag = '' # un-comment the following four lines for the Windsor Star title = u'Windsor Star' url_prefix = 'http://www.windsorstar.com' description = u'News from Windsor, ON' fp_tag = 'CAN_' # un-comment the following four lines for the Ottawa Citizen # title = u'Ottawa Citizen' # url_prefix = 'http://www.ottawacitizen.com' # description = u'News from Ottawa, ON' # fp_tag = 'CAN_OC' # un-comment the following four lines for the Montreal Gazette # title = u'Montreal Gazette' # url_prefix = 'http://www.montrealgazette.com' # description = u'News from Montreal, QC' # fp_tag = 'CAN_MG' language = 'en_CA' __author__ = 'Nick Redding' no_stylesheets = True timefmt = ' [%b %d]' extra_css = ''' .timestamp { font-size:xx-small; display: block; } #storyheader { font-size: medium; } #storyheader h1 { font-size: x-large; } #storyheader h2 { font-size: large; font-style: italic; } .byline { font-size:xx-small; } #photocaption { font-size: small; font-style: italic } #photocredit { font-size: xx-small; }''' keep_only_tags = [dict(name='div', attrs={'id': 'storyheader'}), dict( name='div', attrs={'id': 'storycontent'})] remove_tags = [{'class': 'comments'}, dict(name='div', attrs={'class': 'navbar'}), dict( name='div', attrs={'class': 'morelinks'}), dict(name='div', attrs={'class': 'viewmore'}), dict( name='li', attrs={'class': 'email'}), dict(name='div', attrs={'class': 'story_tool_hr'}), dict( name='div', attrs={'class': 'clear'}), dict(name='div', attrs={'class': 'story_tool'}), dict( name='div', attrs={'class': 'copyright'}), dict(name='div', attrs={'class': 'rule_grey_solid'}), dict(name='li', attrs={'class': 'print'}), dict(name='li', attrs={'class': 'share'}), dict(name='ul', attrs={'class': 'bullet'})] def get_cover_url(self): from datetime import timedelta, date if self.fp_tag == '': return None cover = 'http://webmedia.newseum.org/newseum-multimedia/dfp/jpg' + \ str(date.today().day) + '/lg/' + self.fp_tag + '.jpg' br = BasicNewsRecipe.get_browser(self) daysback = 1 try: br.open(cover) except: while daysback < 7: cover = 'http://webmedia.newseum.org/newseum-multimedia/dfp/jpg' + \ str((date.today() - timedelta(days=daysback)).day) + \ '/lg/' + self.fp_tag + '.jpg' br = BasicNewsRecipe.get_browser(self) try: br.open(cover) except: daysback = daysback + 1 continue break if daysback == 7: self.log("\nCover unavailable") cover = None return cover def fixChars(self, string): # Replace lsquo (\x91) fixed = re.sub("\x91", "‘", string) # Replace rsquo (\x92) fixed = re.sub("\x92", "’", fixed) # Replace ldquo (\x93) fixed = re.sub("\x93", "“", fixed) # Replace rdquo (\x94) fixed = re.sub("\x94", "”", fixed) # Replace ndash (\x96) fixed = re.sub("\x96", "–", fixed) # Replace mdash (\x97) fixed = re.sub("\x97", "—", fixed) fixed = re.sub("’", "’", fixed) return fixed def massageNCXText(self, description): return description def populate_article_metadata(self, article, soup, first): if first: picdiv = soup.find('body').find('img') if picdiv is not None: self.add_toc_thumbnail(article, re.sub( r'links\\link\d+\\', '', picdiv['src'])) xtitle = article.text_summary.strip() if len(xtitle) == 0: desc = soup.find('meta', attrs={'property': 'og:description'}) if desc is not None: article.summary = article.text_summary = desc['content'] def strip_anchors(self, soup): paras = soup.findAll(True) for para in paras: aTags = para.findAll('a') for a in aTags: if a.img is None: a.replaceWith(a.renderContents().decode( 'cp1252', 'replace')) return soup def preprocess_html(self, soup): return self.strip_anchors(soup) def parse_index(self): soup = self.index_to_soup( self.url_prefix + '/news/todays-paper/index.html') articles = {} key = 'News' ans = ['News'] # Find each instance of class="sectiontitle", class="featurecontent" for divtag in soup.findAll('div', attrs={'class': ["section_title02", "featurecontent"]}): # self.log(" div class = %s" % divtag['class']) if ''.join(divtag['class']).startswith('section_title'): # div contains section title if not divtag.h3: continue key = self.tag_to_string(divtag.h3, False) ans.append(key) self.log("Section name %s" % key) continue # div contains article data h1tag = divtag.find('h1') if not h1tag: continue atag = h1tag.find('a', href=True) if not atag: continue url = self.url_prefix + '/news/todays-paper/' + atag['href'] # self.log("Section %s" % key) # self.log("url %s" % url) title = self.tag_to_string(atag, False) # self.log("title %s" % title) pubdate = '' description = '' ptag = divtag.find('p') if ptag: description = self.tag_to_string(ptag, False) # self.log("description %s" % description) author = '' autag = divtag.find('h4') if autag: author = self.tag_to_string(autag, False) # self.log("author %s" % author) if key not in articles: articles[key] = [] articles[key].append(dict(title=title, url=url, date=pubdate, description=description, author=author, content='')) ans = [(keyl, articles[keyl]) for keyl in ans if keyl in articles] return ans