From bf6c5695b3eb844a6fab519d248b03eb3af5ae9a Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Tue, 7 Feb 2012 11:53:16 +0530 Subject: [PATCH] Vancouver Provice and Windsor Star by Nick Redding --- recipes/vancouver_province.recipe | 233 ++++++++++++++++++++++++++++++ recipes/windsor_star.recipe | 233 ++++++++++++++++++++++++++++++ 2 files changed, 466 insertions(+) create mode 100644 recipes/vancouver_province.recipe create mode 100644 recipes/windsor_star.recipe diff --git a/recipes/vancouver_province.recipe b/recipes/vancouver_province.recipe new file mode 100644 index 0000000000..8d0d125b63 --- /dev/null +++ b/recipes/vancouver_province.recipe @@ -0,0 +1,233 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +__license__ = 'GPL v3' + +''' +www.canada.com +''' + +import string, re +from calibre import strftime +from calibre.web.feeds.news import BasicNewsRecipe + +import string, re +from calibre import strftime +from calibre.web.feeds.recipes import BasicNewsRecipe +from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag + + +class CanWestPaper(BasicNewsRecipe): + + # un-comment the following four lines for the Victoria Times Colonist +## title = u'Victoria Times Colonist' +## url_prefix = 'http://www.timescolonist.com' +## description = u'News from Victoria, BC' +## fp_tag = 'CAN_TC' + + # un-comment the following four lines for the Vancouver Province + title = u'Vancouver Province' + url_prefix = 'http://www.theprovince.com' + description = u'News from Vancouver, BC' + fp_tag = 'CAN_VP' + + # un-comment the following four lines for the Vancouver Sun +## title = u'Vancouver Sun' +## url_prefix = 'http://www.vancouversun.com' +## description = u'News from Vancouver, BC' +## fp_tag = 'CAN_VS' + + # un-comment the following four lines for the Edmonton Journal +## title = u'Edmonton Journal' +## url_prefix = 'http://www.edmontonjournal.com' +## description = u'News from Edmonton, AB' +## fp_tag = 'CAN_EJ' + + # un-comment the following four lines for the Calgary Herald +## title = u'Calgary Herald' +## url_prefix = 'http://www.calgaryherald.com' +## description = u'News from Calgary, AB' +## fp_tag = 'CAN_CH' + + # un-comment the following four lines for the Regina Leader-Post +## title = u'Regina Leader-Post' +## url_prefix = 'http://www.leaderpost.com' +## description = u'News from Regina, SK' +## fp_tag = '' + + # un-comment the following four lines for the Saskatoon Star-Phoenix +## title = u'Saskatoon Star-Phoenix' +## url_prefix = 'http://www.thestarphoenix.com' +## description = u'News from Saskatoon, SK' +## fp_tag = '' + + # un-comment the following four lines for the Windsor Star +## title = u'Windsor Star' +## url_prefix = 'http://www.windsorstar.com' +## description = u'News from Windsor, ON' +## fp_tag = 'CAN_' + + # un-comment the following four lines for the Ottawa Citizen +## title = u'Ottawa Citizen' +## url_prefix = 'http://www.ottawacitizen.com' +## description = u'News from Ottawa, ON' +## fp_tag = 'CAN_OC' + + # un-comment the following four lines for the Montreal Gazette +## title = u'Montreal Gazette' +## url_prefix = 'http://www.montrealgazette.com' +## description = u'News from Montreal, QC' +## fp_tag = 'CAN_MG' + + + language = 'en_CA' + __author__ = 'Nick Redding' + no_stylesheets = True + timefmt = ' [%b %d]' + extra_css = ''' + .timestamp { font-size:xx-small; display: block; } + #storyheader { font-size: medium; } + #storyheader h1 { font-size: x-large; } + #storyheader h2 { font-size: large; font-style: italic; } + .byline { font-size:xx-small; } + #photocaption { font-size: small; font-style: italic } + #photocredit { font-size: xx-small; }''' + keep_only_tags = [dict(name='div', attrs={'id':'storyheader'}),dict(name='div', attrs={'id':'storycontent'})] + remove_tags = [{'class':'comments'}, + dict(name='div', attrs={'class':'navbar'}),dict(name='div', attrs={'class':'morelinks'}), + dict(name='div', attrs={'class':'viewmore'}),dict(name='li', attrs={'class':'email'}), + dict(name='div', attrs={'class':'story_tool_hr'}),dict(name='div', attrs={'class':'clear'}), + dict(name='div', attrs={'class':'story_tool'}),dict(name='div', attrs={'class':'copyright'}), + dict(name='div', attrs={'class':'rule_grey_solid'}), + dict(name='li', attrs={'class':'print'}),dict(name='li', attrs={'class':'share'}),dict(name='ul', attrs={'class':'bullet'})] + + def preprocess_html(self,soup): + #delete iempty id attributes--they screw up the TOC for unknow reasons + divtags = soup.findAll('div',attrs={'id':''}) + if divtags: + for div in divtags: + del(div['id']) + return soup + + def get_cover_url(self): + from datetime import timedelta, datetime, date + if self.fp_tag=='': + return None + cover = 'http://webmedia.newseum.org/newseum-multimedia/dfp/jpg'+str(date.today().day)+'/lg/'+self.fp_tag+'.jpg' + br = BasicNewsRecipe.get_browser() + daysback=1 + try: + br.open(cover) + except: + while daysback<7: + cover = 'http://webmedia.newseum.org/newseum-multimedia/dfp/jpg'+str((date.today() - timedelta(days=daysback)).day)+'/lg/'+self.fp_tag+'.jpg' + br = BasicNewsRecipe.get_browser() + try: + br.open(cover) + except: + daysback = daysback+1 + continue + break + if daysback==7: + self.log("\nCover unavailable") + cover = None + return cover + + def fixChars(self,string): + # Replace lsquo (\x91) + fixed = re.sub("\x91","‘",string) + # Replace rsquo (\x92) + fixed = re.sub("\x92","’",fixed) + # Replace ldquo (\x93) + fixed = re.sub("\x93","“",fixed) + # Replace rdquo (\x94) + fixed = re.sub("\x94","”",fixed) + # Replace ndash (\x96) + fixed = re.sub("\x96","–",fixed) + # Replace mdash (\x97) + fixed = re.sub("\x97","—",fixed) + fixed = re.sub("’","’",fixed) + return fixed + + def massageNCXText(self, description): + # Kindle TOC descriptions won't render certain characters + if description: + massaged = unicode(BeautifulStoneSoup(description, convertEntities=BeautifulStoneSoup.HTML_ENTITIES)) + # Replace '&' with '&' + massaged = re.sub("&","&", massaged) + return self.fixChars(massaged) + else: + return description + + def populate_article_metadata(self, article, soup, first): + if first: + picdiv = soup.find('body').find('img') + if picdiv is not None: + self.add_toc_thumbnail(article,re.sub(r'links\\link\d+\\','',picdiv['src'])) + xtitle = article.text_summary.strip() + if len(xtitle) == 0: + desc = soup.find('meta',attrs={'property':'og:description'}) + if desc is not None: + article.summary = article.text_summary = desc['content'] + + def strip_anchors(self,soup): + paras = soup.findAll(True) + for para in paras: + aTags = para.findAll('a') + for a in aTags: + if a.img is None: + a.replaceWith(a.renderContents().decode('cp1252','replace')) + return soup + + def preprocess_html(self, soup): + return self.strip_anchors(soup) + + + + def parse_index(self): + soup = self.index_to_soup(self.url_prefix+'/news/todays-paper/index.html') + + articles = {} + key = 'News' + ans = ['News'] + + # Find each instance of class="sectiontitle", class="featurecontent" + for divtag in soup.findAll('div',attrs={'class' : ["section_title02","featurecontent"]}): + #self.log(" div class = %s" % divtag['class']) + if divtag['class'].startswith('section_title'): + # div contains section title + if not divtag.h3: + continue + key = self.tag_to_string(divtag.h3,False) + ans.append(key) + self.log("Section name %s" % key) + continue + # div contains article data + h1tag = divtag.find('h1') + if not h1tag: + continue + atag = h1tag.find('a',href=True) + if not atag: + continue + url = self.url_prefix+'/news/todays-paper/'+atag['href'] + #self.log("Section %s" % key) + #self.log("url %s" % url) + title = self.tag_to_string(atag,False) + #self.log("title %s" % title) + pubdate = '' + description = '' + ptag = divtag.find('p'); + if ptag: + description = self.tag_to_string(ptag,False) + #self.log("description %s" % description) + author = '' + autag = divtag.find('h4') + if autag: + author = self.tag_to_string(autag,False) + #self.log("author %s" % author) + if not articles.has_key(key): + articles[key] = [] + articles[key].append(dict(title=title,url=url,date=pubdate,description=description,author=author,content='')) + + ans = [(key, articles[key]) for key in ans if articles.has_key(key)] + return ans diff --git a/recipes/windsor_star.recipe b/recipes/windsor_star.recipe new file mode 100644 index 0000000000..ae8ee47bf6 --- /dev/null +++ b/recipes/windsor_star.recipe @@ -0,0 +1,233 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +__license__ = 'GPL v3' + +''' +www.canada.com +''' + +import string, re +from calibre import strftime +from calibre.web.feeds.news import BasicNewsRecipe + +import string, re +from calibre import strftime +from calibre.web.feeds.recipes import BasicNewsRecipe +from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag + + +class CanWestPaper(BasicNewsRecipe): + + # un-comment the following four lines for the Victoria Times Colonist +## title = u'Victoria Times Colonist' +## url_prefix = 'http://www.timescolonist.com' +## description = u'News from Victoria, BC' +## fp_tag = 'CAN_TC' + + # un-comment the following four lines for the Vancouver Province +## title = u'Vancouver Province' +## url_prefix = 'http://www.theprovince.com' +## description = u'News from Vancouver, BC' +## fp_tag = 'CAN_VP' + + # un-comment the following four lines for the Vancouver Sun +## title = u'Vancouver Sun' +## url_prefix = 'http://www.vancouversun.com' +## description = u'News from Vancouver, BC' +## fp_tag = 'CAN_VS' + + # un-comment the following four lines for the Edmonton Journal +## title = u'Edmonton Journal' +## url_prefix = 'http://www.edmontonjournal.com' +## description = u'News from Edmonton, AB' +## fp_tag = 'CAN_EJ' + + # un-comment the following four lines for the Calgary Herald +## title = u'Calgary Herald' +## url_prefix = 'http://www.calgaryherald.com' +## description = u'News from Calgary, AB' +## fp_tag = 'CAN_CH' + + # un-comment the following four lines for the Regina Leader-Post +## title = u'Regina Leader-Post' +## url_prefix = 'http://www.leaderpost.com' +## description = u'News from Regina, SK' +## fp_tag = '' + + # un-comment the following four lines for the Saskatoon Star-Phoenix +## title = u'Saskatoon Star-Phoenix' +## url_prefix = 'http://www.thestarphoenix.com' +## description = u'News from Saskatoon, SK' +## fp_tag = '' + + # un-comment the following four lines for the Windsor Star + title = u'Windsor Star' + url_prefix = 'http://www.windsorstar.com' + description = u'News from Windsor, ON' + fp_tag = 'CAN_' + + # un-comment the following four lines for the Ottawa Citizen +## title = u'Ottawa Citizen' +## url_prefix = 'http://www.ottawacitizen.com' +## description = u'News from Ottawa, ON' +## fp_tag = 'CAN_OC' + + # un-comment the following four lines for the Montreal Gazette +## title = u'Montreal Gazette' +## url_prefix = 'http://www.montrealgazette.com' +## description = u'News from Montreal, QC' +## fp_tag = 'CAN_MG' + + + language = 'en_CA' + __author__ = 'Nick Redding' + no_stylesheets = True + timefmt = ' [%b %d]' + extra_css = ''' + .timestamp { font-size:xx-small; display: block; } + #storyheader { font-size: medium; } + #storyheader h1 { font-size: x-large; } + #storyheader h2 { font-size: large; font-style: italic; } + .byline { font-size:xx-small; } + #photocaption { font-size: small; font-style: italic } + #photocredit { font-size: xx-small; }''' + keep_only_tags = [dict(name='div', attrs={'id':'storyheader'}),dict(name='div', attrs={'id':'storycontent'})] + remove_tags = [{'class':'comments'}, + dict(name='div', attrs={'class':'navbar'}),dict(name='div', attrs={'class':'morelinks'}), + dict(name='div', attrs={'class':'viewmore'}),dict(name='li', attrs={'class':'email'}), + dict(name='div', attrs={'class':'story_tool_hr'}),dict(name='div', attrs={'class':'clear'}), + dict(name='div', attrs={'class':'story_tool'}),dict(name='div', attrs={'class':'copyright'}), + dict(name='div', attrs={'class':'rule_grey_solid'}), + dict(name='li', attrs={'class':'print'}),dict(name='li', attrs={'class':'share'}),dict(name='ul', attrs={'class':'bullet'})] + + def preprocess_html(self,soup): + #delete iempty id attributes--they screw up the TOC for unknow reasons + divtags = soup.findAll('div',attrs={'id':''}) + if divtags: + for div in divtags: + del(div['id']) + return soup + + def get_cover_url(self): + from datetime import timedelta, datetime, date + if self.fp_tag=='': + return None + cover = 'http://webmedia.newseum.org/newseum-multimedia/dfp/jpg'+str(date.today().day)+'/lg/'+self.fp_tag+'.jpg' + br = BasicNewsRecipe.get_browser() + daysback=1 + try: + br.open(cover) + except: + while daysback<7: + cover = 'http://webmedia.newseum.org/newseum-multimedia/dfp/jpg'+str((date.today() - timedelta(days=daysback)).day)+'/lg/'+self.fp_tag+'.jpg' + br = BasicNewsRecipe.get_browser() + try: + br.open(cover) + except: + daysback = daysback+1 + continue + break + if daysback==7: + self.log("\nCover unavailable") + cover = None + return cover + + def fixChars(self,string): + # Replace lsquo (\x91) + fixed = re.sub("\x91","‘",string) + # Replace rsquo (\x92) + fixed = re.sub("\x92","’",fixed) + # Replace ldquo (\x93) + fixed = re.sub("\x93","“",fixed) + # Replace rdquo (\x94) + fixed = re.sub("\x94","”",fixed) + # Replace ndash (\x96) + fixed = re.sub("\x96","–",fixed) + # Replace mdash (\x97) + fixed = re.sub("\x97","—",fixed) + fixed = re.sub("’","’",fixed) + return fixed + + def massageNCXText(self, description): + # Kindle TOC descriptions won't render certain characters + if description: + massaged = unicode(BeautifulStoneSoup(description, convertEntities=BeautifulStoneSoup.HTML_ENTITIES)) + # Replace '&' with '&' + massaged = re.sub("&","&", massaged) + return self.fixChars(massaged) + else: + return description + + def populate_article_metadata(self, article, soup, first): + if first: + picdiv = soup.find('body').find('img') + if picdiv is not None: + self.add_toc_thumbnail(article,re.sub(r'links\\link\d+\\','',picdiv['src'])) + xtitle = article.text_summary.strip() + if len(xtitle) == 0: + desc = soup.find('meta',attrs={'property':'og:description'}) + if desc is not None: + article.summary = article.text_summary = desc['content'] + + def strip_anchors(self,soup): + paras = soup.findAll(True) + for para in paras: + aTags = para.findAll('a') + for a in aTags: + if a.img is None: + a.replaceWith(a.renderContents().decode('cp1252','replace')) + return soup + + def preprocess_html(self, soup): + return self.strip_anchors(soup) + + + + def parse_index(self): + soup = self.index_to_soup(self.url_prefix+'/news/todays-paper/index.html') + + articles = {} + key = 'News' + ans = ['News'] + + # Find each instance of class="sectiontitle", class="featurecontent" + for divtag in soup.findAll('div',attrs={'class' : ["section_title02","featurecontent"]}): + #self.log(" div class = %s" % divtag['class']) + if divtag['class'].startswith('section_title'): + # div contains section title + if not divtag.h3: + continue + key = self.tag_to_string(divtag.h3,False) + ans.append(key) + self.log("Section name %s" % key) + continue + # div contains article data + h1tag = divtag.find('h1') + if not h1tag: + continue + atag = h1tag.find('a',href=True) + if not atag: + continue + url = self.url_prefix+'/news/todays-paper/'+atag['href'] + #self.log("Section %s" % key) + #self.log("url %s" % url) + title = self.tag_to_string(atag,False) + #self.log("title %s" % title) + pubdate = '' + description = '' + ptag = divtag.find('p'); + if ptag: + description = self.tag_to_string(ptag,False) + #self.log("description %s" % description) + author = '' + autag = divtag.find('h4') + if autag: + author = self.tag_to_string(autag,False) + #self.log("author %s" % author) + if not articles.has_key(key): + articles[key] = [] + articles[key].append(dict(title=title,url=url,date=pubdate,description=description,author=author,content='')) + + ans = [(key, articles[key]) for key in ans if articles.has_key(key)] + return ans