Vancouver Provice and Windsor Star by Nick Redding

2025-08-30 23:00:21 -04:00 · 2012-02-07 11:53:16 +05:30 · 2012-02-07 11:53:16 +05:30 · bf6c5695b3
commit bf6c5695b3
parent b24480e749
2 changed files with 466 additions and 0 deletions
--- a/recipes/vancouver_province.recipe
+++ b/recipes/vancouver_province.recipe
@ -0,0 +1,233 @@
+#!/usr/bin/env  python
+# -*- coding: utf-8 -*-
+
+__license__   = 'GPL v3'
+
+'''
+www.canada.com
+'''
+
+import string, re
+from calibre import strftime
+from calibre.web.feeds.news import BasicNewsRecipe
+
+import string, re
+from calibre import strftime
+from calibre.web.feeds.recipes import BasicNewsRecipe
+from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag
+
+
+class CanWestPaper(BasicNewsRecipe):
+
+    # un-comment the following four lines for the Victoria Times Colonist
+##    title = u'Victoria Times Colonist'
+##    url_prefix = 'http://www.timescolonist.com'
+##    description = u'News from Victoria, BC'
+##    fp_tag = 'CAN_TC'
+
+    # un-comment the following four lines for the Vancouver Province
+    title = u'Vancouver Province'
+    url_prefix = 'http://www.theprovince.com'
+    description = u'News from Vancouver, BC'
+    fp_tag = 'CAN_VP'
+
+    # un-comment the following four lines for the Vancouver Sun
+##    title = u'Vancouver Sun'
+##    url_prefix = 'http://www.vancouversun.com'
+##    description = u'News from Vancouver, BC'
+##    fp_tag = 'CAN_VS'
+
+    # un-comment the following four lines for the Edmonton Journal
+##    title = u'Edmonton Journal'
+##    url_prefix = 'http://www.edmontonjournal.com'
+##    description = u'News from Edmonton, AB'
+##    fp_tag = 'CAN_EJ'
+
+    # un-comment the following four lines for the Calgary Herald
+##    title = u'Calgary Herald'
+##    url_prefix = 'http://www.calgaryherald.com'
+##    description = u'News from Calgary, AB'
+##    fp_tag = 'CAN_CH'
+
+    # un-comment the following four lines for the Regina Leader-Post
+##    title = u'Regina Leader-Post'
+##    url_prefix = 'http://www.leaderpost.com'
+##    description = u'News from Regina, SK'
+##    fp_tag = ''
+
+    # un-comment the following four lines for the Saskatoon Star-Phoenix
+##    title = u'Saskatoon Star-Phoenix'
+##    url_prefix = 'http://www.thestarphoenix.com'
+##    description = u'News from Saskatoon, SK'
+##    fp_tag = ''
+
+    # un-comment the following four lines for the Windsor Star
+##    title = u'Windsor Star'
+##    url_prefix = 'http://www.windsorstar.com'
+##    description = u'News from Windsor, ON'
+##    fp_tag = 'CAN_'
+
+    # un-comment the following four lines for the Ottawa Citizen
+##    title = u'Ottawa Citizen'
+##    url_prefix = 'http://www.ottawacitizen.com'
+##    description = u'News from Ottawa, ON'
+##    fp_tag = 'CAN_OC'
+
+    # un-comment the following four lines for the Montreal Gazette
+##    title = u'Montreal Gazette'
+##    url_prefix = 'http://www.montrealgazette.com'
+##    description = u'News from Montreal, QC'
+##    fp_tag = 'CAN_MG'
+
+
+    language = 'en_CA'
+    __author__ = 'Nick Redding'
+    no_stylesheets = True
+    timefmt = ' [%b %d]'
+    extra_css = '''
+                .timestamp {  font-size:xx-small; display: block; }
+                #storyheader { font-size: medium; }
+                #storyheader h1 { font-size: x-large; }
+                #storyheader h2 { font-size: large;  font-style: italic; }
+                .byline { font-size:xx-small; }
+                #photocaption { font-size: small; font-style: italic }
+                #photocredit { font-size: xx-small; }'''
+    keep_only_tags = [dict(name='div', attrs={'id':'storyheader'}),dict(name='div', attrs={'id':'storycontent'})]
+    remove_tags = [{'class':'comments'},
+                   dict(name='div', attrs={'class':'navbar'}),dict(name='div', attrs={'class':'morelinks'}),
+                   dict(name='div', attrs={'class':'viewmore'}),dict(name='li', attrs={'class':'email'}),
+                   dict(name='div', attrs={'class':'story_tool_hr'}),dict(name='div', attrs={'class':'clear'}),
+                   dict(name='div', attrs={'class':'story_tool'}),dict(name='div', attrs={'class':'copyright'}),
+                   dict(name='div', attrs={'class':'rule_grey_solid'}),
+                   dict(name='li', attrs={'class':'print'}),dict(name='li', attrs={'class':'share'}),dict(name='ul', attrs={'class':'bullet'})]
+
+    def preprocess_html(self,soup):
+        #delete iempty id attributes--they screw up the TOC for unknow reasons
+        divtags = soup.findAll('div',attrs={'id':''})
+        if divtags:
+            for div in divtags:
+                del(div['id'])
+        return soup
+
+    def get_cover_url(self):
+        from datetime import timedelta, datetime, date
+        if self.fp_tag=='':
+            return None
+        cover = 'http://webmedia.newseum.org/newseum-multimedia/dfp/jpg'+str(date.today().day)+'/lg/'+self.fp_tag+'.jpg'
+        br = BasicNewsRecipe.get_browser()
+        daysback=1
+        try:
+            br.open(cover)
+        except:
+            while daysback<7:
+                cover = 'http://webmedia.newseum.org/newseum-multimedia/dfp/jpg'+str((date.today() - timedelta(days=daysback)).day)+'/lg/'+self.fp_tag+'.jpg'
+                br = BasicNewsRecipe.get_browser()
+                try:
+                    br.open(cover)
+                except:
+                    daysback = daysback+1
+                    continue
+                break
+        if daysback==7:
+            self.log("\nCover unavailable")
+            cover = None
+        return cover
+
+    def fixChars(self,string):
+        # Replace lsquo (\x91)
+        fixed = re.sub("\x91","‘",string)
+        # Replace rsquo (\x92)
+        fixed = re.sub("\x92","’",fixed)
+        # Replace ldquo (\x93)
+        fixed = re.sub("\x93","“",fixed)
+        # Replace rdquo (\x94)
+        fixed = re.sub("\x94","”",fixed)
+        # Replace ndash (\x96)
+        fixed = re.sub("\x96","–",fixed)
+        # Replace mdash (\x97)
+        fixed = re.sub("\x97","—",fixed)
+        fixed = re.sub("&#x2019;","’",fixed)
+        return fixed
+
+    def massageNCXText(self, description):
+        # Kindle TOC descriptions won't render certain characters
+        if description:
+            massaged = unicode(BeautifulStoneSoup(description, convertEntities=BeautifulStoneSoup.HTML_ENTITIES))
+            # Replace '&' with '&'
+            massaged = re.sub("&","&", massaged)
+            return self.fixChars(massaged)
+        else:
+            return description
+
+    def populate_article_metadata(self, article, soup, first):
+        if first:
+            picdiv = soup.find('body').find('img')
+            if picdiv is not None:
+                self.add_toc_thumbnail(article,re.sub(r'links\\link\d+\\','',picdiv['src']))
+        xtitle = article.text_summary.strip()
+        if len(xtitle) == 0:
+            desc = soup.find('meta',attrs={'property':'og:description'})
+            if desc is not None:
+                article.summary = article.text_summary = desc['content']
+
+    def strip_anchors(self,soup):
+        paras = soup.findAll(True)
+        for para in paras:
+            aTags = para.findAll('a')
+            for a in aTags:
+                if a.img is None:
+                    a.replaceWith(a.renderContents().decode('cp1252','replace'))
+        return soup
+
+    def preprocess_html(self, soup):
+        return self.strip_anchors(soup)
+
+
+
+    def parse_index(self):
+        soup = self.index_to_soup(self.url_prefix+'/news/todays-paper/index.html')
+
+        articles = {}
+        key = 'News'
+        ans = ['News']
+
+        # Find each instance of class="sectiontitle", class="featurecontent"
+        for divtag in soup.findAll('div',attrs={'class' : ["section_title02","featurecontent"]}):
+                #self.log(" div class = %s" % divtag['class'])
+                if divtag['class'].startswith('section_title'):
+                    # div contains section title
+                    if not divtag.h3:
+                        continue
+                    key = self.tag_to_string(divtag.h3,False)
+                    ans.append(key)
+                    self.log("Section name %s" % key)
+                    continue
+                # div contains article data
+                h1tag = divtag.find('h1')
+                if not h1tag:
+                    continue
+                atag = h1tag.find('a',href=True)
+                if not atag:
+                    continue
+                url = self.url_prefix+'/news/todays-paper/'+atag['href']
+                #self.log("Section %s" % key)
+                #self.log("url %s" % url)
+                title = self.tag_to_string(atag,False)
+                #self.log("title %s" % title)
+                pubdate = ''
+                description = ''
+                ptag = divtag.find('p');
+                if ptag:
+                    description = self.tag_to_string(ptag,False)
+                    #self.log("description %s" % description)
+                author = ''
+                autag = divtag.find('h4')
+                if autag:
+                    author = self.tag_to_string(autag,False)
+                    #self.log("author %s" % author)
+                if not articles.has_key(key):
+                    articles[key] = []
+                articles[key].append(dict(title=title,url=url,date=pubdate,description=description,author=author,content=''))
+
+        ans = [(key, articles[key]) for key in ans if articles.has_key(key)]
+        return ans
--- a/recipes/windsor_star.recipe
+++ b/recipes/windsor_star.recipe
@ -0,0 +1,233 @@
+#!/usr/bin/env  python
+# -*- coding: utf-8 -*-
+
+__license__   = 'GPL v3'
+
+'''
+www.canada.com
+'''
+
+import string, re
+from calibre import strftime
+from calibre.web.feeds.news import BasicNewsRecipe
+
+import string, re
+from calibre import strftime
+from calibre.web.feeds.recipes import BasicNewsRecipe
+from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag
+
+
+class CanWestPaper(BasicNewsRecipe):
+
+    # un-comment the following four lines for the Victoria Times Colonist
+##    title = u'Victoria Times Colonist'
+##    url_prefix = 'http://www.timescolonist.com'
+##    description = u'News from Victoria, BC'
+##    fp_tag = 'CAN_TC'
+
+    # un-comment the following four lines for the Vancouver Province
+##    title = u'Vancouver Province'
+##    url_prefix = 'http://www.theprovince.com'
+##    description = u'News from Vancouver, BC'
+##    fp_tag = 'CAN_VP'
+
+    # un-comment the following four lines for the Vancouver Sun
+##    title = u'Vancouver Sun'
+##    url_prefix = 'http://www.vancouversun.com'
+##    description = u'News from Vancouver, BC'
+##    fp_tag = 'CAN_VS'
+
+    # un-comment the following four lines for the Edmonton Journal
+##    title = u'Edmonton Journal'
+##    url_prefix = 'http://www.edmontonjournal.com'
+##    description = u'News from Edmonton, AB'
+##    fp_tag = 'CAN_EJ'
+
+    # un-comment the following four lines for the Calgary Herald
+##    title = u'Calgary Herald'
+##    url_prefix = 'http://www.calgaryherald.com'
+##    description = u'News from Calgary, AB'
+##    fp_tag = 'CAN_CH'
+
+    # un-comment the following four lines for the Regina Leader-Post
+##    title = u'Regina Leader-Post'
+##    url_prefix = 'http://www.leaderpost.com'
+##    description = u'News from Regina, SK'
+##    fp_tag = ''
+
+    # un-comment the following four lines for the Saskatoon Star-Phoenix
+##    title = u'Saskatoon Star-Phoenix'
+##    url_prefix = 'http://www.thestarphoenix.com'
+##    description = u'News from Saskatoon, SK'
+##    fp_tag = ''
+
+    # un-comment the following four lines for the Windsor Star
+    title = u'Windsor Star'
+    url_prefix = 'http://www.windsorstar.com'
+    description = u'News from Windsor, ON'
+    fp_tag = 'CAN_'
+
+    # un-comment the following four lines for the Ottawa Citizen
+##    title = u'Ottawa Citizen'
+##    url_prefix = 'http://www.ottawacitizen.com'
+##    description = u'News from Ottawa, ON'
+##    fp_tag = 'CAN_OC'
+
+    # un-comment the following four lines for the Montreal Gazette
+##    title = u'Montreal Gazette'
+##    url_prefix = 'http://www.montrealgazette.com'
+##    description = u'News from Montreal, QC'
+##    fp_tag = 'CAN_MG'
+
+
+    language = 'en_CA'
+    __author__ = 'Nick Redding'
+    no_stylesheets = True
+    timefmt = ' [%b %d]'
+    extra_css = '''
+                .timestamp {  font-size:xx-small; display: block; }
+                #storyheader { font-size: medium; }
+                #storyheader h1 { font-size: x-large; }
+                #storyheader h2 { font-size: large;  font-style: italic; }
+                .byline { font-size:xx-small; }
+                #photocaption { font-size: small; font-style: italic }
+                #photocredit { font-size: xx-small; }'''
+    keep_only_tags = [dict(name='div', attrs={'id':'storyheader'}),dict(name='div', attrs={'id':'storycontent'})]
+    remove_tags = [{'class':'comments'},
+                   dict(name='div', attrs={'class':'navbar'}),dict(name='div', attrs={'class':'morelinks'}),
+                   dict(name='div', attrs={'class':'viewmore'}),dict(name='li', attrs={'class':'email'}),
+                   dict(name='div', attrs={'class':'story_tool_hr'}),dict(name='div', attrs={'class':'clear'}),
+                   dict(name='div', attrs={'class':'story_tool'}),dict(name='div', attrs={'class':'copyright'}),
+                   dict(name='div', attrs={'class':'rule_grey_solid'}),
+                   dict(name='li', attrs={'class':'print'}),dict(name='li', attrs={'class':'share'}),dict(name='ul', attrs={'class':'bullet'})]
+
+    def preprocess_html(self,soup):
+        #delete iempty id attributes--they screw up the TOC for unknow reasons
+        divtags = soup.findAll('div',attrs={'id':''})
+        if divtags:
+            for div in divtags:
+                del(div['id'])
+        return soup
+
+    def get_cover_url(self):
+        from datetime import timedelta, datetime, date
+        if self.fp_tag=='':
+            return None
+        cover = 'http://webmedia.newseum.org/newseum-multimedia/dfp/jpg'+str(date.today().day)+'/lg/'+self.fp_tag+'.jpg'
+        br = BasicNewsRecipe.get_browser()
+        daysback=1
+        try:
+            br.open(cover)
+        except:
+            while daysback<7:
+                cover = 'http://webmedia.newseum.org/newseum-multimedia/dfp/jpg'+str((date.today() - timedelta(days=daysback)).day)+'/lg/'+self.fp_tag+'.jpg'
+                br = BasicNewsRecipe.get_browser()
+                try:
+                    br.open(cover)
+                except:
+                    daysback = daysback+1
+                    continue
+                break
+        if daysback==7:
+            self.log("\nCover unavailable")
+            cover = None
+        return cover
+
+    def fixChars(self,string):
+        # Replace lsquo (\x91)
+        fixed = re.sub("\x91","‘",string)
+        # Replace rsquo (\x92)
+        fixed = re.sub("\x92","’",fixed)
+        # Replace ldquo (\x93)
+        fixed = re.sub("\x93","“",fixed)
+        # Replace rdquo (\x94)
+        fixed = re.sub("\x94","”",fixed)
+        # Replace ndash (\x96)
+        fixed = re.sub("\x96","–",fixed)
+        # Replace mdash (\x97)
+        fixed = re.sub("\x97","—",fixed)
+        fixed = re.sub("&#x2019;","’",fixed)
+        return fixed
+
+    def massageNCXText(self, description):
+        # Kindle TOC descriptions won't render certain characters
+        if description:
+            massaged = unicode(BeautifulStoneSoup(description, convertEntities=BeautifulStoneSoup.HTML_ENTITIES))
+            # Replace '&' with '&'
+            massaged = re.sub("&","&", massaged)
+            return self.fixChars(massaged)
+        else:
+            return description
+
+    def populate_article_metadata(self, article, soup, first):
+        if first:
+            picdiv = soup.find('body').find('img')
+            if picdiv is not None:
+                self.add_toc_thumbnail(article,re.sub(r'links\\link\d+\\','',picdiv['src']))
+        xtitle = article.text_summary.strip()
+        if len(xtitle) == 0:
+            desc = soup.find('meta',attrs={'property':'og:description'})
+            if desc is not None:
+                article.summary = article.text_summary = desc['content']
+
+    def strip_anchors(self,soup):
+        paras = soup.findAll(True)
+        for para in paras:
+            aTags = para.findAll('a')
+            for a in aTags:
+                if a.img is None:
+                    a.replaceWith(a.renderContents().decode('cp1252','replace'))
+        return soup
+
+    def preprocess_html(self, soup):
+        return self.strip_anchors(soup)
+
+
+
+    def parse_index(self):
+        soup = self.index_to_soup(self.url_prefix+'/news/todays-paper/index.html')
+
+        articles = {}
+        key = 'News'
+        ans = ['News']
+
+        # Find each instance of class="sectiontitle", class="featurecontent"
+        for divtag in soup.findAll('div',attrs={'class' : ["section_title02","featurecontent"]}):
+                #self.log(" div class = %s" % divtag['class'])
+                if divtag['class'].startswith('section_title'):
+                    # div contains section title
+                    if not divtag.h3:
+                        continue
+                    key = self.tag_to_string(divtag.h3,False)
+                    ans.append(key)
+                    self.log("Section name %s" % key)
+                    continue
+                # div contains article data
+                h1tag = divtag.find('h1')
+                if not h1tag:
+                    continue
+                atag = h1tag.find('a',href=True)
+                if not atag:
+                    continue
+                url = self.url_prefix+'/news/todays-paper/'+atag['href']
+                #self.log("Section %s" % key)
+                #self.log("url %s" % url)
+                title = self.tag_to_string(atag,False)
+                #self.log("title %s" % title)
+                pubdate = ''
+                description = ''
+                ptag = divtag.find('p');
+                if ptag:
+                    description = self.tag_to_string(ptag,False)
+                    #self.log("description %s" % description)
+                author = ''
+                autag = divtag.find('h4')
+                if autag:
+                    author = self.tag_to_string(autag,False)
+                    #self.log("author %s" % author)
+                if not articles.has_key(key):
+                    articles[key] = []
+                articles[key].append(dict(title=title,url=url,date=pubdate,description=description,author=author,content=''))
+
+        ans = [(key, articles[key]) for key in ans if articles.has_key(key)]
+        return ans