Updated Postmedia publications

2026-05-30 10:35:20 -04:00 · 2012-02-07 11:44:14 +05:30
parent b51079b26a
commit d4d7d2e13f
8 changed files with 1070 additions and 152 deletions
@@ -1,4 +1,5 @@
 #!/usr/bin/env  python
+# -*- coding: utf-8 -*-

 __license__   = 'GPL v3'

@@ -6,45 +7,81 @@ __license__   = 'GPL v3'
 www.canada.com
 '''

+import string, re
+from calibre import strftime
+from calibre.web.feeds.news import BasicNewsRecipe
+
+import string, re
+from calibre import strftime
 from calibre.web.feeds.recipes import BasicNewsRecipe
+from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag


 class CanWestPaper(BasicNewsRecipe):

-    # un-comment the following three lines for the Calgary Herald
+    # un-comment the following four lines for the Victoria Times Colonist
+##    title = u'Victoria Times Colonist'
+##    url_prefix = 'http://www.timescolonist.com'
+##    description = u'News from Victoria, BC'
+##    fp_tag = 'CAN_TC'
+
+    # un-comment the following four lines for the Vancouver Province
+##    title = u'Vancouver Province'
+##    url_prefix = 'http://www.theprovince.com'
+##    description = u'News from Vancouver, BC'
+##    fp_tag = 'CAN_VP'
+
+    # un-comment the following four lines for the Vancouver Sun
+##    title = u'Vancouver Sun'
+##    url_prefix = 'http://www.vancouversun.com'
+##    description = u'News from Vancouver, BC'
+##    fp_tag = 'CAN_VS'
+
+    # un-comment the following four lines for the Edmonton Journal
+##    title = u'Edmonton Journal'
+##    url_prefix = 'http://www.edmontonjournal.com'
+##    description = u'News from Edmonton, AB'
+##    fp_tag = 'CAN_EJ'
+
+    # un-comment the following four lines for the Calgary Herald
    title = u'Calgary Herald'
    url_prefix = 'http://www.calgaryherald.com'
    description = u'News from Calgary, AB'
+    fp_tag = 'CAN_CH'

-    # un-comment the following three lines for the Regina Leader-Post
-    #title = u'Regina Leader-Post'
-    #url_prefix = 'http://www.leaderpost.com'
-    #description = u'News from Regina, SK'
+    # un-comment the following four lines for the Regina Leader-Post
+##    title = u'Regina Leader-Post'
+##    url_prefix = 'http://www.leaderpost.com'
+##    description = u'News from Regina, SK'
+##    fp_tag = ''

-    # un-comment the following three lines for the Saskatoon Star-Phoenix
-    #title = u'Saskatoon Star-Phoenix'
-    #url_prefix = 'http://www.thestarphoenix.com'
-    #description = u'News from Saskatoon, SK'
+    # un-comment the following four lines for the Saskatoon Star-Phoenix
+##    title = u'Saskatoon Star-Phoenix'
+##    url_prefix = 'http://www.thestarphoenix.com'
+##    description = u'News from Saskatoon, SK'
+##    fp_tag = ''

-    # un-comment the following three lines for the Windsor Star
-    #title = u'Windsor Star'
-    #url_prefix = 'http://www.windsorstar.com'
-    #description = u'News from Windsor, ON'
+    # un-comment the following four lines for the Windsor Star
+##    title = u'Windsor Star'
+##    url_prefix = 'http://www.windsorstar.com'
+##    description = u'News from Windsor, ON'
+##    fp_tag = 'CAN_'

-    # un-comment the following three lines for the Ottawa Citizen
-    #title = u'Ottawa Citizen'
-    #url_prefix = 'http://www.ottawacitizen.com'
-    #description = u'News from Ottawa, ON'
+    # un-comment the following four lines for the Ottawa Citizen
+##    title = u'Ottawa Citizen'
+##    url_prefix = 'http://www.ottawacitizen.com'
+##    description = u'News from Ottawa, ON'
+##    fp_tag = 'CAN_OC'

-    # un-comment the following three lines for the Montreal Gazette
-    #title = u'Montreal Gazette'
-    #url_prefix = 'http://www.montrealgazette.com'
-    #description = u'News from Montreal, QC'
+    # un-comment the following four lines for the Montreal Gazette
+##    title = u'Montreal Gazette'
+##    url_prefix = 'http://www.montrealgazette.com'
+##    description = u'News from Montreal, QC'
+##    fp_tag = 'CAN_MG'


    language = 'en_CA'
    __author__ = 'Nick Redding'
-    encoding = 'latin1'
    no_stylesheets = True
    timefmt = ' [%b %d]'
    extra_css = '''
@@ -72,6 +109,80 @@ class CanWestPaper(BasicNewsRecipe):
                del(div['id'])
        return soup

+    def get_cover_url(self):
+        from datetime import timedelta, datetime, date
+        if self.fp_tag=='':
+            return None
+        cover = 'http://webmedia.newseum.org/newseum-multimedia/dfp/jpg'+str(date.today().day)+'/lg/'+self.fp_tag+'.jpg'
+        br = BasicNewsRecipe.get_browser()
+        daysback=1
+        try:
+            br.open(cover)
+        except:
+            while daysback<7:
+                cover = 'http://webmedia.newseum.org/newseum-multimedia/dfp/jpg'+str((date.today() - timedelta(days=daysback)).day)+'/lg/'+self.fp_tag+'.jpg'
+                br = BasicNewsRecipe.get_browser()
+                try:
+                    br.open(cover)
+                except:
+                    daysback = daysback+1
+                    continue
+                break
+        if daysback==7:
+            self.log("\nCover unavailable")
+            cover = None
+        return cover
+
+    def fixChars(self,string):
+        # Replace lsquo (\x91)
+        fixed = re.sub("\x91","‘",string)
+        # Replace rsquo (\x92)
+        fixed = re.sub("\x92","’",fixed)
+        # Replace ldquo (\x93)
+        fixed = re.sub("\x93","“",fixed)
+        # Replace rdquo (\x94)
+        fixed = re.sub("\x94","”",fixed)
+        # Replace ndash (\x96)
+        fixed = re.sub("\x96","–",fixed)
+        # Replace mdash (\x97)
+        fixed = re.sub("\x97","—",fixed)
+        fixed = re.sub("&#x2019;","’",fixed)
+        return fixed
+
+    def massageNCXText(self, description):
+        # Kindle TOC descriptions won't render certain characters
+        if description:
+            massaged = unicode(BeautifulStoneSoup(description, convertEntities=BeautifulStoneSoup.HTML_ENTITIES))
+            # Replace '&' with '&'
+            massaged = re.sub("&","&", massaged)
+            return self.fixChars(massaged)
+        else:
+            return description
+
+    def populate_article_metadata(self, article, soup, first):
+        if first:
+            picdiv = soup.find('body').find('img')
+            if picdiv is not None:
+                self.add_toc_thumbnail(article,re.sub(r'links\\link\d+\\','',picdiv['src']))
+        xtitle = article.text_summary.strip()
+        if len(xtitle) == 0:
+            desc = soup.find('meta',attrs={'property':'og:description'})
+            if desc is not None:
+                article.summary = article.text_summary = desc['content']
+
+    def strip_anchors(self,soup):
+        paras = soup.findAll(True)
+        for para in paras:
+            aTags = para.findAll('a')
+            for a in aTags:
+                if a.img is None:
+                    a.replaceWith(a.renderContents().decode('cp1252','replace'))
+        return soup
+
+    def preprocess_html(self, soup):
+        return self.strip_anchors(soup)
+
+

    def parse_index(self):
        soup = self.index_to_soup(self.url_prefix+'/news/todays-paper/index.html')
@@ -98,9 +209,7 @@ class CanWestPaper(BasicNewsRecipe):
                atag = h1tag.find('a',href=True)
                if not atag:
                    continue
-                url = atag['href']
-                if not url.startswith('http:'):
-                    url = self.url_prefix+'/news/todays-paper/'+atag['href']
+                url = self.url_prefix+'/news/todays-paper/'+atag['href']
                #self.log("Section %s" % key)
                #self.log("url %s" % url)
                title = self.tag_to_string(atag,False)
@@ -1,4 +1,5 @@
 #!/usr/bin/env  python
+# -*- coding: utf-8 -*-

 __license__   = 'GPL v3'

@@ -6,45 +7,77 @@ __license__   = 'GPL v3'
 www.canada.com
 '''

+import string, re
+from calibre import strftime
+from calibre.web.feeds.news import BasicNewsRecipe
+
+import string, re
+from calibre import strftime
 from calibre.web.feeds.recipes import BasicNewsRecipe
+from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag


 class CanWestPaper(BasicNewsRecipe):

-    # un-comment the following three lines for the Edmonton Journal
+    # un-comment the following four lines for the Victoria Times Colonist
+##    title = u'Victoria Times Colonist'
+##    url_prefix = 'http://www.timescolonist.com'
+##    description = u'News from Victoria, BC'
+##    fp_tag = 'CAN_TC'
+
+    # un-comment the following four lines for the Vancouver Province
+##    title = u'Vancouver Province'
+##    url_prefix = 'http://www.theprovince.com'
+##    description = u'News from Vancouver, BC'
+##    fp_tag = 'CAN_VP'
+
+    # un-comment the following four lines for the Vancouver Sun
+##    title = u'Vancouver Sun'
+##    url_prefix = 'http://www.vancouversun.com'
+##    description = u'News from Vancouver, BC'
+##    fp_tag = 'CAN_VS'
+
+    # un-comment the following four lines for the Edmonton Journal
    title = u'Edmonton Journal'
    url_prefix = 'http://www.edmontonjournal.com'
    description = u'News from Edmonton, AB'
+    fp_tag = 'CAN_EJ'

-    # un-comment the following three lines for the Calgary Herald
-    #title = u'Calgary Herald'
-    #url_prefix = 'http://www.calgaryherald.com'
-    #description = u'News from Calgary, AB'
+    # un-comment the following four lines for the Calgary Herald
+##    title = u'Calgary Herald'
+##    url_prefix = 'http://www.calgaryherald.com'
+##    description = u'News from Calgary, AB'
+##    fp_tag = 'CAN_CH'

-    # un-comment the following three lines for the Regina Leader-Post
-    #title = u'Regina Leader-Post'
-    #url_prefix = 'http://www.leaderpost.com'
-    #description = u'News from Regina, SK'
+    # un-comment the following four lines for the Regina Leader-Post
+##    title = u'Regina Leader-Post'
+##    url_prefix = 'http://www.leaderpost.com'
+##    description = u'News from Regina, SK'
+##    fp_tag = ''

-    # un-comment the following three lines for the Saskatoon Star-Phoenix
-    #title = u'Saskatoon Star-Phoenix'
-    #url_prefix = 'http://www.thestarphoenix.com'
-    #description = u'News from Saskatoon, SK'
+    # un-comment the following four lines for the Saskatoon Star-Phoenix
+##    title = u'Saskatoon Star-Phoenix'
+##    url_prefix = 'http://www.thestarphoenix.com'
+##    description = u'News from Saskatoon, SK'
+##    fp_tag = ''

-    # un-comment the following three lines for the Windsor Star
-    #title = u'Windsor Star'
-    #url_prefix = 'http://www.windsorstar.com'
-    #description = u'News from Windsor, ON'
+    # un-comment the following four lines for the Windsor Star
+##    title = u'Windsor Star'
+##    url_prefix = 'http://www.windsorstar.com'
+##    description = u'News from Windsor, ON'
+##    fp_tag = 'CAN_'

-    # un-comment the following three lines for the Ottawa Citizen
-    #title = u'Ottawa Citizen'
-    #url_prefix = 'http://www.ottawacitizen.com'
-    #description = u'News from Ottawa, ON'
+    # un-comment the following four lines for the Ottawa Citizen
+##    title = u'Ottawa Citizen'
+##    url_prefix = 'http://www.ottawacitizen.com'
+##    description = u'News from Ottawa, ON'
+##    fp_tag = 'CAN_OC'

-    # un-comment the following three lines for the Montreal Gazette
-    #title = u'Montreal Gazette'
-    #url_prefix = 'http://www.montrealgazette.com'
-    #description = u'News from Montreal, QC'
+    # un-comment the following four lines for the Montreal Gazette
+##    title = u'Montreal Gazette'
+##    url_prefix = 'http://www.montrealgazette.com'
+##    description = u'News from Montreal, QC'
+##    fp_tag = 'CAN_MG'


    language = 'en_CA'
@@ -76,6 +109,80 @@ class CanWestPaper(BasicNewsRecipe):
                del(div['id'])
        return soup

+    def get_cover_url(self):
+        from datetime import timedelta, datetime, date
+        if self.fp_tag=='':
+            return None
+        cover = 'http://webmedia.newseum.org/newseum-multimedia/dfp/jpg'+str(date.today().day)+'/lg/'+self.fp_tag+'.jpg'
+        br = BasicNewsRecipe.get_browser()
+        daysback=1
+        try:
+            br.open(cover)
+        except:
+            while daysback<7:
+                cover = 'http://webmedia.newseum.org/newseum-multimedia/dfp/jpg'+str((date.today() - timedelta(days=daysback)).day)+'/lg/'+self.fp_tag+'.jpg'
+                br = BasicNewsRecipe.get_browser()
+                try:
+                    br.open(cover)
+                except:
+                    daysback = daysback+1
+                    continue
+                break
+        if daysback==7:
+            self.log("\nCover unavailable")
+            cover = None
+        return cover
+
+    def fixChars(self,string):
+        # Replace lsquo (\x91)
+        fixed = re.sub("\x91","‘",string)
+        # Replace rsquo (\x92)
+        fixed = re.sub("\x92","’",fixed)
+        # Replace ldquo (\x93)
+        fixed = re.sub("\x93","“",fixed)
+        # Replace rdquo (\x94)
+        fixed = re.sub("\x94","”",fixed)
+        # Replace ndash (\x96)
+        fixed = re.sub("\x96","–",fixed)
+        # Replace mdash (\x97)
+        fixed = re.sub("\x97","—",fixed)
+        fixed = re.sub("&#x2019;","’",fixed)
+        return fixed
+
+    def massageNCXText(self, description):
+        # Kindle TOC descriptions won't render certain characters
+        if description:
+            massaged = unicode(BeautifulStoneSoup(description, convertEntities=BeautifulStoneSoup.HTML_ENTITIES))
+            # Replace '&' with '&'
+            massaged = re.sub("&","&", massaged)
+            return self.fixChars(massaged)
+        else:
+            return description
+
+    def populate_article_metadata(self, article, soup, first):
+        if first:
+            picdiv = soup.find('body').find('img')
+            if picdiv is not None:
+                self.add_toc_thumbnail(article,re.sub(r'links\\link\d+\\','',picdiv['src']))
+        xtitle = article.text_summary.strip()
+        if len(xtitle) == 0:
+            desc = soup.find('meta',attrs={'property':'og:description'})
+            if desc is not None:
+                article.summary = article.text_summary = desc['content']
+
+    def strip_anchors(self,soup):
+        paras = soup.findAll(True)
+        for para in paras:
+            aTags = para.findAll('a')
+            for a in aTags:
+                if a.img is None:
+                    a.replaceWith(a.renderContents().decode('cp1252','replace'))
+        return soup
+
+    def preprocess_html(self, soup):
+        return self.strip_anchors(soup)
+
+

    def parse_index(self):
        soup = self.index_to_soup(self.url_prefix+'/news/todays-paper/index.html')
@@ -1,4 +1,5 @@
 #!/usr/bin/env  python
+# -*- coding: utf-8 -*-

 __license__   = 'GPL v3'

@@ -6,15 +7,77 @@ __license__   = 'GPL v3'
 www.canada.com
 '''

+import string, re
+from calibre import strftime
+from calibre.web.feeds.news import BasicNewsRecipe
+
+import string, re
+from calibre import strftime
 from calibre.web.feeds.recipes import BasicNewsRecipe
+from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag


 class CanWestPaper(BasicNewsRecipe):

-    # un-comment the following three lines for the Montreal Gazette
+    # un-comment the following four lines for the Victoria Times Colonist
+##    title = u'Victoria Times Colonist'
+##    url_prefix = 'http://www.timescolonist.com'
+##    description = u'News from Victoria, BC'
+##    fp_tag = 'CAN_TC'
+
+    # un-comment the following four lines for the Vancouver Province
+##    title = u'Vancouver Province'
+##    url_prefix = 'http://www.theprovince.com'
+##    description = u'News from Vancouver, BC'
+##    fp_tag = 'CAN_VP'
+
+    # un-comment the following four lines for the Vancouver Sun
+##    title = u'Vancouver Sun'
+##    url_prefix = 'http://www.vancouversun.com'
+##    description = u'News from Vancouver, BC'
+##    fp_tag = 'CAN_VS'
+
+    # un-comment the following four lines for the Edmonton Journal
+##    title = u'Edmonton Journal'
+##    url_prefix = 'http://www.edmontonjournal.com'
+##    description = u'News from Edmonton, AB'
+##    fp_tag = 'CAN_EJ'
+
+    # un-comment the following four lines for the Calgary Herald
+##    title = u'Calgary Herald'
+##    url_prefix = 'http://www.calgaryherald.com'
+##    description = u'News from Calgary, AB'
+##    fp_tag = 'CAN_CH'
+
+    # un-comment the following four lines for the Regina Leader-Post
+##    title = u'Regina Leader-Post'
+##    url_prefix = 'http://www.leaderpost.com'
+##    description = u'News from Regina, SK'
+##    fp_tag = ''
+
+    # un-comment the following four lines for the Saskatoon Star-Phoenix
+##    title = u'Saskatoon Star-Phoenix'
+##    url_prefix = 'http://www.thestarphoenix.com'
+##    description = u'News from Saskatoon, SK'
+##    fp_tag = ''
+
+    # un-comment the following four lines for the Windsor Star
+##    title = u'Windsor Star'
+##    url_prefix = 'http://www.windsorstar.com'
+##    description = u'News from Windsor, ON'
+##    fp_tag = 'CAN_'
+
+    # un-comment the following four lines for the Ottawa Citizen
+##    title = u'Ottawa Citizen'
+##    url_prefix = 'http://www.ottawacitizen.com'
+##    description = u'News from Ottawa, ON'
+##    fp_tag = 'CAN_OC'
+
+    # un-comment the following four lines for the Montreal Gazette
    title = u'Montreal Gazette'
    url_prefix = 'http://www.montrealgazette.com'
    description = u'News from Montreal, QC'
+    fp_tag = 'CAN_MG'


    language = 'en_CA'
@@ -46,6 +109,80 @@ class CanWestPaper(BasicNewsRecipe):
                del(div['id'])
        return soup

+    def get_cover_url(self):
+        from datetime import timedelta, datetime, date
+        if self.fp_tag=='':
+            return None
+        cover = 'http://webmedia.newseum.org/newseum-multimedia/dfp/jpg'+str(date.today().day)+'/lg/'+self.fp_tag+'.jpg'
+        br = BasicNewsRecipe.get_browser()
+        daysback=1
+        try:
+            br.open(cover)
+        except:
+            while daysback<7:
+                cover = 'http://webmedia.newseum.org/newseum-multimedia/dfp/jpg'+str((date.today() - timedelta(days=daysback)).day)+'/lg/'+self.fp_tag+'.jpg'
+                br = BasicNewsRecipe.get_browser()
+                try:
+                    br.open(cover)
+                except:
+                    daysback = daysback+1
+                    continue
+                break
+        if daysback==7:
+            self.log("\nCover unavailable")
+            cover = None
+        return cover
+
+    def fixChars(self,string):
+        # Replace lsquo (\x91)
+        fixed = re.sub("\x91","‘",string)
+        # Replace rsquo (\x92)
+        fixed = re.sub("\x92","’",fixed)
+        # Replace ldquo (\x93)
+        fixed = re.sub("\x93","“",fixed)
+        # Replace rdquo (\x94)
+        fixed = re.sub("\x94","”",fixed)
+        # Replace ndash (\x96)
+        fixed = re.sub("\x96","–",fixed)
+        # Replace mdash (\x97)
+        fixed = re.sub("\x97","—",fixed)
+        fixed = re.sub("&#x2019;","’",fixed)
+        return fixed
+
+    def massageNCXText(self, description):
+        # Kindle TOC descriptions won't render certain characters
+        if description:
+            massaged = unicode(BeautifulStoneSoup(description, convertEntities=BeautifulStoneSoup.HTML_ENTITIES))
+            # Replace '&' with '&'
+            massaged = re.sub("&","&", massaged)
+            return self.fixChars(massaged)
+        else:
+            return description
+
+    def populate_article_metadata(self, article, soup, first):
+        if first:
+            picdiv = soup.find('body').find('img')
+            if picdiv is not None:
+                self.add_toc_thumbnail(article,re.sub(r'links\\link\d+\\','',picdiv['src']))
+        xtitle = article.text_summary.strip()
+        if len(xtitle) == 0:
+            desc = soup.find('meta',attrs={'property':'og:description'})
+            if desc is not None:
+                article.summary = article.text_summary = desc['content']
+
+    def strip_anchors(self,soup):
+        paras = soup.findAll(True)
+        for para in paras:
+            aTags = para.findAll('a')
+            for a in aTags:
+                if a.img is None:
+                    a.replaceWith(a.renderContents().decode('cp1252','replace'))
+        return soup
+
+    def preprocess_html(self, soup):
+        return self.strip_anchors(soup)
+
+

    def parse_index(self):
        soup = self.index_to_soup(self.url_prefix+'/news/todays-paper/index.html')
@@ -1,4 +1,5 @@
 #!/usr/bin/env  python
+# -*- coding: utf-8 -*-

 __license__   = 'GPL v3'

@@ -6,20 +7,77 @@ __license__   = 'GPL v3'
 www.canada.com
 '''

+import string, re
+from calibre import strftime
+from calibre.web.feeds.news import BasicNewsRecipe
+
+import string, re
+from calibre import strftime
 from calibre.web.feeds.recipes import BasicNewsRecipe
+from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag


 class CanWestPaper(BasicNewsRecipe):

-    # un-comment the following three lines for the Ottawa Citizen
+    # un-comment the following four lines for the Victoria Times Colonist
+##    title = u'Victoria Times Colonist'
+##    url_prefix = 'http://www.timescolonist.com'
+##    description = u'News from Victoria, BC'
+##    fp_tag = 'CAN_TC'
+
+    # un-comment the following four lines for the Vancouver Province
+##    title = u'Vancouver Province'
+##    url_prefix = 'http://www.theprovince.com'
+##    description = u'News from Vancouver, BC'
+##    fp_tag = 'CAN_VP'
+
+    # un-comment the following four lines for the Vancouver Sun
+##    title = u'Vancouver Sun'
+##    url_prefix = 'http://www.vancouversun.com'
+##    description = u'News from Vancouver, BC'
+##    fp_tag = 'CAN_VS'
+
+    # un-comment the following four lines for the Edmonton Journal
+##    title = u'Edmonton Journal'
+##    url_prefix = 'http://www.edmontonjournal.com'
+##    description = u'News from Edmonton, AB'
+##    fp_tag = 'CAN_EJ'
+
+    # un-comment the following four lines for the Calgary Herald
+##    title = u'Calgary Herald'
+##    url_prefix = 'http://www.calgaryherald.com'
+##    description = u'News from Calgary, AB'
+##    fp_tag = 'CAN_CH'
+
+    # un-comment the following four lines for the Regina Leader-Post
+##    title = u'Regina Leader-Post'
+##    url_prefix = 'http://www.leaderpost.com'
+##    description = u'News from Regina, SK'
+##    fp_tag = ''
+
+    # un-comment the following four lines for the Saskatoon Star-Phoenix
+##    title = u'Saskatoon Star-Phoenix'
+##    url_prefix = 'http://www.thestarphoenix.com'
+##    description = u'News from Saskatoon, SK'
+##    fp_tag = ''
+
+    # un-comment the following four lines for the Windsor Star
+##    title = u'Windsor Star'
+##    url_prefix = 'http://www.windsorstar.com'
+##    description = u'News from Windsor, ON'
+##    fp_tag = 'CAN_'
+
+    # un-comment the following four lines for the Ottawa Citizen
    title = u'Ottawa Citizen'
    url_prefix = 'http://www.ottawacitizen.com'
    description = u'News from Ottawa, ON'
+    fp_tag = 'CAN_OC'

-    # un-comment the following three lines for the Montreal Gazette
-    #title = u'Montreal Gazette'
-    #url_prefix = 'http://www.montrealgazette.com'
-    #description = u'News from Montreal, QC'
+    # un-comment the following four lines for the Montreal Gazette
+##    title = u'Montreal Gazette'
+##    url_prefix = 'http://www.montrealgazette.com'
+##    description = u'News from Montreal, QC'
+##    fp_tag = 'CAN_MG'


    language = 'en_CA'
@@ -51,6 +109,80 @@ class CanWestPaper(BasicNewsRecipe):
                del(div['id'])
        return soup

+    def get_cover_url(self):
+        from datetime import timedelta, datetime, date
+        if self.fp_tag=='':
+            return None
+        cover = 'http://webmedia.newseum.org/newseum-multimedia/dfp/jpg'+str(date.today().day)+'/lg/'+self.fp_tag+'.jpg'
+        br = BasicNewsRecipe.get_browser()
+        daysback=1
+        try:
+            br.open(cover)
+        except:
+            while daysback<7:
+                cover = 'http://webmedia.newseum.org/newseum-multimedia/dfp/jpg'+str((date.today() - timedelta(days=daysback)).day)+'/lg/'+self.fp_tag+'.jpg'
+                br = BasicNewsRecipe.get_browser()
+                try:
+                    br.open(cover)
+                except:
+                    daysback = daysback+1
+                    continue
+                break
+        if daysback==7:
+            self.log("\nCover unavailable")
+            cover = None
+        return cover
+
+    def fixChars(self,string):
+        # Replace lsquo (\x91)
+        fixed = re.sub("\x91","‘",string)
+        # Replace rsquo (\x92)
+        fixed = re.sub("\x92","’",fixed)
+        # Replace ldquo (\x93)
+        fixed = re.sub("\x93","“",fixed)
+        # Replace rdquo (\x94)
+        fixed = re.sub("\x94","”",fixed)
+        # Replace ndash (\x96)
+        fixed = re.sub("\x96","–",fixed)
+        # Replace mdash (\x97)
+        fixed = re.sub("\x97","—",fixed)
+        fixed = re.sub("&#x2019;","’",fixed)
+        return fixed
+
+    def massageNCXText(self, description):
+        # Kindle TOC descriptions won't render certain characters
+        if description:
+            massaged = unicode(BeautifulStoneSoup(description, convertEntities=BeautifulStoneSoup.HTML_ENTITIES))
+            # Replace '&' with '&'
+            massaged = re.sub("&","&", massaged)
+            return self.fixChars(massaged)
+        else:
+            return description
+
+    def populate_article_metadata(self, article, soup, first):
+        if first:
+            picdiv = soup.find('body').find('img')
+            if picdiv is not None:
+                self.add_toc_thumbnail(article,re.sub(r'links\\link\d+\\','',picdiv['src']))
+        xtitle = article.text_summary.strip()
+        if len(xtitle) == 0:
+            desc = soup.find('meta',attrs={'property':'og:description'})
+            if desc is not None:
+                article.summary = article.text_summary = desc['content']
+
+    def strip_anchors(self,soup):
+        paras = soup.findAll(True)
+        for para in paras:
+            aTags = para.findAll('a')
+            for a in aTags:
+                if a.img is None:
+                    a.replaceWith(a.renderContents().decode('cp1252','replace'))
+        return soup
+
+    def preprocess_html(self, soup):
+        return self.strip_anchors(soup)
+
+

    def parse_index(self):
        soup = self.index_to_soup(self.url_prefix+'/news/todays-paper/index.html')
@@ -1,4 +1,5 @@
 #!/usr/bin/env  python
+# -*- coding: utf-8 -*-

 __license__   = 'GPL v3'

@@ -6,35 +7,77 @@ __license__   = 'GPL v3'
 www.canada.com
 '''

+import string, re
+from calibre import strftime
+from calibre.web.feeds.news import BasicNewsRecipe
+
+import string, re
+from calibre import strftime
 from calibre.web.feeds.recipes import BasicNewsRecipe
+from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag


 class CanWestPaper(BasicNewsRecipe):

-    # un-comment the following three lines for the Regina Leader-Post
+    # un-comment the following four lines for the Victoria Times Colonist
+##    title = u'Victoria Times Colonist'
+##    url_prefix = 'http://www.timescolonist.com'
+##    description = u'News from Victoria, BC'
+##    fp_tag = 'CAN_TC'
+
+    # un-comment the following four lines for the Vancouver Province
+##    title = u'Vancouver Province'
+##    url_prefix = 'http://www.theprovince.com'
+##    description = u'News from Vancouver, BC'
+##    fp_tag = 'CAN_VP'
+
+    # un-comment the following four lines for the Vancouver Sun
+##    title = u'Vancouver Sun'
+##    url_prefix = 'http://www.vancouversun.com'
+##    description = u'News from Vancouver, BC'
+##    fp_tag = 'CAN_VS'
+
+    # un-comment the following four lines for the Edmonton Journal
+##    title = u'Edmonton Journal'
+##    url_prefix = 'http://www.edmontonjournal.com'
+##    description = u'News from Edmonton, AB'
+##    fp_tag = 'CAN_EJ'
+
+    # un-comment the following four lines for the Calgary Herald
+##    title = u'Calgary Herald'
+##    url_prefix = 'http://www.calgaryherald.com'
+##    description = u'News from Calgary, AB'
+##    fp_tag = 'CAN_CH'
+
+    # un-comment the following four lines for the Regina Leader-Post
    title = u'Regina Leader-Post'
    url_prefix = 'http://www.leaderpost.com'
    description = u'News from Regina, SK'
+    fp_tag = ''

-    # un-comment the following three lines for the Saskatoon Star-Phoenix
-    #title = u'Saskatoon Star-Phoenix'
-    #url_prefix = 'http://www.thestarphoenix.com'
-    #description = u'News from Saskatoon, SK'
+    # un-comment the following four lines for the Saskatoon Star-Phoenix
+##    title = u'Saskatoon Star-Phoenix'
+##    url_prefix = 'http://www.thestarphoenix.com'
+##    description = u'News from Saskatoon, SK'
+##    fp_tag = ''

-    # un-comment the following three lines for the Windsor Star
-    #title = u'Windsor Star'
-    #url_prefix = 'http://www.windsorstar.com'
-    #description = u'News from Windsor, ON'
+    # un-comment the following four lines for the Windsor Star
+##    title = u'Windsor Star'
+##    url_prefix = 'http://www.windsorstar.com'
+##    description = u'News from Windsor, ON'
+##    fp_tag = 'CAN_'

-    # un-comment the following three lines for the Ottawa Citizen
-    #title = u'Ottawa Citizen'
-    #url_prefix = 'http://www.ottawacitizen.com'
-    #description = u'News from Ottawa, ON'
+    # un-comment the following four lines for the Ottawa Citizen
+##    title = u'Ottawa Citizen'
+##    url_prefix = 'http://www.ottawacitizen.com'
+##    description = u'News from Ottawa, ON'
+##    fp_tag = 'CAN_OC'

-    # un-comment the following three lines for the Montreal Gazette
-    #title = u'Montreal Gazette'
-    #url_prefix = 'http://www.montrealgazette.com'
-    #description = u'News from Montreal, QC'
+    # un-comment the following four lines for the Montreal Gazette
+##    title = u'Montreal Gazette'
+##    url_prefix = 'http://www.montrealgazette.com'
+##    description = u'News from Montreal, QC'
+##    fp_tag = 'CAN_MG'


    language = 'en_CA'
@@ -66,6 +109,80 @@ class CanWestPaper(BasicNewsRecipe):
                del(div['id'])
        return soup

+    def get_cover_url(self):
+        from datetime import timedelta, datetime, date
+        if self.fp_tag=='':
+            return None
+        cover = 'http://webmedia.newseum.org/newseum-multimedia/dfp/jpg'+str(date.today().day)+'/lg/'+self.fp_tag+'.jpg'
+        br = BasicNewsRecipe.get_browser()
+        daysback=1
+        try:
+            br.open(cover)
+        except:
+            while daysback<7:
+                cover = 'http://webmedia.newseum.org/newseum-multimedia/dfp/jpg'+str((date.today() - timedelta(days=daysback)).day)+'/lg/'+self.fp_tag+'.jpg'
+                br = BasicNewsRecipe.get_browser()
+                try:
+                    br.open(cover)
+                except:
+                    daysback = daysback+1
+                    continue
+                break
+        if daysback==7:
+            self.log("\nCover unavailable")
+            cover = None
+        return cover
+
+    def fixChars(self,string):
+        # Replace lsquo (\x91)
+        fixed = re.sub("\x91","‘",string)
+        # Replace rsquo (\x92)
+        fixed = re.sub("\x92","’",fixed)
+        # Replace ldquo (\x93)
+        fixed = re.sub("\x93","“",fixed)
+        # Replace rdquo (\x94)
+        fixed = re.sub("\x94","”",fixed)
+        # Replace ndash (\x96)
+        fixed = re.sub("\x96","–",fixed)
+        # Replace mdash (\x97)
+        fixed = re.sub("\x97","—",fixed)
+        fixed = re.sub("&#x2019;","’",fixed)
+        return fixed
+
+    def massageNCXText(self, description):
+        # Kindle TOC descriptions won't render certain characters
+        if description:
+            massaged = unicode(BeautifulStoneSoup(description, convertEntities=BeautifulStoneSoup.HTML_ENTITIES))
+            # Replace '&' with '&'
+            massaged = re.sub("&","&", massaged)
+            return self.fixChars(massaged)
+        else:
+            return description
+
+    def populate_article_metadata(self, article, soup, first):
+        if first:
+            picdiv = soup.find('body').find('img')
+            if picdiv is not None:
+                self.add_toc_thumbnail(article,re.sub(r'links\\link\d+\\','',picdiv['src']))
+        xtitle = article.text_summary.strip()
+        if len(xtitle) == 0:
+            desc = soup.find('meta',attrs={'property':'og:description'})
+            if desc is not None:
+                article.summary = article.text_summary = desc['content']
+
+    def strip_anchors(self,soup):
+        paras = soup.findAll(True)
+        for para in paras:
+            aTags = para.findAll('a')
+            for a in aTags:
+                if a.img is None:
+                    a.replaceWith(a.renderContents().decode('cp1252','replace'))
+        return soup
+
+    def preprocess_html(self, soup):
+        return self.strip_anchors(soup)
+
+

    def parse_index(self):
        soup = self.index_to_soup(self.url_prefix+'/news/todays-paper/index.html')
@@ -1,4 +1,5 @@
 #!/usr/bin/env  python
+# -*- coding: utf-8 -*-

 __license__   = 'GPL v3'

@@ -6,30 +7,77 @@ __license__   = 'GPL v3'
 www.canada.com
 '''

+import string, re
+from calibre import strftime
+from calibre.web.feeds.news import BasicNewsRecipe
+
+import string, re
+from calibre import strftime
 from calibre.web.feeds.recipes import BasicNewsRecipe
+from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag


 class CanWestPaper(BasicNewsRecipe):

-    # un-comment the following three lines for the Saskatoon Star-Phoenix
+    # un-comment the following four lines for the Victoria Times Colonist
+##    title = u'Victoria Times Colonist'
+##    url_prefix = 'http://www.timescolonist.com'
+##    description = u'News from Victoria, BC'
+##    fp_tag = 'CAN_TC'
+
+    # un-comment the following four lines for the Vancouver Province
+##    title = u'Vancouver Province'
+##    url_prefix = 'http://www.theprovince.com'
+##    description = u'News from Vancouver, BC'
+##    fp_tag = 'CAN_VP'
+
+    # un-comment the following four lines for the Vancouver Sun
+##    title = u'Vancouver Sun'
+##    url_prefix = 'http://www.vancouversun.com'
+##    description = u'News from Vancouver, BC'
+##    fp_tag = 'CAN_VS'
+
+    # un-comment the following four lines for the Edmonton Journal
+##    title = u'Edmonton Journal'
+##    url_prefix = 'http://www.edmontonjournal.com'
+##    description = u'News from Edmonton, AB'
+##    fp_tag = 'CAN_EJ'
+
+    # un-comment the following four lines for the Calgary Herald
+##    title = u'Calgary Herald'
+##    url_prefix = 'http://www.calgaryherald.com'
+##    description = u'News from Calgary, AB'
+##    fp_tag = 'CAN_CH'
+
+    # un-comment the following four lines for the Regina Leader-Post
+##    title = u'Regina Leader-Post'
+##    url_prefix = 'http://www.leaderpost.com'
+##    description = u'News from Regina, SK'
+##    fp_tag = ''
+
+    # un-comment the following four lines for the Saskatoon Star-Phoenix
    title = u'Saskatoon Star-Phoenix'
    url_prefix = 'http://www.thestarphoenix.com'
    description = u'News from Saskatoon, SK'
+    fp_tag = ''

-    # un-comment the following three lines for the Windsor Star
-    #title = u'Windsor Star'
-    #url_prefix = 'http://www.windsorstar.com'
-    #description = u'News from Windsor, ON'
+    # un-comment the following four lines for the Windsor Star
+##    title = u'Windsor Star'
+##    url_prefix = 'http://www.windsorstar.com'
+##    description = u'News from Windsor, ON'
+##    fp_tag = 'CAN_'

-    # un-comment the following three lines for the Ottawa Citizen
-    #title = u'Ottawa Citizen'
-    #url_prefix = 'http://www.ottawacitizen.com'
-    #description = u'News from Ottawa, ON'
+    # un-comment the following four lines for the Ottawa Citizen
+##    title = u'Ottawa Citizen'
+##    url_prefix = 'http://www.ottawacitizen.com'
+##    description = u'News from Ottawa, ON'
+##    fp_tag = 'CAN_OC'

-    # un-comment the following three lines for the Montreal Gazette
-    #title = u'Montreal Gazette'
-    #url_prefix = 'http://www.montrealgazette.com'
-    #description = u'News from Montreal, QC'
+    # un-comment the following four lines for the Montreal Gazette
+##    title = u'Montreal Gazette'
+##    url_prefix = 'http://www.montrealgazette.com'
+##    description = u'News from Montreal, QC'
+##    fp_tag = 'CAN_MG'


    language = 'en_CA'
@@ -61,6 +109,80 @@ class CanWestPaper(BasicNewsRecipe):
                del(div['id'])
        return soup

+    def get_cover_url(self):
+        from datetime import timedelta, datetime, date
+        if self.fp_tag=='':
+            return None
+        cover = 'http://webmedia.newseum.org/newseum-multimedia/dfp/jpg'+str(date.today().day)+'/lg/'+self.fp_tag+'.jpg'
+        br = BasicNewsRecipe.get_browser()
+        daysback=1
+        try:
+            br.open(cover)
+        except:
+            while daysback<7:
+                cover = 'http://webmedia.newseum.org/newseum-multimedia/dfp/jpg'+str((date.today() - timedelta(days=daysback)).day)+'/lg/'+self.fp_tag+'.jpg'
+                br = BasicNewsRecipe.get_browser()
+                try:
+                    br.open(cover)
+                except:
+                    daysback = daysback+1
+                    continue
+                break
+        if daysback==7:
+            self.log("\nCover unavailable")
+            cover = None
+        return cover
+
+    def fixChars(self,string):
+        # Replace lsquo (\x91)
+        fixed = re.sub("\x91","‘",string)
+        # Replace rsquo (\x92)
+        fixed = re.sub("\x92","’",fixed)
+        # Replace ldquo (\x93)
+        fixed = re.sub("\x93","“",fixed)
+        # Replace rdquo (\x94)
+        fixed = re.sub("\x94","”",fixed)
+        # Replace ndash (\x96)
+        fixed = re.sub("\x96","–",fixed)
+        # Replace mdash (\x97)
+        fixed = re.sub("\x97","—",fixed)
+        fixed = re.sub("&#x2019;","’",fixed)
+        return fixed
+
+    def massageNCXText(self, description):
+        # Kindle TOC descriptions won't render certain characters
+        if description:
+            massaged = unicode(BeautifulStoneSoup(description, convertEntities=BeautifulStoneSoup.HTML_ENTITIES))
+            # Replace '&' with '&'
+            massaged = re.sub("&","&", massaged)
+            return self.fixChars(massaged)
+        else:
+            return description
+
+    def populate_article_metadata(self, article, soup, first):
+        if first:
+            picdiv = soup.find('body').find('img')
+            if picdiv is not None:
+                self.add_toc_thumbnail(article,re.sub(r'links\\link\d+\\','',picdiv['src']))
+        xtitle = article.text_summary.strip()
+        if len(xtitle) == 0:
+            desc = soup.find('meta',attrs={'property':'og:description'})
+            if desc is not None:
+                article.summary = article.text_summary = desc['content']
+
+    def strip_anchors(self,soup):
+        paras = soup.findAll(True)
+        for para in paras:
+            aTags = para.findAll('a')
+            for a in aTags:
+                if a.img is None:
+                    a.replaceWith(a.renderContents().decode('cp1252','replace'))
+        return soup
+
+    def preprocess_html(self, soup):
+        return self.strip_anchors(soup)
+
+

    def parse_index(self):
        soup = self.index_to_soup(self.url_prefix+'/news/todays-paper/index.html')
@@ -1,4 +1,5 @@
 #!/usr/bin/env  python
+# -*- coding: utf-8 -*-

 __license__   = 'GPL v3'

@@ -6,50 +7,77 @@ __license__   = 'GPL v3'
 www.canada.com
 '''

+import string, re
+from calibre import strftime
+from calibre.web.feeds.news import BasicNewsRecipe
+
+import string, re
+from calibre import strftime
 from calibre.web.feeds.recipes import BasicNewsRecipe
+from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag


 class CanWestPaper(BasicNewsRecipe):

-    # un-comment the following three lines for the Vancouver Sun
+    # un-comment the following four lines for the Victoria Times Colonist
+##    title = u'Victoria Times Colonist'
+##    url_prefix = 'http://www.timescolonist.com'
+##    description = u'News from Victoria, BC'
+##    fp_tag = 'CAN_TC'
+
+    # un-comment the following four lines for the Vancouver Province
+##    title = u'Vancouver Province'
+##    url_prefix = 'http://www.theprovince.com'
+##    description = u'News from Vancouver, BC'
+##    fp_tag = 'CAN_VP'
+
+    # un-comment the following four lines for the Vancouver Sun
    title = u'Vancouver Sun'
    url_prefix = 'http://www.vancouversun.com'
    description = u'News from Vancouver, BC'
+    fp_tag = 'CAN_VS'

-    # un-comment the following three lines for the Edmonton Journal
-    #title = u'Edmonton Journal'
-    #url_prefix = 'http://www.edmontonjournal.com'
-    #description = u'News from Edmonton, AB'
+    # un-comment the following four lines for the Edmonton Journal
+##    title = u'Edmonton Journal'
+##    url_prefix = 'http://www.edmontonjournal.com'
+##    description = u'News from Edmonton, AB'
+##    fp_tag = 'CAN_EJ'

-    # un-comment the following three lines for the Calgary Herald
-    #title = u'Calgary Herald'
-    #url_prefix = 'http://www.calgaryherald.com'
-    #description = u'News from Calgary, AB'
+    # un-comment the following four lines for the Calgary Herald
+##    title = u'Calgary Herald'
+##    url_prefix = 'http://www.calgaryherald.com'
+##    description = u'News from Calgary, AB'
+##    fp_tag = 'CAN_CH'

-    # un-comment the following three lines for the Regina Leader-Post
-    #title = u'Regina Leader-Post'
-    #url_prefix = 'http://www.leaderpost.com'
-    #description = u'News from Regina, SK'
+    # un-comment the following four lines for the Regina Leader-Post
+##    title = u'Regina Leader-Post'
+##    url_prefix = 'http://www.leaderpost.com'
+##    description = u'News from Regina, SK'
+##    fp_tag = ''

-    # un-comment the following three lines for the Saskatoon Star-Phoenix
-    #title = u'Saskatoon Star-Phoenix'
-    #url_prefix = 'http://www.thestarphoenix.com'
-    #description = u'News from Saskatoon, SK'
+    # un-comment the following four lines for the Saskatoon Star-Phoenix
+##    title = u'Saskatoon Star-Phoenix'
+##    url_prefix = 'http://www.thestarphoenix.com'
+##    description = u'News from Saskatoon, SK'
+##    fp_tag = ''

-    # un-comment the following three lines for the Windsor Star
-    #title = u'Windsor Star'
-    #url_prefix = 'http://www.windsorstar.com'
-    #description = u'News from Windsor, ON'
+    # un-comment the following four lines for the Windsor Star
+##    title = u'Windsor Star'
+##    url_prefix = 'http://www.windsorstar.com'
+##    description = u'News from Windsor, ON'
+##    fp_tag = 'CAN_'

-    # un-comment the following three lines for the Ottawa Citizen
-    #title = u'Ottawa Citizen'
-    #url_prefix = 'http://www.ottawacitizen.com'
-    #description = u'News from Ottawa, ON'
+    # un-comment the following four lines for the Ottawa Citizen
+##    title = u'Ottawa Citizen'
+##    url_prefix = 'http://www.ottawacitizen.com'
+##    description = u'News from Ottawa, ON'
+##    fp_tag = 'CAN_OC'

-    # un-comment the following three lines for the Montreal Gazette
-    #title = u'Montreal Gazette'
-    #url_prefix = 'http://www.montrealgazette.com'
-    #description = u'News from Montreal, QC'
+    # un-comment the following four lines for the Montreal Gazette
+##    title = u'Montreal Gazette'
+##    url_prefix = 'http://www.montrealgazette.com'
+##    description = u'News from Montreal, QC'
+##    fp_tag = 'CAN_MG'


    language = 'en_CA'
@@ -81,6 +109,80 @@ class CanWestPaper(BasicNewsRecipe):
                del(div['id'])
        return soup

+    def get_cover_url(self):
+        from datetime import timedelta, datetime, date
+        if self.fp_tag=='':
+            return None
+        cover = 'http://webmedia.newseum.org/newseum-multimedia/dfp/jpg'+str(date.today().day)+'/lg/'+self.fp_tag+'.jpg'
+        br = BasicNewsRecipe.get_browser()
+        daysback=1
+        try:
+            br.open(cover)
+        except:
+            while daysback<7:
+                cover = 'http://webmedia.newseum.org/newseum-multimedia/dfp/jpg'+str((date.today() - timedelta(days=daysback)).day)+'/lg/'+self.fp_tag+'.jpg'
+                br = BasicNewsRecipe.get_browser()
+                try:
+                    br.open(cover)
+                except:
+                    daysback = daysback+1
+                    continue
+                break
+        if daysback==7:
+            self.log("\nCover unavailable")
+            cover = None
+        return cover
+
+    def fixChars(self,string):
+        # Replace lsquo (\x91)
+        fixed = re.sub("\x91","‘",string)
+        # Replace rsquo (\x92)
+        fixed = re.sub("\x92","’",fixed)
+        # Replace ldquo (\x93)
+        fixed = re.sub("\x93","“",fixed)
+        # Replace rdquo (\x94)
+        fixed = re.sub("\x94","”",fixed)
+        # Replace ndash (\x96)
+        fixed = re.sub("\x96","–",fixed)
+        # Replace mdash (\x97)
+        fixed = re.sub("\x97","—",fixed)
+        fixed = re.sub("&#x2019;","’",fixed)
+        return fixed
+
+    def massageNCXText(self, description):
+        # Kindle TOC descriptions won't render certain characters
+        if description:
+            massaged = unicode(BeautifulStoneSoup(description, convertEntities=BeautifulStoneSoup.HTML_ENTITIES))
+            # Replace '&' with '&'
+            massaged = re.sub("&","&", massaged)
+            return self.fixChars(massaged)
+        else:
+            return description
+
+    def populate_article_metadata(self, article, soup, first):
+        if first:
+            picdiv = soup.find('body').find('img')
+            if picdiv is not None:
+                self.add_toc_thumbnail(article,re.sub(r'links\\link\d+\\','',picdiv['src']))
+        xtitle = article.text_summary.strip()
+        if len(xtitle) == 0:
+            desc = soup.find('meta',attrs={'property':'og:description'})
+            if desc is not None:
+                article.summary = article.text_summary = desc['content']
+
+    def strip_anchors(self,soup):
+        paras = soup.findAll(True)
+        for para in paras:
+            aTags = para.findAll('a')
+            for a in aTags:
+                if a.img is None:
+                    a.replaceWith(a.renderContents().decode('cp1252','replace'))
+        return soup
+
+    def preprocess_html(self, soup):
+        return self.strip_anchors(soup)
+
+

    def parse_index(self):
        soup = self.index_to_soup(self.url_prefix+'/news/todays-paper/index.html')
@@ -1,4 +1,5 @@
 #!/usr/bin/env  python
+# -*- coding: utf-8 -*-

 __license__   = 'GPL v3'

@@ -6,60 +7,77 @@ __license__   = 'GPL v3'
 www.canada.com
 '''

+import string, re
+from calibre import strftime
+from calibre.web.feeds.news import BasicNewsRecipe
+
+import string, re
+from calibre import strftime
 from calibre.web.feeds.recipes import BasicNewsRecipe
+from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag


 class CanWestPaper(BasicNewsRecipe):

-    # un-comment the following three lines for the Victoria Times Colonist
+    # un-comment the following four lines for the Victoria Times Colonist
    title = u'Victoria Times Colonist'
    url_prefix = 'http://www.timescolonist.com'
    description = u'News from Victoria, BC'
+    fp_tag = 'CAN_TC'

-    # un-comment the following three lines for the Vancouver Province
-    #title = u'Vancouver Province'
-    #url_prefix = 'http://www.theprovince.com'
-    #description = u'News from Vancouver, BC'
+    # un-comment the following four lines for the Vancouver Province
+##    title = u'Vancouver Province'
+##    url_prefix = 'http://www.theprovince.com'
+##    description = u'News from Vancouver, BC'
+##    fp_tag = 'CAN_VP'

-    # un-comment the following three lines for the Vancouver Sun
-    #title = u'Vancouver Sun'
-    #url_prefix = 'http://www.vancouversun.com'
-    #description = u'News from Vancouver, BC'
+    # un-comment the following four lines for the Vancouver Sun
+##    title = u'Vancouver Sun'
+##    url_prefix = 'http://www.vancouversun.com'
+##    description = u'News from Vancouver, BC'
+##    fp_tag = 'CAN_VS'

-    # un-comment the following three lines for the Edmonton Journal
-    #title = u'Edmonton Journal'
-    #url_prefix = 'http://www.edmontonjournal.com'
-    #description = u'News from Edmonton, AB'
+    # un-comment the following four lines for the Edmonton Journal
+##    title = u'Edmonton Journal'
+##    url_prefix = 'http://www.edmontonjournal.com'
+##    description = u'News from Edmonton, AB'
+##    fp_tag = 'CAN_EJ'

-    # un-comment the following three lines for the Calgary Herald
-    #title = u'Calgary Herald'
-    #url_prefix = 'http://www.calgaryherald.com'
-    #description = u'News from Calgary, AB'
+    # un-comment the following four lines for the Calgary Herald
+##    title = u'Calgary Herald'
+##    url_prefix = 'http://www.calgaryherald.com'
+##    description = u'News from Calgary, AB'
+##    fp_tag = 'CAN_CH'

-    # un-comment the following three lines for the Regina Leader-Post
-    #title = u'Regina Leader-Post'
-    #url_prefix = 'http://www.leaderpost.com'
-    #description = u'News from Regina, SK'
+    # un-comment the following four lines for the Regina Leader-Post
+##    title = u'Regina Leader-Post'
+##    url_prefix = 'http://www.leaderpost.com'
+##    description = u'News from Regina, SK'
+##    fp_tag = ''

-    # un-comment the following three lines for the Saskatoon Star-Phoenix
-    #title = u'Saskatoon Star-Phoenix'
-    #url_prefix = 'http://www.thestarphoenix.com'
-    #description = u'News from Saskatoon, SK'
+    # un-comment the following four lines for the Saskatoon Star-Phoenix
+##    title = u'Saskatoon Star-Phoenix'
+##    url_prefix = 'http://www.thestarphoenix.com'
+##    description = u'News from Saskatoon, SK'
+##    fp_tag = ''

-    # un-comment the following three lines for the Windsor Star
-    #title = u'Windsor Star'
-    #url_prefix = 'http://www.windsorstar.com'
-    #description = u'News from Windsor, ON'
+    # un-comment the following four lines for the Windsor Star
+##    title = u'Windsor Star'
+##    url_prefix = 'http://www.windsorstar.com'
+##    description = u'News from Windsor, ON'
+##    fp_tag = 'CAN_'

-    # un-comment the following three lines for the Ottawa Citizen
-    #title = u'Ottawa Citizen'
-    #url_prefix = 'http://www.ottawacitizen.com'
-    #description = u'News from Ottawa, ON'
+    # un-comment the following four lines for the Ottawa Citizen
+##    title = u'Ottawa Citizen'
+##    url_prefix = 'http://www.ottawacitizen.com'
+##    description = u'News from Ottawa, ON'
+##    fp_tag = 'CAN_OC'

-    # un-comment the following three lines for the Montreal Gazette
-    #title = u'Montreal Gazette'
-    #url_prefix = 'http://www.montrealgazette.com'
-    #description = u'News from Montreal, QC'
+    # un-comment the following four lines for the Montreal Gazette
+##    title = u'Montreal Gazette'
+##    url_prefix = 'http://www.montrealgazette.com'
+##    description = u'News from Montreal, QC'
+##    fp_tag = 'CAN_MG'


    language = 'en_CA'
@@ -91,6 +109,80 @@ class CanWestPaper(BasicNewsRecipe):
                del(div['id'])
        return soup

+    def get_cover_url(self):
+        from datetime import timedelta, datetime, date
+        if self.fp_tag=='':
+            return None
+        cover = 'http://webmedia.newseum.org/newseum-multimedia/dfp/jpg'+str(date.today().day)+'/lg/'+self.fp_tag+'.jpg'
+        br = BasicNewsRecipe.get_browser()
+        daysback=1
+        try:
+            br.open(cover)
+        except:
+            while daysback<7:
+                cover = 'http://webmedia.newseum.org/newseum-multimedia/dfp/jpg'+str((date.today() - timedelta(days=daysback)).day)+'/lg/'+self.fp_tag+'.jpg'
+                br = BasicNewsRecipe.get_browser()
+                try:
+                    br.open(cover)
+                except:
+                    daysback = daysback+1
+                    continue
+                break
+        if daysback==7:
+            self.log("\nCover unavailable")
+            cover = None
+        return cover
+
+    def fixChars(self,string):
+        # Replace lsquo (\x91)
+        fixed = re.sub("\x91","‘",string)
+        # Replace rsquo (\x92)
+        fixed = re.sub("\x92","’",fixed)
+        # Replace ldquo (\x93)
+        fixed = re.sub("\x93","“",fixed)
+        # Replace rdquo (\x94)
+        fixed = re.sub("\x94","”",fixed)
+        # Replace ndash (\x96)
+        fixed = re.sub("\x96","–",fixed)
+        # Replace mdash (\x97)
+        fixed = re.sub("\x97","—",fixed)
+        fixed = re.sub("&#x2019;","’",fixed)
+        return fixed
+
+    def massageNCXText(self, description):
+        # Kindle TOC descriptions won't render certain characters
+        if description:
+            massaged = unicode(BeautifulStoneSoup(description, convertEntities=BeautifulStoneSoup.HTML_ENTITIES))
+            # Replace '&' with '&'
+            massaged = re.sub("&","&", massaged)
+            return self.fixChars(massaged)
+        else:
+            return description
+
+    def populate_article_metadata(self, article, soup, first):
+        if first:
+            picdiv = soup.find('body').find('img')
+            if picdiv is not None:
+                self.add_toc_thumbnail(article,re.sub(r'links\\link\d+\\','',picdiv['src']))
+        xtitle = article.text_summary.strip()
+        if len(xtitle) == 0:
+            desc = soup.find('meta',attrs={'property':'og:description'})
+            if desc is not None:
+                article.summary = article.text_summary = desc['content']
+
+    def strip_anchors(self,soup):
+        paras = soup.findAll(True)
+        for para in paras:
+            aTags = para.findAll('a')
+            for a in aTags:
+                if a.img is None:
+                    a.replaceWith(a.renderContents().decode('cp1252','replace'))
+        return soup
+
+    def preprocess_html(self, soup):
+        return self.strip_anchors(soup)
+
+

    def parse_index(self):
        soup = self.index_to_soup(self.url_prefix+'/news/todays-paper/index.html')