Updated various Canadian newspapers

2025-07-09 03:04:10 -04:00 · 2012-08-15 09:24:48 +05:30 · 2012-08-15 09:24:48 +05:30 · 218a92de0e
commit 218a92de0e
parent 2f2787d0fa
6 changed files with 1446 additions and 405 deletions
--- a/recipes/calgary_herald.recipe
+++ b/recipes/calgary_herald.recipe
@ -1,35 +1,320 @@
 #!/usr/bin/env  python
 # -*- coding: utf-8 -*-
 __license__   = 'GPL v3'
 '''
 www.canada.com
 '''
 import string, re
 from calibre import strftime
 from calibre.web.feeds.news import BasicNewsRecipe
-class CalgaryHerald(BasicNewsRecipe):
+import string, re
 from calibre import strftime
 from calibre.web.feeds.recipes import BasicNewsRecipe
 from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag
 class CanWestPaper(BasicNewsRecipe):
    postmedia_index_pages = [
        (u'Headlines',u'/index.html'),
        (u'Ottawa & Area',u'/news/ottawa/index.html'),
        (u'Vancouver',u'/news/vancouver/index.html'),
        (u'Calgary',u'/news/calgary/index.html'),
        (u'Edmonton',u'/news/edmonton/index.html'),
        (u'Montreal',u'/news/montreal/index.html'),
        (u'Fraser Valley',u'/news/fraser-valley/index.html'),
        (u'British Columbia',u'/news/bc/index.html'),
        (u'Alberta',u'/news/alberta/index.html'),
        (u'Canada',u'/news/canada/index.html'),
        (u'National',u'/news/national/index.html'),
        (u'Politics',u'/news/politics/index.html'),
        (u'Insight',u'/news/insight/index.html'),
        (u'Special Reports',u'/news/specialreports/index.html'),
        (u'Gangs',u'/news/gangs/index.html'),
        (u'Education',u'/news/education/index.html'),
        (u'Health',u'/news/health/index.html'),
        (u'Environment',u'/news/environment/index.html'),
        (u'World',u'/news/world/index.html'),
        (u'Police Blotter',u'/news/crime-and-justice/index.html'),
        (u'Crime',u'/news/blotter/index.html'),
        (u'Around Town',u'/news/topic.html?t=keyword&q=Around+Town'),
        (u'Diplomatica',u'/news/diplomatica/index.html'),
        (u'Opinion',u'/opinion/index.html'),
        (u'Columnists',u'/columnists/index.html'),
        (u'Editorials',u'/opinion/editorials/index.html'),
        (u'Letters',u'/opinion/letters/index.html'),
        (u'Business',u'/business/index.html'),
        (u'Sports',u'/sports/index.html'),
        (u'Arts',u'/entertainment/index.html'),
        (u'Life',u'/life/index.html'),
        (u'Technology',u'/technology/index.html'),
        (u'Travel',u'/travel/index.html'),
        (u'Health',u'/health/index.html')
        ]
    # un-comment the following six lines for the Vancouver Province
 ##    title = u'Vancouver Province'
 ##    url_prefix = 'http://www.theprovince.com'
 ##    description = u'News from Vancouver, BC'
 ##    std_logo_url = 'http://www.theprovince.com/images/logo_theprovince.jpg'
 ##    logo_url = 'vplogo.jpg'
 ##    fp_tag = 'CAN_TP'
    # un-comment the following six lines for the Vancouver Sun
 ##    title = u'Vancouver Sun'
 ##    url_prefix = 'http://www.vancouversun.com'
 ##    description = u'News from Vancouver, BC'
 ##    std_logo_url = 'http://www.vancouversun.com/images/logo_vancouversun.jpg'
 ##    logo_url = 'vslogo.jpg'
 ##    fp_tag = 'CAN_VS'
    # un-comment the following six lines for the Calgary Herald
    title = u'Calgary Herald'
-    oldest_article = 3
+    url_prefix = 'http://www.calgaryherald.com'
-    max_articles_per_feed = 100
+    description = u'News from Calgary, AB'
    std_logo_url = 'http://www.calgaryherald.com/images/logo_calgaryherald.jpg'
    logo_url = 'chlogo.jpg'
    fp_tag = 'CAN_CH'
-    feeds          = [
+    # un-comment the following six lines for the Edmonton Journal
-	(u'News', u'http://rss.canada.com/get/?F233'),
+##    title = u'Edmonton Journal'
-	(u'Calgary', u'http://www.calgaryherald.com/scripts/sp6query.aspx?catalog=cahr&tags=keyword|calgary&output=rss?link=http%3a%2f%2fwww.calgaryherald'),
+##    url_prefix = 'http://www.edmontonjournal.com'
-	(u'Alberta', u'http://www.calgaryherald.com/scripts/Sp6Query.aspx?catalog=CAHR&tags=Keyword|Alberta&output=rss?link=http%3A%2F%2Fwww.calgaryherald.com%2Fnews%2Falberta%2Findex.html'),
+##    description = u'News from Edmonton, AB'
-	(u'Politics', u'http://rss.canada.com/get/?F7551'),
+##    std_logo_url = 'http://www.edmontonjournal.com/images/logo_edmontonjournal.jpg'
-	(u'National', u'http://rss.canada.com/get/?F7552'),
+##    logo_url = 'ejlogo.jpg'
-	(u'World', u'http://rss.canada.com/get/?F7553'),
+##    fp_tag = 'CAN_EJ'
 	]
    __author__ = 'rty'
    pubisher  = 'Calgary Herald'
    description           = 'Calgary, Alberta, Canada'
    category              = 'News, Calgary, Alberta, Canada'
    # un-comment the following six lines for the Ottawa Citizen
 ##    title = u'Ottawa Citizen'
 ##    url_prefix = 'http://www.ottawacitizen.com'
 ##    description = u'News from Ottawa, ON'   
 ##    std_logo_url = 'http://www.ottawacitizen.com/images/logo_ottawacitizen.jpg'
 ##    logo_url = 'oclogo.jpg'
 ##    fp_tag = 'CAN_OC'
-    remove_javascript = True
+    # un-comment the following six lines for the Montreal Gazette
-    use_embedded_content   = False
+##    title = u'Montreal Gazette'
-    no_stylesheets = True
+##    url_prefix = 'http://www.montrealgazette.com'
 ##    description = u'News from Montreal, QC'
 ##    std_logo_url = 'http://www.montrealgazette.com/images/logo_montrealgazette.jpg'
 ##    logo_url = 'mglogo.jpg'
 ##    fp_tag = 'CAN_MG'
    Kindle_Fire=False
    masthead_url = std_logo_url
    url_list = []    
    language = 'en_CA'
    __author__ = 'Nick Redding'
    no_stylesheets = True
    timefmt =  ' [%b %d]'
    encoding = 'utf-8'
-    conversion_options = {'linearize_tables':True}
+    extra_css = '''
-    ##masthead_url = 'http://www.calgaryherald.com/index.html'
+                .timestamp {  font-size:xx-small; display: block; }
-    keep_only_tags = [
+                #storyheader { font-size: medium; }
-	dict(name='div', attrs={'id':'storyheader'}),
+                #storyheader h1 { font-size: x-large; }
-	dict(name='div', attrs={'id':'storycontent'})
+                #storyheader h2 { font-size: small;  font-style: italic; }
                .byline { font-size:xx-small; }
                #photocaption { font-size: xx-small; font-style: italic; font-weight: normal; }
                .photocaption { font-size: xx-small; font-style: italic; font-weight: normal; }
                #photocredit { font-size: xx-small; font-weight: normal; }'''
-                               ]
+    keep_only_tags = [dict(name='div', attrs={'id':'storyheader'}),dict(name='div', attrs={'id':'storycontent'}),dict(name='div', attrs={'id':'storyphoto'})]
-    remove_tags_after = {'class':"story_tool_hr"}
+
    remove_tags = [{'class':'comments'},
                   dict(name='div', attrs={'class':'navbar'}),dict(name='div', attrs={'class':'morelinks'}),
                   dict(name='h2', attrs={'id':'photocredit'}),
                   dict(name='div', attrs={'class':'viewmore'}),dict(name='li', attrs={'class':'email'}),
                   dict(name='div', attrs={'class':'story_tool_hr'}),dict(name='div', attrs={'class':'clear'}),
                   dict(name='div', attrs={'class':'story_tool'}),dict(name='div', attrs={'class':'copyright'}),
                   dict(name='div', attrs={'class':'rule_grey_solid'}),
                   dict(name='li', attrs={'class':'print'}),dict(name='li', attrs={'class':'share'}),dict(name='ul', attrs={'class':'bullet'})]
    def get_cover_url(self):
        from datetime import timedelta, datetime, date
        cover = 'http://webmedia.newseum.org/newseum-multimedia/dfp/jpg'+str(date.today().day)+'/lg/'+self.fp_tag+'.jpg'
        br = BasicNewsRecipe.get_browser()
        daysback=1
        try:
            br.open(cover)
        except:
            while daysback<7:
                cover = 'http://webmedia.newseum.org/newseum-multimedia/dfp/jpg'+str((date.today() - timedelta(days=daysback)).day)+'/lg/'+self.fp_tag+'.jpg'
                br = BasicNewsRecipe.get_browser()
                try:
                    br.open(cover)
                except:
                    daysback = daysback+1
                    continue
                break
        if daysback==7:
            self.log("\nCover unavailable")
            cover = None
        return cover
    def prepare_masthead_image(self, path_to_image, out_path):
        if self.Kindle_Fire:
            from calibre import fit_image
            from calibre.utils.magick import Image, create_canvas
            img = Image()
            img.open(path_to_image)
            width, height = img.size
            img2 = create_canvas(width, height)
            img2.compose(img)
            img2.save(out_path)
        else:
            BasicNewsRecipe.prepare_masthead_image(self, path_to_image, out_path)
    def fixChars(self,string):
        # Replace lsquo (\x91)
        fixed = re.sub("\x91","‘",string)
        # Replace rsquo (\x92)
        fixed = re.sub("\x92","’",fixed)
        # Replace ldquo (\x93)
        fixed = re.sub("\x93","“",fixed)
        # Replace rdquo (\x94)
        fixed = re.sub("\x94","”",fixed)
        # Replace ndash (\x96)
        fixed = re.sub("\x96","–",fixed)
        # Replace mdash (\x97)
        fixed = re.sub("\x97","—",fixed)
        fixed = re.sub("&#x2019;","’",fixed)
        return fixed
    def massageNCXText(self, description):
        # Kindle TOC descriptions won't render certain characters
        if description:
            massaged = unicode(BeautifulStoneSoup(description, convertEntities=BeautifulStoneSoup.HTML_ENTITIES))
            # Replace '&' with '&'
            massaged = re.sub("&","&", massaged)
            return self.fixChars(massaged)
        else:
            return description
    def populate_article_metadata(self, article, soup, first):
        if first:
            picdiv = soup.find('body').find('img')
            if picdiv is not None:
                self.add_toc_thumbnail(article,re.sub(r'links\\link\d+\\','',picdiv['src']))
        xtitle = article.text_summary.strip()
        if len(xtitle) == 0:
            desc = soup.find('meta',attrs={'property':'og:description'})
            if desc is not None:
                article.summary = article.text_summary = desc['content']
    def strip_anchors(self,soup):
        paras = soup.findAll(True)
        for para in paras:
            aTags = para.findAll('a')
            for a in aTags:
                if a.img is None:
                    a.replaceWith(a.renderContents().decode('cp1252','replace'))
        return soup
    def preprocess_html(self,soup):
        #delete empty id attributes--they screw up the TOC for unknown reasons
        divtags = soup.findAll('div',attrs={'id':''})
        if divtags:
            for div in divtags:
                del(div['id'])
        pgall = soup.find('div',attrs={'id':'storyphoto'})
        if pgall is not None: # photo gallery perhaps
            if (soup.find('div',attrs={'id':'storycontent'}) is None):
                allpics = Tag(soup,'div')
                first_img = pgall.find('div','storyimage')
                if first_img is not None:
                    first_img.extract()
                    tlist = pgall.find('div',attrs={'id':'relatedthumbs'})
                    if tlist is not None:
                        for atag in tlist.findAll('a'):
                            img = Tag(soup,'img')
                            srcpre, sep, srcpost = atag.img['src'].partition('?')
                            img['src'] = srcpre
                            pdesc = Tag(soup,'p')
                            pdesc.insert(0,atag.img['alt'])
                            pdesc['class']='photocaption'
                            div = Tag(soup,'div')
                            div.insert(0,pdesc)
                            div.insert(0,img)
                            allpics.append(div)
                pgall.replaceWith(allpics)
        for pg in soup.findAll('div',attrs={'id':'storyphoto'}):
            pg.extract()
        return self.strip_anchors(soup)
    def parse_index(self):
        articles = {}
        ans = []
        def handle_article(adiv,key):
            h1tag = adiv.h1
            if h1tag is not None:
                atag = h1tag.a
                if atag is not None:
                    url = atag['href']
                    if atag['href'].startswith('http'):
                        return
                    elif atag['href'].startswith('/'):
                        url = self.url_prefix+atag['href']
                    else:
                        url = self.url_prefix+'/'+atag['href']
                    if url in self.url_list:
                        return
                    self.url_list.append(url)
                    title = self.tag_to_string(atag,False)
                    if 'VIDEO' in title.upper():
                        return
                    if 'GALLERY' in title.upper():
                        return
                    if 'PHOTOS' in title.upper():
                        return                  
                    dtag = adiv.find('div','content')
                    description=''
                    print("URL "+url)
                    print("TITLE "+title)
                    if dtag is not None:
                        stag = dtag.span
                        if stag is not None:
                            if stag['class'] != 'timestamp':
                                description = self.tag_to_string(stag,False)
                        else:
                            description = self.tag_to_string(dtag,False)
                        print("DESCRIPTION: "+description)
                    if not articles.has_key(key):
                        articles[key] = []
                    articles[key].append(dict(title=title,url=url,date='',description=description,author='',content=''))
        def parse_web_index(key, keyurl):
            try:
                soup = self.index_to_soup(self.url_prefix+keyurl)
            except:
                return
            ans.append(key)
            mainsoup = soup.find('div','bodywrapper')
            footer = mainsoup.find(attrs={'id':'footerfeature'})
            if footer is not None:
                footer.extract()
            print("Section: "+key)
            for wdiv in mainsoup.findAll('div',attrs={'id':re.compile('^HorizontalFeatureSlider_1_Story')}):
                handle_article(wdiv,key)
                wdiv.extract()
            for wdiv in mainsoup.findAll(attrs={'id':['featurewidget','textfeature','textlinks_timestamp']}):
                for adiv in wdiv.findAll('div','featurecontent'):
                    handle_article(adiv,key)
        for (k,url) in self.postmedia_index_pages:
            parse_web_index(k,url)
        ans = [(key, articles[key]) for key in ans if articles.has_key(key)]
        return ans
--- a/recipes/edmonton_journal.recipe
+++ b/recipes/edmonton_journal.recipe
@ -1,105 +1,141 @@
 #!/usr/bin/env  python
 # -*- coding: utf-8 -*-
 __license__   = 'GPL v3'
 '''
 www.canada.com
 '''
-
+import string, re
-import re
+from calibre import strftime
 from calibre.web.feeds.news import BasicNewsRecipe
-from calibre.ebooks.BeautifulSoup import BeautifulStoneSoup
+
 import string, re
 from calibre import strftime
 from calibre.web.feeds.recipes import BasicNewsRecipe
 from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag
 class CanWestPaper(BasicNewsRecipe):
-    # un-comment the following four lines for the Victoria Times Colonist
+    postmedia_index_pages = [
-##    title = u'Victoria Times Colonist'
+        (u'Headlines',u'/index.html'),
-##    url_prefix = 'http://www.timescolonist.com'
+        (u'Ottawa & Area',u'/news/ottawa/index.html'),
-##    description = u'News from Victoria, BC'
+        (u'Vancouver',u'/news/vancouver/index.html'),
-##    fp_tag = 'CAN_TC'
+        (u'Calgary',u'/news/calgary/index.html'),
        (u'Edmonton',u'/news/edmonton/index.html'),
        (u'Montreal',u'/news/montreal/index.html'),,
        (u'Fraser Valley',u'/news/fraser-valley/index.html'),
        (u'British Columbia',u'/news/bc/index.html'),
        (u'Alberta',u'/news/alberta/index.html'),
        (u'Canada',u'/news/canada/index.html'),
        (u'National',u'/news/national/index.html'),
        (u'Politics',u'/news/politics/index.html'),
        (u'Insight',u'/news/insight/index.html'),
        (u'Special Reports',u'/news/specialreports/index.html'),
        (u'Gangs',u'/news/gangs/index.html'),
        (u'Education',u'/news/education/index.html'),
        (u'Health',u'/news/health/index.html'),
        (u'Environment',u'/news/environment/index.html'),
        (u'World',u'/news/world/index.html'),
        (u'Police Blotter',u'/news/crime-and-justice/index.html'),
        (u'Crime',u'/news/blotter/index.html'),
        (u'Around Town',u'/news/topic.html?t=keyword&q=Around+Town'),
        (u'Diplomatica',u'/news/diplomatica/index.html'),
        (u'Opinion',u'/opinion/index.html'),
        (u'Columnists',u'/columnists/index.html'),
        (u'Editorials',u'/opinion/editorials/index.html'),
        (u'Letters',u'/opinion/letters/index.html'),
        (u'Business',u'/business/index.html'),
        (u'Sports',u'/sports/index.html'),
        (u'Arts',u'/entertainment/index.html'),
        (u'Life',u'/life/index.html'),
        (u'Technology',u'/technology/index.html'),
        (u'Travel',u'/travel/index.html'),
        (u'Health',u'/health/index.html')
        ]
-    # un-comment the following four lines for the Vancouver Province
+
    # un-comment the following six lines for the Vancouver Province
 ##    title = u'Vancouver Province'
 ##    url_prefix = 'http://www.theprovince.com'
 ##    description = u'News from Vancouver, BC'
-##    fp_tag = 'CAN_VP'
+##    std_logo_url = 'http://www.theprovince.com/images/logo_theprovince.jpg'
 ##    logo_url = 'vplogo.jpg'
 ##    fp_tag = 'CAN_TP'
-    # un-comment the following four lines for the Vancouver Sun
+    # un-comment the following six lines for the Vancouver Sun
 ##    title = u'Vancouver Sun'
 ##    url_prefix = 'http://www.vancouversun.com'
 ##    description = u'News from Vancouver, BC'
 ##    std_logo_url = 'http://www.vancouversun.com/images/logo_vancouversun.jpg'
 ##    logo_url = 'vslogo.jpg'
 ##    fp_tag = 'CAN_VS'
-    # un-comment the following four lines for the Edmonton Journal
+    # un-comment the following six lines for the Calgary Herald
    title = u'Edmonton Journal'
    url_prefix = 'http://www.edmontonjournal.com'
    description = u'News from Edmonton, AB'
    fp_tag = 'CAN_EJ'
    # un-comment the following four lines for the Calgary Herald
 ##    title = u'Calgary Herald'
 ##    url_prefix = 'http://www.calgaryherald.com'
 ##    description = u'News from Calgary, AB'
 ##    std_logo_url = 'http://www.calgaryherald.com/images/logo_calgaryherald.jpg'
 ##    logo_url = 'chlogo.jpg'
 ##    fp_tag = 'CAN_CH'
-    # un-comment the following four lines for the Regina Leader-Post
+    # un-comment the following six lines for the Edmonton Journal
-##    title = u'Regina Leader-Post'
+    title = u'Edmonton Journal'
-##    url_prefix = 'http://www.leaderpost.com'
+    url_prefix = 'http://www.edmontonjournal.com'
-##    description = u'News from Regina, SK'
+    description = u'News from Edmonton, AB'
-##    fp_tag = ''
+    std_logo_url = 'http://www.edmontonjournal.com/images/logo_edmontonjournal.jpg'
    logo_url = 'ejlogo.jpg'
    fp_tag = 'CAN_EJ'
-    # un-comment the following four lines for the Saskatoon Star-Phoenix
+    # un-comment the following six lines for the Ottawa Citizen
 ##    title = u'Saskatoon Star-Phoenix'
 ##    url_prefix = 'http://www.thestarphoenix.com'
 ##    description = u'News from Saskatoon, SK'
 ##    fp_tag = ''
    # un-comment the following four lines for the Windsor Star
 ##    title = u'Windsor Star'
 ##    url_prefix = 'http://www.windsorstar.com'
 ##    description = u'News from Windsor, ON'
 ##    fp_tag = 'CAN_'
    # un-comment the following four lines for the Ottawa Citizen
 ##    title = u'Ottawa Citizen'
 ##    url_prefix = 'http://www.ottawacitizen.com'
 ##    description = u'News from Ottawa, ON'   
 ##    std_logo_url = 'http://www.ottawacitizen.com/images/logo_ottawacitizen.jpg'
 ##    logo_url = 'oclogo.jpg'
 ##    fp_tag = 'CAN_OC'
-    # un-comment the following four lines for the Montreal Gazette
+    # un-comment the following six lines for the Montreal Gazette
 ##    title = u'Montreal Gazette'
 ##    url_prefix = 'http://www.montrealgazette.com'
 ##    description = u'News from Montreal, QC'
 ##    std_logo_url = 'http://www.montrealgazette.com/images/logo_montrealgazette.jpg'
 ##    logo_url = 'mglogo.jpg'
 ##    fp_tag = 'CAN_MG'
    Kindle_Fire=False
    masthead_url = std_logo_url
    url_list = []    
    language = 'en_CA'
    __author__ = 'Nick Redding'
    no_stylesheets = True
    timefmt =  ' [%b %d]'
    encoding = 'utf-8'
    extra_css = '''
                .timestamp {  font-size:xx-small; display: block; }
                #storyheader { font-size: medium; }
                #storyheader h1 { font-size: x-large; }
-                #storyheader h2 { font-size: large;  font-style: italic; }
+                #storyheader h2 { font-size: small;  font-style: italic; }
                .byline { font-size:xx-small; }
-                #photocaption { font-size: small; font-style: italic }
+                #photocaption { font-size: xx-small; font-style: italic; font-weight: normal; }
-                #photocredit { font-size: xx-small; }'''
+                .photocaption { font-size: xx-small; font-style: italic; font-weight: normal; }
-    keep_only_tags = [dict(name='div', attrs={'id':'storyheader'}),dict(name='div', attrs={'id':'storycontent'})]
+                #photocredit { font-size: xx-small; font-weight: normal; }'''
    keep_only_tags = [dict(name='div', attrs={'id':'storyheader'}),dict(name='div', attrs={'id':'storycontent'}),dict(name='div', attrs={'id':'storyphoto'})]
    remove_tags = [{'class':'comments'},
                   dict(name='div', attrs={'class':'navbar'}),dict(name='div', attrs={'class':'morelinks'}),
                   dict(name='h2', attrs={'id':'photocredit'}),
                   dict(name='div', attrs={'class':'viewmore'}),dict(name='li', attrs={'class':'email'}),
                   dict(name='div', attrs={'class':'story_tool_hr'}),dict(name='div', attrs={'class':'clear'}),
                   dict(name='div', attrs={'class':'story_tool'}),dict(name='div', attrs={'class':'copyright'}),
                   dict(name='div', attrs={'class':'rule_grey_solid'}),
                   dict(name='li', attrs={'class':'print'}),dict(name='li', attrs={'class':'share'}),dict(name='ul', attrs={'class':'bullet'})]
    def get_cover_url(self):
-        from datetime import timedelta, date
+        from datetime import timedelta, datetime, date
        if self.fp_tag=='':
            return None
        cover = 'http://webmedia.newseum.org/newseum-multimedia/dfp/jpg'+str(date.today().day)+'/lg/'+self.fp_tag+'.jpg'
        br = BasicNewsRecipe.get_browser()
        daysback=1
@ -120,6 +156,19 @@ class CanWestPaper(BasicNewsRecipe):
            cover = None
        return cover
    def prepare_masthead_image(self, path_to_image, out_path):
        if self.Kindle_Fire:
            from calibre import fit_image
            from calibre.utils.magick import Image, create_canvas
            img = Image()
            img.open(path_to_image)
            width, height = img.size
            img2 = create_canvas(width, height)
            img2.compose(img)
            img2.save(out_path)
        else:
            BasicNewsRecipe.prepare_masthead_image(self, path_to_image, out_path)
    def fixChars(self,string):
        # Replace lsquo (\x91)
        fixed = re.sub("\x91","‘",string)
@ -166,55 +215,106 @@ class CanWestPaper(BasicNewsRecipe):
                    a.replaceWith(a.renderContents().decode('cp1252','replace'))
        return soup
    def preprocess_html(self,soup):
        #delete empty id attributes--they screw up the TOC for unknown reasons
        divtags = soup.findAll('div',attrs={'id':''})
        if divtags:
            for div in divtags:
                del(div['id'])
        pgall = soup.find('div',attrs={'id':'storyphoto'})
        if pgall is not None: # photo gallery perhaps
            if (soup.find('div',attrs={'id':'storycontent'}) is None):
                allpics = Tag(soup,'div')
                first_img = pgall.find('div','storyimage')
                if first_img is not None:
                    first_img.extract()
                    tlist = pgall.find('div',attrs={'id':'relatedthumbs'})
                    if tlist is not None:
                        for atag in tlist.findAll('a'):
                            img = Tag(soup,'img')
                            srcpre, sep, srcpost = atag.img['src'].partition('?')
                            img['src'] = srcpre
                            pdesc = Tag(soup,'p')
                            pdesc.insert(0,atag.img['alt'])
                            pdesc['class']='photocaption'
                            div = Tag(soup,'div')
                            div.insert(0,pdesc)
                            div.insert(0,img)
                            allpics.append(div)
                pgall.replaceWith(allpics)
        for pg in soup.findAll('div',attrs={'id':'storyphoto'}):
            pg.extract()
        return self.strip_anchors(soup)
    def parse_index(self):
        soup = self.index_to_soup(self.url_prefix+'/news/todays-paper/index.html')
        articles = {}
-        key = 'News'
+        ans = []
        ans = ['News']
-        # Find each instance of class="sectiontitle", class="featurecontent"
+
-        for divtag in soup.findAll('div',attrs={'class' : ["section_title02","featurecontent"]}):
+        def handle_article(adiv,key):
-                #self.log(" div class = %s" % divtag['class'])
+            h1tag = adiv.h1
-                if divtag['class'].startswith('section_title'):
+            if h1tag is not None:
-                    # div contains section title
+                atag = h1tag.a
-                    if not divtag.h3:
+                if atag is not None:
-                        continue
+                    url = atag['href']
-                    key = self.tag_to_string(divtag.h3,False)
+                    if atag['href'].startswith('http'):
-                    ans.append(key)
+                        return
-                    self.log("Section name %s" % key)
+                    elif atag['href'].startswith('/'):
-                    continue
+                        url = self.url_prefix+atag['href']
-                # div contains article data
+                    else:
-                h1tag = divtag.find('h1')
+                        url = self.url_prefix+'/'+atag['href']
-                if not h1tag:
+                    if url in self.url_list:
-                    continue
+                        return
-                atag = h1tag.find('a',href=True)
+                    self.url_list.append(url)
                if not atag:
                    continue
                url = self.url_prefix+'/news/todays-paper/'+atag['href']
                #self.log("Section %s" % key)
                #self.log("url %s" % url)
                    title = self.tag_to_string(atag,False)
-                #self.log("title %s" % title)
+                    if 'VIDEO' in title.upper():
-                pubdate = ''
+                        return
                    if 'GALLERY' in title.upper():
                        return
                    if 'PHOTOS' in title.upper():
                        return                  
                    dtag = adiv.find('div','content')
                    description=''
-                ptag = divtag.find('p');
+                    print("URL "+url)
-                if ptag:
+                    print("TITLE "+title)
-                    description = self.tag_to_string(ptag,False)
+                    if dtag is not None:
-                    #self.log("description %s" % description)
+                        stag = dtag.span
-                author = ''
+                        if stag is not None:
-                autag = divtag.find('h4')
+                            if stag['class'] != 'timestamp':
-                if autag:
+                                description = self.tag_to_string(stag,False)
-                    author = self.tag_to_string(autag,False)
+                        else:
-                    #self.log("author %s" % author)
+                            description = self.tag_to_string(dtag,False)
                        print("DESCRIPTION: "+description)
                    if not articles.has_key(key):
                        articles[key] = []
-                articles[key].append(dict(title=title,url=url,date=pubdate,description=description,author=author,content=''))
+                    articles[key].append(dict(title=title,url=url,date='',description=description,author='',content=''))
        def parse_web_index(key, keyurl):
            try:
                soup = self.index_to_soup(self.url_prefix+keyurl)
            except:
                return
            ans.append(key)
            mainsoup = soup.find('div','bodywrapper')
            footer = mainsoup.find(attrs={'id':'footerfeature'})
            if footer is not None:
                footer.extract()
            print("Section: "+key)
            for wdiv in mainsoup.findAll('div',attrs={'id':re.compile('^HorizontalFeatureSlider_1_Story')}):
                handle_article(wdiv,key)
                wdiv.extract()
            for wdiv in mainsoup.findAll(attrs={'id':['featurewidget','textfeature','textlinks_timestamp']}):
                for adiv in wdiv.findAll('div','featurecontent'):
                    handle_article(adiv,key)
        for (k,url) in self.postmedia_index_pages:
            parse_web_index(k,url)
        ans = [(key, articles[key]) for key in ans if articles.has_key(key)]
        return ans
--- a/recipes/montreal_gazette.recipe
+++ b/recipes/montreal_gazette.recipe
@ -1,48 +1,320 @@
 #!/usr/bin/env  python
-
+# -*- coding: utf-8 -*-
 __license__   = 'GPL v3'
 '''
 www.canada.com
 '''
 import string, re
 from calibre import strftime
 from calibre.web.feeds.news import BasicNewsRecipe
 import string, re
 from calibre import strftime
 from calibre.web.feeds.recipes import BasicNewsRecipe
 from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag
 class CanWestPaper(BasicNewsRecipe):
-    # un-comment the following three lines for the Montreal Gazette
+    postmedia_index_pages = [
        (u'Headlines',u'/index.html'),
        (u'Ottawa & Area',u'/news/ottawa/index.html'),
        (u'Vancouver',u'/news/vancouver/index.html'),
        (u'Calgary',u'/news/calgary/index.html'),
        (u'Edmonton',u'/news/edmonton/index.html'),
        (u'Montreal',u'/news/montreal/index.html'),
        (u'Fraser Valley',u'/news/fraser-valley/index.html'),
        (u'British Columbia',u'/news/bc/index.html'),
        (u'Alberta',u'/news/alberta/index.html'),
        (u'Canada',u'/news/canada/index.html'),
        (u'National',u'/news/national/index.html'),
        (u'Politics',u'/news/politics/index.html'),
        (u'Insight',u'/news/insight/index.html'),
        (u'Special Reports',u'/news/specialreports/index.html'),
        (u'Gangs',u'/news/gangs/index.html'),
        (u'Education',u'/news/education/index.html'),
        (u'Health',u'/news/health/index.html'),
        (u'Environment',u'/news/environment/index.html'),
        (u'World',u'/news/world/index.html'),
        (u'Police Blotter',u'/news/crime-and-justice/index.html'),
        (u'Crime',u'/news/blotter/index.html'),
        (u'Around Town',u'/news/topic.html?t=keyword&q=Around+Town'),
        (u'Diplomatica',u'/news/diplomatica/index.html'),
        (u'Opinion',u'/opinion/index.html'),
        (u'Columnists',u'/columnists/index.html'),
        (u'Editorials',u'/opinion/editorials/index.html'),
        (u'Letters',u'/opinion/letters/index.html'),
        (u'Business',u'/business/index.html'),
        (u'Sports',u'/sports/index.html'),
        (u'Arts',u'/entertainment/index.html'),
        (u'Life',u'/life/index.html'),
        (u'Technology',u'/technology/index.html'),
        (u'Travel',u'/travel/index.html'),
        (u'Health',u'/health/index.html')
        ]
    # un-comment the following six lines for the Vancouver Province
 ##    title = u'Vancouver Province'
 ##    url_prefix = 'http://www.theprovince.com'
 ##    description = u'News from Vancouver, BC'
 ##    std_logo_url = 'http://www.theprovince.com/images/logo_theprovince.jpg'
 ##    logo_url = 'vplogo.jpg'
 ##    fp_tag = 'CAN_TP'
    # un-comment the following six lines for the Vancouver Sun
 ##    title = u'Vancouver Sun'
 ##    url_prefix = 'http://www.vancouversun.com'
 ##    description = u'News from Vancouver, BC'
 ##    std_logo_url = 'http://www.vancouversun.com/images/logo_vancouversun.jpg'
 ##    logo_url = 'vslogo.jpg'
 ##    fp_tag = 'CAN_VS'
    # un-comment the following six lines for the Calgary Herald
 ##    title = u'Calgary Herald'
 ##    url_prefix = 'http://www.calgaryherald.com'
 ##    description = u'News from Calgary, AB'
 ##    std_logo_url = 'http://www.calgaryherald.com/images/logo_calgaryherald.jpg'
 ##    logo_url = 'chlogo.jpg'
 ##    fp_tag = 'CAN_CH'
    # un-comment the following six lines for the Edmonton Journal
 ##    title = u'Edmonton Journal'
 ##    url_prefix = 'http://www.edmontonjournal.com'
 ##    description = u'News from Edmonton, AB'
 ##    std_logo_url = 'http://www.edmontonjournal.com/images/logo_edmontonjournal.jpg'
 ##    logo_url = 'ejlogo.jpg'
 ##    fp_tag = 'CAN_EJ'
    # un-comment the following six lines for the Ottawa Citizen
 ##    title = u'Ottawa Citizen'
 ##    url_prefix = 'http://www.ottawacitizen.com'
 ##    description = u'News from Ottawa, ON'   
 ##    std_logo_url = 'http://www.ottawacitizen.com/images/logo_ottawacitizen.jpg'
 ##    logo_url = 'oclogo.jpg'
 ##    fp_tag = 'CAN_OC'
    # un-comment the following six lines for the Montreal Gazette
    title = u'Montreal Gazette'
    url_prefix = 'http://www.montrealgazette.com'
    description = u'News from Montreal, QC'
    std_logo_url = 'http://www.montrealgazette.com/images/logo_montrealgazette.jpg'
    logo_url = 'mglogo.jpg'
    fp_tag = 'CAN_MG'
    Kindle_Fire=False
    masthead_url = std_logo_url
    url_list = []    
    language = 'en_CA'
    __author__ = 'Nick Redding'
    no_stylesheets = True
    auto_cleanup = True
    auto_cleanup_keep = '//*[@id="imageBox"]'
    timefmt =  ' [%b %d]'
    encoding = 'utf-8'
    extra_css = '''
                .timestamp {  font-size:xx-small; display: block; }
                #storyheader { font-size: medium; }
                #storyheader h1 { font-size: x-large; }
-                #storyheader h2 { font-size: large;  font-style: italic; }
+                #storyheader h2 { font-size: small;  font-style: italic; }
                .byline { font-size:xx-small; }
-                #photocaption { font-size: small; font-style: italic }
+                #photocaption { font-size: xx-small; font-style: italic; font-weight: normal; }
-                #photocredit { font-size: xx-small; }'''
+                .photocaption { font-size: xx-small; font-style: italic; font-weight: normal; }
                #photocredit { font-size: xx-small; font-weight: normal; }'''
    keep_only_tags = [dict(name='div', attrs={'id':'storyheader'}),dict(name='div', attrs={'id':'storycontent'}),dict(name='div', attrs={'id':'storyphoto'})]
    remove_tags = [{'class':'comments'},
                   dict(name='div', attrs={'class':'navbar'}),dict(name='div', attrs={'class':'morelinks'}),
                   dict(name='h2', attrs={'id':'photocredit'}),
                   dict(name='div', attrs={'class':'viewmore'}),dict(name='li', attrs={'class':'email'}),
                   dict(name='div', attrs={'class':'story_tool_hr'}),dict(name='div', attrs={'class':'clear'}),
                   dict(name='div', attrs={'class':'story_tool'}),dict(name='div', attrs={'class':'copyright'}),
                   dict(name='div', attrs={'class':'rule_grey_solid'}),
                   dict(name='li', attrs={'class':'print'}),dict(name='li', attrs={'class':'share'}),dict(name='ul', attrs={'class':'bullet'})]
    def get_cover_url(self):
        from datetime import timedelta, datetime, date
        cover = 'http://webmedia.newseum.org/newseum-multimedia/dfp/jpg'+str(date.today().day)+'/lg/'+self.fp_tag+'.jpg'
        br = BasicNewsRecipe.get_browser()
        daysback=1
        try:
            br.open(cover)
        except:
            while daysback<7:
                cover = 'http://webmedia.newseum.org/newseum-multimedia/dfp/jpg'+str((date.today() - timedelta(days=daysback)).day)+'/lg/'+self.fp_tag+'.jpg'
                br = BasicNewsRecipe.get_browser()
                try:
                    br.open(cover)
                except:
                    daysback = daysback+1
                    continue
                break
        if daysback==7:
            self.log("\nCover unavailable")
            cover = None
        return cover
    def prepare_masthead_image(self, path_to_image, out_path):
        if self.Kindle_Fire:
            from calibre import fit_image
            from calibre.utils.magick import Image, create_canvas
            img = Image()
            img.open(path_to_image)
            width, height = img.size
            img2 = create_canvas(width, height)
            img2.compose(img)
            img2.save(out_path)
        else:
            BasicNewsRecipe.prepare_masthead_image(self, path_to_image, out_path)
    def fixChars(self,string):
        # Replace lsquo (\x91)
        fixed = re.sub("\x91","‘",string)
        # Replace rsquo (\x92)
        fixed = re.sub("\x92","’",fixed)
        # Replace ldquo (\x93)
        fixed = re.sub("\x93","“",fixed)
        # Replace rdquo (\x94)
        fixed = re.sub("\x94","”",fixed)
        # Replace ndash (\x96)
        fixed = re.sub("\x96","–",fixed)
        # Replace mdash (\x97)
        fixed = re.sub("\x97","—",fixed)
        fixed = re.sub("&#x2019;","’",fixed)
        return fixed
    def massageNCXText(self, description):
        # Kindle TOC descriptions won't render certain characters
        if description:
            massaged = unicode(BeautifulStoneSoup(description, convertEntities=BeautifulStoneSoup.HTML_ENTITIES))
            # Replace '&' with '&'
            massaged = re.sub("&","&", massaged)
            return self.fixChars(massaged)
        else:
            return description
    def populate_article_metadata(self, article, soup, first):
        if first:
            picdiv = soup.find('body').find('img')
            if picdiv is not None:
                self.add_toc_thumbnail(article,re.sub(r'links\\link\d+\\','',picdiv['src']))
        xtitle = article.text_summary.strip()
        if len(xtitle) == 0:
            desc = soup.find('meta',attrs={'property':'og:description'})
            if desc is not None:
                article.summary = article.text_summary = desc['content']
    def strip_anchors(self,soup):
        paras = soup.findAll(True)
        for para in paras:
            aTags = para.findAll('a')
            for a in aTags:
                if a.img is None:
                    a.replaceWith(a.renderContents().decode('cp1252','replace'))
        return soup
    def preprocess_html(self,soup):
        #delete empty id attributes--they screw up the TOC for unknown reasons
        divtags = soup.findAll('div',attrs={'id':''})
        if divtags:
            for div in divtags:
                del(div['id'])
        pgall = soup.find('div',attrs={'id':'storyphoto'})
        if pgall is not None: # photo gallery perhaps
            if (soup.find('div',attrs={'id':'storycontent'}) is None):
                allpics = Tag(soup,'div')
                first_img = pgall.find('div','storyimage')
                if first_img is not None:
                    first_img.extract()
                    tlist = pgall.find('div',attrs={'id':'relatedthumbs'})
                    if tlist is not None:
                        for atag in tlist.findAll('a'):
                            img = Tag(soup,'img')
                            srcpre, sep, srcpost = atag.img['src'].partition('?')
                            img['src'] = srcpre
                            pdesc = Tag(soup,'p')
                            pdesc.insert(0,atag.img['alt'])
                            pdesc['class']='photocaption'
                            div = Tag(soup,'div')
                            div.insert(0,pdesc)
                            div.insert(0,img)
                            allpics.append(div)
                pgall.replaceWith(allpics)
        for pg in soup.findAll('div',attrs={'id':'storyphoto'}):
            pg.extract()
        return self.strip_anchors(soup)
-    feeds          = [
+    def parse_index(self):
-('News', 
+
- 'http://rss.canada.com/get/?F297'),
+        articles = {}
- ('Sports', 
+        ans = []
 'http://rss.canada.com/get/?F299'),
 ('Entertainment', 
 'http://rss.canada.com/get/?F7366'),
 ('Business', 
 'http://rss.canada.com/get/?F6939'),
 ]
        def handle_article(adiv,key):
            h1tag = adiv.h1
            if h1tag is not None:
                atag = h1tag.a
                if atag is not None:
                    url = atag['href']
                    if atag['href'].startswith('http'):
                        return
                    elif atag['href'].startswith('/'):
                        url = self.url_prefix+atag['href']
                    else:
                        url = self.url_prefix+'/'+atag['href']
                    if url in self.url_list:
                        return
                    self.url_list.append(url)
                    title = self.tag_to_string(atag,False)
                    if 'VIDEO' in title.upper():
                        return
                    if 'GALLERY' in title.upper():
                        return
                    if 'PHOTOS' in title.upper():
                        return                  
                    dtag = adiv.find('div','content')
                    description=''
                    print("URL "+url)
                    print("TITLE "+title)
                    if dtag is not None:
                        stag = dtag.span
                        if stag is not None:
                            if stag['class'] != 'timestamp':
                                description = self.tag_to_string(stag,False)
                        else:
                            description = self.tag_to_string(dtag,False)
                        print("DESCRIPTION: "+description)
                    if not articles.has_key(key):
                        articles[key] = []
                    articles[key].append(dict(title=title,url=url,date='',description=description,author='',content=''))
        def parse_web_index(key, keyurl):
            try:
                soup = self.index_to_soup(self.url_prefix+keyurl)
            except:
                return
            ans.append(key)
            mainsoup = soup.find('div','bodywrapper')
            footer = mainsoup.find(attrs={'id':'footerfeature'})
            if footer is not None:
                footer.extract()
            print("Section: "+key)
            for wdiv in mainsoup.findAll('div',attrs={'id':re.compile('^HorizontalFeatureSlider_1_Story')}):
                handle_article(wdiv,key)
                wdiv.extract()
            for wdiv in mainsoup.findAll(attrs={'id':['featurewidget','textfeature','textlinks_timestamp']}):
                for adiv in wdiv.findAll('div','featurecontent'):
                    handle_article(adiv,key)
        for (k,url) in self.postmedia_index_pages:
            parse_web_index(k,url)
        ans = [(key, articles[key]) for key in ans if articles.has_key(key)]
        return ans
--- a/recipes/ottawa_citizen.recipe
+++ b/recipes/ottawa_citizen.recipe
@ -1,105 +1,141 @@
 #!/usr/bin/env  python
 # -*- coding: utf-8 -*-
 __license__   = 'GPL v3'
 '''
 www.canada.com
 '''
-
+import string, re
-import re
+from calibre import strftime
 from calibre.web.feeds.news import BasicNewsRecipe
-from calibre.ebooks.BeautifulSoup import BeautifulStoneSoup
+
 import string, re
 from calibre import strftime
 from calibre.web.feeds.recipes import BasicNewsRecipe
 from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag
 class CanWestPaper(BasicNewsRecipe):
-    # un-comment the following four lines for the Victoria Times Colonist
+    postmedia_index_pages = [
-##    title = u'Victoria Times Colonist'
+        (u'Headlines',u'/index.html'),
-##    url_prefix = 'http://www.timescolonist.com'
+        (u'Ottawa & Area',u'/news/ottawa/index.html'),
-##    description = u'News from Victoria, BC'
+        (u'Vancouver',u'/news/vancouver/index.html'),
-##    fp_tag = 'CAN_TC'
+        (u'Calgary',u'/news/calgary/index.html'),
        (u'Edmonton',u'/news/edmonton/index.html'),
        (u'Montreal',u'/news/montreal/index.html'),
        (u'Fraser Valley',u'/news/fraser-valley/index.html'),
        (u'British Columbia',u'/news/bc/index.html'),
        (u'Alberta',u'/news/alberta/index.html'),
        (u'Canada',u'/news/canada/index.html'),
        (u'National',u'/news/national/index.html'),
        (u'Politics',u'/news/politics/index.html'),
        (u'Insight',u'/news/insight/index.html'),
        (u'Special Reports',u'/news/specialreports/index.html'),
        (u'Gangs',u'/news/gangs/index.html'),
        (u'Education',u'/news/education/index.html'),
        (u'Health',u'/news/health/index.html'),
        (u'Environment',u'/news/environment/index.html'),
        (u'World',u'/news/world/index.html'),
        (u'Police Blotter',u'/news/crime-and-justice/index.html'),
        (u'Crime',u'/news/blotter/index.html'),
        (u'Around Town',u'/news/topic.html?t=keyword&q=Around+Town'),
        (u'Diplomatica',u'/news/diplomatica/index.html'),
        (u'Opinion',u'/opinion/index.html'),
        (u'Columnists',u'/columnists/index.html'),
        (u'Editorials',u'/opinion/editorials/index.html'),
        (u'Letters',u'/opinion/letters/index.html'),
        (u'Business',u'/business/index.html'),
        (u'Sports',u'/sports/index.html'),
        (u'Arts',u'/entertainment/index.html'),
        (u'Life',u'/life/index.html'),
        (u'Technology',u'/technology/index.html'),
        (u'Travel',u'/travel/index.html'),
        (u'Health',u'/health/index.html')
        ]
-    # un-comment the following four lines for the Vancouver Province
+
    # un-comment the following six lines for the Vancouver Province
 ##    title = u'Vancouver Province'
 ##    url_prefix = 'http://www.theprovince.com'
 ##    description = u'News from Vancouver, BC'
-##    fp_tag = 'CAN_VP'
+##    std_logo_url = 'http://www.theprovince.com/images/logo_theprovince.jpg'
 ##    logo_url = 'vplogo.jpg'
 ##    fp_tag = 'CAN_TP'
-    # un-comment the following four lines for the Vancouver Sun
+    # un-comment the following six lines for the Vancouver Sun
 ##    title = u'Vancouver Sun'
 ##    url_prefix = 'http://www.vancouversun.com'
 ##    description = u'News from Vancouver, BC'
 ##    std_logo_url = 'http://www.vancouversun.com/images/logo_vancouversun.jpg'
 ##    logo_url = 'vslogo.jpg'
 ##    fp_tag = 'CAN_VS'
-    # un-comment the following four lines for the Edmonton Journal
+    # un-comment the following six lines for the Calgary Herald
 ##    title = u'Edmonton Journal'
 ##    url_prefix = 'http://www.edmontonjournal.com'
 ##    description = u'News from Edmonton, AB'
 ##    fp_tag = 'CAN_EJ'
    # un-comment the following four lines for the Calgary Herald
 ##    title = u'Calgary Herald'
 ##    url_prefix = 'http://www.calgaryherald.com'
 ##    description = u'News from Calgary, AB'
 ##    std_logo_url = 'http://www.calgaryherald.com/images/logo_calgaryherald.jpg'
 ##    logo_url = 'chlogo.jpg'
 ##    fp_tag = 'CAN_CH'
-    # un-comment the following four lines for the Regina Leader-Post
+    # un-comment the following six lines for the Edmonton Journal
-##    title = u'Regina Leader-Post'
+##    title = u'Edmonton Journal'
-##    url_prefix = 'http://www.leaderpost.com'
+##    url_prefix = 'http://www.edmontonjournal.com'
-##    description = u'News from Regina, SK'
+##    description = u'News from Edmonton, AB'
-##    fp_tag = ''
+##    std_logo_url = 'http://www.edmontonjournal.com/images/logo_edmontonjournal.jpg'
 ##    logo_url = 'ejlogo.jpg'
 ##    fp_tag = 'CAN_EJ'
-    # un-comment the following four lines for the Saskatoon Star-Phoenix
+    # un-comment the following six lines for the Ottawa Citizen
 ##    title = u'Saskatoon Star-Phoenix'
 ##    url_prefix = 'http://www.thestarphoenix.com'
 ##    description = u'News from Saskatoon, SK'
 ##    fp_tag = ''
    # un-comment the following four lines for the Windsor Star
 ##    title = u'Windsor Star'
 ##    url_prefix = 'http://www.windsorstar.com'
 ##    description = u'News from Windsor, ON'
 ##    fp_tag = 'CAN_'
    # un-comment the following four lines for the Ottawa Citizen
    title = u'Ottawa Citizen'
    url_prefix = 'http://www.ottawacitizen.com'
    description = u'News from Ottawa, ON'   
    std_logo_url = 'http://www.ottawacitizen.com/images/logo_ottawacitizen.jpg'
    logo_url = 'oclogo.jpg'
    fp_tag = 'CAN_OC'
-    # un-comment the following four lines for the Montreal Gazette
+    # un-comment the following six lines for the Montreal Gazette
 ##    title = u'Montreal Gazette'
 ##    url_prefix = 'http://www.montrealgazette.com'
 ##    description = u'News from Montreal, QC'
 ##    std_logo_url = 'http://www.montrealgazette.com/images/logo_montrealgazette.jpg'
 ##    logo_url = 'mglogo.jpg'
 ##    fp_tag = 'CAN_MG'
    Kindle_Fire=False
    masthead_url = std_logo_url
    url_list = []    
    language = 'en_CA'
    __author__ = 'Nick Redding'
    no_stylesheets = True
    timefmt =  ' [%b %d]'
    encoding = 'utf-8'
    extra_css = '''
                .timestamp {  font-size:xx-small; display: block; }
                #storyheader { font-size: medium; }
                #storyheader h1 { font-size: x-large; }
-                #storyheader h2 { font-size: large;  font-style: italic; }
+                #storyheader h2 { font-size: small;  font-style: italic; }
                .byline { font-size:xx-small; }
-                #photocaption { font-size: small; font-style: italic }
+                #photocaption { font-size: xx-small; font-style: italic; font-weight: normal; }
-                #photocredit { font-size: xx-small; }'''
+                .photocaption { font-size: xx-small; font-style: italic; font-weight: normal; }
-    keep_only_tags = [dict(name='div', attrs={'id':'storyheader'}),dict(name='div', attrs={'id':'storycontent'})]
+                #photocredit { font-size: xx-small; font-weight: normal; }'''
    keep_only_tags = [dict(name='div', attrs={'id':'storyheader'}),dict(name='div', attrs={'id':'storycontent'}),dict(name='div', attrs={'id':'storyphoto'})]
    remove_tags = [{'class':'comments'},
                   dict(name='div', attrs={'class':'navbar'}),dict(name='div', attrs={'class':'morelinks'}),
                   dict(name='h2', attrs={'id':'photocredit'}),
                   dict(name='div', attrs={'class':'viewmore'}),dict(name='li', attrs={'class':'email'}),
                   dict(name='div', attrs={'class':'story_tool_hr'}),dict(name='div', attrs={'class':'clear'}),
                   dict(name='div', attrs={'class':'story_tool'}),dict(name='div', attrs={'class':'copyright'}),
                   dict(name='div', attrs={'class':'rule_grey_solid'}),
                   dict(name='li', attrs={'class':'print'}),dict(name='li', attrs={'class':'share'}),dict(name='ul', attrs={'class':'bullet'})]
    def get_cover_url(self):
-        from datetime import timedelta, date
+        from datetime import timedelta, datetime, date
        if self.fp_tag=='':
            return None
        cover = 'http://webmedia.newseum.org/newseum-multimedia/dfp/jpg'+str(date.today().day)+'/lg/'+self.fp_tag+'.jpg'
        br = BasicNewsRecipe.get_browser()
        daysback=1
@ -120,6 +156,19 @@ class CanWestPaper(BasicNewsRecipe):
            cover = None
        return cover
    def prepare_masthead_image(self, path_to_image, out_path):
        if self.Kindle_Fire:
            from calibre import fit_image
            from calibre.utils.magick import Image, create_canvas
            img = Image()
            img.open(path_to_image)
            width, height = img.size
            img2 = create_canvas(width, height)
            img2.compose(img)
            img2.save(out_path)
        else:
            BasicNewsRecipe.prepare_masthead_image(self, path_to_image, out_path)
    def fixChars(self,string):
        # Replace lsquo (\x91)
        fixed = re.sub("\x91","‘",string)
@ -166,55 +215,106 @@ class CanWestPaper(BasicNewsRecipe):
                    a.replaceWith(a.renderContents().decode('cp1252','replace'))
        return soup
    def preprocess_html(self,soup):
        #delete empty id attributes--they screw up the TOC for unknown reasons
        divtags = soup.findAll('div',attrs={'id':''})
        if divtags:
            for div in divtags:
                del(div['id'])
        pgall = soup.find('div',attrs={'id':'storyphoto'})
        if pgall is not None: # photo gallery perhaps
            if (soup.find('div',attrs={'id':'storycontent'}) is None):
                allpics = Tag(soup,'div')
                first_img = pgall.find('div','storyimage')
                if first_img is not None:
                    first_img.extract()
                    tlist = pgall.find('div',attrs={'id':'relatedthumbs'})
                    if tlist is not None:
                        for atag in tlist.findAll('a'):
                            img = Tag(soup,'img')
                            srcpre, sep, srcpost = atag.img['src'].partition('?')
                            img['src'] = srcpre
                            pdesc = Tag(soup,'p')
                            pdesc.insert(0,atag.img['alt'])
                            pdesc['class']='photocaption'
                            div = Tag(soup,'div')
                            div.insert(0,pdesc)
                            div.insert(0,img)
                            allpics.append(div)
                pgall.replaceWith(allpics)
        for pg in soup.findAll('div',attrs={'id':'storyphoto'}):
            pg.extract()
        return self.strip_anchors(soup)
    def parse_index(self):
        soup = self.index_to_soup(self.url_prefix+'/news/todays-paper/index.html')
        articles = {}
-        key = 'News'
+        ans = []
        ans = ['News']
-        # Find each instance of class="sectiontitle", class="featurecontent"
+
-        for divtag in soup.findAll('div',attrs={'class' : ["section_title02","featurecontent"]}):
+        def handle_article(adiv,key):
-                #self.log(" div class = %s" % divtag['class'])
+            h1tag = adiv.h1
-                if divtag['class'].startswith('section_title'):
+            if h1tag is not None:
-                    # div contains section title
+                atag = h1tag.a
-                    if not divtag.h3:
+                if atag is not None:
-                        continue
+                    url = atag['href']
-                    key = self.tag_to_string(divtag.h3,False)
+                    if atag['href'].startswith('http'):
-                    ans.append(key)
+                        return
-                    self.log("Section name %s" % key)
+                    elif atag['href'].startswith('/'):
-                    continue
+                        url = self.url_prefix+atag['href']
-                # div contains article data
+                    else:
-                h1tag = divtag.find('h1')
+                        url = self.url_prefix+'/'+atag['href']
-                if not h1tag:
+                    if url in self.url_list:
-                    continue
+                        return
-                atag = h1tag.find('a',href=True)
+                    self.url_list.append(url)
                if not atag:
                    continue
                url = self.url_prefix+'/news/todays-paper/'+atag['href']
                #self.log("Section %s" % key)
                #self.log("url %s" % url)
                    title = self.tag_to_string(atag,False)
-                #self.log("title %s" % title)
+                    if 'VIDEO' in title.upper():
-                pubdate = ''
+                        return
                    if 'GALLERY' in title.upper():
                        return
                    if 'PHOTOS' in title.upper():
                        return                  
                    dtag = adiv.find('div','content')
                    description=''
-                ptag = divtag.find('p');
+                    print("URL "+url)
-                if ptag:
+                    print("TITLE "+title)
-                    description = self.tag_to_string(ptag,False)
+                    if dtag is not None:
-                    #self.log("description %s" % description)
+                        stag = dtag.span
-                author = ''
+                        if stag is not None:
-                autag = divtag.find('h4')
+                            if stag['class'] != 'timestamp':
-                if autag:
+                                description = self.tag_to_string(stag,False)
-                    author = self.tag_to_string(autag,False)
+                        else:
-                    #self.log("author %s" % author)
+                            description = self.tag_to_string(dtag,False)
                        print("DESCRIPTION: "+description)
                    if not articles.has_key(key):
                        articles[key] = []
-                articles[key].append(dict(title=title,url=url,date=pubdate,description=description,author=author,content=''))
+                    articles[key].append(dict(title=title,url=url,date='',description=description,author='',content=''))
        def parse_web_index(key, keyurl):
            try:
                soup = self.index_to_soup(self.url_prefix+keyurl)
            except:
                return
            ans.append(key)
            mainsoup = soup.find('div','bodywrapper')
            footer = mainsoup.find(attrs={'id':'footerfeature'})
            if footer is not None:
                footer.extract()
            print("Section: "+key)
            for wdiv in mainsoup.findAll('div',attrs={'id':re.compile('^HorizontalFeatureSlider_1_Story')}):
                handle_article(wdiv,key)
                wdiv.extract()
            for wdiv in mainsoup.findAll(attrs={'id':['featurewidget','textfeature','textlinks_timestamp']}):
                for adiv in wdiv.findAll('div','featurecontent'):
                    handle_article(adiv,key)
        for (k,url) in self.postmedia_index_pages:
            parse_web_index(k,url)
        ans = [(key, articles[key]) for key in ans if articles.has_key(key)]
        return ans
--- a/recipes/vancouver_provice.recipe
+++ b/recipes/vancouver_provice.recipe
@ -1,136 +1,320 @@
 #!/usr/bin/env  python
-
+# -*- coding: utf-8 -*-
 __license__   = 'GPL v3'
 '''
 www.canada.com
 '''
 import string, re
 from calibre import strftime
 from calibre.web.feeds.news import BasicNewsRecipe
 import string, re
 from calibre import strftime
 from calibre.web.feeds.recipes import BasicNewsRecipe
 from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag
 class CanWestPaper(BasicNewsRecipe):
-    # un-comment the following three lines for the Vancouver Province
+    postmedia_index_pages = [
        (u'Headlines',u'/index.html'),
        (u'Ottawa & Area',u'/news/ottawa/index.html'),
        (u'Vancouver',u'/news/vancouver/index.html'),
        (u'Calgary',u'/news/calgary/index.html'),
        (u'Edmonton',u'/news/edmonton/index.html'),
        (u'Montreal',u'/news/montreal/index.html'),
        (u'Fraser Valley',u'/news/fraser-valley/index.html'),
        (u'British Columbia',u'/news/bc/index.html'),
        (u'Alberta',u'/news/alberta/index.html'),
        (u'Canada',u'/news/canada/index.html'),
        (u'National',u'/news/national/index.html'),
        (u'Politics',u'/news/politics/index.html'),
        (u'Insight',u'/news/insight/index.html'),
        (u'Special Reports',u'/news/specialreports/index.html'),
        (u'Gangs',u'/news/gangs/index.html'),
        (u'Education',u'/news/education/index.html'),
        (u'Health',u'/news/health/index.html'),
        (u'Environment',u'/news/environment/index.html'),
        (u'World',u'/news/world/index.html'),
        (u'Police Blotter',u'/news/crime-and-justice/index.html'),
        (u'Crime',u'/news/blotter/index.html'),
        (u'Around Town',u'/news/topic.html?t=keyword&q=Around+Town'),
        (u'Diplomatica',u'/news/diplomatica/index.html'),
        (u'Opinion',u'/opinion/index.html'),
        (u'Columnists',u'/columnists/index.html'),
        (u'Editorials',u'/opinion/editorials/index.html'),
        (u'Letters',u'/opinion/letters/index.html'),
        (u'Business',u'/business/index.html'),
        (u'Sports',u'/sports/index.html'),
        (u'Arts',u'/entertainment/index.html'),
        (u'Life',u'/life/index.html'),
        (u'Technology',u'/technology/index.html'),
        (u'Travel',u'/travel/index.html'),
        (u'Health',u'/health/index.html')
        ]
    # un-comment the following six lines for the Vancouver Province
    title = u'Vancouver Province'
    url_prefix = 'http://www.theprovince.com'
    description = u'News from Vancouver, BC'
    std_logo_url = 'http://www.theprovince.com/images/logo_theprovince.jpg'
    logo_url = 'vplogo.jpg'
    fp_tag = 'CAN_TP'
-    # un-comment the following three lines for the Vancouver Sun
+    # un-comment the following six lines for the Vancouver Sun
-    #title = u'Vancouver Sun'
+##    title = u'Vancouver Sun'
-    #url_prefix = 'http://www.vancouversun.com'
+##    url_prefix = 'http://www.vancouversun.com'
-    #description = u'News from Vancouver, BC'
+##    description = u'News from Vancouver, BC'
 ##    std_logo_url = 'http://www.vancouversun.com/images/logo_vancouversun.jpg'
 ##    logo_url = 'vslogo.jpg'
 ##    fp_tag = 'CAN_VS'
-    # un-comment the following three lines for the Edmonton Journal
+    # un-comment the following six lines for the Calgary Herald
-    #title = u'Edmonton Journal'
+##    title = u'Calgary Herald'
-    #url_prefix = 'http://www.edmontonjournal.com'
+##    url_prefix = 'http://www.calgaryherald.com'
-    #description = u'News from Edmonton, AB'
+##    description = u'News from Calgary, AB'
 ##    std_logo_url = 'http://www.calgaryherald.com/images/logo_calgaryherald.jpg'
 ##    logo_url = 'chlogo.jpg'
 ##    fp_tag = 'CAN_CH'
-    # un-comment the following three lines for the Calgary Herald
+    # un-comment the following six lines for the Edmonton Journal
-    #title = u'Calgary Herald'
+##    title = u'Edmonton Journal'
-    #url_prefix = 'http://www.calgaryherald.com'
+##    url_prefix = 'http://www.edmontonjournal.com'
-    #description = u'News from Calgary, AB'
+##    description = u'News from Edmonton, AB'
 ##    std_logo_url = 'http://www.edmontonjournal.com/images/logo_edmontonjournal.jpg'
 ##    logo_url = 'ejlogo.jpg'
 ##    fp_tag = 'CAN_EJ'
-    # un-comment the following three lines for the Regina Leader-Post
+    # un-comment the following six lines for the Ottawa Citizen
-    #title = u'Regina Leader-Post'
+##    title = u'Ottawa Citizen'
-    #url_prefix = 'http://www.leaderpost.com'
+##    url_prefix = 'http://www.ottawacitizen.com'
-    #description = u'News from Regina, SK'
+##    description = u'News from Ottawa, ON'   
 ##    std_logo_url = 'http://www.ottawacitizen.com/images/logo_ottawacitizen.jpg'
 ##    logo_url = 'oclogo.jpg'
 ##    fp_tag = 'CAN_OC'
-    # un-comment the following three lines for the Saskatoon Star-Phoenix
+    # un-comment the following six lines for the Montreal Gazette
-    #title = u'Saskatoon Star-Phoenix'
+##    title = u'Montreal Gazette'
-    #url_prefix = 'http://www.thestarphoenix.com'
+##    url_prefix = 'http://www.montrealgazette.com'
-    #description = u'News from Saskatoon, SK'
+##    description = u'News from Montreal, QC'
-
+##    std_logo_url = 'http://www.montrealgazette.com/images/logo_montrealgazette.jpg'
-    # un-comment the following three lines for the Windsor Star
+##    logo_url = 'mglogo.jpg'
-    #title = u'Windsor Star'
+##    fp_tag = 'CAN_MG'
    #url_prefix = 'http://www.windsorstar.com'
    #description = u'News from Windsor, ON'
    # un-comment the following three lines for the Ottawa Citizen
    #title = u'Ottawa Citizen'
    #url_prefix = 'http://www.ottawacitizen.com'
    #description = u'News from Ottawa, ON'
    # un-comment the following three lines for the Montreal Gazette
    #title = u'Montreal Gazette'
    #url_prefix = 'http://www.montrealgazette.com'
    #description = u'News from Montreal, QC'
    Kindle_Fire=False
    masthead_url = std_logo_url
    url_list = []    
    language = 'en_CA'
    __author__ = 'Nick Redding'
    no_stylesheets = True
    timefmt =  ' [%b %d]'
    encoding = 'utf-8'
    extra_css = '''
                .timestamp {  font-size:xx-small; display: block; }
                #storyheader { font-size: medium; }
                #storyheader h1 { font-size: x-large; }
-                #storyheader h2 { font-size: large;  font-style: italic; }
+                #storyheader h2 { font-size: small;  font-style: italic; }
                .byline { font-size:xx-small; }
-                #photocaption { font-size: small; font-style: italic }
+                #photocaption { font-size: xx-small; font-style: italic; font-weight: normal; }
-                #photocredit { font-size: xx-small; }'''
+                .photocaption { font-size: xx-small; font-style: italic; font-weight: normal; }
-    keep_only_tags = [dict(name='div', attrs={'id':'storyheader'}),dict(name='div', attrs={'id':'storycontent'})]
+                #photocredit { font-size: xx-small; font-weight: normal; }'''
    keep_only_tags = [dict(name='div', attrs={'id':'storyheader'}),dict(name='div', attrs={'id':'storycontent'}),dict(name='div', attrs={'id':'storyphoto'})]
    remove_tags = [{'class':'comments'},
                   dict(name='div', attrs={'class':'navbar'}),dict(name='div', attrs={'class':'morelinks'}),
                   dict(name='h2', attrs={'id':'photocredit'}),
                   dict(name='div', attrs={'class':'viewmore'}),dict(name='li', attrs={'class':'email'}),
                   dict(name='div', attrs={'class':'story_tool_hr'}),dict(name='div', attrs={'class':'clear'}),
                   dict(name='div', attrs={'class':'story_tool'}),dict(name='div', attrs={'class':'copyright'}),
                   dict(name='div', attrs={'class':'rule_grey_solid'}),
                   dict(name='li', attrs={'class':'print'}),dict(name='li', attrs={'class':'share'}),dict(name='ul', attrs={'class':'bullet'})]
    def get_cover_url(self):
        from datetime import timedelta, datetime, date
        cover = 'http://webmedia.newseum.org/newseum-multimedia/dfp/jpg'+str(date.today().day)+'/lg/'+self.fp_tag+'.jpg'
        br = BasicNewsRecipe.get_browser()
        daysback=1
        try:
            br.open(cover)
        except:
            while daysback<7:
                cover = 'http://webmedia.newseum.org/newseum-multimedia/dfp/jpg'+str((date.today() - timedelta(days=daysback)).day)+'/lg/'+self.fp_tag+'.jpg'
                br = BasicNewsRecipe.get_browser()
                try:
                    br.open(cover)
                except:
                    daysback = daysback+1
                    continue
                break
        if daysback==7:
            self.log("\nCover unavailable")
            cover = None
        return cover
    def prepare_masthead_image(self, path_to_image, out_path):
        if self.Kindle_Fire:
            from calibre import fit_image
            from calibre.utils.magick import Image, create_canvas
            img = Image()
            img.open(path_to_image)
            width, height = img.size
            img2 = create_canvas(width, height)
            img2.compose(img)
            img2.save(out_path)
        else:
            BasicNewsRecipe.prepare_masthead_image(self, path_to_image, out_path)
    def fixChars(self,string):
        # Replace lsquo (\x91)
        fixed = re.sub("\x91","‘",string)
        # Replace rsquo (\x92)
        fixed = re.sub("\x92","’",fixed)
        # Replace ldquo (\x93)
        fixed = re.sub("\x93","“",fixed)
        # Replace rdquo (\x94)
        fixed = re.sub("\x94","”",fixed)
        # Replace ndash (\x96)
        fixed = re.sub("\x96","–",fixed)
        # Replace mdash (\x97)
        fixed = re.sub("\x97","—",fixed)
        fixed = re.sub("&#x2019;","’",fixed)
        return fixed
    def massageNCXText(self, description):
        # Kindle TOC descriptions won't render certain characters
        if description:
            massaged = unicode(BeautifulStoneSoup(description, convertEntities=BeautifulStoneSoup.HTML_ENTITIES))
            # Replace '&' with '&'
            massaged = re.sub("&","&", massaged)
            return self.fixChars(massaged)
        else:
            return description
    def populate_article_metadata(self, article, soup, first):
        if first:
            picdiv = soup.find('body').find('img')
            if picdiv is not None:
                self.add_toc_thumbnail(article,re.sub(r'links\\link\d+\\','',picdiv['src']))
        xtitle = article.text_summary.strip()
        if len(xtitle) == 0:
            desc = soup.find('meta',attrs={'property':'og:description'})
            if desc is not None:
                article.summary = article.text_summary = desc['content']
    def strip_anchors(self,soup):
        paras = soup.findAll(True)
        for para in paras:
            aTags = para.findAll('a')
            for a in aTags:
                if a.img is None:
                    a.replaceWith(a.renderContents().decode('cp1252','replace'))
        return soup
    def preprocess_html(self,soup):
-        #delete iempty id attributes--they screw up the TOC for unknow reasons
+        #delete empty id attributes--they screw up the TOC for unknown reasons
        divtags = soup.findAll('div',attrs={'id':''})
        if divtags:
            for div in divtags:
                del(div['id'])
-        return soup
+
        pgall = soup.find('div',attrs={'id':'storyphoto'})
        if pgall is not None: # photo gallery perhaps
            if (soup.find('div',attrs={'id':'storycontent'}) is None):
                allpics = Tag(soup,'div')
                first_img = pgall.find('div','storyimage')
                if first_img is not None:
                    first_img.extract()
                    tlist = pgall.find('div',attrs={'id':'relatedthumbs'})
                    if tlist is not None:
                        for atag in tlist.findAll('a'):
                            img = Tag(soup,'img')
                            srcpre, sep, srcpost = atag.img['src'].partition('?')
                            img['src'] = srcpre
                            pdesc = Tag(soup,'p')
                            pdesc.insert(0,atag.img['alt'])
                            pdesc['class']='photocaption'
                            div = Tag(soup,'div')
                            div.insert(0,pdesc)
                            div.insert(0,img)
                            allpics.append(div)
                pgall.replaceWith(allpics)
        for pg in soup.findAll('div',attrs={'id':'storyphoto'}):
            pg.extract()
        return self.strip_anchors(soup)
    def parse_index(self):
        soup = self.index_to_soup(self.url_prefix+'/news/todays-paper/index.html')
        articles = {}
-        key = 'News'
+        ans = []
        ans = ['News']
-        # Find each instance of class="sectiontitle", class="featurecontent"
+
-        for divtag in soup.findAll('div',attrs={'class' : ["section_title02","featurecontent"]}):
+        def handle_article(adiv,key):
-                #self.log(" div class = %s" % divtag['class'])
+            h1tag = adiv.h1
-                if divtag['class'].startswith('section_title'):
+            if h1tag is not None:
-                    # div contains section title
+                atag = h1tag.a
-                    if not divtag.h3:
+                if atag is not None:
-                        continue
+                    url = atag['href']
-                    key = self.tag_to_string(divtag.h3,False)
+                    if atag['href'].startswith('http'):
-                    ans.append(key)
+                        return
-                    self.log("Section name %s" % key)
+                    elif atag['href'].startswith('/'):
-                    continue
+                        url = self.url_prefix+atag['href']
-                # div contains article data
+                    else:
-                h1tag = divtag.find('h1')
+                        url = self.url_prefix+'/'+atag['href']
-                if not h1tag:
+                    if url in self.url_list:
-                    continue
+                        return
-                atag = h1tag.find('a',href=True)
+                    self.url_list.append(url)
                if not atag:
                    continue
                url = self.url_prefix+'/news/todays-paper/'+atag['href']
                #self.log("Section %s" % key)
                #self.log("url %s" % url)
                    title = self.tag_to_string(atag,False)
-                #self.log("title %s" % title)
+                    if 'VIDEO' in title.upper():
-                pubdate = ''
+                        return
                    if 'GALLERY' in title.upper():
                        return
                    if 'PHOTOS' in title.upper():
                        return                  
                    dtag = adiv.find('div','content')
                    description=''
-                ptag = divtag.find('p');
+                    print("URL "+url)
-                if ptag:
+                    print("TITLE "+title)
-                    description = self.tag_to_string(ptag,False)
+                    if dtag is not None:
-                    #self.log("description %s" % description)
+                        stag = dtag.span
-                author = ''
+                        if stag is not None:
-                autag = divtag.find('h4')
+                            if stag['class'] != 'timestamp':
-                if autag:
+                                description = self.tag_to_string(stag,False)
-                    author = self.tag_to_string(autag,False)
+                        else:
-                    #self.log("author %s" % author)
+                            description = self.tag_to_string(dtag,False)
                        print("DESCRIPTION: "+description)
                    if not articles.has_key(key):
                        articles[key] = []
-                articles[key].append(dict(title=title,url=url,date=pubdate,description=description,author=author,content=''))
+                    articles[key].append(dict(title=title,url=url,date='',description=description,author='',content=''))
        def parse_web_index(key, keyurl):
            try:
                soup = self.index_to_soup(self.url_prefix+keyurl)
            except:
                return
            ans.append(key)
            mainsoup = soup.find('div','bodywrapper')
            footer = mainsoup.find(attrs={'id':'footerfeature'})
            if footer is not None:
                footer.extract()
            print("Section: "+key)
            for wdiv in mainsoup.findAll('div',attrs={'id':re.compile('^HorizontalFeatureSlider_1_Story')}):
                handle_article(wdiv,key)
                wdiv.extract()
            for wdiv in mainsoup.findAll(attrs={'id':['featurewidget','textfeature','textlinks_timestamp']}):
                for adiv in wdiv.findAll('div','featurecontent'):
                    handle_article(adiv,key)
        for (k,url) in self.postmedia_index_pages:
            parse_web_index(k,url)
        ans = [(key, articles[key]) for key in ans if articles.has_key(key)]
        return ans
--- a/recipes/vancouver_sun.recipe
+++ b/recipes/vancouver_sun.recipe
@ -1,105 +1,141 @@
 #!/usr/bin/env  python
 # -*- coding: utf-8 -*-
 __license__   = 'GPL v3'
 '''
 www.canada.com
 '''
-
+import string, re
-import re
+from calibre import strftime
 from calibre.web.feeds.news import BasicNewsRecipe
-from calibre.ebooks.BeautifulSoup import BeautifulStoneSoup
+
 import string, re
 from calibre import strftime
 from calibre.web.feeds.recipes import BasicNewsRecipe
 from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag
 class CanWestPaper(BasicNewsRecipe):
-    # un-comment the following four lines for the Victoria Times Colonist
+    postmedia_index_pages = [
-##    title = u'Victoria Times Colonist'
+        (u'Headlines',u'/index.html'),
-##    url_prefix = 'http://www.timescolonist.com'
+        (u'Ottawa & Area',u'/news/ottawa/index.html'),
-##    description = u'News from Victoria, BC'
+        (u'Vancouver',u'/news/vancouver/index.html'),
-##    fp_tag = 'CAN_TC'
+        (u'Calgary',u'/news/calgary/index.html'),
        (u'Edmonton',u'/news/edmonton/index.html'),
        (u'Montreal',u'/news/montreal/index.html'),
        (u'Fraser Valley',u'/news/fraser-valley/index.html'),
        (u'British Columbia',u'/news/bc/index.html'),
        (u'Alberta',u'/news/alberta/index.html'),
        (u'Canada',u'/news/canada/index.html'),
        (u'National',u'/news/national/index.html'),
        (u'Politics',u'/news/politics/index.html'),
        (u'Insight',u'/news/insight/index.html'),
        (u'Special Reports',u'/news/specialreports/index.html'),
        (u'Gangs',u'/news/gangs/index.html'),
        (u'Education',u'/news/education/index.html'),
        (u'Health',u'/news/health/index.html'),
        (u'Environment',u'/news/environment/index.html'),
        (u'World',u'/news/world/index.html'),
        (u'Police Blotter',u'/news/crime-and-justice/index.html'),
        (u'Crime',u'/news/blotter/index.html'),
        (u'Around Town',u'/news/topic.html?t=keyword&q=Around+Town'),
        (u'Diplomatica',u'/news/diplomatica/index.html'),
        (u'Opinion',u'/opinion/index.html'),
        (u'Columnists',u'/columnists/index.html'),
        (u'Editorials',u'/opinion/editorials/index.html'),
        (u'Letters',u'/opinion/letters/index.html'),
        (u'Business',u'/business/index.html'),
        (u'Sports',u'/sports/index.html'),
        (u'Arts',u'/entertainment/index.html'),
        (u'Life',u'/life/index.html'),
        (u'Technology',u'/technology/index.html'),
        (u'Travel',u'/travel/index.html'),
        (u'Health',u'/health/index.html')
        ]
-    # un-comment the following four lines for the Vancouver Province
+
    # un-comment the following six lines for the Vancouver Province
 ##    title = u'Vancouver Province'
 ##    url_prefix = 'http://www.theprovince.com'
 ##    description = u'News from Vancouver, BC'
-##    fp_tag = 'CAN_VP'
+##    std_logo_url = 'http://www.theprovince.com/images/logo_theprovince.jpg'
 ##    logo_url = 'vplogo.jpg'
 ##    fp_tag = 'CAN_TP'
-    # un-comment the following four lines for the Vancouver Sun
+    # un-comment the following six lines for the Vancouver Sun
    title = u'Vancouver Sun'
    url_prefix = 'http://www.vancouversun.com'
    description = u'News from Vancouver, BC'
    std_logo_url = 'http://www.vancouversun.com/images/logo_vancouversun.jpg'
    logo_url = 'vslogo.jpg'
    fp_tag = 'CAN_VS'
-    # un-comment the following four lines for the Edmonton Journal
+    # un-comment the following six lines for the Calgary Herald
 ##    title = u'Edmonton Journal'
 ##    url_prefix = 'http://www.edmontonjournal.com'
 ##    description = u'News from Edmonton, AB'
 ##    fp_tag = 'CAN_EJ'
    # un-comment the following four lines for the Calgary Herald
 ##    title = u'Calgary Herald'
 ##    url_prefix = 'http://www.calgaryherald.com'
 ##    description = u'News from Calgary, AB'
 ##    std_logo_url = 'http://www.calgaryherald.com/images/logo_calgaryherald.jpg'
 ##    logo_url = 'chlogo.jpg'
 ##    fp_tag = 'CAN_CH'
-    # un-comment the following four lines for the Regina Leader-Post
+    # un-comment the following six lines for the Edmonton Journal
-##    title = u'Regina Leader-Post'
+##    title = u'Edmonton Journal'
-##    url_prefix = 'http://www.leaderpost.com'
+##    url_prefix = 'http://www.edmontonjournal.com'
-##    description = u'News from Regina, SK'
+##    description = u'News from Edmonton, AB'
-##    fp_tag = ''
+##    std_logo_url = 'http://www.edmontonjournal.com/images/logo_edmontonjournal.jpg'
 ##    logo_url = 'ejlogo.jpg'
 ##    fp_tag = 'CAN_EJ'
-    # un-comment the following four lines for the Saskatoon Star-Phoenix
+    # un-comment the following six lines for the Ottawa Citizen
 ##    title = u'Saskatoon Star-Phoenix'
 ##    url_prefix = 'http://www.thestarphoenix.com'
 ##    description = u'News from Saskatoon, SK'
 ##    fp_tag = ''
    # un-comment the following four lines for the Windsor Star
 ##    title = u'Windsor Star'
 ##    url_prefix = 'http://www.windsorstar.com'
 ##    description = u'News from Windsor, ON'
 ##    fp_tag = 'CAN_'
    # un-comment the following four lines for the Ottawa Citizen
 ##    title = u'Ottawa Citizen'
 ##    url_prefix = 'http://www.ottawacitizen.com'
 ##    description = u'News from Ottawa, ON'   
 ##    std_logo_url = 'http://www.ottawacitizen.com/images/logo_ottawacitizen.jpg'
 ##    logo_url = 'oclogo.jpg'
 ##    fp_tag = 'CAN_OC'
-    # un-comment the following four lines for the Montreal Gazette
+    # un-comment the following six lines for the Montreal Gazette
 ##    title = u'Montreal Gazette'
 ##    url_prefix = 'http://www.montrealgazette.com'
 ##    description = u'News from Montreal, QC'
 ##    std_logo_url = 'http://www.montrealgazette.com/images/logo_montrealgazette.jpg'
 ##    logo_url = 'mglogo.jpg'
 ##    fp_tag = 'CAN_MG'
    Kindle_Fire=False
    masthead_url = std_logo_url
    url_list = []    
    language = 'en_CA'
    __author__ = 'Nick Redding'
    no_stylesheets = True
    timefmt =  ' [%b %d]'
    encoding = 'utf-8'
    extra_css = '''
                .timestamp {  font-size:xx-small; display: block; }
                #storyheader { font-size: medium; }
                #storyheader h1 { font-size: x-large; }
-                #storyheader h2 { font-size: large;  font-style: italic; }
+                #storyheader h2 { font-size: small;  font-style: italic; }
                .byline { font-size:xx-small; }
-                #photocaption { font-size: small; font-style: italic }
+                #photocaption { font-size: xx-small; font-style: italic; font-weight: normal; }
-                #photocredit { font-size: xx-small; }'''
+                .photocaption { font-size: xx-small; font-style: italic; font-weight: normal; }
-    keep_only_tags = [dict(name='div', attrs={'id':'storyheader'}),dict(name='div', attrs={'id':'storycontent'})]
+                #photocredit { font-size: xx-small; font-weight: normal; }'''
    keep_only_tags = [dict(name='div', attrs={'id':'storyheader'}),dict(name='div', attrs={'id':'storycontent'}),dict(name='div', attrs={'id':'storyphoto'})]
    remove_tags = [{'class':'comments'},
                   dict(name='div', attrs={'class':'navbar'}),dict(name='div', attrs={'class':'morelinks'}),
                   dict(name='h2', attrs={'id':'photocredit'}),
                   dict(name='div', attrs={'class':'viewmore'}),dict(name='li', attrs={'class':'email'}),
                   dict(name='div', attrs={'class':'story_tool_hr'}),dict(name='div', attrs={'class':'clear'}),
                   dict(name='div', attrs={'class':'story_tool'}),dict(name='div', attrs={'class':'copyright'}),
                   dict(name='div', attrs={'class':'rule_grey_solid'}),
                   dict(name='li', attrs={'class':'print'}),dict(name='li', attrs={'class':'share'}),dict(name='ul', attrs={'class':'bullet'})]
    def get_cover_url(self):
-        from datetime import timedelta, date
+        from datetime import timedelta, datetime, date
        if self.fp_tag=='':
            return None
        cover = 'http://webmedia.newseum.org/newseum-multimedia/dfp/jpg'+str(date.today().day)+'/lg/'+self.fp_tag+'.jpg'
        br = BasicNewsRecipe.get_browser()
        daysback=1
@ -120,6 +156,19 @@ class CanWestPaper(BasicNewsRecipe):
            cover = None
        return cover
    def prepare_masthead_image(self, path_to_image, out_path):
        if self.Kindle_Fire:
            from calibre import fit_image
            from calibre.utils.magick import Image, create_canvas
            img = Image()
            img.open(path_to_image)
            width, height = img.size
            img2 = create_canvas(width, height)
            img2.compose(img)
            img2.save(out_path)
        else:
            BasicNewsRecipe.prepare_masthead_image(self, path_to_image, out_path)
    def fixChars(self,string):
        # Replace lsquo (\x91)
        fixed = re.sub("\x91","‘",string)
@ -166,55 +215,106 @@ class CanWestPaper(BasicNewsRecipe):
                    a.replaceWith(a.renderContents().decode('cp1252','replace'))
        return soup
    def preprocess_html(self,soup):
        #delete empty id attributes--they screw up the TOC for unknown reasons
        divtags = soup.findAll('div',attrs={'id':''})
        if divtags:
            for div in divtags:
                del(div['id'])
        pgall = soup.find('div',attrs={'id':'storyphoto'})
        if pgall is not None: # photo gallery perhaps
            if (soup.find('div',attrs={'id':'storycontent'}) is None):
                allpics = Tag(soup,'div')
                first_img = pgall.find('div','storyimage')
                if first_img is not None:
                    first_img.extract()
                    tlist = pgall.find('div',attrs={'id':'relatedthumbs'})
                    if tlist is not None:
                        for atag in tlist.findAll('a'):
                            img = Tag(soup,'img')
                            srcpre, sep, srcpost = atag.img['src'].partition('?')
                            img['src'] = srcpre
                            pdesc = Tag(soup,'p')
                            pdesc.insert(0,atag.img['alt'])
                            pdesc['class']='photocaption'
                            div = Tag(soup,'div')
                            div.insert(0,pdesc)
                            div.insert(0,img)
                            allpics.append(div)
                pgall.replaceWith(allpics)
        for pg in soup.findAll('div',attrs={'id':'storyphoto'}):
            pg.extract()
        return self.strip_anchors(soup)
    def parse_index(self):
        soup = self.index_to_soup(self.url_prefix+'/news/todays-paper/index.html')
        articles = {}
-        key = 'News'
+        ans = []
        ans = ['News']
-        # Find each instance of class="sectiontitle", class="featurecontent"
+
-        for divtag in soup.findAll('div',attrs={'class' : ["section_title02","featurecontent"]}):
+        def handle_article(adiv,key):
-                #self.log(" div class = %s" % divtag['class'])
+            h1tag = adiv.h1
-                if divtag['class'].startswith('section_title'):
+            if h1tag is not None:
-                    # div contains section title
+                atag = h1tag.a
-                    if not divtag.h3:
+                if atag is not None:
-                        continue
+                    url = atag['href']
-                    key = self.tag_to_string(divtag.h3,False)
+                    if atag['href'].startswith('http'):
-                    ans.append(key)
+                        return
-                    self.log("Section name %s" % key)
+                    elif atag['href'].startswith('/'):
-                    continue
+                        url = self.url_prefix+atag['href']
-                # div contains article data
+                    else:
-                h1tag = divtag.find('h1')
+                        url = self.url_prefix+'/'+atag['href']
-                if not h1tag:
+                    if url in self.url_list:
-                    continue
+                        return
-                atag = h1tag.find('a',href=True)
+                    self.url_list.append(url)
                if not atag:
                    continue
                url = self.url_prefix+'/news/todays-paper/'+atag['href']
                #self.log("Section %s" % key)
                #self.log("url %s" % url)
                    title = self.tag_to_string(atag,False)
-                #self.log("title %s" % title)
+                    if 'VIDEO' in title.upper():
-                pubdate = ''
+                        return
                    if 'GALLERY' in title.upper():
                        return
                    if 'PHOTOS' in title.upper():
                        return                  
                    dtag = adiv.find('div','content')
                    description=''
-                ptag = divtag.find('p');
+                    print("URL "+url)
-                if ptag:
+                    print("TITLE "+title)
-                    description = self.tag_to_string(ptag,False)
+                    if dtag is not None:
-                    #self.log("description %s" % description)
+                        stag = dtag.span
-                author = ''
+                        if stag is not None:
-                autag = divtag.find('h4')
+                            if stag['class'] != 'timestamp':
-                if autag:
+                                description = self.tag_to_string(stag,False)
-                    author = self.tag_to_string(autag,False)
+                        else:
-                    #self.log("author %s" % author)
+                            description = self.tag_to_string(dtag,False)
                        print("DESCRIPTION: "+description)
                    if not articles.has_key(key):
                        articles[key] = []
-                articles[key].append(dict(title=title,url=url,date=pubdate,description=description,author=author,content=''))
+                    articles[key].append(dict(title=title,url=url,date='',description=description,author='',content=''))
        def parse_web_index(key, keyurl):
            try:
                soup = self.index_to_soup(self.url_prefix+keyurl)
            except:
                return
            ans.append(key)
            mainsoup = soup.find('div','bodywrapper')
            footer = mainsoup.find(attrs={'id':'footerfeature'})
            if footer is not None:
                footer.extract()
            print("Section: "+key)
            for wdiv in mainsoup.findAll('div',attrs={'id':re.compile('^HorizontalFeatureSlider_1_Story')}):
                handle_article(wdiv,key)
                wdiv.extract()
            for wdiv in mainsoup.findAll(attrs={'id':['featurewidget','textfeature','textlinks_timestamp']}):
                for adiv in wdiv.findAll('div','featurecontent'):
                    handle_article(adiv,key)
        for (k,url) in self.postmedia_index_pages:
            parse_web_index(k,url)
        ans = [(key, articles[key]) for key in ans if articles.has_key(key)]
        return ans