Updates to various PostMedia news sources

2025-07-09 03:04:10 -04:00 · 2013-11-26 08:48:08 +05:30 · 2013-11-26 08:48:08 +05:30 · 27c041ee62
commit 27c041ee62
parent ea23267f97
7 changed files with 387 additions and 571 deletions
--- a/recipes/calgary_herald.recipe
+++ b/recipes/calgary_herald.recipe
@ -7,7 +7,7 @@ www.canada.com
 '''
 import re
 from calibre.web.feeds.recipes import BasicNewsRecipe
-from calibre.ebooks.BeautifulSoup import Tag, BeautifulStoneSoup
+from calibre.ebooks.BeautifulSoup import BeautifulStoneSoup, Tag
 class CanWestPaper(BasicNewsRecipe):
@ -51,20 +51,20 @@ class CanWestPaper(BasicNewsRecipe):
    # un-comment the following six lines for the Vancouver Province
-##    title = u'Vancouver Province'
+   # title = u'Vancouver Province'
-##    url_prefix = 'http://www.theprovince.com'
+   # url_prefix = 'http://www.theprovince.com'
-##    description = u'News from Vancouver, BC'
+   # description = u'News from Vancouver, BC'
-##    std_logo_url = 'http://www.theprovince.com/images/logo_theprovince.jpg'
+   # std_logo_url = 'http://www.theprovince.com/images/logo_theprovince.jpg'
-##    logo_url = 'vplogo.jpg'
+   # logo_url = 'vplogo.jpg'
-##    fp_tag = 'CAN_TP'
+   # fp_tag = 'CAN_TP'
    # un-comment the following six lines for the Vancouver Sun
-##    title = u'Vancouver Sun'
+    # title = u'Vancouver Sun'
-##    url_prefix = 'http://www.vancouversun.com'
+    # url_prefix = 'http://www.vancouversun.com'
-##    description = u'News from Vancouver, BC'
+    # description = u'News from Vancouver, BC'
-##    std_logo_url = 'http://www.vancouversun.com/images/logo_vancouversun.jpg'
+    # std_logo_url = 'http://www.vancouversun.com/images/logo_vancouversun.jpg'
-##    logo_url = 'vslogo.jpg'
+    # logo_url = 'vslogo.jpg'
-##    fp_tag = 'CAN_VS'
+    # fp_tag = 'CAN_VS'
    # un-comment the following six lines for the Calgary Herald
    title = u'Calgary Herald'
@ -117,15 +117,17 @@ class CanWestPaper(BasicNewsRecipe):
                .photocaption { font-size: xx-small; font-style: italic; font-weight: normal; }
                #photocredit { font-size: xx-small; font-weight: normal; }'''
-    keep_only_tags = [dict(name='div', attrs={'id':'storyheader'}),dict(name='div', attrs={'id':'storycontent'}),dict(name='div', attrs={'id':'storyphoto'})]
+    keep_only_tags = [dict(name='div', attrs={'id':re.compile('story')})]
-    remove_tags = [{'class':'comments'},
+    remove_tags = [{'class':'comments'},{'class':'comment-intro'},{'class':'storytab'},
-                   dict(name='div', attrs={'class':'navbar'}),dict(name='div', attrs={'class':'morelinks'}),
+                   dict(name='div', attrs={'class':'section_title'}),dict(name='div', attrs={'class':'sharebar'}),dict(name='div', attrs={'class':'navbar'}),dict(name='div', attrs={'class':'morelinks'}),
                   dict(name='h2', attrs={'id':'photocredit'}),
                   dict(name='div', attrs={'class':'viewmore'}),dict(name='li', attrs={'class':'email'}),
                   dict(name='div', attrs={'class':'story_tool_hr'}),dict(name='div', attrs={'class':'clear'}),
                   dict(name='div', attrs={'class':'story_tool'}),dict(name='div', attrs={'class':'copyright'}),
                   dict(name='div', attrs={'class':'rule_grey_solid'}),
                   dict(name='div', attrs={'id':'soundoff'}),
                   dict(name='div', attrs={'id':re.compile('flyer')}),
                   dict(name='li', attrs={'class':'print'}),dict(name='li', attrs={'class':'share'}),dict(name='ul', attrs={'class':'bullet'})]
@ -252,18 +254,23 @@ class CanWestPaper(BasicNewsRecipe):
        def handle_article(adiv,key):
-            h1tag = adiv.h1
+            if adiv.name=='h1' or adiv.name=='h3':
                h1tag = adiv
            else:
                h1tag = adiv.h1
                if h1tag is None:
                    h1tag = adiv.h3
            if h1tag is not None:
                atag = h1tag.a
                if atag is not None:
                    url = atag['href']
-                    if atag['href'].startswith('http'):
+                    if url.startswith('/'):
                        url = self.url_prefix+url
                    if not url.startswith(self.url_prefix):
                        print("Rejected "+url)
                        return
                    elif atag['href'].startswith('/'):
                        url = self.url_prefix+atag['href']
                    else:
                        url = self.url_prefix+'/'+atag['href']
                    if url in self.url_list:
                        print("Rejected dup "+url)
                        return
                    self.url_list.append(url)
                    title = self.tag_to_string(atag,False)
@ -290,22 +297,21 @@ class CanWestPaper(BasicNewsRecipe):
                    articles[key].append(dict(title=title,url=url,date='',description=description,author='',content=''))
        def parse_web_index(key, keyurl):
            print("Section: "+key+': '+self.url_prefix+keyurl)
            try:
                soup = self.index_to_soup(self.url_prefix+keyurl)
            except:
                print("Section: "+key+' NOT FOUND');
                return
            ans.append(key)
            mainsoup = soup.find('div','bodywrapper')
            footer = mainsoup.find(attrs={'id':'footerfeature'})
            if footer is not None:
                footer.extract()
-            print("Section: "+key)
+            for wdiv in mainsoup.findAll(attrs={'class':['genericfeature']}):
-            for wdiv in mainsoup.findAll('div',attrs={'id':re.compile('^HorizontalFeatureSlider_1_Story')}):
+               wdiv.extract()
-                handle_article(wdiv,key)
+            for wdiv in mainsoup.findAll(attrs={'class':['headline','featurecontent']}):
-                wdiv.extract()
+               handle_article(wdiv,key)
            for wdiv in mainsoup.findAll(attrs={'id':['featurewidget','textfeature','textlinks_timestamp']}):
                for adiv in wdiv.findAll('div','featurecontent'):
                    handle_article(adiv,key)
        for (k,url) in self.postmedia_index_pages:
            parse_web_index(k,url)
--- a/recipes/edmonton_journal.recipe
+++ b/recipes/edmonton_journal.recipe
@ -51,28 +51,28 @@ class CanWestPaper(BasicNewsRecipe):
    # un-comment the following six lines for the Vancouver Province
-##    title = u'Vancouver Province'
+   # title = u'Vancouver Province'
-##    url_prefix = 'http://www.theprovince.com'
+   # url_prefix = 'http://www.theprovince.com'
-##    description = u'News from Vancouver, BC'
+   # description = u'News from Vancouver, BC'
-##    std_logo_url = 'http://www.theprovince.com/images/logo_theprovince.jpg'
+   # std_logo_url = 'http://www.theprovince.com/images/logo_theprovince.jpg'
-##    logo_url = 'vplogo.jpg'
+   # logo_url = 'vplogo.jpg'
-##    fp_tag = 'CAN_TP'
+   # fp_tag = 'CAN_TP'
    # un-comment the following six lines for the Vancouver Sun
-##    title = u'Vancouver Sun'
+    # title = u'Vancouver Sun'
-##    url_prefix = 'http://www.vancouversun.com'
+    # url_prefix = 'http://www.vancouversun.com'
-##    description = u'News from Vancouver, BC'
+    # description = u'News from Vancouver, BC'
-##    std_logo_url = 'http://www.vancouversun.com/images/logo_vancouversun.jpg'
+    # std_logo_url = 'http://www.vancouversun.com/images/logo_vancouversun.jpg'
-##    logo_url = 'vslogo.jpg'
+    # logo_url = 'vslogo.jpg'
-##    fp_tag = 'CAN_VS'
+    # fp_tag = 'CAN_VS'
    # un-comment the following six lines for the Calgary Herald
-##    title = u'Calgary Herald'
+   # title = u'Calgary Herald'
-##    url_prefix = 'http://www.calgaryherald.com'
+   # url_prefix = 'http://www.calgaryherald.com'
-##    description = u'News from Calgary, AB'
+   # description = u'News from Calgary, AB'
-##    std_logo_url = 'http://www.calgaryherald.com/images/logo_calgaryherald.jpg'
+   # std_logo_url = 'http://www.calgaryherald.com/images/logo_calgaryherald.jpg'
-##    logo_url = 'chlogo.jpg'
+   # logo_url = 'chlogo.jpg'
-##    fp_tag = 'CAN_CH'
+   # fp_tag = 'CAN_CH'
    # un-comment the following six lines for the Edmonton Journal
    title = u'Edmonton Journal'
@ -117,15 +117,17 @@ class CanWestPaper(BasicNewsRecipe):
                .photocaption { font-size: xx-small; font-style: italic; font-weight: normal; }
                #photocredit { font-size: xx-small; font-weight: normal; }'''
-    keep_only_tags = [dict(name='div', attrs={'id':'storyheader'}),dict(name='div', attrs={'id':'storycontent'}),dict(name='div', attrs={'id':'storyphoto'})]
+    keep_only_tags = [dict(name='div', attrs={'id':re.compile('story')})]
-    remove_tags = [{'class':'comments'},
+    remove_tags = [{'class':'comments'},{'class':'comment-intro'},{'class':'storytab'},
-                   dict(name='div', attrs={'class':'navbar'}),dict(name='div', attrs={'class':'morelinks'}),
+                   dict(name='div', attrs={'class':'section_title'}),dict(name='div', attrs={'class':'sharebar'}),dict(name='div', attrs={'class':'navbar'}),dict(name='div', attrs={'class':'morelinks'}),
                   dict(name='h2', attrs={'id':'photocredit'}),
                   dict(name='div', attrs={'class':'viewmore'}),dict(name='li', attrs={'class':'email'}),
                   dict(name='div', attrs={'class':'story_tool_hr'}),dict(name='div', attrs={'class':'clear'}),
                   dict(name='div', attrs={'class':'story_tool'}),dict(name='div', attrs={'class':'copyright'}),
                   dict(name='div', attrs={'class':'rule_grey_solid'}),
                   dict(name='div', attrs={'id':'soundoff'}),
                   dict(name='div', attrs={'id':re.compile('flyer')}),
                   dict(name='li', attrs={'class':'print'}),dict(name='li', attrs={'class':'share'}),dict(name='ul', attrs={'class':'bullet'})]
@ -252,18 +254,23 @@ class CanWestPaper(BasicNewsRecipe):
        def handle_article(adiv,key):
-            h1tag = adiv.h1
+            if adiv.name=='h1' or adiv.name=='h3':
                h1tag = adiv
            else:
                h1tag = adiv.h1
                if h1tag is None:
                    h1tag = adiv.h3
            if h1tag is not None:
                atag = h1tag.a
                if atag is not None:
                    url = atag['href']
-                    if atag['href'].startswith('http'):
+                    if url.startswith('/'):
                        url = self.url_prefix+url
                    if not url.startswith(self.url_prefix):
                        print("Rejected "+url)
                        return
                    elif atag['href'].startswith('/'):
                        url = self.url_prefix+atag['href']
                    else:
                        url = self.url_prefix+'/'+atag['href']
                    if url in self.url_list:
                        print("Rejected dup "+url)
                        return
                    self.url_list.append(url)
                    title = self.tag_to_string(atag,False)
@ -290,22 +297,21 @@ class CanWestPaper(BasicNewsRecipe):
                    articles[key].append(dict(title=title,url=url,date='',description=description,author='',content=''))
        def parse_web_index(key, keyurl):
            print("Section: "+key+': '+self.url_prefix+keyurl)
            try:
                soup = self.index_to_soup(self.url_prefix+keyurl)
            except:
                print("Section: "+key+' NOT FOUND');
                return
            ans.append(key)
            mainsoup = soup.find('div','bodywrapper')
            footer = mainsoup.find(attrs={'id':'footerfeature'})
            if footer is not None:
                footer.extract()
-            print("Section: "+key)
+            for wdiv in mainsoup.findAll(attrs={'class':['genericfeature']}):
-            for wdiv in mainsoup.findAll('div',attrs={'id':re.compile('^HorizontalFeatureSlider_1_Story')}):
+               wdiv.extract()
-                handle_article(wdiv,key)
+            for wdiv in mainsoup.findAll(attrs={'class':['headline','featurecontent']}):
-                wdiv.extract()
+               handle_article(wdiv,key)
            for wdiv in mainsoup.findAll(attrs={'id':['featurewidget','textfeature','textlinks_timestamp']}):
                for adiv in wdiv.findAll('div','featurecontent'):
                    handle_article(adiv,key)
        for (k,url) in self.postmedia_index_pages:
            parse_web_index(k,url)
--- a/recipes/montreal_gazette.recipe
+++ b/recipes/montreal_gazette.recipe
@ -51,44 +51,44 @@ class CanWestPaper(BasicNewsRecipe):
    # un-comment the following six lines for the Vancouver Province
-##    title = u'Vancouver Province'
+   # title = u'Vancouver Province'
-##    url_prefix = 'http://www.theprovince.com'
+   # url_prefix = 'http://www.theprovince.com'
-##    description = u'News from Vancouver, BC'
+   # description = u'News from Vancouver, BC'
-##    std_logo_url = 'http://www.theprovince.com/images/logo_theprovince.jpg'
+   # std_logo_url = 'http://www.theprovince.com/images/logo_theprovince.jpg'
-##    logo_url = 'vplogo.jpg'
+   # logo_url = 'vplogo.jpg'
-##    fp_tag = 'CAN_TP'
+   # fp_tag = 'CAN_TP'
    # un-comment the following six lines for the Vancouver Sun
-##    title = u'Vancouver Sun'
+    # title = u'Vancouver Sun'
-##    url_prefix = 'http://www.vancouversun.com'
+    # url_prefix = 'http://www.vancouversun.com'
-##    description = u'News from Vancouver, BC'
+    # description = u'News from Vancouver, BC'
-##    std_logo_url = 'http://www.vancouversun.com/images/logo_vancouversun.jpg'
+    # std_logo_url = 'http://www.vancouversun.com/images/logo_vancouversun.jpg'
-##    logo_url = 'vslogo.jpg'
+    # logo_url = 'vslogo.jpg'
-##    fp_tag = 'CAN_VS'
+    # fp_tag = 'CAN_VS'
    # un-comment the following six lines for the Calgary Herald
-##    title = u'Calgary Herald'
+   # title = u'Calgary Herald'
-##    url_prefix = 'http://www.calgaryherald.com'
+   # url_prefix = 'http://www.calgaryherald.com'
-##    description = u'News from Calgary, AB'
+   # description = u'News from Calgary, AB'
-##    std_logo_url = 'http://www.calgaryherald.com/images/logo_calgaryherald.jpg'
+   # std_logo_url = 'http://www.calgaryherald.com/images/logo_calgaryherald.jpg'
-##    logo_url = 'chlogo.jpg'
+   # logo_url = 'chlogo.jpg'
-##    fp_tag = 'CAN_CH'
+   # fp_tag = 'CAN_CH'
    # un-comment the following six lines for the Edmonton Journal
-##    title = u'Edmonton Journal'
+   # title = u'Edmonton Journal'
-##    url_prefix = 'http://www.edmontonjournal.com'
+   # url_prefix = 'http://www.edmontonjournal.com'
-##    description = u'News from Edmonton, AB'
+   # description = u'News from Edmonton, AB'
-##    std_logo_url = 'http://www.edmontonjournal.com/images/logo_edmontonjournal.jpg'
+   # std_logo_url = 'http://www.edmontonjournal.com/images/logo_edmontonjournal.jpg'
-##    logo_url = 'ejlogo.jpg'
+   # logo_url = 'ejlogo.jpg'
-##    fp_tag = 'CAN_EJ'
+   # fp_tag = 'CAN_EJ'
    # un-comment the following six lines for the Ottawa Citizen
-##    title = u'Ottawa Citizen'
+   # title = u'Ottawa Citizen'
-##    url_prefix = 'http://www.ottawacitizen.com'
+   # url_prefix = 'http://www.ottawacitizen.com'
-##    description = u'News from Ottawa, ON'
+   # description = u'News from Ottawa, ON'
-##    std_logo_url = 'http://www.ottawacitizen.com/images/logo_ottawacitizen.jpg'
+   # std_logo_url = 'http://www.ottawacitizen.com/images/logo_ottawacitizen.jpg'
-##    logo_url = 'oclogo.jpg'
+   # logo_url = 'oclogo.jpg'
-##    fp_tag = 'CAN_OC'
+   # fp_tag = 'CAN_OC'
    # un-comment the following six lines for the Montreal Gazette
    title = u'Montreal Gazette'
@ -117,15 +117,17 @@ class CanWestPaper(BasicNewsRecipe):
                .photocaption { font-size: xx-small; font-style: italic; font-weight: normal; }
                #photocredit { font-size: xx-small; font-weight: normal; }'''
-    keep_only_tags = [dict(name='div', attrs={'id':'storyheader'}),dict(name='div', attrs={'id':'storycontent'}),dict(name='div', attrs={'id':'storyphoto'})]
+    keep_only_tags = [dict(name='div', attrs={'id':re.compile('story')})]
-    remove_tags = [{'class':'comments'},
+    remove_tags = [{'class':'comments'},{'class':'comment-intro'},{'class':'storytab'},
-                   dict(name='div', attrs={'class':'navbar'}),dict(name='div', attrs={'class':'morelinks'}),
+                   dict(name='div', attrs={'class':'section_title'}),dict(name='div', attrs={'class':'sharebar'}),dict(name='div', attrs={'class':'navbar'}),dict(name='div', attrs={'class':'morelinks'}),
                   dict(name='h2', attrs={'id':'photocredit'}),
                   dict(name='div', attrs={'class':'viewmore'}),dict(name='li', attrs={'class':'email'}),
                   dict(name='div', attrs={'class':'story_tool_hr'}),dict(name='div', attrs={'class':'clear'}),
                   dict(name='div', attrs={'class':'story_tool'}),dict(name='div', attrs={'class':'copyright'}),
                   dict(name='div', attrs={'class':'rule_grey_solid'}),
                   dict(name='div', attrs={'id':'soundoff'}),
                   dict(name='div', attrs={'id':re.compile('flyer')}),
                   dict(name='li', attrs={'class':'print'}),dict(name='li', attrs={'class':'share'}),dict(name='ul', attrs={'class':'bullet'})]
@ -252,18 +254,23 @@ class CanWestPaper(BasicNewsRecipe):
        def handle_article(adiv,key):
-            h1tag = adiv.h1
+            if adiv.name=='h1' or adiv.name=='h3':
                h1tag = adiv
            else:
                h1tag = adiv.h1
                if h1tag is None:
                    h1tag = adiv.h3
            if h1tag is not None:
                atag = h1tag.a
                if atag is not None:
                    url = atag['href']
-                    if atag['href'].startswith('http'):
+                    if url.startswith('/'):
                        url = self.url_prefix+url
                    if not url.startswith(self.url_prefix):
                        print("Rejected "+url)
                        return
                    elif atag['href'].startswith('/'):
                        url = self.url_prefix+atag['href']
                    else:
                        url = self.url_prefix+'/'+atag['href']
                    if url in self.url_list:
                        print("Rejected dup "+url)
                        return
                    self.url_list.append(url)
                    title = self.tag_to_string(atag,False)
@ -290,22 +297,21 @@ class CanWestPaper(BasicNewsRecipe):
                    articles[key].append(dict(title=title,url=url,date='',description=description,author='',content=''))
        def parse_web_index(key, keyurl):
            print("Section: "+key+': '+self.url_prefix+keyurl)
            try:
                soup = self.index_to_soup(self.url_prefix+keyurl)
            except:
                print("Section: "+key+' NOT FOUND');
                return
            ans.append(key)
            mainsoup = soup.find('div','bodywrapper')
            footer = mainsoup.find(attrs={'id':'footerfeature'})
            if footer is not None:
                footer.extract()
-            print("Section: "+key)
+            for wdiv in mainsoup.findAll(attrs={'class':['genericfeature']}):
-            for wdiv in mainsoup.findAll('div',attrs={'id':re.compile('^HorizontalFeatureSlider_1_Story')}):
+               wdiv.extract()
-                handle_article(wdiv,key)
+            for wdiv in mainsoup.findAll(attrs={'class':['headline','featurecontent']}):
-                wdiv.extract()
+               handle_article(wdiv,key)
            for wdiv in mainsoup.findAll(attrs={'id':['featurewidget','textfeature','textlinks_timestamp']}):
                for adiv in wdiv.findAll('div','featurecontent'):
                    handle_article(adiv,key)
        for (k,url) in self.postmedia_index_pages:
            parse_web_index(k,url)
--- a/recipes/ottawa_citizen.recipe
+++ b/recipes/ottawa_citizen.recipe
@ -51,36 +51,36 @@ class CanWestPaper(BasicNewsRecipe):
    # un-comment the following six lines for the Vancouver Province
-##    title = u'Vancouver Province'
+   # title = u'Vancouver Province'
-##    url_prefix = 'http://www.theprovince.com'
+   # url_prefix = 'http://www.theprovince.com'
-##    description = u'News from Vancouver, BC'
+   # description = u'News from Vancouver, BC'
-##    std_logo_url = 'http://www.theprovince.com/images/logo_theprovince.jpg'
+   # std_logo_url = 'http://www.theprovince.com/images/logo_theprovince.jpg'
-##    logo_url = 'vplogo.jpg'
+   # logo_url = 'vplogo.jpg'
-##    fp_tag = 'CAN_TP'
+   # fp_tag = 'CAN_TP'
    # un-comment the following six lines for the Vancouver Sun
-##    title = u'Vancouver Sun'
+    # title = u'Vancouver Sun'
-##    url_prefix = 'http://www.vancouversun.com'
+    # url_prefix = 'http://www.vancouversun.com'
-##    description = u'News from Vancouver, BC'
+    # description = u'News from Vancouver, BC'
-##    std_logo_url = 'http://www.vancouversun.com/images/logo_vancouversun.jpg'
+    # std_logo_url = 'http://www.vancouversun.com/images/logo_vancouversun.jpg'
-##    logo_url = 'vslogo.jpg'
+    # logo_url = 'vslogo.jpg'
-##    fp_tag = 'CAN_VS'
+    # fp_tag = 'CAN_VS'
    # un-comment the following six lines for the Calgary Herald
-##    title = u'Calgary Herald'
+   # title = u'Calgary Herald'
-##    url_prefix = 'http://www.calgaryherald.com'
+   # url_prefix = 'http://www.calgaryherald.com'
-##    description = u'News from Calgary, AB'
+   # description = u'News from Calgary, AB'
-##    std_logo_url = 'http://www.calgaryherald.com/images/logo_calgaryherald.jpg'
+   # std_logo_url = 'http://www.calgaryherald.com/images/logo_calgaryherald.jpg'
-##    logo_url = 'chlogo.jpg'
+   # logo_url = 'chlogo.jpg'
-##    fp_tag = 'CAN_CH'
+   # fp_tag = 'CAN_CH'
    # un-comment the following six lines for the Edmonton Journal
-##    title = u'Edmonton Journal'
+   # title = u'Edmonton Journal'
-##    url_prefix = 'http://www.edmontonjournal.com'
+   # url_prefix = 'http://www.edmontonjournal.com'
-##    description = u'News from Edmonton, AB'
+   # description = u'News from Edmonton, AB'
-##    std_logo_url = 'http://www.edmontonjournal.com/images/logo_edmontonjournal.jpg'
+   # std_logo_url = 'http://www.edmontonjournal.com/images/logo_edmontonjournal.jpg'
-##    logo_url = 'ejlogo.jpg'
+   # logo_url = 'ejlogo.jpg'
-##    fp_tag = 'CAN_EJ'
+   # fp_tag = 'CAN_EJ'
    # un-comment the following six lines for the Ottawa Citizen
    title = u'Ottawa Citizen'
@ -117,15 +117,17 @@ class CanWestPaper(BasicNewsRecipe):
                .photocaption { font-size: xx-small; font-style: italic; font-weight: normal; }
                #photocredit { font-size: xx-small; font-weight: normal; }'''
-    keep_only_tags = [dict(name='div', attrs={'id':'storyheader'}),dict(name='div', attrs={'id':'storycontent'}),dict(name='div', attrs={'id':'storyphoto'})]
+    keep_only_tags = [dict(name='div', attrs={'id':re.compile('story')})]
-    remove_tags = [{'class':'comments'},
+    remove_tags = [{'class':'comments'},{'class':'comment-intro'},{'class':'storytab'},
-                   dict(name='div', attrs={'class':'navbar'}),dict(name='div', attrs={'class':'morelinks'}),
+                   dict(name='div', attrs={'class':'section_title'}),dict(name='div', attrs={'class':'sharebar'}),dict(name='div', attrs={'class':'navbar'}),dict(name='div', attrs={'class':'morelinks'}),
                   dict(name='h2', attrs={'id':'photocredit'}),
                   dict(name='div', attrs={'class':'viewmore'}),dict(name='li', attrs={'class':'email'}),
                   dict(name='div', attrs={'class':'story_tool_hr'}),dict(name='div', attrs={'class':'clear'}),
                   dict(name='div', attrs={'class':'story_tool'}),dict(name='div', attrs={'class':'copyright'}),
                   dict(name='div', attrs={'class':'rule_grey_solid'}),
                   dict(name='div', attrs={'id':'soundoff'}),
                   dict(name='div', attrs={'id':re.compile('flyer')}),
                   dict(name='li', attrs={'class':'print'}),dict(name='li', attrs={'class':'share'}),dict(name='ul', attrs={'class':'bullet'})]
@ -252,18 +254,23 @@ class CanWestPaper(BasicNewsRecipe):
        def handle_article(adiv,key):
-            h1tag = adiv.h1
+            if adiv.name=='h1' or adiv.name=='h3':
                h1tag = adiv
            else:
                h1tag = adiv.h1
                if h1tag is None:
                    h1tag = adiv.h3
            if h1tag is not None:
                atag = h1tag.a
                if atag is not None:
                    url = atag['href']
-                    if atag['href'].startswith('http'):
+                    if url.startswith('/'):
                        url = self.url_prefix+url
                    if not url.startswith(self.url_prefix):
                        print("Rejected "+url)
                        return
                    elif atag['href'].startswith('/'):
                        url = self.url_prefix+atag['href']
                    else:
                        url = self.url_prefix+'/'+atag['href']
                    if url in self.url_list:
                        print("Rejected dup "+url)
                        return
                    self.url_list.append(url)
                    title = self.tag_to_string(atag,False)
@ -290,22 +297,21 @@ class CanWestPaper(BasicNewsRecipe):
                    articles[key].append(dict(title=title,url=url,date='',description=description,author='',content=''))
        def parse_web_index(key, keyurl):
            print("Section: "+key+': '+self.url_prefix+keyurl)
            try:
                soup = self.index_to_soup(self.url_prefix+keyurl)
            except:
                print("Section: "+key+' NOT FOUND');
                return
            ans.append(key)
            mainsoup = soup.find('div','bodywrapper')
            footer = mainsoup.find(attrs={'id':'footerfeature'})
            if footer is not None:
                footer.extract()
-            print("Section: "+key)
+            for wdiv in mainsoup.findAll(attrs={'class':['genericfeature']}):
-            for wdiv in mainsoup.findAll('div',attrs={'id':re.compile('^HorizontalFeatureSlider_1_Story')}):
+               wdiv.extract()
-                handle_article(wdiv,key)
+            for wdiv in mainsoup.findAll(attrs={'class':['headline','featurecontent']}):
-                wdiv.extract()
+               handle_article(wdiv,key)
            for wdiv in mainsoup.findAll(attrs={'id':['featurewidget','textfeature','textlinks_timestamp']}):
                for adiv in wdiv.findAll('div','featurecontent'):
                    handle_article(adiv,key)
        for (k,url) in self.postmedia_index_pages:
            parse_web_index(k,url)
--- a/recipes/vancouver_provice.recipe
+++ b/recipes/vancouver_provice.recipe
@ -1,314 +0,0 @@
 #!/usr/bin/env  python
 # -*- coding: utf-8 -*-
 __license__   = 'GPL v3'
 '''
 www.canada.com
 '''
 import re
 from calibre.web.feeds.recipes import BasicNewsRecipe
 from calibre.ebooks.BeautifulSoup import BeautifulStoneSoup, Tag
 class CanWestPaper(BasicNewsRecipe):
    postmedia_index_pages = [
        (u'Headlines',u'/index.html'),
        (u'Ottawa & Area',u'/news/ottawa/index.html'),
        (u'Vancouver',u'/news/vancouver/index.html'),
        (u'Calgary',u'/news/calgary/index.html'),
        (u'Edmonton',u'/news/edmonton/index.html'),
        (u'Montreal',u'/news/montreal/index.html'),
        (u'Fraser Valley',u'/news/fraser-valley/index.html'),
        (u'British Columbia',u'/news/bc/index.html'),
        (u'Alberta',u'/news/alberta/index.html'),
        (u'Canada',u'/news/canada/index.html'),
        (u'National',u'/news/national/index.html'),
        (u'Politics',u'/news/politics/index.html'),
        (u'Insight',u'/news/insight/index.html'),
        (u'Special Reports',u'/news/specialreports/index.html'),
        (u'Gangs',u'/news/gangs/index.html'),
        (u'Education',u'/news/education/index.html'),
        (u'Health',u'/news/health/index.html'),
        (u'Environment',u'/news/environment/index.html'),
        (u'World',u'/news/world/index.html'),
        (u'Police Blotter',u'/news/crime-and-justice/index.html'),
        (u'Crime',u'/news/blotter/index.html'),
        (u'Around Town',u'/news/topic.html?t=keyword&q=Around+Town'),
        (u'Diplomatica',u'/news/diplomatica/index.html'),
        (u'Opinion',u'/opinion/index.html'),
        (u'Columnists',u'/columnists/index.html'),
        (u'Editorials',u'/opinion/editorials/index.html'),
        (u'Letters',u'/opinion/letters/index.html'),
        (u'Business',u'/business/index.html'),
        (u'Sports',u'/sports/index.html'),
        (u'Arts',u'/entertainment/index.html'),
        (u'Life',u'/life/index.html'),
        (u'Technology',u'/technology/index.html'),
        (u'Travel',u'/travel/index.html'),
        (u'Health',u'/health/index.html')
        ]
    # un-comment the following six lines for the Vancouver Province
    title = u'Vancouver Province'
    url_prefix = 'http://www.theprovince.com'
    description = u'News from Vancouver, BC'
    std_logo_url = 'http://www.theprovince.com/images/logo_theprovince.jpg'
    logo_url = 'vplogo.jpg'
    fp_tag = 'CAN_TP'
    # un-comment the following six lines for the Vancouver Sun
 ##    title = u'Vancouver Sun'
 ##    url_prefix = 'http://www.vancouversun.com'
 ##    description = u'News from Vancouver, BC'
 ##    std_logo_url = 'http://www.vancouversun.com/images/logo_vancouversun.jpg'
 ##    logo_url = 'vslogo.jpg'
 ##    fp_tag = 'CAN_VS'
    # un-comment the following six lines for the Calgary Herald
 ##    title = u'Calgary Herald'
 ##    url_prefix = 'http://www.calgaryherald.com'
 ##    description = u'News from Calgary, AB'
 ##    std_logo_url = 'http://www.calgaryherald.com/images/logo_calgaryherald.jpg'
 ##    logo_url = 'chlogo.jpg'
 ##    fp_tag = 'CAN_CH'
    # un-comment the following six lines for the Edmonton Journal
 ##    title = u'Edmonton Journal'
 ##    url_prefix = 'http://www.edmontonjournal.com'
 ##    description = u'News from Edmonton, AB'
 ##    std_logo_url = 'http://www.edmontonjournal.com/images/logo_edmontonjournal.jpg'
 ##    logo_url = 'ejlogo.jpg'
 ##    fp_tag = 'CAN_EJ'
    # un-comment the following six lines for the Ottawa Citizen
 ##    title = u'Ottawa Citizen'
 ##    url_prefix = 'http://www.ottawacitizen.com'
 ##    description = u'News from Ottawa, ON'
 ##    std_logo_url = 'http://www.ottawacitizen.com/images/logo_ottawacitizen.jpg'
 ##    logo_url = 'oclogo.jpg'
 ##    fp_tag = 'CAN_OC'
    # un-comment the following six lines for the Montreal Gazette
 ##    title = u'Montreal Gazette'
 ##    url_prefix = 'http://www.montrealgazette.com'
 ##    description = u'News from Montreal, QC'
 ##    std_logo_url = 'http://www.montrealgazette.com/images/logo_montrealgazette.jpg'
 ##    logo_url = 'mglogo.jpg'
 ##    fp_tag = 'CAN_MG'
    Kindle_Fire=False
    masthead_url = std_logo_url
    url_list = []
    language = 'en_CA'
    __author__ = 'Nick Redding'
    no_stylesheets = True
    timefmt =  ' [%b %d]'
    encoding = 'utf-8'
    extra_css = '''
                .timestamp {  font-size:xx-small; display: block; }
                #storyheader { font-size: medium; }
                #storyheader h1 { font-size: x-large; }
                #storyheader h2 { font-size: small;  font-style: italic; }
                .byline { font-size:xx-small; }
                #photocaption { font-size: xx-small; font-style: italic; font-weight: normal; }
                .photocaption { font-size: xx-small; font-style: italic; font-weight: normal; }
                #photocredit { font-size: xx-small; font-weight: normal; }'''
    keep_only_tags = [dict(name='div', attrs={'id':'storyheader'}),dict(name='div', attrs={'id':'storycontent'}),dict(name='div', attrs={'id':'storyphoto'})]
    remove_tags = [{'class':'comments'},
                   dict(name='div', attrs={'class':'navbar'}),dict(name='div', attrs={'class':'morelinks'}),
                   dict(name='h2', attrs={'id':'photocredit'}),
                   dict(name='div', attrs={'class':'viewmore'}),dict(name='li', attrs={'class':'email'}),
                   dict(name='div', attrs={'class':'story_tool_hr'}),dict(name='div', attrs={'class':'clear'}),
                   dict(name='div', attrs={'class':'story_tool'}),dict(name='div', attrs={'class':'copyright'}),
                   dict(name='div', attrs={'class':'rule_grey_solid'}),
                   dict(name='li', attrs={'class':'print'}),dict(name='li', attrs={'class':'share'}),dict(name='ul', attrs={'class':'bullet'})]
    def get_cover_url(self):
        from datetime import timedelta, date
        cover = 'http://webmedia.newseum.org/newseum-multimedia/dfp/jpg'+str(date.today().day)+'/lg/'+self.fp_tag+'.jpg'
        br = BasicNewsRecipe.get_browser(self)
        daysback=1
        try:
            br.open(cover)
        except:
            while daysback<7:
                cover = 'http://webmedia.newseum.org/newseum-multimedia/dfp/jpg'+str((date.today() - timedelta(days=daysback)).day)+'/lg/'+self.fp_tag+'.jpg'
                br = BasicNewsRecipe.get_browser(self)
                try:
                    br.open(cover)
                except:
                    daysback = daysback+1
                    continue
                break
        if daysback==7:
            self.log("\nCover unavailable")
            cover = None
        return cover
    def prepare_masthead_image(self, path_to_image, out_path):
        if self.Kindle_Fire:
            from calibre.utils.magick import Image, create_canvas
            img = Image()
            img.open(path_to_image)
            width, height = img.size
            img2 = create_canvas(width, height)
            img2.compose(img)
            img2.save(out_path)
        else:
            BasicNewsRecipe.prepare_masthead_image(self, path_to_image, out_path)
    def fixChars(self,string):
        # Replace lsquo (\x91)
        fixed = re.sub("\x91","‘",string)
        # Replace rsquo (\x92)
        fixed = re.sub("\x92","’",fixed)
        # Replace ldquo (\x93)
        fixed = re.sub("\x93","“",fixed)
        # Replace rdquo (\x94)
        fixed = re.sub("\x94","”",fixed)
        # Replace ndash (\x96)
        fixed = re.sub("\x96","–",fixed)
        # Replace mdash (\x97)
        fixed = re.sub("\x97","—",fixed)
        fixed = re.sub("&#x2019;","’",fixed)
        return fixed
    def massageNCXText(self, description):
        # Kindle TOC descriptions won't render certain characters
        if description:
            massaged = unicode(BeautifulStoneSoup(description, convertEntities=BeautifulStoneSoup.HTML_ENTITIES))
            # Replace '&' with '&'
            massaged = re.sub("&","&", massaged)
            return self.fixChars(massaged)
        else:
            return description
    def populate_article_metadata(self, article, soup, first):
        if first:
            picdiv = soup.find('body').find('img')
            if picdiv is not None:
                self.add_toc_thumbnail(article,re.sub(r'links\\link\d+\\','',picdiv['src']))
        xtitle = article.text_summary.strip()
        if len(xtitle) == 0:
            desc = soup.find('meta',attrs={'property':'og:description'})
            if desc is not None:
                article.summary = article.text_summary = desc['content']
    def strip_anchors(self,soup):
        paras = soup.findAll(True)
        for para in paras:
            aTags = para.findAll('a')
            for a in aTags:
                if a.img is None:
                    a.replaceWith(a.renderContents().decode('cp1252','replace'))
        return soup
    def preprocess_html(self,soup):
        #delete empty id attributes--they screw up the TOC for unknown reasons
        divtags = soup.findAll('div',attrs={'id':''})
        if divtags:
            for div in divtags:
                del(div['id'])
        pgall = soup.find('div',attrs={'id':'storyphoto'})
        if pgall is not None: # photo gallery perhaps
            if (soup.find('div',attrs={'id':'storycontent'}) is None):
                allpics = Tag(soup,'div')
                first_img = pgall.find('div','storyimage')
                if first_img is not None:
                    first_img.extract()
                    tlist = pgall.find('div',attrs={'id':'relatedthumbs'})
                    if tlist is not None:
                        for atag in tlist.findAll('a'):
                            img = Tag(soup,'img')
                            srcpre, sep, srcpost = atag.img['src'].partition('?')
                            img['src'] = srcpre
                            pdesc = Tag(soup,'p')
                            pdesc.insert(0,atag.img['alt'])
                            pdesc['class']='photocaption'
                            div = Tag(soup,'div')
                            div.insert(0,pdesc)
                            div.insert(0,img)
                            allpics.append(div)
                pgall.replaceWith(allpics)
        for pg in soup.findAll('div',attrs={'id':'storyphoto'}):
            pg.extract()
        return self.strip_anchors(soup)
    def parse_index(self):
        articles = {}
        ans = []
        def handle_article(adiv,key):
            h1tag = adiv.h1
            if h1tag is not None:
                atag = h1tag.a
                if atag is not None:
                    url = atag['href']
                    if atag['href'].startswith('http'):
                        return
                    elif atag['href'].startswith('/'):
                        url = self.url_prefix+atag['href']
                    else:
                        url = self.url_prefix+'/'+atag['href']
                    if url in self.url_list:
                        return
                    self.url_list.append(url)
                    title = self.tag_to_string(atag,False)
                    if 'VIDEO' in title.upper():
                        return
                    if 'GALLERY' in title.upper():
                        return
                    if 'PHOTOS' in title.upper():
                        return
                    dtag = adiv.find('div','content')
                    description=''
                    print("URL "+url)
                    print("TITLE "+title)
                    if dtag is not None:
                        stag = dtag.span
                        if stag is not None:
                            if stag['class'] != 'timestamp':
                                description = self.tag_to_string(stag,False)
                        else:
                            description = self.tag_to_string(dtag,False)
                        print("DESCRIPTION: "+description)
                    if not articles.has_key(key):
                        articles[key] = []
                    articles[key].append(dict(title=title,url=url,date='',description=description,author='',content=''))
        def parse_web_index(key, keyurl):
            try:
                soup = self.index_to_soup(self.url_prefix+keyurl)
            except:
                return
            ans.append(key)
            mainsoup = soup.find('div','bodywrapper')
            footer = mainsoup.find(attrs={'id':'footerfeature'})
            if footer is not None:
                footer.extract()
            print("Section: "+key)
            for wdiv in mainsoup.findAll('div',attrs={'id':re.compile('^HorizontalFeatureSlider_1_Story')}):
                handle_article(wdiv,key)
                wdiv.extract()
            for wdiv in mainsoup.findAll(attrs={'id':['featurewidget','textfeature','textlinks_timestamp']}):
                for adiv in wdiv.findAll('div','featurecontent'):
                    handle_article(adiv,key)
        for (k,url) in self.postmedia_index_pages:
            parse_web_index(k,url)
        ans = [(key, articles[key]) for key in ans if articles.has_key(key)]
        return ans
--- a/recipes/vancouver_province.recipe
+++ b/recipes/vancouver_province.recipe
@ -1,105 +1,138 @@
 #!/usr/bin/env  python
 # -*- coding: utf-8 -*-
 __license__   = 'GPL v3'
 '''
 www.canada.com
 '''
 import re
-from calibre.web.feeds.news import BasicNewsRecipe
+from calibre.web.feeds.recipes import BasicNewsRecipe
-from calibre.ebooks.BeautifulSoup import BeautifulStoneSoup
+from calibre.ebooks.BeautifulSoup import BeautifulStoneSoup, Tag
 class CanWestPaper(BasicNewsRecipe):
-    # un-comment the following four lines for the Victoria Times Colonist
+    postmedia_index_pages = [
-##    title = u'Victoria Times Colonist'
+        (u'Headlines',u'/index.html'),
-##    url_prefix = 'http://www.timescolonist.com'
+        (u'Ottawa & Area',u'/news/ottawa/index.html'),
-##    description = u'News from Victoria, BC'
+        (u'Vancouver',u'/news/vancouver/index.html'),
-##    fp_tag = 'CAN_TC'
+        (u'Calgary',u'/news/calgary/index.html'),
        (u'Edmonton',u'/news/edmonton/index.html'),
        (u'Montreal',u'/news/montreal/index.html'),
        (u'Fraser Valley',u'/news/fraser-valley/index.html'),
        (u'British Columbia',u'/news/bc/index.html'),
        (u'Alberta',u'/news/alberta/index.html'),
        (u'Canada',u'/news/canada/index.html'),
        (u'National',u'/news/national/index.html'),
        (u'Politics',u'/news/politics/index.html'),
        (u'Insight',u'/news/insight/index.html'),
        (u'Special Reports',u'/news/specialreports/index.html'),
        (u'Gangs',u'/news/gangs/index.html'),
        (u'Education',u'/news/education/index.html'),
        (u'Health',u'/news/health/index.html'),
        (u'Environment',u'/news/environment/index.html'),
        (u'World',u'/news/world/index.html'),
        (u'Police Blotter',u'/news/crime-and-justice/index.html'),
        (u'Crime',u'/news/blotter/index.html'),
        (u'Around Town',u'/news/topic.html?t=keyword&q=Around+Town'),
        (u'Diplomatica',u'/news/diplomatica/index.html'),
        (u'Opinion',u'/opinion/index.html'),
        (u'Columnists',u'/columnists/index.html'),
        (u'Editorials',u'/opinion/editorials/index.html'),
        (u'Letters',u'/opinion/letters/index.html'),
        (u'Business',u'/business/index.html'),
        (u'Sports',u'/sports/index.html'),
        (u'Arts',u'/entertainment/index.html'),
        (u'Life',u'/life/index.html'),
        (u'Technology',u'/technology/index.html'),
        (u'Travel',u'/travel/index.html'),
        (u'Health',u'/health/index.html')
        ]
-    # un-comment the following four lines for the Vancouver Province
+
    # un-comment the following six lines for the Vancouver Province
    title = u'Vancouver Province'
    url_prefix = 'http://www.theprovince.com'
    description = u'News from Vancouver, BC'
-    fp_tag = 'CAN_VP'
+    std_logo_url = 'http://www.theprovince.com/images/logo_theprovince.jpg'
    logo_url = 'vplogo.jpg'
    fp_tag = 'CAN_TP'
-    # un-comment the following four lines for the Vancouver Sun
+    # un-comment the following six lines for the Vancouver Sun
-##    title = u'Vancouver Sun'
+    # title = u'Vancouver Sun'
-##    url_prefix = 'http://www.vancouversun.com'
+    # url_prefix = 'http://www.vancouversun.com'
-##    description = u'News from Vancouver, BC'
+    # description = u'News from Vancouver, BC'
-##    fp_tag = 'CAN_VS'
+    # std_logo_url = 'http://www.vancouversun.com/images/logo_vancouversun.jpg'
    # logo_url = 'vslogo.jpg'
    # fp_tag = 'CAN_VS'
-    # un-comment the following four lines for the Edmonton Journal
+    # un-comment the following six lines for the Calgary Herald
 ##    title = u'Edmonton Journal'
 ##    url_prefix = 'http://www.edmontonjournal.com'
 ##    description = u'News from Edmonton, AB'
 ##    fp_tag = 'CAN_EJ'
    # un-comment the following four lines for the Calgary Herald
 ##    title = u'Calgary Herald'
 ##    url_prefix = 'http://www.calgaryherald.com'
 ##    description = u'News from Calgary, AB'
 ##    std_logo_url = 'http://www.calgaryherald.com/images/logo_calgaryherald.jpg'
 ##    logo_url = 'chlogo.jpg'
 ##    fp_tag = 'CAN_CH'
-    # un-comment the following four lines for the Regina Leader-Post
+    # un-comment the following six lines for the Edmonton Journal
-##    title = u'Regina Leader-Post'
+##    title = u'Edmonton Journal'
-##    url_prefix = 'http://www.leaderpost.com'
+##    url_prefix = 'http://www.edmontonjournal.com'
-##    description = u'News from Regina, SK'
+##    description = u'News from Edmonton, AB'
-##    fp_tag = ''
+##    std_logo_url = 'http://www.edmontonjournal.com/images/logo_edmontonjournal.jpg'
 ##    logo_url = 'ejlogo.jpg'
 ##    fp_tag = 'CAN_EJ'
-    # un-comment the following four lines for the Saskatoon Star-Phoenix
+    # un-comment the following six lines for the Ottawa Citizen
 ##    title = u'Saskatoon Star-Phoenix'
 ##    url_prefix = 'http://www.thestarphoenix.com'
 ##    description = u'News from Saskatoon, SK'
 ##    fp_tag = ''
    # un-comment the following four lines for the Windsor Star
 ##    title = u'Windsor Star'
 ##    url_prefix = 'http://www.windsorstar.com'
 ##    description = u'News from Windsor, ON'
 ##    fp_tag = 'CAN_'
    # un-comment the following four lines for the Ottawa Citizen
 ##    title = u'Ottawa Citizen'
 ##    url_prefix = 'http://www.ottawacitizen.com'
 ##    description = u'News from Ottawa, ON'
 ##    std_logo_url = 'http://www.ottawacitizen.com/images/logo_ottawacitizen.jpg'
 ##    logo_url = 'oclogo.jpg'
 ##    fp_tag = 'CAN_OC'
-    # un-comment the following four lines for the Montreal Gazette
+    # un-comment the following six lines for the Montreal Gazette
 ##    title = u'Montreal Gazette'
 ##    url_prefix = 'http://www.montrealgazette.com'
 ##    description = u'News from Montreal, QC'
 ##    std_logo_url = 'http://www.montrealgazette.com/images/logo_montrealgazette.jpg'
 ##    logo_url = 'mglogo.jpg'
 ##    fp_tag = 'CAN_MG'
    Kindle_Fire=False
    masthead_url = std_logo_url
    url_list = []
    language = 'en_CA'
    __author__ = 'Nick Redding'
    no_stylesheets = True
-    timefmt = ' [%b %d]'
+    timefmt =  ' [%b %d]'
    encoding = 'utf-8'
    extra_css = '''
                .timestamp {  font-size:xx-small; display: block; }
                #storyheader { font-size: medium; }
                #storyheader h1 { font-size: x-large; }
-                #storyheader h2 { font-size: large;  font-style: italic; }
+                #storyheader h2 { font-size: small;  font-style: italic; }
                .byline { font-size:xx-small; }
-                #photocaption { font-size: small; font-style: italic }
+                #photocaption { font-size: xx-small; font-style: italic; font-weight: normal; }
-                #photocredit { font-size: xx-small; }'''
+                .photocaption { font-size: xx-small; font-style: italic; font-weight: normal; }
-    keep_only_tags = [dict(name='div', attrs={'id':'storyheader'}),dict(name='div', attrs={'id':'storycontent'})]
+                #photocredit { font-size: xx-small; font-weight: normal; }'''
-    remove_tags = [{'class':'comments'},
+
-                   dict(name='div', attrs={'class':'navbar'}),dict(name='div', attrs={'class':'morelinks'}),
+    keep_only_tags = [dict(name='div', attrs={'id':re.compile('story')})]
    remove_tags = [{'class':'comments'},{'class':'comment-intro'},{'class':'storytab'},
                   dict(name='div', attrs={'class':'section_title'}),dict(name='div', attrs={'class':'sharebar'}),dict(name='div', attrs={'class':'navbar'}),dict(name='div', attrs={'class':'morelinks'}),
                   dict(name='h2', attrs={'id':'photocredit'}),
                   dict(name='div', attrs={'class':'viewmore'}),dict(name='li', attrs={'class':'email'}),
                   dict(name='div', attrs={'class':'story_tool_hr'}),dict(name='div', attrs={'class':'clear'}),
                   dict(name='div', attrs={'class':'story_tool'}),dict(name='div', attrs={'class':'copyright'}),
                   dict(name='div', attrs={'class':'rule_grey_solid'}),
                   dict(name='div', attrs={'id':'soundoff'}),
                   dict(name='div', attrs={'id':re.compile('flyer')}),
                   dict(name='li', attrs={'class':'print'}),dict(name='li', attrs={'class':'share'}),dict(name='ul', attrs={'class':'bullet'})]
    def get_cover_url(self):
        from datetime import timedelta, date
        if self.fp_tag=='':
            return None
        cover = 'http://webmedia.newseum.org/newseum-multimedia/dfp/jpg'+str(date.today().day)+'/lg/'+self.fp_tag+'.jpg'
        br = BasicNewsRecipe.get_browser(self)
        daysback=1
@ -120,6 +153,18 @@ class CanWestPaper(BasicNewsRecipe):
            cover = None
        return cover
    def prepare_masthead_image(self, path_to_image, out_path):
        if self.Kindle_Fire:
            from calibre.utils.magick import Image, create_canvas
            img = Image()
            img.open(path_to_image)
            width, height = img.size
            img2 = create_canvas(width, height)
            img2.compose(img)
            img2.save(out_path)
        else:
            BasicNewsRecipe.prepare_masthead_image(self, path_to_image, out_path)
    def fixChars(self,string):
        # Replace lsquo (\x91)
        fixed = re.sub("\x91","‘",string)
@ -166,55 +211,110 @@ class CanWestPaper(BasicNewsRecipe):
                    a.replaceWith(a.renderContents().decode('cp1252','replace'))
        return soup
-    def preprocess_html(self, soup):
+
    def preprocess_html(self,soup):
        #delete empty id attributes--they screw up the TOC for unknown reasons
        divtags = soup.findAll('div',attrs={'id':''})
        if divtags:
            for div in divtags:
                del(div['id'])
        pgall = soup.find('div',attrs={'id':'storyphoto'})
        if pgall is not None: # photo gallery perhaps
            if (soup.find('div',attrs={'id':'storycontent'}) is None):
                allpics = Tag(soup,'div')
                first_img = pgall.find('div','storyimage')
                if first_img is not None:
                    first_img.extract()
                    tlist = pgall.find('div',attrs={'id':'relatedthumbs'})
                    if tlist is not None:
                        for atag in tlist.findAll('a'):
                            img = Tag(soup,'img')
                            srcpre, sep, srcpost = atag.img['src'].partition('?')
                            img['src'] = srcpre
                            pdesc = Tag(soup,'p')
                            pdesc.insert(0,atag.img['alt'])
                            pdesc['class']='photocaption'
                            div = Tag(soup,'div')
                            div.insert(0,pdesc)
                            div.insert(0,img)
                            allpics.append(div)
                pgall.replaceWith(allpics)
        for pg in soup.findAll('div',attrs={'id':'storyphoto'}):
            pg.extract()
        return self.strip_anchors(soup)
    def parse_index(self):
        soup = self.index_to_soup(self.url_prefix+'/news/todays-paper/index.html')
        articles = {}
-        key = 'News'
+        ans = []
        ans = ['News']
        # Find each instance of class="sectiontitle", class="featurecontent"
        for divtag in soup.findAll('div',attrs={'class' : ["section_title02","featurecontent"]}):
                #self.log(" div class = %s" % divtag['class'])
                if divtag['class'].startswith('section_title'):
                    # div contains section title
                    if not divtag.h3:
                        continue
                    key = self.tag_to_string(divtag.h3,False)
                    ans.append(key)
                    self.log("Section name %s" % key)
                    continue
                # div contains article data
                h1tag = divtag.find('h1')
                if not h1tag:
                    continue
                atag = h1tag.find('a',href=True)
                if not atag:
                    continue
                url = self.url_prefix+'/news/todays-paper/'+atag['href']
                #self.log("Section %s" % key)
                #self.log("url %s" % url)
                title = self.tag_to_string(atag,False)
                #self.log("title %s" % title)
                pubdate = ''
                description = ''
                ptag = divtag.find('p');
                if ptag:
                    description = self.tag_to_string(ptag,False)
                    #self.log("description %s" % description)
                author = ''
                autag = divtag.find('h4')
                if autag:
                    author = self.tag_to_string(autag,False)
                    #self.log("author %s" % author)
                if not articles.has_key(key):
                    articles[key] = []
                articles[key].append(dict(title=title,url=url,date=pubdate,description=description,author=author,content=''))
        def handle_article(adiv,key):
            if adiv.name=='h1' or adiv.name=='h3':
                h1tag = adiv
            else:
                h1tag = adiv.h1
                if h1tag is None:
                    h1tag = adiv.h3
            if h1tag is not None:
                atag = h1tag.a
                if atag is not None:
                    url = atag['href']
                    if url.startswith('/'):
                        url = self.url_prefix+url
                    if not url.startswith(self.url_prefix):
                        print("Rejected "+url)
                        return
                    if url in self.url_list:
                        print("Rejected dup "+url)
                        return
                    self.url_list.append(url)
                    title = self.tag_to_string(atag,False)
                    if 'VIDEO' in title.upper():
                        return
                    if 'GALLERY' in title.upper():
                        return
                    if 'PHOTOS' in title.upper():
                        return
                    dtag = adiv.find('div','content')
                    description=''
                    print("URL "+url)
                    print("TITLE "+title)
                    if dtag is not None:
                        stag = dtag.span
                        if stag is not None:
                            if stag['class'] != 'timestamp':
                                description = self.tag_to_string(stag,False)
                        else:
                            description = self.tag_to_string(dtag,False)
                        print("DESCRIPTION: "+description)
                    if not articles.has_key(key):
                        articles[key] = []
                    articles[key].append(dict(title=title,url=url,date='',description=description,author='',content=''))
        def parse_web_index(key, keyurl):
            print("Section: "+key+': '+self.url_prefix+keyurl)
            try:
                soup = self.index_to_soup(self.url_prefix+keyurl)
            except:
                print("Section: "+key+' NOT FOUND');
                return
            ans.append(key)
            mainsoup = soup.find('div','bodywrapper')
            footer = mainsoup.find(attrs={'id':'footerfeature'})
            if footer is not None:
                footer.extract()
            for wdiv in mainsoup.findAll(attrs={'class':['genericfeature']}):
               wdiv.extract()
            for wdiv in mainsoup.findAll(attrs={'class':['headline','featurecontent']}):
               handle_article(wdiv,key)
        for (k,url) in self.postmedia_index_pages:
            parse_web_index(k,url)
        ans = [(key, articles[key]) for key in ans if articles.has_key(key)]
        return ans
--- a/recipes/vancouver_sun.recipe
+++ b/recipes/vancouver_sun.recipe
@ -117,15 +117,17 @@ class CanWestPaper(BasicNewsRecipe):
                .photocaption { font-size: xx-small; font-style: italic; font-weight: normal; }
                #photocredit { font-size: xx-small; font-weight: normal; }'''
-    keep_only_tags = [dict(name='div', attrs={'id':'storyheader'}),dict(name='div', attrs={'id':'storycontent'}),dict(name='div', attrs={'id':'storyphoto'})]
+    keep_only_tags = [dict(name='div', attrs={'id':re.compile('story')})]
-    remove_tags = [{'class':'comments'},
+    remove_tags = [{'class':'comments'},{'class':'comment-intro'},{'class':'storytab'},
-                   dict(name='div', attrs={'class':'navbar'}),dict(name='div', attrs={'class':'morelinks'}),
+                   dict(name='div', attrs={'class':'section_title'}),dict(name='div', attrs={'class':'sharebar'}),dict(name='div', attrs={'class':'navbar'}),dict(name='div', attrs={'class':'morelinks'}),
                   dict(name='h2', attrs={'id':'photocredit'}),
                   dict(name='div', attrs={'class':'viewmore'}),dict(name='li', attrs={'class':'email'}),
                   dict(name='div', attrs={'class':'story_tool_hr'}),dict(name='div', attrs={'class':'clear'}),
                   dict(name='div', attrs={'class':'story_tool'}),dict(name='div', attrs={'class':'copyright'}),
                   dict(name='div', attrs={'class':'rule_grey_solid'}),
                   dict(name='div', attrs={'id':'soundoff'}),
                   dict(name='div', attrs={'id':re.compile('flyer')}),
                   dict(name='li', attrs={'class':'print'}),dict(name='li', attrs={'class':'share'}),dict(name='ul', attrs={'class':'bullet'})]
@ -252,18 +254,23 @@ class CanWestPaper(BasicNewsRecipe):
        def handle_article(adiv,key):
-            h1tag = adiv.h1
+            if adiv.name=='h1' or adiv.name=='h3':
                h1tag = adiv
            else:
                h1tag = adiv.h1
                if h1tag is None:
                    h1tag = adiv.h3
            if h1tag is not None:
                atag = h1tag.a
                if atag is not None:
                    url = atag['href']
-                    if atag['href'].startswith('http'):
+                    if url.startswith('/'):
                        url = self.url_prefix+url
                    if not url.startswith(self.url_prefix):
                        print("Rejected "+url)
                        return
                    elif atag['href'].startswith('/'):
                        url = self.url_prefix+atag['href']
                    else:
                        url = self.url_prefix+'/'+atag['href']
                    if url in self.url_list:
                        print("Rejected dup "+url)
                        return
                    self.url_list.append(url)
                    title = self.tag_to_string(atag,False)
@ -290,22 +297,21 @@ class CanWestPaper(BasicNewsRecipe):
                    articles[key].append(dict(title=title,url=url,date='',description=description,author='',content=''))
        def parse_web_index(key, keyurl):
            print("Section: "+key+': '+self.url_prefix+keyurl)
            try:
                soup = self.index_to_soup(self.url_prefix+keyurl)
            except:
                print("Section: "+key+' NOT FOUND');
                return
            ans.append(key)
            mainsoup = soup.find('div','bodywrapper')
            footer = mainsoup.find(attrs={'id':'footerfeature'})
            if footer is not None:
                footer.extract()
-            print("Section: "+key)
+            for wdiv in mainsoup.findAll(attrs={'class':['genericfeature']}):
-            for wdiv in mainsoup.findAll('div',attrs={'id':re.compile('^HorizontalFeatureSlider_1_Story')}):
+               wdiv.extract()
-                handle_article(wdiv,key)
+            for wdiv in mainsoup.findAll(attrs={'class':['headline','featurecontent']}):
-                wdiv.extract()
+               handle_article(wdiv,key)
            for wdiv in mainsoup.findAll(attrs={'id':['featurewidget','textfeature','textlinks_timestamp']}):
                for adiv in wdiv.findAll('div','featurecontent'):
                    handle_article(adiv,key)
        for (k,url) in self.postmedia_index_pages:
            parse_web_index(k,url)