From 5132aba5f0434b231cbfe3d5d02acf64d1433f6c Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Wed, 20 Jan 2010 21:01:43 -0700 Subject: [PATCH] New recipes for various CanWest Canadian news sources by Nick Redding --- resources/recipes/calgary_herald.recipe | 121 +++++++++++++++ resources/recipes/edmonton_journal.recipe | 126 ++++++++++++++++ resources/recipes/montreal_gazette.recipe | 96 ++++++++++++ resources/recipes/ottawa_citizen.recipe | 101 +++++++++++++ resources/recipes/regina_leader_post.recipe | 116 ++++++++++++++ .../recipes/saskatoon_star_phoenix.recipe | 111 ++++++++++++++ resources/recipes/vancouver_provice.recipe | 136 +++++++++++++++++ resources/recipes/vancouver_sun.recipe | 131 ++++++++++++++++ resources/recipes/vic_times.recipe | 141 ++++++++++++++++++ resources/recipes/windows_star.recipe | 106 +++++++++++++ 10 files changed, 1185 insertions(+) create mode 100644 resources/recipes/calgary_herald.recipe create mode 100644 resources/recipes/edmonton_journal.recipe create mode 100644 resources/recipes/montreal_gazette.recipe create mode 100644 resources/recipes/ottawa_citizen.recipe create mode 100644 resources/recipes/regina_leader_post.recipe create mode 100644 resources/recipes/saskatoon_star_phoenix.recipe create mode 100644 resources/recipes/vancouver_provice.recipe create mode 100644 resources/recipes/vancouver_sun.recipe create mode 100644 resources/recipes/vic_times.recipe create mode 100644 resources/recipes/windows_star.recipe diff --git a/resources/recipes/calgary_herald.recipe b/resources/recipes/calgary_herald.recipe new file mode 100644 index 0000000000..884a951d96 --- /dev/null +++ b/resources/recipes/calgary_herald.recipe @@ -0,0 +1,121 @@ +#!/usr/bin/env python + +__license__ = 'GPL v3' + +''' +www.canada.com +''' + +from calibre.web.feeds.recipes import BasicNewsRecipe + + +class CanWestPaper(BasicNewsRecipe): + + # un-comment the following three lines for the Calgary Herald + title = u'Calgary Herald' + url_prefix = 'http://www.calgaryherald.com' + description = u'News from Calgary, AB' + + # un-comment the following three lines for the Regina Leader-Post + #title = u'Regina Leader-Post' + #url_prefix = 'http://www.leaderpost.com' + #description = u'News from Regina, SK' + + # un-comment the following three lines for the Saskatoon Star-Phoenix + #title = u'Saskatoon Star-Phoenix' + #url_prefix = 'http://www.thestarphoenix.com' + #description = u'News from Saskatoon, SK' + + # un-comment the following three lines for the Windsor Star + #title = u'Windsor Star' + #url_prefix = 'http://www.windsorstar.com' + #description = u'News from Windsor, ON' + + # un-comment the following three lines for the Ottawa Citizen + #title = u'Ottawa Citizen' + #url_prefix = 'http://www.ottawacitizen.com' + #description = u'News from Ottawa, ON' + + # un-comment the following three lines for the Montreal Gazette + #title = u'Montreal Gazette' + #url_prefix = 'http://www.montrealgazette.com' + #description = u'News from Montreal, QC' + + + language = 'en_CA' + __author__ = 'Nick Redding' + no_stylesheets = True + timefmt = ' [%b %d]' + extra_css = ''' + .timestamp { font-size:xx-small; display: block; } + #storyheader { font-size: medium; } + #storyheader h1 { font-size: x-large; } + #storyheader h2 { font-size: large; font-style: italic; } + .byline { font-size:xx-small; } + #photocaption { font-size: small; font-style: italic } + #photocredit { font-size: xx-small; }''' + keep_only_tags = [dict(name='div', attrs={'id':'storyheader'}),dict(name='div', attrs={'id':'storycontent'})] + remove_tags = [{'class':'comments'}, + dict(name='div', attrs={'class':'navbar'}),dict(name='div', attrs={'class':'morelinks'}), + dict(name='div', attrs={'class':'viewmore'}),dict(name='li', attrs={'class':'email'}), + dict(name='div', attrs={'class':'story_tool_hr'}),dict(name='div', attrs={'class':'clear'}), + dict(name='div', attrs={'class':'story_tool'}),dict(name='div', attrs={'class':'copyright'}), + dict(name='div', attrs={'class':'rule_grey_solid'}), + dict(name='li', attrs={'class':'print'}),dict(name='li', attrs={'class':'share'}),dict(name='ul', attrs={'class':'bullet'})] + + def preprocess_html(self,soup): + #delete iempty id attributes--they screw up the TOC for unknow reasons + divtags = soup.findAll('div',attrs={'id':''}) + if divtags: + for div in divtags: + del(div['id']) + return soup + + + def parse_index(self): + soup = self.index_to_soup(self.url_prefix+'/news/todays-paper/index.html') + + articles = {} + key = 'News' + ans = ['News'] + + # Find each instance of class="sectiontitle", class="featurecontent" + for divtag in soup.findAll('div',attrs={'class' : ["section_title02","featurecontent"]}): + #self.log(" div class = %s" % divtag['class']) + if divtag['class'].startswith('section_title'): + # div contains section title + if not divtag.h3: + continue + key = self.tag_to_string(divtag.h3,False) + ans.append(key) + self.log("Section name %s" % key) + continue + # div contains article data + h1tag = divtag.find('h1') + if not h1tag: + continue + atag = h1tag.find('a',href=True) + if not atag: + continue + url = self.url_prefix+'/news/todays-paper/'+atag['href'] + #self.log("Section %s" % key) + #self.log("url %s" % url) + title = self.tag_to_string(atag,False) + #self.log("title %s" % title) + pubdate = '' + description = '' + ptag = divtag.find('p'); + if ptag: + description = self.tag_to_string(ptag,False) + #self.log("description %s" % description) + author = '' + autag = divtag.find('h4') + if autag: + author = self.tag_to_string(autag,False) + #self.log("author %s" % author) + if not articles.has_key(key): + articles[key] = [] + articles[key].append(dict(title=title,url=url,date=pubdate,description=description,author=author,content='')) + + ans = [(key, articles[key]) for key in ans if articles.has_key(key)] + return ans diff --git a/resources/recipes/edmonton_journal.recipe b/resources/recipes/edmonton_journal.recipe new file mode 100644 index 0000000000..ac28b18f71 --- /dev/null +++ b/resources/recipes/edmonton_journal.recipe @@ -0,0 +1,126 @@ +#!/usr/bin/env python + +__license__ = 'GPL v3' + +''' +www.canada.com +''' + +from calibre.web.feeds.recipes import BasicNewsRecipe + + +class CanWestPaper(BasicNewsRecipe): + + # un-comment the following three lines for the Edmonton Journal + title = u'Edmonton Journal' + url_prefix = 'http://www.edmontonjournal.com' + description = u'News from Edmonton, AB' + + # un-comment the following three lines for the Calgary Herald + #title = u'Calgary Herald' + #url_prefix = 'http://www.calgaryherald.com' + #description = u'News from Calgary, AB' + + # un-comment the following three lines for the Regina Leader-Post + #title = u'Regina Leader-Post' + #url_prefix = 'http://www.leaderpost.com' + #description = u'News from Regina, SK' + + # un-comment the following three lines for the Saskatoon Star-Phoenix + #title = u'Saskatoon Star-Phoenix' + #url_prefix = 'http://www.thestarphoenix.com' + #description = u'News from Saskatoon, SK' + + # un-comment the following three lines for the Windsor Star + #title = u'Windsor Star' + #url_prefix = 'http://www.windsorstar.com' + #description = u'News from Windsor, ON' + + # un-comment the following three lines for the Ottawa Citizen + #title = u'Ottawa Citizen' + #url_prefix = 'http://www.ottawacitizen.com' + #description = u'News from Ottawa, ON' + + # un-comment the following three lines for the Montreal Gazette + #title = u'Montreal Gazette' + #url_prefix = 'http://www.montrealgazette.com' + #description = u'News from Montreal, QC' + + + language = 'en_CA' + __author__ = 'Nick Redding' + no_stylesheets = True + timefmt = ' [%b %d]' + extra_css = ''' + .timestamp { font-size:xx-small; display: block; } + #storyheader { font-size: medium; } + #storyheader h1 { font-size: x-large; } + #storyheader h2 { font-size: large; font-style: italic; } + .byline { font-size:xx-small; } + #photocaption { font-size: small; font-style: italic } + #photocredit { font-size: xx-small; }''' + keep_only_tags = [dict(name='div', attrs={'id':'storyheader'}),dict(name='div', attrs={'id':'storycontent'})] + remove_tags = [{'class':'comments'}, + dict(name='div', attrs={'class':'navbar'}),dict(name='div', attrs={'class':'morelinks'}), + dict(name='div', attrs={'class':'viewmore'}),dict(name='li', attrs={'class':'email'}), + dict(name='div', attrs={'class':'story_tool_hr'}),dict(name='div', attrs={'class':'clear'}), + dict(name='div', attrs={'class':'story_tool'}),dict(name='div', attrs={'class':'copyright'}), + dict(name='div', attrs={'class':'rule_grey_solid'}), + dict(name='li', attrs={'class':'print'}),dict(name='li', attrs={'class':'share'}),dict(name='ul', attrs={'class':'bullet'})] + + def preprocess_html(self,soup): + #delete iempty id attributes--they screw up the TOC for unknow reasons + divtags = soup.findAll('div',attrs={'id':''}) + if divtags: + for div in divtags: + del(div['id']) + return soup + + + def parse_index(self): + soup = self.index_to_soup(self.url_prefix+'/news/todays-paper/index.html') + + articles = {} + key = 'News' + ans = ['News'] + + # Find each instance of class="sectiontitle", class="featurecontent" + for divtag in soup.findAll('div',attrs={'class' : ["section_title02","featurecontent"]}): + #self.log(" div class = %s" % divtag['class']) + if divtag['class'].startswith('section_title'): + # div contains section title + if not divtag.h3: + continue + key = self.tag_to_string(divtag.h3,False) + ans.append(key) + self.log("Section name %s" % key) + continue + # div contains article data + h1tag = divtag.find('h1') + if not h1tag: + continue + atag = h1tag.find('a',href=True) + if not atag: + continue + url = self.url_prefix+'/news/todays-paper/'+atag['href'] + #self.log("Section %s" % key) + #self.log("url %s" % url) + title = self.tag_to_string(atag,False) + #self.log("title %s" % title) + pubdate = '' + description = '' + ptag = divtag.find('p'); + if ptag: + description = self.tag_to_string(ptag,False) + #self.log("description %s" % description) + author = '' + autag = divtag.find('h4') + if autag: + author = self.tag_to_string(autag,False) + #self.log("author %s" % author) + if not articles.has_key(key): + articles[key] = [] + articles[key].append(dict(title=title,url=url,date=pubdate,description=description,author=author,content='')) + + ans = [(key, articles[key]) for key in ans if articles.has_key(key)] + return ans diff --git a/resources/recipes/montreal_gazette.recipe b/resources/recipes/montreal_gazette.recipe new file mode 100644 index 0000000000..3061cc37e4 --- /dev/null +++ b/resources/recipes/montreal_gazette.recipe @@ -0,0 +1,96 @@ +#!/usr/bin/env python + +__license__ = 'GPL v3' + +''' +www.canada.com +''' + +from calibre.web.feeds.recipes import BasicNewsRecipe + + +class CanWestPaper(BasicNewsRecipe): + + # un-comment the following three lines for the Montreal Gazette + title = u'Montreal Gazette' + url_prefix = 'http://www.montrealgazette.com' + description = u'News from Montreal, QC' + + + language = 'en_CA' + __author__ = 'Nick Redding' + no_stylesheets = True + timefmt = ' [%b %d]' + extra_css = ''' + .timestamp { font-size:xx-small; display: block; } + #storyheader { font-size: medium; } + #storyheader h1 { font-size: x-large; } + #storyheader h2 { font-size: large; font-style: italic; } + .byline { font-size:xx-small; } + #photocaption { font-size: small; font-style: italic } + #photocredit { font-size: xx-small; }''' + keep_only_tags = [dict(name='div', attrs={'id':'storyheader'}),dict(name='div', attrs={'id':'storycontent'})] + remove_tags = [{'class':'comments'}, + dict(name='div', attrs={'class':'navbar'}),dict(name='div', attrs={'class':'morelinks'}), + dict(name='div', attrs={'class':'viewmore'}),dict(name='li', attrs={'class':'email'}), + dict(name='div', attrs={'class':'story_tool_hr'}),dict(name='div', attrs={'class':'clear'}), + dict(name='div', attrs={'class':'story_tool'}),dict(name='div', attrs={'class':'copyright'}), + dict(name='div', attrs={'class':'rule_grey_solid'}), + dict(name='li', attrs={'class':'print'}),dict(name='li', attrs={'class':'share'}),dict(name='ul', attrs={'class':'bullet'})] + + def preprocess_html(self,soup): + #delete iempty id attributes--they screw up the TOC for unknow reasons + divtags = soup.findAll('div',attrs={'id':''}) + if divtags: + for div in divtags: + del(div['id']) + return soup + + + def parse_index(self): + soup = self.index_to_soup(self.url_prefix+'/news/todays-paper/index.html') + + articles = {} + key = 'News' + ans = ['News'] + + # Find each instance of class="sectiontitle", class="featurecontent" + for divtag in soup.findAll('div',attrs={'class' : ["section_title02","featurecontent"]}): + #self.log(" div class = %s" % divtag['class']) + if divtag['class'].startswith('section_title'): + # div contains section title + if not divtag.h3: + continue + key = self.tag_to_string(divtag.h3,False) + ans.append(key) + self.log("Section name %s" % key) + continue + # div contains article data + h1tag = divtag.find('h1') + if not h1tag: + continue + atag = h1tag.find('a',href=True) + if not atag: + continue + url = self.url_prefix+'/news/todays-paper/'+atag['href'] + #self.log("Section %s" % key) + #self.log("url %s" % url) + title = self.tag_to_string(atag,False) + #self.log("title %s" % title) + pubdate = '' + description = '' + ptag = divtag.find('p'); + if ptag: + description = self.tag_to_string(ptag,False) + #self.log("description %s" % description) + author = '' + autag = divtag.find('h4') + if autag: + author = self.tag_to_string(autag,False) + #self.log("author %s" % author) + if not articles.has_key(key): + articles[key] = [] + articles[key].append(dict(title=title,url=url,date=pubdate,description=description,author=author,content='')) + + ans = [(key, articles[key]) for key in ans if articles.has_key(key)] + return ans diff --git a/resources/recipes/ottawa_citizen.recipe b/resources/recipes/ottawa_citizen.recipe new file mode 100644 index 0000000000..5465212d4c --- /dev/null +++ b/resources/recipes/ottawa_citizen.recipe @@ -0,0 +1,101 @@ +#!/usr/bin/env python + +__license__ = 'GPL v3' + +''' +www.canada.com +''' + +from calibre.web.feeds.recipes import BasicNewsRecipe + + +class CanWestPaper(BasicNewsRecipe): + + # un-comment the following three lines for the Ottawa Citizen + title = u'Ottawa Citizen' + url_prefix = 'http://www.ottawacitizen.com' + description = u'News from Ottawa, ON' + + # un-comment the following three lines for the Montreal Gazette + #title = u'Montreal Gazette' + #url_prefix = 'http://www.montrealgazette.com' + #description = u'News from Montreal, QC' + + + language = 'en_CA' + __author__ = 'Nick Redding' + no_stylesheets = True + timefmt = ' [%b %d]' + extra_css = ''' + .timestamp { font-size:xx-small; display: block; } + #storyheader { font-size: medium; } + #storyheader h1 { font-size: x-large; } + #storyheader h2 { font-size: large; font-style: italic; } + .byline { font-size:xx-small; } + #photocaption { font-size: small; font-style: italic } + #photocredit { font-size: xx-small; }''' + keep_only_tags = [dict(name='div', attrs={'id':'storyheader'}),dict(name='div', attrs={'id':'storycontent'})] + remove_tags = [{'class':'comments'}, + dict(name='div', attrs={'class':'navbar'}),dict(name='div', attrs={'class':'morelinks'}), + dict(name='div', attrs={'class':'viewmore'}),dict(name='li', attrs={'class':'email'}), + dict(name='div', attrs={'class':'story_tool_hr'}),dict(name='div', attrs={'class':'clear'}), + dict(name='div', attrs={'class':'story_tool'}),dict(name='div', attrs={'class':'copyright'}), + dict(name='div', attrs={'class':'rule_grey_solid'}), + dict(name='li', attrs={'class':'print'}),dict(name='li', attrs={'class':'share'}),dict(name='ul', attrs={'class':'bullet'})] + + def preprocess_html(self,soup): + #delete iempty id attributes--they screw up the TOC for unknow reasons + divtags = soup.findAll('div',attrs={'id':''}) + if divtags: + for div in divtags: + del(div['id']) + return soup + + + def parse_index(self): + soup = self.index_to_soup(self.url_prefix+'/news/todays-paper/index.html') + + articles = {} + key = 'News' + ans = ['News'] + + # Find each instance of class="sectiontitle", class="featurecontent" + for divtag in soup.findAll('div',attrs={'class' : ["section_title02","featurecontent"]}): + #self.log(" div class = %s" % divtag['class']) + if divtag['class'].startswith('section_title'): + # div contains section title + if not divtag.h3: + continue + key = self.tag_to_string(divtag.h3,False) + ans.append(key) + self.log("Section name %s" % key) + continue + # div contains article data + h1tag = divtag.find('h1') + if not h1tag: + continue + atag = h1tag.find('a',href=True) + if not atag: + continue + url = self.url_prefix+'/news/todays-paper/'+atag['href'] + #self.log("Section %s" % key) + #self.log("url %s" % url) + title = self.tag_to_string(atag,False) + #self.log("title %s" % title) + pubdate = '' + description = '' + ptag = divtag.find('p'); + if ptag: + description = self.tag_to_string(ptag,False) + #self.log("description %s" % description) + author = '' + autag = divtag.find('h4') + if autag: + author = self.tag_to_string(autag,False) + #self.log("author %s" % author) + if not articles.has_key(key): + articles[key] = [] + articles[key].append(dict(title=title,url=url,date=pubdate,description=description,author=author,content='')) + + ans = [(key, articles[key]) for key in ans if articles.has_key(key)] + return ans diff --git a/resources/recipes/regina_leader_post.recipe b/resources/recipes/regina_leader_post.recipe new file mode 100644 index 0000000000..9efec51848 --- /dev/null +++ b/resources/recipes/regina_leader_post.recipe @@ -0,0 +1,116 @@ +#!/usr/bin/env python + +__license__ = 'GPL v3' + +''' +www.canada.com +''' + +from calibre.web.feeds.recipes import BasicNewsRecipe + + +class CanWestPaper(BasicNewsRecipe): + + # un-comment the following three lines for the Regina Leader-Post + title = u'Regina Leader-Post' + url_prefix = 'http://www.leaderpost.com' + description = u'News from Regina, SK' + + # un-comment the following three lines for the Saskatoon Star-Phoenix + #title = u'Saskatoon Star-Phoenix' + #url_prefix = 'http://www.thestarphoenix.com' + #description = u'News from Saskatoon, SK' + + # un-comment the following three lines for the Windsor Star + #title = u'Windsor Star' + #url_prefix = 'http://www.windsorstar.com' + #description = u'News from Windsor, ON' + + # un-comment the following three lines for the Ottawa Citizen + #title = u'Ottawa Citizen' + #url_prefix = 'http://www.ottawacitizen.com' + #description = u'News from Ottawa, ON' + + # un-comment the following three lines for the Montreal Gazette + #title = u'Montreal Gazette' + #url_prefix = 'http://www.montrealgazette.com' + #description = u'News from Montreal, QC' + + + language = 'en_CA' + __author__ = 'Nick Redding' + no_stylesheets = True + timefmt = ' [%b %d]' + extra_css = ''' + .timestamp { font-size:xx-small; display: block; } + #storyheader { font-size: medium; } + #storyheader h1 { font-size: x-large; } + #storyheader h2 { font-size: large; font-style: italic; } + .byline { font-size:xx-small; } + #photocaption { font-size: small; font-style: italic } + #photocredit { font-size: xx-small; }''' + keep_only_tags = [dict(name='div', attrs={'id':'storyheader'}),dict(name='div', attrs={'id':'storycontent'})] + remove_tags = [{'class':'comments'}, + dict(name='div', attrs={'class':'navbar'}),dict(name='div', attrs={'class':'morelinks'}), + dict(name='div', attrs={'class':'viewmore'}),dict(name='li', attrs={'class':'email'}), + dict(name='div', attrs={'class':'story_tool_hr'}),dict(name='div', attrs={'class':'clear'}), + dict(name='div', attrs={'class':'story_tool'}),dict(name='div', attrs={'class':'copyright'}), + dict(name='div', attrs={'class':'rule_grey_solid'}), + dict(name='li', attrs={'class':'print'}),dict(name='li', attrs={'class':'share'}),dict(name='ul', attrs={'class':'bullet'})] + + def preprocess_html(self,soup): + #delete iempty id attributes--they screw up the TOC for unknow reasons + divtags = soup.findAll('div',attrs={'id':''}) + if divtags: + for div in divtags: + del(div['id']) + return soup + + + def parse_index(self): + soup = self.index_to_soup(self.url_prefix+'/news/todays-paper/index.html') + + articles = {} + key = 'News' + ans = ['News'] + + # Find each instance of class="sectiontitle", class="featurecontent" + for divtag in soup.findAll('div',attrs={'class' : ["section_title02","featurecontent"]}): + #self.log(" div class = %s" % divtag['class']) + if divtag['class'].startswith('section_title'): + # div contains section title + if not divtag.h3: + continue + key = self.tag_to_string(divtag.h3,False) + ans.append(key) + self.log("Section name %s" % key) + continue + # div contains article data + h1tag = divtag.find('h1') + if not h1tag: + continue + atag = h1tag.find('a',href=True) + if not atag: + continue + url = self.url_prefix+'/news/todays-paper/'+atag['href'] + #self.log("Section %s" % key) + #self.log("url %s" % url) + title = self.tag_to_string(atag,False) + #self.log("title %s" % title) + pubdate = '' + description = '' + ptag = divtag.find('p'); + if ptag: + description = self.tag_to_string(ptag,False) + #self.log("description %s" % description) + author = '' + autag = divtag.find('h4') + if autag: + author = self.tag_to_string(autag,False) + #self.log("author %s" % author) + if not articles.has_key(key): + articles[key] = [] + articles[key].append(dict(title=title,url=url,date=pubdate,description=description,author=author,content='')) + + ans = [(key, articles[key]) for key in ans if articles.has_key(key)] + return ans diff --git a/resources/recipes/saskatoon_star_phoenix.recipe b/resources/recipes/saskatoon_star_phoenix.recipe new file mode 100644 index 0000000000..25330478d4 --- /dev/null +++ b/resources/recipes/saskatoon_star_phoenix.recipe @@ -0,0 +1,111 @@ +#!/usr/bin/env python + +__license__ = 'GPL v3' + +''' +www.canada.com +''' + +from calibre.web.feeds.recipes import BasicNewsRecipe + + +class CanWestPaper(BasicNewsRecipe): + + # un-comment the following three lines for the Saskatoon Star-Phoenix + title = u'Saskatoon Star-Phoenix' + url_prefix = 'http://www.thestarphoenix.com' + description = u'News from Saskatoon, SK' + + # un-comment the following three lines for the Windsor Star + #title = u'Windsor Star' + #url_prefix = 'http://www.windsorstar.com' + #description = u'News from Windsor, ON' + + # un-comment the following three lines for the Ottawa Citizen + #title = u'Ottawa Citizen' + #url_prefix = 'http://www.ottawacitizen.com' + #description = u'News from Ottawa, ON' + + # un-comment the following three lines for the Montreal Gazette + #title = u'Montreal Gazette' + #url_prefix = 'http://www.montrealgazette.com' + #description = u'News from Montreal, QC' + + + language = 'en_CA' + __author__ = 'Nick Redding' + no_stylesheets = True + timefmt = ' [%b %d]' + extra_css = ''' + .timestamp { font-size:xx-small; display: block; } + #storyheader { font-size: medium; } + #storyheader h1 { font-size: x-large; } + #storyheader h2 { font-size: large; font-style: italic; } + .byline { font-size:xx-small; } + #photocaption { font-size: small; font-style: italic } + #photocredit { font-size: xx-small; }''' + keep_only_tags = [dict(name='div', attrs={'id':'storyheader'}),dict(name='div', attrs={'id':'storycontent'})] + remove_tags = [{'class':'comments'}, + dict(name='div', attrs={'class':'navbar'}),dict(name='div', attrs={'class':'morelinks'}), + dict(name='div', attrs={'class':'viewmore'}),dict(name='li', attrs={'class':'email'}), + dict(name='div', attrs={'class':'story_tool_hr'}),dict(name='div', attrs={'class':'clear'}), + dict(name='div', attrs={'class':'story_tool'}),dict(name='div', attrs={'class':'copyright'}), + dict(name='div', attrs={'class':'rule_grey_solid'}), + dict(name='li', attrs={'class':'print'}),dict(name='li', attrs={'class':'share'}),dict(name='ul', attrs={'class':'bullet'})] + + def preprocess_html(self,soup): + #delete iempty id attributes--they screw up the TOC for unknow reasons + divtags = soup.findAll('div',attrs={'id':''}) + if divtags: + for div in divtags: + del(div['id']) + return soup + + + def parse_index(self): + soup = self.index_to_soup(self.url_prefix+'/news/todays-paper/index.html') + + articles = {} + key = 'News' + ans = ['News'] + + # Find each instance of class="sectiontitle", class="featurecontent" + for divtag in soup.findAll('div',attrs={'class' : ["section_title02","featurecontent"]}): + #self.log(" div class = %s" % divtag['class']) + if divtag['class'].startswith('section_title'): + # div contains section title + if not divtag.h3: + continue + key = self.tag_to_string(divtag.h3,False) + ans.append(key) + self.log("Section name %s" % key) + continue + # div contains article data + h1tag = divtag.find('h1') + if not h1tag: + continue + atag = h1tag.find('a',href=True) + if not atag: + continue + url = self.url_prefix+'/news/todays-paper/'+atag['href'] + #self.log("Section %s" % key) + #self.log("url %s" % url) + title = self.tag_to_string(atag,False) + #self.log("title %s" % title) + pubdate = '' + description = '' + ptag = divtag.find('p'); + if ptag: + description = self.tag_to_string(ptag,False) + #self.log("description %s" % description) + author = '' + autag = divtag.find('h4') + if autag: + author = self.tag_to_string(autag,False) + #self.log("author %s" % author) + if not articles.has_key(key): + articles[key] = [] + articles[key].append(dict(title=title,url=url,date=pubdate,description=description,author=author,content='')) + + ans = [(key, articles[key]) for key in ans if articles.has_key(key)] + return ans diff --git a/resources/recipes/vancouver_provice.recipe b/resources/recipes/vancouver_provice.recipe new file mode 100644 index 0000000000..9375670c59 --- /dev/null +++ b/resources/recipes/vancouver_provice.recipe @@ -0,0 +1,136 @@ +#!/usr/bin/env python + +__license__ = 'GPL v3' + +''' +www.canada.com +''' + +from calibre.web.feeds.recipes import BasicNewsRecipe + + +class CanWestPaper(BasicNewsRecipe): + + # un-comment the following three lines for the Vancouver Province + title = u'Vancouver Province' + url_prefix = 'http://www.theprovince.com' + description = u'News from Vancouver, BC' + + # un-comment the following three lines for the Vancouver Sun + #title = u'Vancouver Sun' + #url_prefix = 'http://www.vancouversun.com' + #description = u'News from Vancouver, BC' + + # un-comment the following three lines for the Edmonton Journal + #title = u'Edmonton Journal' + #url_prefix = 'http://www.edmontonjournal.com' + #description = u'News from Edmonton, AB' + + # un-comment the following three lines for the Calgary Herald + #title = u'Calgary Herald' + #url_prefix = 'http://www.calgaryherald.com' + #description = u'News from Calgary, AB' + + # un-comment the following three lines for the Regina Leader-Post + #title = u'Regina Leader-Post' + #url_prefix = 'http://www.leaderpost.com' + #description = u'News from Regina, SK' + + # un-comment the following three lines for the Saskatoon Star-Phoenix + #title = u'Saskatoon Star-Phoenix' + #url_prefix = 'http://www.thestarphoenix.com' + #description = u'News from Saskatoon, SK' + + # un-comment the following three lines for the Windsor Star + #title = u'Windsor Star' + #url_prefix = 'http://www.windsorstar.com' + #description = u'News from Windsor, ON' + + # un-comment the following three lines for the Ottawa Citizen + #title = u'Ottawa Citizen' + #url_prefix = 'http://www.ottawacitizen.com' + #description = u'News from Ottawa, ON' + + # un-comment the following three lines for the Montreal Gazette + #title = u'Montreal Gazette' + #url_prefix = 'http://www.montrealgazette.com' + #description = u'News from Montreal, QC' + + + language = 'en_CA' + __author__ = 'Nick Redding' + no_stylesheets = True + timefmt = ' [%b %d]' + extra_css = ''' + .timestamp { font-size:xx-small; display: block; } + #storyheader { font-size: medium; } + #storyheader h1 { font-size: x-large; } + #storyheader h2 { font-size: large; font-style: italic; } + .byline { font-size:xx-small; } + #photocaption { font-size: small; font-style: italic } + #photocredit { font-size: xx-small; }''' + keep_only_tags = [dict(name='div', attrs={'id':'storyheader'}),dict(name='div', attrs={'id':'storycontent'})] + remove_tags = [{'class':'comments'}, + dict(name='div', attrs={'class':'navbar'}),dict(name='div', attrs={'class':'morelinks'}), + dict(name='div', attrs={'class':'viewmore'}),dict(name='li', attrs={'class':'email'}), + dict(name='div', attrs={'class':'story_tool_hr'}),dict(name='div', attrs={'class':'clear'}), + dict(name='div', attrs={'class':'story_tool'}),dict(name='div', attrs={'class':'copyright'}), + dict(name='div', attrs={'class':'rule_grey_solid'}), + dict(name='li', attrs={'class':'print'}),dict(name='li', attrs={'class':'share'}),dict(name='ul', attrs={'class':'bullet'})] + + def preprocess_html(self,soup): + #delete iempty id attributes--they screw up the TOC for unknow reasons + divtags = soup.findAll('div',attrs={'id':''}) + if divtags: + for div in divtags: + del(div['id']) + return soup + + + def parse_index(self): + soup = self.index_to_soup(self.url_prefix+'/news/todays-paper/index.html') + + articles = {} + key = 'News' + ans = ['News'] + + # Find each instance of class="sectiontitle", class="featurecontent" + for divtag in soup.findAll('div',attrs={'class' : ["section_title02","featurecontent"]}): + #self.log(" div class = %s" % divtag['class']) + if divtag['class'].startswith('section_title'): + # div contains section title + if not divtag.h3: + continue + key = self.tag_to_string(divtag.h3,False) + ans.append(key) + self.log("Section name %s" % key) + continue + # div contains article data + h1tag = divtag.find('h1') + if not h1tag: + continue + atag = h1tag.find('a',href=True) + if not atag: + continue + url = self.url_prefix+'/news/todays-paper/'+atag['href'] + #self.log("Section %s" % key) + #self.log("url %s" % url) + title = self.tag_to_string(atag,False) + #self.log("title %s" % title) + pubdate = '' + description = '' + ptag = divtag.find('p'); + if ptag: + description = self.tag_to_string(ptag,False) + #self.log("description %s" % description) + author = '' + autag = divtag.find('h4') + if autag: + author = self.tag_to_string(autag,False) + #self.log("author %s" % author) + if not articles.has_key(key): + articles[key] = [] + articles[key].append(dict(title=title,url=url,date=pubdate,description=description,author=author,content='')) + + ans = [(key, articles[key]) for key in ans if articles.has_key(key)] + return ans diff --git a/resources/recipes/vancouver_sun.recipe b/resources/recipes/vancouver_sun.recipe new file mode 100644 index 0000000000..8f12869bf9 --- /dev/null +++ b/resources/recipes/vancouver_sun.recipe @@ -0,0 +1,131 @@ +#!/usr/bin/env python + +__license__ = 'GPL v3' + +''' +www.canada.com +''' + +from calibre.web.feeds.recipes import BasicNewsRecipe + + +class CanWestPaper(BasicNewsRecipe): + + # un-comment the following three lines for the Vancouver Sun + title = u'Vancouver Sun' + url_prefix = 'http://www.vancouversun.com' + description = u'News from Vancouver, BC' + + # un-comment the following three lines for the Edmonton Journal + #title = u'Edmonton Journal' + #url_prefix = 'http://www.edmontonjournal.com' + #description = u'News from Edmonton, AB' + + # un-comment the following three lines for the Calgary Herald + #title = u'Calgary Herald' + #url_prefix = 'http://www.calgaryherald.com' + #description = u'News from Calgary, AB' + + # un-comment the following three lines for the Regina Leader-Post + #title = u'Regina Leader-Post' + #url_prefix = 'http://www.leaderpost.com' + #description = u'News from Regina, SK' + + # un-comment the following three lines for the Saskatoon Star-Phoenix + #title = u'Saskatoon Star-Phoenix' + #url_prefix = 'http://www.thestarphoenix.com' + #description = u'News from Saskatoon, SK' + + # un-comment the following three lines for the Windsor Star + #title = u'Windsor Star' + #url_prefix = 'http://www.windsorstar.com' + #description = u'News from Windsor, ON' + + # un-comment the following three lines for the Ottawa Citizen + #title = u'Ottawa Citizen' + #url_prefix = 'http://www.ottawacitizen.com' + #description = u'News from Ottawa, ON' + + # un-comment the following three lines for the Montreal Gazette + #title = u'Montreal Gazette' + #url_prefix = 'http://www.montrealgazette.com' + #description = u'News from Montreal, QC' + + + language = 'en_CA' + __author__ = 'Nick Redding' + no_stylesheets = True + timefmt = ' [%b %d]' + extra_css = ''' + .timestamp { font-size:xx-small; display: block; } + #storyheader { font-size: medium; } + #storyheader h1 { font-size: x-large; } + #storyheader h2 { font-size: large; font-style: italic; } + .byline { font-size:xx-small; } + #photocaption { font-size: small; font-style: italic } + #photocredit { font-size: xx-small; }''' + keep_only_tags = [dict(name='div', attrs={'id':'storyheader'}),dict(name='div', attrs={'id':'storycontent'})] + remove_tags = [{'class':'comments'}, + dict(name='div', attrs={'class':'navbar'}),dict(name='div', attrs={'class':'morelinks'}), + dict(name='div', attrs={'class':'viewmore'}),dict(name='li', attrs={'class':'email'}), + dict(name='div', attrs={'class':'story_tool_hr'}),dict(name='div', attrs={'class':'clear'}), + dict(name='div', attrs={'class':'story_tool'}),dict(name='div', attrs={'class':'copyright'}), + dict(name='div', attrs={'class':'rule_grey_solid'}), + dict(name='li', attrs={'class':'print'}),dict(name='li', attrs={'class':'share'}),dict(name='ul', attrs={'class':'bullet'})] + + def preprocess_html(self,soup): + #delete iempty id attributes--they screw up the TOC for unknow reasons + divtags = soup.findAll('div',attrs={'id':''}) + if divtags: + for div in divtags: + del(div['id']) + return soup + + + def parse_index(self): + soup = self.index_to_soup(self.url_prefix+'/news/todays-paper/index.html') + + articles = {} + key = 'News' + ans = ['News'] + + # Find each instance of class="sectiontitle", class="featurecontent" + for divtag in soup.findAll('div',attrs={'class' : ["section_title02","featurecontent"]}): + #self.log(" div class = %s" % divtag['class']) + if divtag['class'].startswith('section_title'): + # div contains section title + if not divtag.h3: + continue + key = self.tag_to_string(divtag.h3,False) + ans.append(key) + self.log("Section name %s" % key) + continue + # div contains article data + h1tag = divtag.find('h1') + if not h1tag: + continue + atag = h1tag.find('a',href=True) + if not atag: + continue + url = self.url_prefix+'/news/todays-paper/'+atag['href'] + #self.log("Section %s" % key) + #self.log("url %s" % url) + title = self.tag_to_string(atag,False) + #self.log("title %s" % title) + pubdate = '' + description = '' + ptag = divtag.find('p'); + if ptag: + description = self.tag_to_string(ptag,False) + #self.log("description %s" % description) + author = '' + autag = divtag.find('h4') + if autag: + author = self.tag_to_string(autag,False) + #self.log("author %s" % author) + if not articles.has_key(key): + articles[key] = [] + articles[key].append(dict(title=title,url=url,date=pubdate,description=description,author=author,content='')) + + ans = [(key, articles[key]) for key in ans if articles.has_key(key)] + return ans diff --git a/resources/recipes/vic_times.recipe b/resources/recipes/vic_times.recipe new file mode 100644 index 0000000000..2dc8e96003 --- /dev/null +++ b/resources/recipes/vic_times.recipe @@ -0,0 +1,141 @@ +#!/usr/bin/env python + +__license__ = 'GPL v3' + +''' +www.canada.com +''' + +from calibre.web.feeds.recipes import BasicNewsRecipe + + +class CanWestPaper(BasicNewsRecipe): + + # un-comment the following three lines for the Victoria Times Colonist + title = u'Victoria Times Colonist' + url_prefix = 'http://www.timescolonist.com' + description = u'News from Victoria, BC' + + # un-comment the following three lines for the Vancouver Province + #title = u'Vancouver Province' + #url_prefix = 'http://www.theprovince.com' + #description = u'News from Vancouver, BC' + + # un-comment the following three lines for the Vancouver Sun + #title = u'Vancouver Sun' + #url_prefix = 'http://www.vancouversun.com' + #description = u'News from Vancouver, BC' + + # un-comment the following three lines for the Edmonton Journal + #title = u'Edmonton Journal' + #url_prefix = 'http://www.edmontonjournal.com' + #description = u'News from Edmonton, AB' + + # un-comment the following three lines for the Calgary Herald + #title = u'Calgary Herald' + #url_prefix = 'http://www.calgaryherald.com' + #description = u'News from Calgary, AB' + + # un-comment the following three lines for the Regina Leader-Post + #title = u'Regina Leader-Post' + #url_prefix = 'http://www.leaderpost.com' + #description = u'News from Regina, SK' + + # un-comment the following three lines for the Saskatoon Star-Phoenix + #title = u'Saskatoon Star-Phoenix' + #url_prefix = 'http://www.thestarphoenix.com' + #description = u'News from Saskatoon, SK' + + # un-comment the following three lines for the Windsor Star + #title = u'Windsor Star' + #url_prefix = 'http://www.windsorstar.com' + #description = u'News from Windsor, ON' + + # un-comment the following three lines for the Ottawa Citizen + #title = u'Ottawa Citizen' + #url_prefix = 'http://www.ottawacitizen.com' + #description = u'News from Ottawa, ON' + + # un-comment the following three lines for the Montreal Gazette + #title = u'Montreal Gazette' + #url_prefix = 'http://www.montrealgazette.com' + #description = u'News from Montreal, QC' + + + language = 'en_CA' + __author__ = 'Nick Redding' + no_stylesheets = True + timefmt = ' [%b %d]' + extra_css = ''' + .timestamp { font-size:xx-small; display: block; } + #storyheader { font-size: medium; } + #storyheader h1 { font-size: x-large; } + #storyheader h2 { font-size: large; font-style: italic; } + .byline { font-size:xx-small; } + #photocaption { font-size: small; font-style: italic } + #photocredit { font-size: xx-small; }''' + keep_only_tags = [dict(name='div', attrs={'id':'storyheader'}),dict(name='div', attrs={'id':'storycontent'})] + remove_tags = [{'class':'comments'}, + dict(name='div', attrs={'class':'navbar'}),dict(name='div', attrs={'class':'morelinks'}), + dict(name='div', attrs={'class':'viewmore'}),dict(name='li', attrs={'class':'email'}), + dict(name='div', attrs={'class':'story_tool_hr'}),dict(name='div', attrs={'class':'clear'}), + dict(name='div', attrs={'class':'story_tool'}),dict(name='div', attrs={'class':'copyright'}), + dict(name='div', attrs={'class':'rule_grey_solid'}), + dict(name='li', attrs={'class':'print'}),dict(name='li', attrs={'class':'share'}),dict(name='ul', attrs={'class':'bullet'})] + + def preprocess_html(self,soup): + #delete iempty id attributes--they screw up the TOC for unknow reasons + divtags = soup.findAll('div',attrs={'id':''}) + if divtags: + for div in divtags: + del(div['id']) + return soup + + + def parse_index(self): + soup = self.index_to_soup(self.url_prefix+'/news/todays-paper/index.html') + + articles = {} + key = 'News' + ans = ['News'] + + # Find each instance of class="sectiontitle", class="featurecontent" + for divtag in soup.findAll('div',attrs={'class' : ["section_title02","featurecontent"]}): + #self.log(" div class = %s" % divtag['class']) + if divtag['class'].startswith('section_title'): + # div contains section title + if not divtag.h3: + continue + key = self.tag_to_string(divtag.h3,False) + ans.append(key) + self.log("Section name %s" % key) + continue + # div contains article data + h1tag = divtag.find('h1') + if not h1tag: + continue + atag = h1tag.find('a',href=True) + if not atag: + continue + url = self.url_prefix+'/news/todays-paper/'+atag['href'] + #self.log("Section %s" % key) + #self.log("url %s" % url) + title = self.tag_to_string(atag,False) + #self.log("title %s" % title) + pubdate = '' + description = '' + ptag = divtag.find('p'); + if ptag: + description = self.tag_to_string(ptag,False) + #self.log("description %s" % description) + author = '' + autag = divtag.find('h4') + if autag: + author = self.tag_to_string(autag,False) + #self.log("author %s" % author) + if not articles.has_key(key): + articles[key] = [] + articles[key].append(dict(title=title,url=url,date=pubdate,description=description,author=author,content='')) + + ans = [(key, articles[key]) for key in ans if articles.has_key(key)] + return ans diff --git a/resources/recipes/windows_star.recipe b/resources/recipes/windows_star.recipe new file mode 100644 index 0000000000..4d34261bb7 --- /dev/null +++ b/resources/recipes/windows_star.recipe @@ -0,0 +1,106 @@ +#!/usr/bin/env python + +__license__ = 'GPL v3' + +''' +www.canada.com +''' + +from calibre.web.feeds.recipes import BasicNewsRecipe + + +class CanWestPaper(BasicNewsRecipe): + + # un-comment the following three lines for the Windsor Star + title = u'Windsor Star' + url_prefix = 'http://www.windsorstar.com' + description = u'News from Windsor, ON' + + # un-comment the following three lines for the Ottawa Citizen + #title = u'Ottawa Citizen' + #url_prefix = 'http://www.ottawacitizen.com' + #description = u'News from Ottawa, ON' + + # un-comment the following three lines for the Montreal Gazette + #title = u'Montreal Gazette' + #url_prefix = 'http://www.montrealgazette.com' + #description = u'News from Montreal, QC' + + + language = 'en_CA' + __author__ = 'Nick Redding' + no_stylesheets = True + timefmt = ' [%b %d]' + extra_css = ''' + .timestamp { font-size:xx-small; display: block; } + #storyheader { font-size: medium; } + #storyheader h1 { font-size: x-large; } + #storyheader h2 { font-size: large; font-style: italic; } + .byline { font-size:xx-small; } + #photocaption { font-size: small; font-style: italic } + #photocredit { font-size: xx-small; }''' + keep_only_tags = [dict(name='div', attrs={'id':'storyheader'}),dict(name='div', attrs={'id':'storycontent'})] + remove_tags = [{'class':'comments'}, + dict(name='div', attrs={'class':'navbar'}),dict(name='div', attrs={'class':'morelinks'}), + dict(name='div', attrs={'class':'viewmore'}),dict(name='li', attrs={'class':'email'}), + dict(name='div', attrs={'class':'story_tool_hr'}),dict(name='div', attrs={'class':'clear'}), + dict(name='div', attrs={'class':'story_tool'}),dict(name='div', attrs={'class':'copyright'}), + dict(name='div', attrs={'class':'rule_grey_solid'}), + dict(name='li', attrs={'class':'print'}),dict(name='li', attrs={'class':'share'}),dict(name='ul', attrs={'class':'bullet'})] + + def preprocess_html(self,soup): + #delete iempty id attributes--they screw up the TOC for unknow reasons + divtags = soup.findAll('div',attrs={'id':''}) + if divtags: + for div in divtags: + del(div['id']) + return soup + + + def parse_index(self): + soup = self.index_to_soup(self.url_prefix+'/news/todays-paper/index.html') + + articles = {} + key = 'News' + ans = ['News'] + + # Find each instance of class="sectiontitle", class="featurecontent" + for divtag in soup.findAll('div',attrs={'class' : ["section_title02","featurecontent"]}): + #self.log(" div class = %s" % divtag['class']) + if divtag['class'].startswith('section_title'): + # div contains section title + if not divtag.h3: + continue + key = self.tag_to_string(divtag.h3,False) + ans.append(key) + self.log("Section name %s" % key) + continue + # div contains article data + h1tag = divtag.find('h1') + if not h1tag: + continue + atag = h1tag.find('a',href=True) + if not atag: + continue + url = self.url_prefix+'/news/todays-paper/'+atag['href'] + #self.log("Section %s" % key) + #self.log("url %s" % url) + title = self.tag_to_string(atag,False) + #self.log("title %s" % title) + pubdate = '' + description = '' + ptag = divtag.find('p'); + if ptag: + description = self.tag_to_string(ptag,False) + #self.log("description %s" % description) + author = '' + autag = divtag.find('h4') + if autag: + author = self.tag_to_string(autag,False) + #self.log("author %s" % author) + if not articles.has_key(key): + articles[key] = [] + articles[key].append(dict(title=title,url=url,date=pubdate,description=description,author=author,content='')) + + ans = [(key, articles[key]) for key in ans if articles.has_key(key)] + return ans