mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-06-23 15:30:45 -04:00
New recipes for various CanWest Canadian news sources by Nick Redding
This commit is contained in:
parent
df019215ca
commit
5132aba5f0
121
resources/recipes/calgary_herald.recipe
Normal file
121
resources/recipes/calgary_herald.recipe
Normal file
@ -0,0 +1,121 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
|
||||||
|
__license__ = 'GPL v3'
|
||||||
|
|
||||||
|
'''
|
||||||
|
www.canada.com
|
||||||
|
'''
|
||||||
|
|
||||||
|
from calibre.web.feeds.recipes import BasicNewsRecipe
|
||||||
|
|
||||||
|
|
||||||
|
class CanWestPaper(BasicNewsRecipe):
|
||||||
|
|
||||||
|
# un-comment the following three lines for the Calgary Herald
|
||||||
|
title = u'Calgary Herald'
|
||||||
|
url_prefix = 'http://www.calgaryherald.com'
|
||||||
|
description = u'News from Calgary, AB'
|
||||||
|
|
||||||
|
# un-comment the following three lines for the Regina Leader-Post
|
||||||
|
#title = u'Regina Leader-Post'
|
||||||
|
#url_prefix = 'http://www.leaderpost.com'
|
||||||
|
#description = u'News from Regina, SK'
|
||||||
|
|
||||||
|
# un-comment the following three lines for the Saskatoon Star-Phoenix
|
||||||
|
#title = u'Saskatoon Star-Phoenix'
|
||||||
|
#url_prefix = 'http://www.thestarphoenix.com'
|
||||||
|
#description = u'News from Saskatoon, SK'
|
||||||
|
|
||||||
|
# un-comment the following three lines for the Windsor Star
|
||||||
|
#title = u'Windsor Star'
|
||||||
|
#url_prefix = 'http://www.windsorstar.com'
|
||||||
|
#description = u'News from Windsor, ON'
|
||||||
|
|
||||||
|
# un-comment the following three lines for the Ottawa Citizen
|
||||||
|
#title = u'Ottawa Citizen'
|
||||||
|
#url_prefix = 'http://www.ottawacitizen.com'
|
||||||
|
#description = u'News from Ottawa, ON'
|
||||||
|
|
||||||
|
# un-comment the following three lines for the Montreal Gazette
|
||||||
|
#title = u'Montreal Gazette'
|
||||||
|
#url_prefix = 'http://www.montrealgazette.com'
|
||||||
|
#description = u'News from Montreal, QC'
|
||||||
|
|
||||||
|
|
||||||
|
language = 'en_CA'
|
||||||
|
__author__ = 'Nick Redding'
|
||||||
|
no_stylesheets = True
|
||||||
|
timefmt = ' [%b %d]'
|
||||||
|
extra_css = '''
|
||||||
|
.timestamp { font-size:xx-small; display: block; }
|
||||||
|
#storyheader { font-size: medium; }
|
||||||
|
#storyheader h1 { font-size: x-large; }
|
||||||
|
#storyheader h2 { font-size: large; font-style: italic; }
|
||||||
|
.byline { font-size:xx-small; }
|
||||||
|
#photocaption { font-size: small; font-style: italic }
|
||||||
|
#photocredit { font-size: xx-small; }'''
|
||||||
|
keep_only_tags = [dict(name='div', attrs={'id':'storyheader'}),dict(name='div', attrs={'id':'storycontent'})]
|
||||||
|
remove_tags = [{'class':'comments'},
|
||||||
|
dict(name='div', attrs={'class':'navbar'}),dict(name='div', attrs={'class':'morelinks'}),
|
||||||
|
dict(name='div', attrs={'class':'viewmore'}),dict(name='li', attrs={'class':'email'}),
|
||||||
|
dict(name='div', attrs={'class':'story_tool_hr'}),dict(name='div', attrs={'class':'clear'}),
|
||||||
|
dict(name='div', attrs={'class':'story_tool'}),dict(name='div', attrs={'class':'copyright'}),
|
||||||
|
dict(name='div', attrs={'class':'rule_grey_solid'}),
|
||||||
|
dict(name='li', attrs={'class':'print'}),dict(name='li', attrs={'class':'share'}),dict(name='ul', attrs={'class':'bullet'})]
|
||||||
|
|
||||||
|
def preprocess_html(self,soup):
|
||||||
|
#delete iempty id attributes--they screw up the TOC for unknow reasons
|
||||||
|
divtags = soup.findAll('div',attrs={'id':''})
|
||||||
|
if divtags:
|
||||||
|
for div in divtags:
|
||||||
|
del(div['id'])
|
||||||
|
return soup
|
||||||
|
|
||||||
|
|
||||||
|
def parse_index(self):
|
||||||
|
soup = self.index_to_soup(self.url_prefix+'/news/todays-paper/index.html')
|
||||||
|
|
||||||
|
articles = {}
|
||||||
|
key = 'News'
|
||||||
|
ans = ['News']
|
||||||
|
|
||||||
|
# Find each instance of class="sectiontitle", class="featurecontent"
|
||||||
|
for divtag in soup.findAll('div',attrs={'class' : ["section_title02","featurecontent"]}):
|
||||||
|
#self.log(" div class = %s" % divtag['class'])
|
||||||
|
if divtag['class'].startswith('section_title'):
|
||||||
|
# div contains section title
|
||||||
|
if not divtag.h3:
|
||||||
|
continue
|
||||||
|
key = self.tag_to_string(divtag.h3,False)
|
||||||
|
ans.append(key)
|
||||||
|
self.log("Section name %s" % key)
|
||||||
|
continue
|
||||||
|
# div contains article data
|
||||||
|
h1tag = divtag.find('h1')
|
||||||
|
if not h1tag:
|
||||||
|
continue
|
||||||
|
atag = h1tag.find('a',href=True)
|
||||||
|
if not atag:
|
||||||
|
continue
|
||||||
|
url = self.url_prefix+'/news/todays-paper/'+atag['href']
|
||||||
|
#self.log("Section %s" % key)
|
||||||
|
#self.log("url %s" % url)
|
||||||
|
title = self.tag_to_string(atag,False)
|
||||||
|
#self.log("title %s" % title)
|
||||||
|
pubdate = ''
|
||||||
|
description = ''
|
||||||
|
ptag = divtag.find('p');
|
||||||
|
if ptag:
|
||||||
|
description = self.tag_to_string(ptag,False)
|
||||||
|
#self.log("description %s" % description)
|
||||||
|
author = ''
|
||||||
|
autag = divtag.find('h4')
|
||||||
|
if autag:
|
||||||
|
author = self.tag_to_string(autag,False)
|
||||||
|
#self.log("author %s" % author)
|
||||||
|
if not articles.has_key(key):
|
||||||
|
articles[key] = []
|
||||||
|
articles[key].append(dict(title=title,url=url,date=pubdate,description=description,author=author,content=''))
|
||||||
|
|
||||||
|
ans = [(key, articles[key]) for key in ans if articles.has_key(key)]
|
||||||
|
return ans
|
126
resources/recipes/edmonton_journal.recipe
Normal file
126
resources/recipes/edmonton_journal.recipe
Normal file
@ -0,0 +1,126 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
|
||||||
|
__license__ = 'GPL v3'
|
||||||
|
|
||||||
|
'''
|
||||||
|
www.canada.com
|
||||||
|
'''
|
||||||
|
|
||||||
|
from calibre.web.feeds.recipes import BasicNewsRecipe
|
||||||
|
|
||||||
|
|
||||||
|
class CanWestPaper(BasicNewsRecipe):
|
||||||
|
|
||||||
|
# un-comment the following three lines for the Edmonton Journal
|
||||||
|
title = u'Edmonton Journal'
|
||||||
|
url_prefix = 'http://www.edmontonjournal.com'
|
||||||
|
description = u'News from Edmonton, AB'
|
||||||
|
|
||||||
|
# un-comment the following three lines for the Calgary Herald
|
||||||
|
#title = u'Calgary Herald'
|
||||||
|
#url_prefix = 'http://www.calgaryherald.com'
|
||||||
|
#description = u'News from Calgary, AB'
|
||||||
|
|
||||||
|
# un-comment the following three lines for the Regina Leader-Post
|
||||||
|
#title = u'Regina Leader-Post'
|
||||||
|
#url_prefix = 'http://www.leaderpost.com'
|
||||||
|
#description = u'News from Regina, SK'
|
||||||
|
|
||||||
|
# un-comment the following three lines for the Saskatoon Star-Phoenix
|
||||||
|
#title = u'Saskatoon Star-Phoenix'
|
||||||
|
#url_prefix = 'http://www.thestarphoenix.com'
|
||||||
|
#description = u'News from Saskatoon, SK'
|
||||||
|
|
||||||
|
# un-comment the following three lines for the Windsor Star
|
||||||
|
#title = u'Windsor Star'
|
||||||
|
#url_prefix = 'http://www.windsorstar.com'
|
||||||
|
#description = u'News from Windsor, ON'
|
||||||
|
|
||||||
|
# un-comment the following three lines for the Ottawa Citizen
|
||||||
|
#title = u'Ottawa Citizen'
|
||||||
|
#url_prefix = 'http://www.ottawacitizen.com'
|
||||||
|
#description = u'News from Ottawa, ON'
|
||||||
|
|
||||||
|
# un-comment the following three lines for the Montreal Gazette
|
||||||
|
#title = u'Montreal Gazette'
|
||||||
|
#url_prefix = 'http://www.montrealgazette.com'
|
||||||
|
#description = u'News from Montreal, QC'
|
||||||
|
|
||||||
|
|
||||||
|
language = 'en_CA'
|
||||||
|
__author__ = 'Nick Redding'
|
||||||
|
no_stylesheets = True
|
||||||
|
timefmt = ' [%b %d]'
|
||||||
|
extra_css = '''
|
||||||
|
.timestamp { font-size:xx-small; display: block; }
|
||||||
|
#storyheader { font-size: medium; }
|
||||||
|
#storyheader h1 { font-size: x-large; }
|
||||||
|
#storyheader h2 { font-size: large; font-style: italic; }
|
||||||
|
.byline { font-size:xx-small; }
|
||||||
|
#photocaption { font-size: small; font-style: italic }
|
||||||
|
#photocredit { font-size: xx-small; }'''
|
||||||
|
keep_only_tags = [dict(name='div', attrs={'id':'storyheader'}),dict(name='div', attrs={'id':'storycontent'})]
|
||||||
|
remove_tags = [{'class':'comments'},
|
||||||
|
dict(name='div', attrs={'class':'navbar'}),dict(name='div', attrs={'class':'morelinks'}),
|
||||||
|
dict(name='div', attrs={'class':'viewmore'}),dict(name='li', attrs={'class':'email'}),
|
||||||
|
dict(name='div', attrs={'class':'story_tool_hr'}),dict(name='div', attrs={'class':'clear'}),
|
||||||
|
dict(name='div', attrs={'class':'story_tool'}),dict(name='div', attrs={'class':'copyright'}),
|
||||||
|
dict(name='div', attrs={'class':'rule_grey_solid'}),
|
||||||
|
dict(name='li', attrs={'class':'print'}),dict(name='li', attrs={'class':'share'}),dict(name='ul', attrs={'class':'bullet'})]
|
||||||
|
|
||||||
|
def preprocess_html(self,soup):
|
||||||
|
#delete iempty id attributes--they screw up the TOC for unknow reasons
|
||||||
|
divtags = soup.findAll('div',attrs={'id':''})
|
||||||
|
if divtags:
|
||||||
|
for div in divtags:
|
||||||
|
del(div['id'])
|
||||||
|
return soup
|
||||||
|
|
||||||
|
|
||||||
|
def parse_index(self):
|
||||||
|
soup = self.index_to_soup(self.url_prefix+'/news/todays-paper/index.html')
|
||||||
|
|
||||||
|
articles = {}
|
||||||
|
key = 'News'
|
||||||
|
ans = ['News']
|
||||||
|
|
||||||
|
# Find each instance of class="sectiontitle", class="featurecontent"
|
||||||
|
for divtag in soup.findAll('div',attrs={'class' : ["section_title02","featurecontent"]}):
|
||||||
|
#self.log(" div class = %s" % divtag['class'])
|
||||||
|
if divtag['class'].startswith('section_title'):
|
||||||
|
# div contains section title
|
||||||
|
if not divtag.h3:
|
||||||
|
continue
|
||||||
|
key = self.tag_to_string(divtag.h3,False)
|
||||||
|
ans.append(key)
|
||||||
|
self.log("Section name %s" % key)
|
||||||
|
continue
|
||||||
|
# div contains article data
|
||||||
|
h1tag = divtag.find('h1')
|
||||||
|
if not h1tag:
|
||||||
|
continue
|
||||||
|
atag = h1tag.find('a',href=True)
|
||||||
|
if not atag:
|
||||||
|
continue
|
||||||
|
url = self.url_prefix+'/news/todays-paper/'+atag['href']
|
||||||
|
#self.log("Section %s" % key)
|
||||||
|
#self.log("url %s" % url)
|
||||||
|
title = self.tag_to_string(atag,False)
|
||||||
|
#self.log("title %s" % title)
|
||||||
|
pubdate = ''
|
||||||
|
description = ''
|
||||||
|
ptag = divtag.find('p');
|
||||||
|
if ptag:
|
||||||
|
description = self.tag_to_string(ptag,False)
|
||||||
|
#self.log("description %s" % description)
|
||||||
|
author = ''
|
||||||
|
autag = divtag.find('h4')
|
||||||
|
if autag:
|
||||||
|
author = self.tag_to_string(autag,False)
|
||||||
|
#self.log("author %s" % author)
|
||||||
|
if not articles.has_key(key):
|
||||||
|
articles[key] = []
|
||||||
|
articles[key].append(dict(title=title,url=url,date=pubdate,description=description,author=author,content=''))
|
||||||
|
|
||||||
|
ans = [(key, articles[key]) for key in ans if articles.has_key(key)]
|
||||||
|
return ans
|
96
resources/recipes/montreal_gazette.recipe
Normal file
96
resources/recipes/montreal_gazette.recipe
Normal file
@ -0,0 +1,96 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
|
||||||
|
__license__ = 'GPL v3'
|
||||||
|
|
||||||
|
'''
|
||||||
|
www.canada.com
|
||||||
|
'''
|
||||||
|
|
||||||
|
from calibre.web.feeds.recipes import BasicNewsRecipe
|
||||||
|
|
||||||
|
|
||||||
|
class CanWestPaper(BasicNewsRecipe):
|
||||||
|
|
||||||
|
# un-comment the following three lines for the Montreal Gazette
|
||||||
|
title = u'Montreal Gazette'
|
||||||
|
url_prefix = 'http://www.montrealgazette.com'
|
||||||
|
description = u'News from Montreal, QC'
|
||||||
|
|
||||||
|
|
||||||
|
language = 'en_CA'
|
||||||
|
__author__ = 'Nick Redding'
|
||||||
|
no_stylesheets = True
|
||||||
|
timefmt = ' [%b %d]'
|
||||||
|
extra_css = '''
|
||||||
|
.timestamp { font-size:xx-small; display: block; }
|
||||||
|
#storyheader { font-size: medium; }
|
||||||
|
#storyheader h1 { font-size: x-large; }
|
||||||
|
#storyheader h2 { font-size: large; font-style: italic; }
|
||||||
|
.byline { font-size:xx-small; }
|
||||||
|
#photocaption { font-size: small; font-style: italic }
|
||||||
|
#photocredit { font-size: xx-small; }'''
|
||||||
|
keep_only_tags = [dict(name='div', attrs={'id':'storyheader'}),dict(name='div', attrs={'id':'storycontent'})]
|
||||||
|
remove_tags = [{'class':'comments'},
|
||||||
|
dict(name='div', attrs={'class':'navbar'}),dict(name='div', attrs={'class':'morelinks'}),
|
||||||
|
dict(name='div', attrs={'class':'viewmore'}),dict(name='li', attrs={'class':'email'}),
|
||||||
|
dict(name='div', attrs={'class':'story_tool_hr'}),dict(name='div', attrs={'class':'clear'}),
|
||||||
|
dict(name='div', attrs={'class':'story_tool'}),dict(name='div', attrs={'class':'copyright'}),
|
||||||
|
dict(name='div', attrs={'class':'rule_grey_solid'}),
|
||||||
|
dict(name='li', attrs={'class':'print'}),dict(name='li', attrs={'class':'share'}),dict(name='ul', attrs={'class':'bullet'})]
|
||||||
|
|
||||||
|
def preprocess_html(self,soup):
|
||||||
|
#delete iempty id attributes--they screw up the TOC for unknow reasons
|
||||||
|
divtags = soup.findAll('div',attrs={'id':''})
|
||||||
|
if divtags:
|
||||||
|
for div in divtags:
|
||||||
|
del(div['id'])
|
||||||
|
return soup
|
||||||
|
|
||||||
|
|
||||||
|
def parse_index(self):
|
||||||
|
soup = self.index_to_soup(self.url_prefix+'/news/todays-paper/index.html')
|
||||||
|
|
||||||
|
articles = {}
|
||||||
|
key = 'News'
|
||||||
|
ans = ['News']
|
||||||
|
|
||||||
|
# Find each instance of class="sectiontitle", class="featurecontent"
|
||||||
|
for divtag in soup.findAll('div',attrs={'class' : ["section_title02","featurecontent"]}):
|
||||||
|
#self.log(" div class = %s" % divtag['class'])
|
||||||
|
if divtag['class'].startswith('section_title'):
|
||||||
|
# div contains section title
|
||||||
|
if not divtag.h3:
|
||||||
|
continue
|
||||||
|
key = self.tag_to_string(divtag.h3,False)
|
||||||
|
ans.append(key)
|
||||||
|
self.log("Section name %s" % key)
|
||||||
|
continue
|
||||||
|
# div contains article data
|
||||||
|
h1tag = divtag.find('h1')
|
||||||
|
if not h1tag:
|
||||||
|
continue
|
||||||
|
atag = h1tag.find('a',href=True)
|
||||||
|
if not atag:
|
||||||
|
continue
|
||||||
|
url = self.url_prefix+'/news/todays-paper/'+atag['href']
|
||||||
|
#self.log("Section %s" % key)
|
||||||
|
#self.log("url %s" % url)
|
||||||
|
title = self.tag_to_string(atag,False)
|
||||||
|
#self.log("title %s" % title)
|
||||||
|
pubdate = ''
|
||||||
|
description = ''
|
||||||
|
ptag = divtag.find('p');
|
||||||
|
if ptag:
|
||||||
|
description = self.tag_to_string(ptag,False)
|
||||||
|
#self.log("description %s" % description)
|
||||||
|
author = ''
|
||||||
|
autag = divtag.find('h4')
|
||||||
|
if autag:
|
||||||
|
author = self.tag_to_string(autag,False)
|
||||||
|
#self.log("author %s" % author)
|
||||||
|
if not articles.has_key(key):
|
||||||
|
articles[key] = []
|
||||||
|
articles[key].append(dict(title=title,url=url,date=pubdate,description=description,author=author,content=''))
|
||||||
|
|
||||||
|
ans = [(key, articles[key]) for key in ans if articles.has_key(key)]
|
||||||
|
return ans
|
101
resources/recipes/ottawa_citizen.recipe
Normal file
101
resources/recipes/ottawa_citizen.recipe
Normal file
@ -0,0 +1,101 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
|
||||||
|
__license__ = 'GPL v3'
|
||||||
|
|
||||||
|
'''
|
||||||
|
www.canada.com
|
||||||
|
'''
|
||||||
|
|
||||||
|
from calibre.web.feeds.recipes import BasicNewsRecipe
|
||||||
|
|
||||||
|
|
||||||
|
class CanWestPaper(BasicNewsRecipe):
|
||||||
|
|
||||||
|
# un-comment the following three lines for the Ottawa Citizen
|
||||||
|
title = u'Ottawa Citizen'
|
||||||
|
url_prefix = 'http://www.ottawacitizen.com'
|
||||||
|
description = u'News from Ottawa, ON'
|
||||||
|
|
||||||
|
# un-comment the following three lines for the Montreal Gazette
|
||||||
|
#title = u'Montreal Gazette'
|
||||||
|
#url_prefix = 'http://www.montrealgazette.com'
|
||||||
|
#description = u'News from Montreal, QC'
|
||||||
|
|
||||||
|
|
||||||
|
language = 'en_CA'
|
||||||
|
__author__ = 'Nick Redding'
|
||||||
|
no_stylesheets = True
|
||||||
|
timefmt = ' [%b %d]'
|
||||||
|
extra_css = '''
|
||||||
|
.timestamp { font-size:xx-small; display: block; }
|
||||||
|
#storyheader { font-size: medium; }
|
||||||
|
#storyheader h1 { font-size: x-large; }
|
||||||
|
#storyheader h2 { font-size: large; font-style: italic; }
|
||||||
|
.byline { font-size:xx-small; }
|
||||||
|
#photocaption { font-size: small; font-style: italic }
|
||||||
|
#photocredit { font-size: xx-small; }'''
|
||||||
|
keep_only_tags = [dict(name='div', attrs={'id':'storyheader'}),dict(name='div', attrs={'id':'storycontent'})]
|
||||||
|
remove_tags = [{'class':'comments'},
|
||||||
|
dict(name='div', attrs={'class':'navbar'}),dict(name='div', attrs={'class':'morelinks'}),
|
||||||
|
dict(name='div', attrs={'class':'viewmore'}),dict(name='li', attrs={'class':'email'}),
|
||||||
|
dict(name='div', attrs={'class':'story_tool_hr'}),dict(name='div', attrs={'class':'clear'}),
|
||||||
|
dict(name='div', attrs={'class':'story_tool'}),dict(name='div', attrs={'class':'copyright'}),
|
||||||
|
dict(name='div', attrs={'class':'rule_grey_solid'}),
|
||||||
|
dict(name='li', attrs={'class':'print'}),dict(name='li', attrs={'class':'share'}),dict(name='ul', attrs={'class':'bullet'})]
|
||||||
|
|
||||||
|
def preprocess_html(self,soup):
|
||||||
|
#delete iempty id attributes--they screw up the TOC for unknow reasons
|
||||||
|
divtags = soup.findAll('div',attrs={'id':''})
|
||||||
|
if divtags:
|
||||||
|
for div in divtags:
|
||||||
|
del(div['id'])
|
||||||
|
return soup
|
||||||
|
|
||||||
|
|
||||||
|
def parse_index(self):
|
||||||
|
soup = self.index_to_soup(self.url_prefix+'/news/todays-paper/index.html')
|
||||||
|
|
||||||
|
articles = {}
|
||||||
|
key = 'News'
|
||||||
|
ans = ['News']
|
||||||
|
|
||||||
|
# Find each instance of class="sectiontitle", class="featurecontent"
|
||||||
|
for divtag in soup.findAll('div',attrs={'class' : ["section_title02","featurecontent"]}):
|
||||||
|
#self.log(" div class = %s" % divtag['class'])
|
||||||
|
if divtag['class'].startswith('section_title'):
|
||||||
|
# div contains section title
|
||||||
|
if not divtag.h3:
|
||||||
|
continue
|
||||||
|
key = self.tag_to_string(divtag.h3,False)
|
||||||
|
ans.append(key)
|
||||||
|
self.log("Section name %s" % key)
|
||||||
|
continue
|
||||||
|
# div contains article data
|
||||||
|
h1tag = divtag.find('h1')
|
||||||
|
if not h1tag:
|
||||||
|
continue
|
||||||
|
atag = h1tag.find('a',href=True)
|
||||||
|
if not atag:
|
||||||
|
continue
|
||||||
|
url = self.url_prefix+'/news/todays-paper/'+atag['href']
|
||||||
|
#self.log("Section %s" % key)
|
||||||
|
#self.log("url %s" % url)
|
||||||
|
title = self.tag_to_string(atag,False)
|
||||||
|
#self.log("title %s" % title)
|
||||||
|
pubdate = ''
|
||||||
|
description = ''
|
||||||
|
ptag = divtag.find('p');
|
||||||
|
if ptag:
|
||||||
|
description = self.tag_to_string(ptag,False)
|
||||||
|
#self.log("description %s" % description)
|
||||||
|
author = ''
|
||||||
|
autag = divtag.find('h4')
|
||||||
|
if autag:
|
||||||
|
author = self.tag_to_string(autag,False)
|
||||||
|
#self.log("author %s" % author)
|
||||||
|
if not articles.has_key(key):
|
||||||
|
articles[key] = []
|
||||||
|
articles[key].append(dict(title=title,url=url,date=pubdate,description=description,author=author,content=''))
|
||||||
|
|
||||||
|
ans = [(key, articles[key]) for key in ans if articles.has_key(key)]
|
||||||
|
return ans
|
116
resources/recipes/regina_leader_post.recipe
Normal file
116
resources/recipes/regina_leader_post.recipe
Normal file
@ -0,0 +1,116 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
|
||||||
|
__license__ = 'GPL v3'
|
||||||
|
|
||||||
|
'''
|
||||||
|
www.canada.com
|
||||||
|
'''
|
||||||
|
|
||||||
|
from calibre.web.feeds.recipes import BasicNewsRecipe
|
||||||
|
|
||||||
|
|
||||||
|
class CanWestPaper(BasicNewsRecipe):
|
||||||
|
|
||||||
|
# un-comment the following three lines for the Regina Leader-Post
|
||||||
|
title = u'Regina Leader-Post'
|
||||||
|
url_prefix = 'http://www.leaderpost.com'
|
||||||
|
description = u'News from Regina, SK'
|
||||||
|
|
||||||
|
# un-comment the following three lines for the Saskatoon Star-Phoenix
|
||||||
|
#title = u'Saskatoon Star-Phoenix'
|
||||||
|
#url_prefix = 'http://www.thestarphoenix.com'
|
||||||
|
#description = u'News from Saskatoon, SK'
|
||||||
|
|
||||||
|
# un-comment the following three lines for the Windsor Star
|
||||||
|
#title = u'Windsor Star'
|
||||||
|
#url_prefix = 'http://www.windsorstar.com'
|
||||||
|
#description = u'News from Windsor, ON'
|
||||||
|
|
||||||
|
# un-comment the following three lines for the Ottawa Citizen
|
||||||
|
#title = u'Ottawa Citizen'
|
||||||
|
#url_prefix = 'http://www.ottawacitizen.com'
|
||||||
|
#description = u'News from Ottawa, ON'
|
||||||
|
|
||||||
|
# un-comment the following three lines for the Montreal Gazette
|
||||||
|
#title = u'Montreal Gazette'
|
||||||
|
#url_prefix = 'http://www.montrealgazette.com'
|
||||||
|
#description = u'News from Montreal, QC'
|
||||||
|
|
||||||
|
|
||||||
|
language = 'en_CA'
|
||||||
|
__author__ = 'Nick Redding'
|
||||||
|
no_stylesheets = True
|
||||||
|
timefmt = ' [%b %d]'
|
||||||
|
extra_css = '''
|
||||||
|
.timestamp { font-size:xx-small; display: block; }
|
||||||
|
#storyheader { font-size: medium; }
|
||||||
|
#storyheader h1 { font-size: x-large; }
|
||||||
|
#storyheader h2 { font-size: large; font-style: italic; }
|
||||||
|
.byline { font-size:xx-small; }
|
||||||
|
#photocaption { font-size: small; font-style: italic }
|
||||||
|
#photocredit { font-size: xx-small; }'''
|
||||||
|
keep_only_tags = [dict(name='div', attrs={'id':'storyheader'}),dict(name='div', attrs={'id':'storycontent'})]
|
||||||
|
remove_tags = [{'class':'comments'},
|
||||||
|
dict(name='div', attrs={'class':'navbar'}),dict(name='div', attrs={'class':'morelinks'}),
|
||||||
|
dict(name='div', attrs={'class':'viewmore'}),dict(name='li', attrs={'class':'email'}),
|
||||||
|
dict(name='div', attrs={'class':'story_tool_hr'}),dict(name='div', attrs={'class':'clear'}),
|
||||||
|
dict(name='div', attrs={'class':'story_tool'}),dict(name='div', attrs={'class':'copyright'}),
|
||||||
|
dict(name='div', attrs={'class':'rule_grey_solid'}),
|
||||||
|
dict(name='li', attrs={'class':'print'}),dict(name='li', attrs={'class':'share'}),dict(name='ul', attrs={'class':'bullet'})]
|
||||||
|
|
||||||
|
def preprocess_html(self,soup):
|
||||||
|
#delete iempty id attributes--they screw up the TOC for unknow reasons
|
||||||
|
divtags = soup.findAll('div',attrs={'id':''})
|
||||||
|
if divtags:
|
||||||
|
for div in divtags:
|
||||||
|
del(div['id'])
|
||||||
|
return soup
|
||||||
|
|
||||||
|
|
||||||
|
def parse_index(self):
|
||||||
|
soup = self.index_to_soup(self.url_prefix+'/news/todays-paper/index.html')
|
||||||
|
|
||||||
|
articles = {}
|
||||||
|
key = 'News'
|
||||||
|
ans = ['News']
|
||||||
|
|
||||||
|
# Find each instance of class="sectiontitle", class="featurecontent"
|
||||||
|
for divtag in soup.findAll('div',attrs={'class' : ["section_title02","featurecontent"]}):
|
||||||
|
#self.log(" div class = %s" % divtag['class'])
|
||||||
|
if divtag['class'].startswith('section_title'):
|
||||||
|
# div contains section title
|
||||||
|
if not divtag.h3:
|
||||||
|
continue
|
||||||
|
key = self.tag_to_string(divtag.h3,False)
|
||||||
|
ans.append(key)
|
||||||
|
self.log("Section name %s" % key)
|
||||||
|
continue
|
||||||
|
# div contains article data
|
||||||
|
h1tag = divtag.find('h1')
|
||||||
|
if not h1tag:
|
||||||
|
continue
|
||||||
|
atag = h1tag.find('a',href=True)
|
||||||
|
if not atag:
|
||||||
|
continue
|
||||||
|
url = self.url_prefix+'/news/todays-paper/'+atag['href']
|
||||||
|
#self.log("Section %s" % key)
|
||||||
|
#self.log("url %s" % url)
|
||||||
|
title = self.tag_to_string(atag,False)
|
||||||
|
#self.log("title %s" % title)
|
||||||
|
pubdate = ''
|
||||||
|
description = ''
|
||||||
|
ptag = divtag.find('p');
|
||||||
|
if ptag:
|
||||||
|
description = self.tag_to_string(ptag,False)
|
||||||
|
#self.log("description %s" % description)
|
||||||
|
author = ''
|
||||||
|
autag = divtag.find('h4')
|
||||||
|
if autag:
|
||||||
|
author = self.tag_to_string(autag,False)
|
||||||
|
#self.log("author %s" % author)
|
||||||
|
if not articles.has_key(key):
|
||||||
|
articles[key] = []
|
||||||
|
articles[key].append(dict(title=title,url=url,date=pubdate,description=description,author=author,content=''))
|
||||||
|
|
||||||
|
ans = [(key, articles[key]) for key in ans if articles.has_key(key)]
|
||||||
|
return ans
|
111
resources/recipes/saskatoon_star_phoenix.recipe
Normal file
111
resources/recipes/saskatoon_star_phoenix.recipe
Normal file
@ -0,0 +1,111 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
|
||||||
|
__license__ = 'GPL v3'
|
||||||
|
|
||||||
|
'''
|
||||||
|
www.canada.com
|
||||||
|
'''
|
||||||
|
|
||||||
|
from calibre.web.feeds.recipes import BasicNewsRecipe
|
||||||
|
|
||||||
|
|
||||||
|
class CanWestPaper(BasicNewsRecipe):
|
||||||
|
|
||||||
|
# un-comment the following three lines for the Saskatoon Star-Phoenix
|
||||||
|
title = u'Saskatoon Star-Phoenix'
|
||||||
|
url_prefix = 'http://www.thestarphoenix.com'
|
||||||
|
description = u'News from Saskatoon, SK'
|
||||||
|
|
||||||
|
# un-comment the following three lines for the Windsor Star
|
||||||
|
#title = u'Windsor Star'
|
||||||
|
#url_prefix = 'http://www.windsorstar.com'
|
||||||
|
#description = u'News from Windsor, ON'
|
||||||
|
|
||||||
|
# un-comment the following three lines for the Ottawa Citizen
|
||||||
|
#title = u'Ottawa Citizen'
|
||||||
|
#url_prefix = 'http://www.ottawacitizen.com'
|
||||||
|
#description = u'News from Ottawa, ON'
|
||||||
|
|
||||||
|
# un-comment the following three lines for the Montreal Gazette
|
||||||
|
#title = u'Montreal Gazette'
|
||||||
|
#url_prefix = 'http://www.montrealgazette.com'
|
||||||
|
#description = u'News from Montreal, QC'
|
||||||
|
|
||||||
|
|
||||||
|
language = 'en_CA'
|
||||||
|
__author__ = 'Nick Redding'
|
||||||
|
no_stylesheets = True
|
||||||
|
timefmt = ' [%b %d]'
|
||||||
|
extra_css = '''
|
||||||
|
.timestamp { font-size:xx-small; display: block; }
|
||||||
|
#storyheader { font-size: medium; }
|
||||||
|
#storyheader h1 { font-size: x-large; }
|
||||||
|
#storyheader h2 { font-size: large; font-style: italic; }
|
||||||
|
.byline { font-size:xx-small; }
|
||||||
|
#photocaption { font-size: small; font-style: italic }
|
||||||
|
#photocredit { font-size: xx-small; }'''
|
||||||
|
keep_only_tags = [dict(name='div', attrs={'id':'storyheader'}),dict(name='div', attrs={'id':'storycontent'})]
|
||||||
|
remove_tags = [{'class':'comments'},
|
||||||
|
dict(name='div', attrs={'class':'navbar'}),dict(name='div', attrs={'class':'morelinks'}),
|
||||||
|
dict(name='div', attrs={'class':'viewmore'}),dict(name='li', attrs={'class':'email'}),
|
||||||
|
dict(name='div', attrs={'class':'story_tool_hr'}),dict(name='div', attrs={'class':'clear'}),
|
||||||
|
dict(name='div', attrs={'class':'story_tool'}),dict(name='div', attrs={'class':'copyright'}),
|
||||||
|
dict(name='div', attrs={'class':'rule_grey_solid'}),
|
||||||
|
dict(name='li', attrs={'class':'print'}),dict(name='li', attrs={'class':'share'}),dict(name='ul', attrs={'class':'bullet'})]
|
||||||
|
|
||||||
|
def preprocess_html(self,soup):
|
||||||
|
#delete iempty id attributes--they screw up the TOC for unknow reasons
|
||||||
|
divtags = soup.findAll('div',attrs={'id':''})
|
||||||
|
if divtags:
|
||||||
|
for div in divtags:
|
||||||
|
del(div['id'])
|
||||||
|
return soup
|
||||||
|
|
||||||
|
|
||||||
|
def parse_index(self):
|
||||||
|
soup = self.index_to_soup(self.url_prefix+'/news/todays-paper/index.html')
|
||||||
|
|
||||||
|
articles = {}
|
||||||
|
key = 'News'
|
||||||
|
ans = ['News']
|
||||||
|
|
||||||
|
# Find each instance of class="sectiontitle", class="featurecontent"
|
||||||
|
for divtag in soup.findAll('div',attrs={'class' : ["section_title02","featurecontent"]}):
|
||||||
|
#self.log(" div class = %s" % divtag['class'])
|
||||||
|
if divtag['class'].startswith('section_title'):
|
||||||
|
# div contains section title
|
||||||
|
if not divtag.h3:
|
||||||
|
continue
|
||||||
|
key = self.tag_to_string(divtag.h3,False)
|
||||||
|
ans.append(key)
|
||||||
|
self.log("Section name %s" % key)
|
||||||
|
continue
|
||||||
|
# div contains article data
|
||||||
|
h1tag = divtag.find('h1')
|
||||||
|
if not h1tag:
|
||||||
|
continue
|
||||||
|
atag = h1tag.find('a',href=True)
|
||||||
|
if not atag:
|
||||||
|
continue
|
||||||
|
url = self.url_prefix+'/news/todays-paper/'+atag['href']
|
||||||
|
#self.log("Section %s" % key)
|
||||||
|
#self.log("url %s" % url)
|
||||||
|
title = self.tag_to_string(atag,False)
|
||||||
|
#self.log("title %s" % title)
|
||||||
|
pubdate = ''
|
||||||
|
description = ''
|
||||||
|
ptag = divtag.find('p');
|
||||||
|
if ptag:
|
||||||
|
description = self.tag_to_string(ptag,False)
|
||||||
|
#self.log("description %s" % description)
|
||||||
|
author = ''
|
||||||
|
autag = divtag.find('h4')
|
||||||
|
if autag:
|
||||||
|
author = self.tag_to_string(autag,False)
|
||||||
|
#self.log("author %s" % author)
|
||||||
|
if not articles.has_key(key):
|
||||||
|
articles[key] = []
|
||||||
|
articles[key].append(dict(title=title,url=url,date=pubdate,description=description,author=author,content=''))
|
||||||
|
|
||||||
|
ans = [(key, articles[key]) for key in ans if articles.has_key(key)]
|
||||||
|
return ans
|
136
resources/recipes/vancouver_provice.recipe
Normal file
136
resources/recipes/vancouver_provice.recipe
Normal file
@ -0,0 +1,136 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
|
||||||
|
__license__ = 'GPL v3'
|
||||||
|
|
||||||
|
'''
|
||||||
|
www.canada.com
|
||||||
|
'''
|
||||||
|
|
||||||
|
from calibre.web.feeds.recipes import BasicNewsRecipe
|
||||||
|
|
||||||
|
|
||||||
|
class CanWestPaper(BasicNewsRecipe):
|
||||||
|
|
||||||
|
# un-comment the following three lines for the Vancouver Province
|
||||||
|
title = u'Vancouver Province'
|
||||||
|
url_prefix = 'http://www.theprovince.com'
|
||||||
|
description = u'News from Vancouver, BC'
|
||||||
|
|
||||||
|
# un-comment the following three lines for the Vancouver Sun
|
||||||
|
#title = u'Vancouver Sun'
|
||||||
|
#url_prefix = 'http://www.vancouversun.com'
|
||||||
|
#description = u'News from Vancouver, BC'
|
||||||
|
|
||||||
|
# un-comment the following three lines for the Edmonton Journal
|
||||||
|
#title = u'Edmonton Journal'
|
||||||
|
#url_prefix = 'http://www.edmontonjournal.com'
|
||||||
|
#description = u'News from Edmonton, AB'
|
||||||
|
|
||||||
|
# un-comment the following three lines for the Calgary Herald
|
||||||
|
#title = u'Calgary Herald'
|
||||||
|
#url_prefix = 'http://www.calgaryherald.com'
|
||||||
|
#description = u'News from Calgary, AB'
|
||||||
|
|
||||||
|
# un-comment the following three lines for the Regina Leader-Post
|
||||||
|
#title = u'Regina Leader-Post'
|
||||||
|
#url_prefix = 'http://www.leaderpost.com'
|
||||||
|
#description = u'News from Regina, SK'
|
||||||
|
|
||||||
|
# un-comment the following three lines for the Saskatoon Star-Phoenix
|
||||||
|
#title = u'Saskatoon Star-Phoenix'
|
||||||
|
#url_prefix = 'http://www.thestarphoenix.com'
|
||||||
|
#description = u'News from Saskatoon, SK'
|
||||||
|
|
||||||
|
# un-comment the following three lines for the Windsor Star
|
||||||
|
#title = u'Windsor Star'
|
||||||
|
#url_prefix = 'http://www.windsorstar.com'
|
||||||
|
#description = u'News from Windsor, ON'
|
||||||
|
|
||||||
|
# un-comment the following three lines for the Ottawa Citizen
|
||||||
|
#title = u'Ottawa Citizen'
|
||||||
|
#url_prefix = 'http://www.ottawacitizen.com'
|
||||||
|
#description = u'News from Ottawa, ON'
|
||||||
|
|
||||||
|
# un-comment the following three lines for the Montreal Gazette
|
||||||
|
#title = u'Montreal Gazette'
|
||||||
|
#url_prefix = 'http://www.montrealgazette.com'
|
||||||
|
#description = u'News from Montreal, QC'
|
||||||
|
|
||||||
|
|
||||||
|
language = 'en_CA'
|
||||||
|
__author__ = 'Nick Redding'
|
||||||
|
no_stylesheets = True
|
||||||
|
timefmt = ' [%b %d]'
|
||||||
|
extra_css = '''
|
||||||
|
.timestamp { font-size:xx-small; display: block; }
|
||||||
|
#storyheader { font-size: medium; }
|
||||||
|
#storyheader h1 { font-size: x-large; }
|
||||||
|
#storyheader h2 { font-size: large; font-style: italic; }
|
||||||
|
.byline { font-size:xx-small; }
|
||||||
|
#photocaption { font-size: small; font-style: italic }
|
||||||
|
#photocredit { font-size: xx-small; }'''
|
||||||
|
keep_only_tags = [dict(name='div', attrs={'id':'storyheader'}),dict(name='div', attrs={'id':'storycontent'})]
|
||||||
|
remove_tags = [{'class':'comments'},
|
||||||
|
dict(name='div', attrs={'class':'navbar'}),dict(name='div', attrs={'class':'morelinks'}),
|
||||||
|
dict(name='div', attrs={'class':'viewmore'}),dict(name='li', attrs={'class':'email'}),
|
||||||
|
dict(name='div', attrs={'class':'story_tool_hr'}),dict(name='div', attrs={'class':'clear'}),
|
||||||
|
dict(name='div', attrs={'class':'story_tool'}),dict(name='div', attrs={'class':'copyright'}),
|
||||||
|
dict(name='div', attrs={'class':'rule_grey_solid'}),
|
||||||
|
dict(name='li', attrs={'class':'print'}),dict(name='li', attrs={'class':'share'}),dict(name='ul', attrs={'class':'bullet'})]
|
||||||
|
|
||||||
|
def preprocess_html(self,soup):
|
||||||
|
#delete iempty id attributes--they screw up the TOC for unknow reasons
|
||||||
|
divtags = soup.findAll('div',attrs={'id':''})
|
||||||
|
if divtags:
|
||||||
|
for div in divtags:
|
||||||
|
del(div['id'])
|
||||||
|
return soup
|
||||||
|
|
||||||
|
|
||||||
|
def parse_index(self):
|
||||||
|
soup = self.index_to_soup(self.url_prefix+'/news/todays-paper/index.html')
|
||||||
|
|
||||||
|
articles = {}
|
||||||
|
key = 'News'
|
||||||
|
ans = ['News']
|
||||||
|
|
||||||
|
# Find each instance of class="sectiontitle", class="featurecontent"
|
||||||
|
for divtag in soup.findAll('div',attrs={'class' : ["section_title02","featurecontent"]}):
|
||||||
|
#self.log(" div class = %s" % divtag['class'])
|
||||||
|
if divtag['class'].startswith('section_title'):
|
||||||
|
# div contains section title
|
||||||
|
if not divtag.h3:
|
||||||
|
continue
|
||||||
|
key = self.tag_to_string(divtag.h3,False)
|
||||||
|
ans.append(key)
|
||||||
|
self.log("Section name %s" % key)
|
||||||
|
continue
|
||||||
|
# div contains article data
|
||||||
|
h1tag = divtag.find('h1')
|
||||||
|
if not h1tag:
|
||||||
|
continue
|
||||||
|
atag = h1tag.find('a',href=True)
|
||||||
|
if not atag:
|
||||||
|
continue
|
||||||
|
url = self.url_prefix+'/news/todays-paper/'+atag['href']
|
||||||
|
#self.log("Section %s" % key)
|
||||||
|
#self.log("url %s" % url)
|
||||||
|
title = self.tag_to_string(atag,False)
|
||||||
|
#self.log("title %s" % title)
|
||||||
|
pubdate = ''
|
||||||
|
description = ''
|
||||||
|
ptag = divtag.find('p');
|
||||||
|
if ptag:
|
||||||
|
description = self.tag_to_string(ptag,False)
|
||||||
|
#self.log("description %s" % description)
|
||||||
|
author = ''
|
||||||
|
autag = divtag.find('h4')
|
||||||
|
if autag:
|
||||||
|
author = self.tag_to_string(autag,False)
|
||||||
|
#self.log("author %s" % author)
|
||||||
|
if not articles.has_key(key):
|
||||||
|
articles[key] = []
|
||||||
|
articles[key].append(dict(title=title,url=url,date=pubdate,description=description,author=author,content=''))
|
||||||
|
|
||||||
|
ans = [(key, articles[key]) for key in ans if articles.has_key(key)]
|
||||||
|
return ans
|
131
resources/recipes/vancouver_sun.recipe
Normal file
131
resources/recipes/vancouver_sun.recipe
Normal file
@ -0,0 +1,131 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
|
||||||
|
__license__ = 'GPL v3'
|
||||||
|
|
||||||
|
'''
|
||||||
|
www.canada.com
|
||||||
|
'''
|
||||||
|
|
||||||
|
from calibre.web.feeds.recipes import BasicNewsRecipe
|
||||||
|
|
||||||
|
|
||||||
|
class CanWestPaper(BasicNewsRecipe):
|
||||||
|
|
||||||
|
# un-comment the following three lines for the Vancouver Sun
|
||||||
|
title = u'Vancouver Sun'
|
||||||
|
url_prefix = 'http://www.vancouversun.com'
|
||||||
|
description = u'News from Vancouver, BC'
|
||||||
|
|
||||||
|
# un-comment the following three lines for the Edmonton Journal
|
||||||
|
#title = u'Edmonton Journal'
|
||||||
|
#url_prefix = 'http://www.edmontonjournal.com'
|
||||||
|
#description = u'News from Edmonton, AB'
|
||||||
|
|
||||||
|
# un-comment the following three lines for the Calgary Herald
|
||||||
|
#title = u'Calgary Herald'
|
||||||
|
#url_prefix = 'http://www.calgaryherald.com'
|
||||||
|
#description = u'News from Calgary, AB'
|
||||||
|
|
||||||
|
# un-comment the following three lines for the Regina Leader-Post
|
||||||
|
#title = u'Regina Leader-Post'
|
||||||
|
#url_prefix = 'http://www.leaderpost.com'
|
||||||
|
#description = u'News from Regina, SK'
|
||||||
|
|
||||||
|
# un-comment the following three lines for the Saskatoon Star-Phoenix
|
||||||
|
#title = u'Saskatoon Star-Phoenix'
|
||||||
|
#url_prefix = 'http://www.thestarphoenix.com'
|
||||||
|
#description = u'News from Saskatoon, SK'
|
||||||
|
|
||||||
|
# un-comment the following three lines for the Windsor Star
|
||||||
|
#title = u'Windsor Star'
|
||||||
|
#url_prefix = 'http://www.windsorstar.com'
|
||||||
|
#description = u'News from Windsor, ON'
|
||||||
|
|
||||||
|
# un-comment the following three lines for the Ottawa Citizen
|
||||||
|
#title = u'Ottawa Citizen'
|
||||||
|
#url_prefix = 'http://www.ottawacitizen.com'
|
||||||
|
#description = u'News from Ottawa, ON'
|
||||||
|
|
||||||
|
# un-comment the following three lines for the Montreal Gazette
|
||||||
|
#title = u'Montreal Gazette'
|
||||||
|
#url_prefix = 'http://www.montrealgazette.com'
|
||||||
|
#description = u'News from Montreal, QC'
|
||||||
|
|
||||||
|
|
||||||
|
language = 'en_CA'
|
||||||
|
__author__ = 'Nick Redding'
|
||||||
|
no_stylesheets = True
|
||||||
|
timefmt = ' [%b %d]'
|
||||||
|
extra_css = '''
|
||||||
|
.timestamp { font-size:xx-small; display: block; }
|
||||||
|
#storyheader { font-size: medium; }
|
||||||
|
#storyheader h1 { font-size: x-large; }
|
||||||
|
#storyheader h2 { font-size: large; font-style: italic; }
|
||||||
|
.byline { font-size:xx-small; }
|
||||||
|
#photocaption { font-size: small; font-style: italic }
|
||||||
|
#photocredit { font-size: xx-small; }'''
|
||||||
|
keep_only_tags = [dict(name='div', attrs={'id':'storyheader'}),dict(name='div', attrs={'id':'storycontent'})]
|
||||||
|
remove_tags = [{'class':'comments'},
|
||||||
|
dict(name='div', attrs={'class':'navbar'}),dict(name='div', attrs={'class':'morelinks'}),
|
||||||
|
dict(name='div', attrs={'class':'viewmore'}),dict(name='li', attrs={'class':'email'}),
|
||||||
|
dict(name='div', attrs={'class':'story_tool_hr'}),dict(name='div', attrs={'class':'clear'}),
|
||||||
|
dict(name='div', attrs={'class':'story_tool'}),dict(name='div', attrs={'class':'copyright'}),
|
||||||
|
dict(name='div', attrs={'class':'rule_grey_solid'}),
|
||||||
|
dict(name='li', attrs={'class':'print'}),dict(name='li', attrs={'class':'share'}),dict(name='ul', attrs={'class':'bullet'})]
|
||||||
|
|
||||||
|
def preprocess_html(self,soup):
|
||||||
|
#delete iempty id attributes--they screw up the TOC for unknow reasons
|
||||||
|
divtags = soup.findAll('div',attrs={'id':''})
|
||||||
|
if divtags:
|
||||||
|
for div in divtags:
|
||||||
|
del(div['id'])
|
||||||
|
return soup
|
||||||
|
|
||||||
|
|
||||||
|
def parse_index(self):
|
||||||
|
soup = self.index_to_soup(self.url_prefix+'/news/todays-paper/index.html')
|
||||||
|
|
||||||
|
articles = {}
|
||||||
|
key = 'News'
|
||||||
|
ans = ['News']
|
||||||
|
|
||||||
|
# Find each instance of class="sectiontitle", class="featurecontent"
|
||||||
|
for divtag in soup.findAll('div',attrs={'class' : ["section_title02","featurecontent"]}):
|
||||||
|
#self.log(" div class = %s" % divtag['class'])
|
||||||
|
if divtag['class'].startswith('section_title'):
|
||||||
|
# div contains section title
|
||||||
|
if not divtag.h3:
|
||||||
|
continue
|
||||||
|
key = self.tag_to_string(divtag.h3,False)
|
||||||
|
ans.append(key)
|
||||||
|
self.log("Section name %s" % key)
|
||||||
|
continue
|
||||||
|
# div contains article data
|
||||||
|
h1tag = divtag.find('h1')
|
||||||
|
if not h1tag:
|
||||||
|
continue
|
||||||
|
atag = h1tag.find('a',href=True)
|
||||||
|
if not atag:
|
||||||
|
continue
|
||||||
|
url = self.url_prefix+'/news/todays-paper/'+atag['href']
|
||||||
|
#self.log("Section %s" % key)
|
||||||
|
#self.log("url %s" % url)
|
||||||
|
title = self.tag_to_string(atag,False)
|
||||||
|
#self.log("title %s" % title)
|
||||||
|
pubdate = ''
|
||||||
|
description = ''
|
||||||
|
ptag = divtag.find('p');
|
||||||
|
if ptag:
|
||||||
|
description = self.tag_to_string(ptag,False)
|
||||||
|
#self.log("description %s" % description)
|
||||||
|
author = ''
|
||||||
|
autag = divtag.find('h4')
|
||||||
|
if autag:
|
||||||
|
author = self.tag_to_string(autag,False)
|
||||||
|
#self.log("author %s" % author)
|
||||||
|
if not articles.has_key(key):
|
||||||
|
articles[key] = []
|
||||||
|
articles[key].append(dict(title=title,url=url,date=pubdate,description=description,author=author,content=''))
|
||||||
|
|
||||||
|
ans = [(key, articles[key]) for key in ans if articles.has_key(key)]
|
||||||
|
return ans
|
141
resources/recipes/vic_times.recipe
Normal file
141
resources/recipes/vic_times.recipe
Normal file
@ -0,0 +1,141 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
|
||||||
|
__license__ = 'GPL v3'
|
||||||
|
|
||||||
|
'''
|
||||||
|
www.canada.com
|
||||||
|
'''
|
||||||
|
|
||||||
|
from calibre.web.feeds.recipes import BasicNewsRecipe
|
||||||
|
|
||||||
|
|
||||||
|
class CanWestPaper(BasicNewsRecipe):
|
||||||
|
|
||||||
|
# un-comment the following three lines for the Victoria Times Colonist
|
||||||
|
title = u'Victoria Times Colonist'
|
||||||
|
url_prefix = 'http://www.timescolonist.com'
|
||||||
|
description = u'News from Victoria, BC'
|
||||||
|
|
||||||
|
# un-comment the following three lines for the Vancouver Province
|
||||||
|
#title = u'Vancouver Province'
|
||||||
|
#url_prefix = 'http://www.theprovince.com'
|
||||||
|
#description = u'News from Vancouver, BC'
|
||||||
|
|
||||||
|
# un-comment the following three lines for the Vancouver Sun
|
||||||
|
#title = u'Vancouver Sun'
|
||||||
|
#url_prefix = 'http://www.vancouversun.com'
|
||||||
|
#description = u'News from Vancouver, BC'
|
||||||
|
|
||||||
|
# un-comment the following three lines for the Edmonton Journal
|
||||||
|
#title = u'Edmonton Journal'
|
||||||
|
#url_prefix = 'http://www.edmontonjournal.com'
|
||||||
|
#description = u'News from Edmonton, AB'
|
||||||
|
|
||||||
|
# un-comment the following three lines for the Calgary Herald
|
||||||
|
#title = u'Calgary Herald'
|
||||||
|
#url_prefix = 'http://www.calgaryherald.com'
|
||||||
|
#description = u'News from Calgary, AB'
|
||||||
|
|
||||||
|
# un-comment the following three lines for the Regina Leader-Post
|
||||||
|
#title = u'Regina Leader-Post'
|
||||||
|
#url_prefix = 'http://www.leaderpost.com'
|
||||||
|
#description = u'News from Regina, SK'
|
||||||
|
|
||||||
|
# un-comment the following three lines for the Saskatoon Star-Phoenix
|
||||||
|
#title = u'Saskatoon Star-Phoenix'
|
||||||
|
#url_prefix = 'http://www.thestarphoenix.com'
|
||||||
|
#description = u'News from Saskatoon, SK'
|
||||||
|
|
||||||
|
# un-comment the following three lines for the Windsor Star
|
||||||
|
#title = u'Windsor Star'
|
||||||
|
#url_prefix = 'http://www.windsorstar.com'
|
||||||
|
#description = u'News from Windsor, ON'
|
||||||
|
|
||||||
|
# un-comment the following three lines for the Ottawa Citizen
|
||||||
|
#title = u'Ottawa Citizen'
|
||||||
|
#url_prefix = 'http://www.ottawacitizen.com'
|
||||||
|
#description = u'News from Ottawa, ON'
|
||||||
|
|
||||||
|
# un-comment the following three lines for the Montreal Gazette
|
||||||
|
#title = u'Montreal Gazette'
|
||||||
|
#url_prefix = 'http://www.montrealgazette.com'
|
||||||
|
#description = u'News from Montreal, QC'
|
||||||
|
|
||||||
|
|
||||||
|
language = 'en_CA'
|
||||||
|
__author__ = 'Nick Redding'
|
||||||
|
no_stylesheets = True
|
||||||
|
timefmt = ' [%b %d]'
|
||||||
|
extra_css = '''
|
||||||
|
.timestamp { font-size:xx-small; display: block; }
|
||||||
|
#storyheader { font-size: medium; }
|
||||||
|
#storyheader h1 { font-size: x-large; }
|
||||||
|
#storyheader h2 { font-size: large; font-style: italic; }
|
||||||
|
.byline { font-size:xx-small; }
|
||||||
|
#photocaption { font-size: small; font-style: italic }
|
||||||
|
#photocredit { font-size: xx-small; }'''
|
||||||
|
keep_only_tags = [dict(name='div', attrs={'id':'storyheader'}),dict(name='div', attrs={'id':'storycontent'})]
|
||||||
|
remove_tags = [{'class':'comments'},
|
||||||
|
dict(name='div', attrs={'class':'navbar'}),dict(name='div', attrs={'class':'morelinks'}),
|
||||||
|
dict(name='div', attrs={'class':'viewmore'}),dict(name='li', attrs={'class':'email'}),
|
||||||
|
dict(name='div', attrs={'class':'story_tool_hr'}),dict(name='div', attrs={'class':'clear'}),
|
||||||
|
dict(name='div', attrs={'class':'story_tool'}),dict(name='div', attrs={'class':'copyright'}),
|
||||||
|
dict(name='div', attrs={'class':'rule_grey_solid'}),
|
||||||
|
dict(name='li', attrs={'class':'print'}),dict(name='li', attrs={'class':'share'}),dict(name='ul', attrs={'class':'bullet'})]
|
||||||
|
|
||||||
|
def preprocess_html(self,soup):
|
||||||
|
#delete iempty id attributes--they screw up the TOC for unknow reasons
|
||||||
|
divtags = soup.findAll('div',attrs={'id':''})
|
||||||
|
if divtags:
|
||||||
|
for div in divtags:
|
||||||
|
del(div['id'])
|
||||||
|
return soup
|
||||||
|
|
||||||
|
|
||||||
|
def parse_index(self):
|
||||||
|
soup = self.index_to_soup(self.url_prefix+'/news/todays-paper/index.html')
|
||||||
|
|
||||||
|
articles = {}
|
||||||
|
key = 'News'
|
||||||
|
ans = ['News']
|
||||||
|
|
||||||
|
# Find each instance of class="sectiontitle", class="featurecontent"
|
||||||
|
for divtag in soup.findAll('div',attrs={'class' : ["section_title02","featurecontent"]}):
|
||||||
|
#self.log(" div class = %s" % divtag['class'])
|
||||||
|
if divtag['class'].startswith('section_title'):
|
||||||
|
# div contains section title
|
||||||
|
if not divtag.h3:
|
||||||
|
continue
|
||||||
|
key = self.tag_to_string(divtag.h3,False)
|
||||||
|
ans.append(key)
|
||||||
|
self.log("Section name %s" % key)
|
||||||
|
continue
|
||||||
|
# div contains article data
|
||||||
|
h1tag = divtag.find('h1')
|
||||||
|
if not h1tag:
|
||||||
|
continue
|
||||||
|
atag = h1tag.find('a',href=True)
|
||||||
|
if not atag:
|
||||||
|
continue
|
||||||
|
url = self.url_prefix+'/news/todays-paper/'+atag['href']
|
||||||
|
#self.log("Section %s" % key)
|
||||||
|
#self.log("url %s" % url)
|
||||||
|
title = self.tag_to_string(atag,False)
|
||||||
|
#self.log("title %s" % title)
|
||||||
|
pubdate = ''
|
||||||
|
description = ''
|
||||||
|
ptag = divtag.find('p');
|
||||||
|
if ptag:
|
||||||
|
description = self.tag_to_string(ptag,False)
|
||||||
|
#self.log("description %s" % description)
|
||||||
|
author = ''
|
||||||
|
autag = divtag.find('h4')
|
||||||
|
if autag:
|
||||||
|
author = self.tag_to_string(autag,False)
|
||||||
|
#self.log("author %s" % author)
|
||||||
|
if not articles.has_key(key):
|
||||||
|
articles[key] = []
|
||||||
|
articles[key].append(dict(title=title,url=url,date=pubdate,description=description,author=author,content=''))
|
||||||
|
|
||||||
|
ans = [(key, articles[key]) for key in ans if articles.has_key(key)]
|
||||||
|
return ans
|
106
resources/recipes/windows_star.recipe
Normal file
106
resources/recipes/windows_star.recipe
Normal file
@ -0,0 +1,106 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
|
||||||
|
__license__ = 'GPL v3'
|
||||||
|
|
||||||
|
'''
|
||||||
|
www.canada.com
|
||||||
|
'''
|
||||||
|
|
||||||
|
from calibre.web.feeds.recipes import BasicNewsRecipe
|
||||||
|
|
||||||
|
|
||||||
|
class CanWestPaper(BasicNewsRecipe):
|
||||||
|
|
||||||
|
# un-comment the following three lines for the Windsor Star
|
||||||
|
title = u'Windsor Star'
|
||||||
|
url_prefix = 'http://www.windsorstar.com'
|
||||||
|
description = u'News from Windsor, ON'
|
||||||
|
|
||||||
|
# un-comment the following three lines for the Ottawa Citizen
|
||||||
|
#title = u'Ottawa Citizen'
|
||||||
|
#url_prefix = 'http://www.ottawacitizen.com'
|
||||||
|
#description = u'News from Ottawa, ON'
|
||||||
|
|
||||||
|
# un-comment the following three lines for the Montreal Gazette
|
||||||
|
#title = u'Montreal Gazette'
|
||||||
|
#url_prefix = 'http://www.montrealgazette.com'
|
||||||
|
#description = u'News from Montreal, QC'
|
||||||
|
|
||||||
|
|
||||||
|
language = 'en_CA'
|
||||||
|
__author__ = 'Nick Redding'
|
||||||
|
no_stylesheets = True
|
||||||
|
timefmt = ' [%b %d]'
|
||||||
|
extra_css = '''
|
||||||
|
.timestamp { font-size:xx-small; display: block; }
|
||||||
|
#storyheader { font-size: medium; }
|
||||||
|
#storyheader h1 { font-size: x-large; }
|
||||||
|
#storyheader h2 { font-size: large; font-style: italic; }
|
||||||
|
.byline { font-size:xx-small; }
|
||||||
|
#photocaption { font-size: small; font-style: italic }
|
||||||
|
#photocredit { font-size: xx-small; }'''
|
||||||
|
keep_only_tags = [dict(name='div', attrs={'id':'storyheader'}),dict(name='div', attrs={'id':'storycontent'})]
|
||||||
|
remove_tags = [{'class':'comments'},
|
||||||
|
dict(name='div', attrs={'class':'navbar'}),dict(name='div', attrs={'class':'morelinks'}),
|
||||||
|
dict(name='div', attrs={'class':'viewmore'}),dict(name='li', attrs={'class':'email'}),
|
||||||
|
dict(name='div', attrs={'class':'story_tool_hr'}),dict(name='div', attrs={'class':'clear'}),
|
||||||
|
dict(name='div', attrs={'class':'story_tool'}),dict(name='div', attrs={'class':'copyright'}),
|
||||||
|
dict(name='div', attrs={'class':'rule_grey_solid'}),
|
||||||
|
dict(name='li', attrs={'class':'print'}),dict(name='li', attrs={'class':'share'}),dict(name='ul', attrs={'class':'bullet'})]
|
||||||
|
|
||||||
|
def preprocess_html(self,soup):
|
||||||
|
#delete iempty id attributes--they screw up the TOC for unknow reasons
|
||||||
|
divtags = soup.findAll('div',attrs={'id':''})
|
||||||
|
if divtags:
|
||||||
|
for div in divtags:
|
||||||
|
del(div['id'])
|
||||||
|
return soup
|
||||||
|
|
||||||
|
|
||||||
|
def parse_index(self):
|
||||||
|
soup = self.index_to_soup(self.url_prefix+'/news/todays-paper/index.html')
|
||||||
|
|
||||||
|
articles = {}
|
||||||
|
key = 'News'
|
||||||
|
ans = ['News']
|
||||||
|
|
||||||
|
# Find each instance of class="sectiontitle", class="featurecontent"
|
||||||
|
for divtag in soup.findAll('div',attrs={'class' : ["section_title02","featurecontent"]}):
|
||||||
|
#self.log(" div class = %s" % divtag['class'])
|
||||||
|
if divtag['class'].startswith('section_title'):
|
||||||
|
# div contains section title
|
||||||
|
if not divtag.h3:
|
||||||
|
continue
|
||||||
|
key = self.tag_to_string(divtag.h3,False)
|
||||||
|
ans.append(key)
|
||||||
|
self.log("Section name %s" % key)
|
||||||
|
continue
|
||||||
|
# div contains article data
|
||||||
|
h1tag = divtag.find('h1')
|
||||||
|
if not h1tag:
|
||||||
|
continue
|
||||||
|
atag = h1tag.find('a',href=True)
|
||||||
|
if not atag:
|
||||||
|
continue
|
||||||
|
url = self.url_prefix+'/news/todays-paper/'+atag['href']
|
||||||
|
#self.log("Section %s" % key)
|
||||||
|
#self.log("url %s" % url)
|
||||||
|
title = self.tag_to_string(atag,False)
|
||||||
|
#self.log("title %s" % title)
|
||||||
|
pubdate = ''
|
||||||
|
description = ''
|
||||||
|
ptag = divtag.find('p');
|
||||||
|
if ptag:
|
||||||
|
description = self.tag_to_string(ptag,False)
|
||||||
|
#self.log("description %s" % description)
|
||||||
|
author = ''
|
||||||
|
autag = divtag.find('h4')
|
||||||
|
if autag:
|
||||||
|
author = self.tag_to_string(autag,False)
|
||||||
|
#self.log("author %s" % author)
|
||||||
|
if not articles.has_key(key):
|
||||||
|
articles[key] = []
|
||||||
|
articles[key].append(dict(title=title,url=url,date=pubdate,description=description,author=author,content=''))
|
||||||
|
|
||||||
|
ans = [(key, articles[key]) for key in ans if articles.has_key(key)]
|
||||||
|
return ans
|
Loading…
x
Reference in New Issue
Block a user