Updates to various PostMedia news sources

This commit is contained in:
Kovid Goyal 2013-11-26 08:48:08 +05:30
parent ea23267f97
commit 27c041ee62
7 changed files with 387 additions and 571 deletions

View File

@ -7,7 +7,7 @@ www.canada.com
''' '''
import re import re
from calibre.web.feeds.recipes import BasicNewsRecipe from calibre.web.feeds.recipes import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import Tag, BeautifulStoneSoup from calibre.ebooks.BeautifulSoup import BeautifulStoneSoup, Tag
class CanWestPaper(BasicNewsRecipe): class CanWestPaper(BasicNewsRecipe):
@ -51,20 +51,20 @@ class CanWestPaper(BasicNewsRecipe):
# un-comment the following six lines for the Vancouver Province # un-comment the following six lines for the Vancouver Province
## title = u'Vancouver Province' # title = u'Vancouver Province'
## url_prefix = 'http://www.theprovince.com' # url_prefix = 'http://www.theprovince.com'
## description = u'News from Vancouver, BC' # description = u'News from Vancouver, BC'
## std_logo_url = 'http://www.theprovince.com/images/logo_theprovince.jpg' # std_logo_url = 'http://www.theprovince.com/images/logo_theprovince.jpg'
## logo_url = 'vplogo.jpg' # logo_url = 'vplogo.jpg'
## fp_tag = 'CAN_TP' # fp_tag = 'CAN_TP'
# un-comment the following six lines for the Vancouver Sun # un-comment the following six lines for the Vancouver Sun
## title = u'Vancouver Sun' # title = u'Vancouver Sun'
## url_prefix = 'http://www.vancouversun.com' # url_prefix = 'http://www.vancouversun.com'
## description = u'News from Vancouver, BC' # description = u'News from Vancouver, BC'
## std_logo_url = 'http://www.vancouversun.com/images/logo_vancouversun.jpg' # std_logo_url = 'http://www.vancouversun.com/images/logo_vancouversun.jpg'
## logo_url = 'vslogo.jpg' # logo_url = 'vslogo.jpg'
## fp_tag = 'CAN_VS' # fp_tag = 'CAN_VS'
# un-comment the following six lines for the Calgary Herald # un-comment the following six lines for the Calgary Herald
title = u'Calgary Herald' title = u'Calgary Herald'
@ -117,15 +117,17 @@ class CanWestPaper(BasicNewsRecipe):
.photocaption { font-size: xx-small; font-style: italic; font-weight: normal; } .photocaption { font-size: xx-small; font-style: italic; font-weight: normal; }
#photocredit { font-size: xx-small; font-weight: normal; }''' #photocredit { font-size: xx-small; font-weight: normal; }'''
keep_only_tags = [dict(name='div', attrs={'id':'storyheader'}),dict(name='div', attrs={'id':'storycontent'}),dict(name='div', attrs={'id':'storyphoto'})] keep_only_tags = [dict(name='div', attrs={'id':re.compile('story')})]
remove_tags = [{'class':'comments'}, remove_tags = [{'class':'comments'},{'class':'comment-intro'},{'class':'storytab'},
dict(name='div', attrs={'class':'navbar'}),dict(name='div', attrs={'class':'morelinks'}), dict(name='div', attrs={'class':'section_title'}),dict(name='div', attrs={'class':'sharebar'}),dict(name='div', attrs={'class':'navbar'}),dict(name='div', attrs={'class':'morelinks'}),
dict(name='h2', attrs={'id':'photocredit'}), dict(name='h2', attrs={'id':'photocredit'}),
dict(name='div', attrs={'class':'viewmore'}),dict(name='li', attrs={'class':'email'}), dict(name='div', attrs={'class':'viewmore'}),dict(name='li', attrs={'class':'email'}),
dict(name='div', attrs={'class':'story_tool_hr'}),dict(name='div', attrs={'class':'clear'}), dict(name='div', attrs={'class':'story_tool_hr'}),dict(name='div', attrs={'class':'clear'}),
dict(name='div', attrs={'class':'story_tool'}),dict(name='div', attrs={'class':'copyright'}), dict(name='div', attrs={'class':'story_tool'}),dict(name='div', attrs={'class':'copyright'}),
dict(name='div', attrs={'class':'rule_grey_solid'}), dict(name='div', attrs={'class':'rule_grey_solid'}),
dict(name='div', attrs={'id':'soundoff'}),
dict(name='div', attrs={'id':re.compile('flyer')}),
dict(name='li', attrs={'class':'print'}),dict(name='li', attrs={'class':'share'}),dict(name='ul', attrs={'class':'bullet'})] dict(name='li', attrs={'class':'print'}),dict(name='li', attrs={'class':'share'}),dict(name='ul', attrs={'class':'bullet'})]
@ -252,18 +254,23 @@ class CanWestPaper(BasicNewsRecipe):
def handle_article(adiv,key): def handle_article(adiv,key):
h1tag = adiv.h1 if adiv.name=='h1' or adiv.name=='h3':
h1tag = adiv
else:
h1tag = adiv.h1
if h1tag is None:
h1tag = adiv.h3
if h1tag is not None: if h1tag is not None:
atag = h1tag.a atag = h1tag.a
if atag is not None: if atag is not None:
url = atag['href'] url = atag['href']
if atag['href'].startswith('http'): if url.startswith('/'):
url = self.url_prefix+url
if not url.startswith(self.url_prefix):
print("Rejected "+url)
return return
elif atag['href'].startswith('/'):
url = self.url_prefix+atag['href']
else:
url = self.url_prefix+'/'+atag['href']
if url in self.url_list: if url in self.url_list:
print("Rejected dup "+url)
return return
self.url_list.append(url) self.url_list.append(url)
title = self.tag_to_string(atag,False) title = self.tag_to_string(atag,False)
@ -290,22 +297,21 @@ class CanWestPaper(BasicNewsRecipe):
articles[key].append(dict(title=title,url=url,date='',description=description,author='',content='')) articles[key].append(dict(title=title,url=url,date='',description=description,author='',content=''))
def parse_web_index(key, keyurl): def parse_web_index(key, keyurl):
print("Section: "+key+': '+self.url_prefix+keyurl)
try: try:
soup = self.index_to_soup(self.url_prefix+keyurl) soup = self.index_to_soup(self.url_prefix+keyurl)
except: except:
print("Section: "+key+' NOT FOUND');
return return
ans.append(key) ans.append(key)
mainsoup = soup.find('div','bodywrapper') mainsoup = soup.find('div','bodywrapper')
footer = mainsoup.find(attrs={'id':'footerfeature'}) footer = mainsoup.find(attrs={'id':'footerfeature'})
if footer is not None: if footer is not None:
footer.extract() footer.extract()
print("Section: "+key) for wdiv in mainsoup.findAll(attrs={'class':['genericfeature']}):
for wdiv in mainsoup.findAll('div',attrs={'id':re.compile('^HorizontalFeatureSlider_1_Story')}): wdiv.extract()
handle_article(wdiv,key) for wdiv in mainsoup.findAll(attrs={'class':['headline','featurecontent']}):
wdiv.extract() handle_article(wdiv,key)
for wdiv in mainsoup.findAll(attrs={'id':['featurewidget','textfeature','textlinks_timestamp']}):
for adiv in wdiv.findAll('div','featurecontent'):
handle_article(adiv,key)
for (k,url) in self.postmedia_index_pages: for (k,url) in self.postmedia_index_pages:
parse_web_index(k,url) parse_web_index(k,url)

View File

@ -51,28 +51,28 @@ class CanWestPaper(BasicNewsRecipe):
# un-comment the following six lines for the Vancouver Province # un-comment the following six lines for the Vancouver Province
## title = u'Vancouver Province' # title = u'Vancouver Province'
## url_prefix = 'http://www.theprovince.com' # url_prefix = 'http://www.theprovince.com'
## description = u'News from Vancouver, BC' # description = u'News from Vancouver, BC'
## std_logo_url = 'http://www.theprovince.com/images/logo_theprovince.jpg' # std_logo_url = 'http://www.theprovince.com/images/logo_theprovince.jpg'
## logo_url = 'vplogo.jpg' # logo_url = 'vplogo.jpg'
## fp_tag = 'CAN_TP' # fp_tag = 'CAN_TP'
# un-comment the following six lines for the Vancouver Sun # un-comment the following six lines for the Vancouver Sun
## title = u'Vancouver Sun' # title = u'Vancouver Sun'
## url_prefix = 'http://www.vancouversun.com' # url_prefix = 'http://www.vancouversun.com'
## description = u'News from Vancouver, BC' # description = u'News from Vancouver, BC'
## std_logo_url = 'http://www.vancouversun.com/images/logo_vancouversun.jpg' # std_logo_url = 'http://www.vancouversun.com/images/logo_vancouversun.jpg'
## logo_url = 'vslogo.jpg' # logo_url = 'vslogo.jpg'
## fp_tag = 'CAN_VS' # fp_tag = 'CAN_VS'
# un-comment the following six lines for the Calgary Herald # un-comment the following six lines for the Calgary Herald
## title = u'Calgary Herald' # title = u'Calgary Herald'
## url_prefix = 'http://www.calgaryherald.com' # url_prefix = 'http://www.calgaryherald.com'
## description = u'News from Calgary, AB' # description = u'News from Calgary, AB'
## std_logo_url = 'http://www.calgaryherald.com/images/logo_calgaryherald.jpg' # std_logo_url = 'http://www.calgaryherald.com/images/logo_calgaryherald.jpg'
## logo_url = 'chlogo.jpg' # logo_url = 'chlogo.jpg'
## fp_tag = 'CAN_CH' # fp_tag = 'CAN_CH'
# un-comment the following six lines for the Edmonton Journal # un-comment the following six lines for the Edmonton Journal
title = u'Edmonton Journal' title = u'Edmonton Journal'
@ -117,15 +117,17 @@ class CanWestPaper(BasicNewsRecipe):
.photocaption { font-size: xx-small; font-style: italic; font-weight: normal; } .photocaption { font-size: xx-small; font-style: italic; font-weight: normal; }
#photocredit { font-size: xx-small; font-weight: normal; }''' #photocredit { font-size: xx-small; font-weight: normal; }'''
keep_only_tags = [dict(name='div', attrs={'id':'storyheader'}),dict(name='div', attrs={'id':'storycontent'}),dict(name='div', attrs={'id':'storyphoto'})] keep_only_tags = [dict(name='div', attrs={'id':re.compile('story')})]
remove_tags = [{'class':'comments'}, remove_tags = [{'class':'comments'},{'class':'comment-intro'},{'class':'storytab'},
dict(name='div', attrs={'class':'navbar'}),dict(name='div', attrs={'class':'morelinks'}), dict(name='div', attrs={'class':'section_title'}),dict(name='div', attrs={'class':'sharebar'}),dict(name='div', attrs={'class':'navbar'}),dict(name='div', attrs={'class':'morelinks'}),
dict(name='h2', attrs={'id':'photocredit'}), dict(name='h2', attrs={'id':'photocredit'}),
dict(name='div', attrs={'class':'viewmore'}),dict(name='li', attrs={'class':'email'}), dict(name='div', attrs={'class':'viewmore'}),dict(name='li', attrs={'class':'email'}),
dict(name='div', attrs={'class':'story_tool_hr'}),dict(name='div', attrs={'class':'clear'}), dict(name='div', attrs={'class':'story_tool_hr'}),dict(name='div', attrs={'class':'clear'}),
dict(name='div', attrs={'class':'story_tool'}),dict(name='div', attrs={'class':'copyright'}), dict(name='div', attrs={'class':'story_tool'}),dict(name='div', attrs={'class':'copyright'}),
dict(name='div', attrs={'class':'rule_grey_solid'}), dict(name='div', attrs={'class':'rule_grey_solid'}),
dict(name='div', attrs={'id':'soundoff'}),
dict(name='div', attrs={'id':re.compile('flyer')}),
dict(name='li', attrs={'class':'print'}),dict(name='li', attrs={'class':'share'}),dict(name='ul', attrs={'class':'bullet'})] dict(name='li', attrs={'class':'print'}),dict(name='li', attrs={'class':'share'}),dict(name='ul', attrs={'class':'bullet'})]
@ -252,18 +254,23 @@ class CanWestPaper(BasicNewsRecipe):
def handle_article(adiv,key): def handle_article(adiv,key):
h1tag = adiv.h1 if adiv.name=='h1' or adiv.name=='h3':
h1tag = adiv
else:
h1tag = adiv.h1
if h1tag is None:
h1tag = adiv.h3
if h1tag is not None: if h1tag is not None:
atag = h1tag.a atag = h1tag.a
if atag is not None: if atag is not None:
url = atag['href'] url = atag['href']
if atag['href'].startswith('http'): if url.startswith('/'):
url = self.url_prefix+url
if not url.startswith(self.url_prefix):
print("Rejected "+url)
return return
elif atag['href'].startswith('/'):
url = self.url_prefix+atag['href']
else:
url = self.url_prefix+'/'+atag['href']
if url in self.url_list: if url in self.url_list:
print("Rejected dup "+url)
return return
self.url_list.append(url) self.url_list.append(url)
title = self.tag_to_string(atag,False) title = self.tag_to_string(atag,False)
@ -290,22 +297,21 @@ class CanWestPaper(BasicNewsRecipe):
articles[key].append(dict(title=title,url=url,date='',description=description,author='',content='')) articles[key].append(dict(title=title,url=url,date='',description=description,author='',content=''))
def parse_web_index(key, keyurl): def parse_web_index(key, keyurl):
print("Section: "+key+': '+self.url_prefix+keyurl)
try: try:
soup = self.index_to_soup(self.url_prefix+keyurl) soup = self.index_to_soup(self.url_prefix+keyurl)
except: except:
print("Section: "+key+' NOT FOUND');
return return
ans.append(key) ans.append(key)
mainsoup = soup.find('div','bodywrapper') mainsoup = soup.find('div','bodywrapper')
footer = mainsoup.find(attrs={'id':'footerfeature'}) footer = mainsoup.find(attrs={'id':'footerfeature'})
if footer is not None: if footer is not None:
footer.extract() footer.extract()
print("Section: "+key) for wdiv in mainsoup.findAll(attrs={'class':['genericfeature']}):
for wdiv in mainsoup.findAll('div',attrs={'id':re.compile('^HorizontalFeatureSlider_1_Story')}): wdiv.extract()
handle_article(wdiv,key) for wdiv in mainsoup.findAll(attrs={'class':['headline','featurecontent']}):
wdiv.extract() handle_article(wdiv,key)
for wdiv in mainsoup.findAll(attrs={'id':['featurewidget','textfeature','textlinks_timestamp']}):
for adiv in wdiv.findAll('div','featurecontent'):
handle_article(adiv,key)
for (k,url) in self.postmedia_index_pages: for (k,url) in self.postmedia_index_pages:
parse_web_index(k,url) parse_web_index(k,url)

View File

@ -51,44 +51,44 @@ class CanWestPaper(BasicNewsRecipe):
# un-comment the following six lines for the Vancouver Province # un-comment the following six lines for the Vancouver Province
## title = u'Vancouver Province' # title = u'Vancouver Province'
## url_prefix = 'http://www.theprovince.com' # url_prefix = 'http://www.theprovince.com'
## description = u'News from Vancouver, BC' # description = u'News from Vancouver, BC'
## std_logo_url = 'http://www.theprovince.com/images/logo_theprovince.jpg' # std_logo_url = 'http://www.theprovince.com/images/logo_theprovince.jpg'
## logo_url = 'vplogo.jpg' # logo_url = 'vplogo.jpg'
## fp_tag = 'CAN_TP' # fp_tag = 'CAN_TP'
# un-comment the following six lines for the Vancouver Sun # un-comment the following six lines for the Vancouver Sun
## title = u'Vancouver Sun' # title = u'Vancouver Sun'
## url_prefix = 'http://www.vancouversun.com' # url_prefix = 'http://www.vancouversun.com'
## description = u'News from Vancouver, BC' # description = u'News from Vancouver, BC'
## std_logo_url = 'http://www.vancouversun.com/images/logo_vancouversun.jpg' # std_logo_url = 'http://www.vancouversun.com/images/logo_vancouversun.jpg'
## logo_url = 'vslogo.jpg' # logo_url = 'vslogo.jpg'
## fp_tag = 'CAN_VS' # fp_tag = 'CAN_VS'
# un-comment the following six lines for the Calgary Herald # un-comment the following six lines for the Calgary Herald
## title = u'Calgary Herald' # title = u'Calgary Herald'
## url_prefix = 'http://www.calgaryherald.com' # url_prefix = 'http://www.calgaryherald.com'
## description = u'News from Calgary, AB' # description = u'News from Calgary, AB'
## std_logo_url = 'http://www.calgaryherald.com/images/logo_calgaryherald.jpg' # std_logo_url = 'http://www.calgaryherald.com/images/logo_calgaryherald.jpg'
## logo_url = 'chlogo.jpg' # logo_url = 'chlogo.jpg'
## fp_tag = 'CAN_CH' # fp_tag = 'CAN_CH'
# un-comment the following six lines for the Edmonton Journal # un-comment the following six lines for the Edmonton Journal
## title = u'Edmonton Journal' # title = u'Edmonton Journal'
## url_prefix = 'http://www.edmontonjournal.com' # url_prefix = 'http://www.edmontonjournal.com'
## description = u'News from Edmonton, AB' # description = u'News from Edmonton, AB'
## std_logo_url = 'http://www.edmontonjournal.com/images/logo_edmontonjournal.jpg' # std_logo_url = 'http://www.edmontonjournal.com/images/logo_edmontonjournal.jpg'
## logo_url = 'ejlogo.jpg' # logo_url = 'ejlogo.jpg'
## fp_tag = 'CAN_EJ' # fp_tag = 'CAN_EJ'
# un-comment the following six lines for the Ottawa Citizen # un-comment the following six lines for the Ottawa Citizen
## title = u'Ottawa Citizen' # title = u'Ottawa Citizen'
## url_prefix = 'http://www.ottawacitizen.com' # url_prefix = 'http://www.ottawacitizen.com'
## description = u'News from Ottawa, ON' # description = u'News from Ottawa, ON'
## std_logo_url = 'http://www.ottawacitizen.com/images/logo_ottawacitizen.jpg' # std_logo_url = 'http://www.ottawacitizen.com/images/logo_ottawacitizen.jpg'
## logo_url = 'oclogo.jpg' # logo_url = 'oclogo.jpg'
## fp_tag = 'CAN_OC' # fp_tag = 'CAN_OC'
# un-comment the following six lines for the Montreal Gazette # un-comment the following six lines for the Montreal Gazette
title = u'Montreal Gazette' title = u'Montreal Gazette'
@ -117,15 +117,17 @@ class CanWestPaper(BasicNewsRecipe):
.photocaption { font-size: xx-small; font-style: italic; font-weight: normal; } .photocaption { font-size: xx-small; font-style: italic; font-weight: normal; }
#photocredit { font-size: xx-small; font-weight: normal; }''' #photocredit { font-size: xx-small; font-weight: normal; }'''
keep_only_tags = [dict(name='div', attrs={'id':'storyheader'}),dict(name='div', attrs={'id':'storycontent'}),dict(name='div', attrs={'id':'storyphoto'})] keep_only_tags = [dict(name='div', attrs={'id':re.compile('story')})]
remove_tags = [{'class':'comments'}, remove_tags = [{'class':'comments'},{'class':'comment-intro'},{'class':'storytab'},
dict(name='div', attrs={'class':'navbar'}),dict(name='div', attrs={'class':'morelinks'}), dict(name='div', attrs={'class':'section_title'}),dict(name='div', attrs={'class':'sharebar'}),dict(name='div', attrs={'class':'navbar'}),dict(name='div', attrs={'class':'morelinks'}),
dict(name='h2', attrs={'id':'photocredit'}), dict(name='h2', attrs={'id':'photocredit'}),
dict(name='div', attrs={'class':'viewmore'}),dict(name='li', attrs={'class':'email'}), dict(name='div', attrs={'class':'viewmore'}),dict(name='li', attrs={'class':'email'}),
dict(name='div', attrs={'class':'story_tool_hr'}),dict(name='div', attrs={'class':'clear'}), dict(name='div', attrs={'class':'story_tool_hr'}),dict(name='div', attrs={'class':'clear'}),
dict(name='div', attrs={'class':'story_tool'}),dict(name='div', attrs={'class':'copyright'}), dict(name='div', attrs={'class':'story_tool'}),dict(name='div', attrs={'class':'copyright'}),
dict(name='div', attrs={'class':'rule_grey_solid'}), dict(name='div', attrs={'class':'rule_grey_solid'}),
dict(name='div', attrs={'id':'soundoff'}),
dict(name='div', attrs={'id':re.compile('flyer')}),
dict(name='li', attrs={'class':'print'}),dict(name='li', attrs={'class':'share'}),dict(name='ul', attrs={'class':'bullet'})] dict(name='li', attrs={'class':'print'}),dict(name='li', attrs={'class':'share'}),dict(name='ul', attrs={'class':'bullet'})]
@ -252,18 +254,23 @@ class CanWestPaper(BasicNewsRecipe):
def handle_article(adiv,key): def handle_article(adiv,key):
h1tag = adiv.h1 if adiv.name=='h1' or adiv.name=='h3':
h1tag = adiv
else:
h1tag = adiv.h1
if h1tag is None:
h1tag = adiv.h3
if h1tag is not None: if h1tag is not None:
atag = h1tag.a atag = h1tag.a
if atag is not None: if atag is not None:
url = atag['href'] url = atag['href']
if atag['href'].startswith('http'): if url.startswith('/'):
url = self.url_prefix+url
if not url.startswith(self.url_prefix):
print("Rejected "+url)
return return
elif atag['href'].startswith('/'):
url = self.url_prefix+atag['href']
else:
url = self.url_prefix+'/'+atag['href']
if url in self.url_list: if url in self.url_list:
print("Rejected dup "+url)
return return
self.url_list.append(url) self.url_list.append(url)
title = self.tag_to_string(atag,False) title = self.tag_to_string(atag,False)
@ -290,22 +297,21 @@ class CanWestPaper(BasicNewsRecipe):
articles[key].append(dict(title=title,url=url,date='',description=description,author='',content='')) articles[key].append(dict(title=title,url=url,date='',description=description,author='',content=''))
def parse_web_index(key, keyurl): def parse_web_index(key, keyurl):
print("Section: "+key+': '+self.url_prefix+keyurl)
try: try:
soup = self.index_to_soup(self.url_prefix+keyurl) soup = self.index_to_soup(self.url_prefix+keyurl)
except: except:
print("Section: "+key+' NOT FOUND');
return return
ans.append(key) ans.append(key)
mainsoup = soup.find('div','bodywrapper') mainsoup = soup.find('div','bodywrapper')
footer = mainsoup.find(attrs={'id':'footerfeature'}) footer = mainsoup.find(attrs={'id':'footerfeature'})
if footer is not None: if footer is not None:
footer.extract() footer.extract()
print("Section: "+key) for wdiv in mainsoup.findAll(attrs={'class':['genericfeature']}):
for wdiv in mainsoup.findAll('div',attrs={'id':re.compile('^HorizontalFeatureSlider_1_Story')}): wdiv.extract()
handle_article(wdiv,key) for wdiv in mainsoup.findAll(attrs={'class':['headline','featurecontent']}):
wdiv.extract() handle_article(wdiv,key)
for wdiv in mainsoup.findAll(attrs={'id':['featurewidget','textfeature','textlinks_timestamp']}):
for adiv in wdiv.findAll('div','featurecontent'):
handle_article(adiv,key)
for (k,url) in self.postmedia_index_pages: for (k,url) in self.postmedia_index_pages:
parse_web_index(k,url) parse_web_index(k,url)

View File

@ -51,36 +51,36 @@ class CanWestPaper(BasicNewsRecipe):
# un-comment the following six lines for the Vancouver Province # un-comment the following six lines for the Vancouver Province
## title = u'Vancouver Province' # title = u'Vancouver Province'
## url_prefix = 'http://www.theprovince.com' # url_prefix = 'http://www.theprovince.com'
## description = u'News from Vancouver, BC' # description = u'News from Vancouver, BC'
## std_logo_url = 'http://www.theprovince.com/images/logo_theprovince.jpg' # std_logo_url = 'http://www.theprovince.com/images/logo_theprovince.jpg'
## logo_url = 'vplogo.jpg' # logo_url = 'vplogo.jpg'
## fp_tag = 'CAN_TP' # fp_tag = 'CAN_TP'
# un-comment the following six lines for the Vancouver Sun # un-comment the following six lines for the Vancouver Sun
## title = u'Vancouver Sun' # title = u'Vancouver Sun'
## url_prefix = 'http://www.vancouversun.com' # url_prefix = 'http://www.vancouversun.com'
## description = u'News from Vancouver, BC' # description = u'News from Vancouver, BC'
## std_logo_url = 'http://www.vancouversun.com/images/logo_vancouversun.jpg' # std_logo_url = 'http://www.vancouversun.com/images/logo_vancouversun.jpg'
## logo_url = 'vslogo.jpg' # logo_url = 'vslogo.jpg'
## fp_tag = 'CAN_VS' # fp_tag = 'CAN_VS'
# un-comment the following six lines for the Calgary Herald # un-comment the following six lines for the Calgary Herald
## title = u'Calgary Herald' # title = u'Calgary Herald'
## url_prefix = 'http://www.calgaryherald.com' # url_prefix = 'http://www.calgaryherald.com'
## description = u'News from Calgary, AB' # description = u'News from Calgary, AB'
## std_logo_url = 'http://www.calgaryherald.com/images/logo_calgaryherald.jpg' # std_logo_url = 'http://www.calgaryherald.com/images/logo_calgaryherald.jpg'
## logo_url = 'chlogo.jpg' # logo_url = 'chlogo.jpg'
## fp_tag = 'CAN_CH' # fp_tag = 'CAN_CH'
# un-comment the following six lines for the Edmonton Journal # un-comment the following six lines for the Edmonton Journal
## title = u'Edmonton Journal' # title = u'Edmonton Journal'
## url_prefix = 'http://www.edmontonjournal.com' # url_prefix = 'http://www.edmontonjournal.com'
## description = u'News from Edmonton, AB' # description = u'News from Edmonton, AB'
## std_logo_url = 'http://www.edmontonjournal.com/images/logo_edmontonjournal.jpg' # std_logo_url = 'http://www.edmontonjournal.com/images/logo_edmontonjournal.jpg'
## logo_url = 'ejlogo.jpg' # logo_url = 'ejlogo.jpg'
## fp_tag = 'CAN_EJ' # fp_tag = 'CAN_EJ'
# un-comment the following six lines for the Ottawa Citizen # un-comment the following six lines for the Ottawa Citizen
title = u'Ottawa Citizen' title = u'Ottawa Citizen'
@ -117,15 +117,17 @@ class CanWestPaper(BasicNewsRecipe):
.photocaption { font-size: xx-small; font-style: italic; font-weight: normal; } .photocaption { font-size: xx-small; font-style: italic; font-weight: normal; }
#photocredit { font-size: xx-small; font-weight: normal; }''' #photocredit { font-size: xx-small; font-weight: normal; }'''
keep_only_tags = [dict(name='div', attrs={'id':'storyheader'}),dict(name='div', attrs={'id':'storycontent'}),dict(name='div', attrs={'id':'storyphoto'})] keep_only_tags = [dict(name='div', attrs={'id':re.compile('story')})]
remove_tags = [{'class':'comments'}, remove_tags = [{'class':'comments'},{'class':'comment-intro'},{'class':'storytab'},
dict(name='div', attrs={'class':'navbar'}),dict(name='div', attrs={'class':'morelinks'}), dict(name='div', attrs={'class':'section_title'}),dict(name='div', attrs={'class':'sharebar'}),dict(name='div', attrs={'class':'navbar'}),dict(name='div', attrs={'class':'morelinks'}),
dict(name='h2', attrs={'id':'photocredit'}), dict(name='h2', attrs={'id':'photocredit'}),
dict(name='div', attrs={'class':'viewmore'}),dict(name='li', attrs={'class':'email'}), dict(name='div', attrs={'class':'viewmore'}),dict(name='li', attrs={'class':'email'}),
dict(name='div', attrs={'class':'story_tool_hr'}),dict(name='div', attrs={'class':'clear'}), dict(name='div', attrs={'class':'story_tool_hr'}),dict(name='div', attrs={'class':'clear'}),
dict(name='div', attrs={'class':'story_tool'}),dict(name='div', attrs={'class':'copyright'}), dict(name='div', attrs={'class':'story_tool'}),dict(name='div', attrs={'class':'copyright'}),
dict(name='div', attrs={'class':'rule_grey_solid'}), dict(name='div', attrs={'class':'rule_grey_solid'}),
dict(name='div', attrs={'id':'soundoff'}),
dict(name='div', attrs={'id':re.compile('flyer')}),
dict(name='li', attrs={'class':'print'}),dict(name='li', attrs={'class':'share'}),dict(name='ul', attrs={'class':'bullet'})] dict(name='li', attrs={'class':'print'}),dict(name='li', attrs={'class':'share'}),dict(name='ul', attrs={'class':'bullet'})]
@ -252,18 +254,23 @@ class CanWestPaper(BasicNewsRecipe):
def handle_article(adiv,key): def handle_article(adiv,key):
h1tag = adiv.h1 if adiv.name=='h1' or adiv.name=='h3':
h1tag = adiv
else:
h1tag = adiv.h1
if h1tag is None:
h1tag = adiv.h3
if h1tag is not None: if h1tag is not None:
atag = h1tag.a atag = h1tag.a
if atag is not None: if atag is not None:
url = atag['href'] url = atag['href']
if atag['href'].startswith('http'): if url.startswith('/'):
url = self.url_prefix+url
if not url.startswith(self.url_prefix):
print("Rejected "+url)
return return
elif atag['href'].startswith('/'):
url = self.url_prefix+atag['href']
else:
url = self.url_prefix+'/'+atag['href']
if url in self.url_list: if url in self.url_list:
print("Rejected dup "+url)
return return
self.url_list.append(url) self.url_list.append(url)
title = self.tag_to_string(atag,False) title = self.tag_to_string(atag,False)
@ -290,22 +297,21 @@ class CanWestPaper(BasicNewsRecipe):
articles[key].append(dict(title=title,url=url,date='',description=description,author='',content='')) articles[key].append(dict(title=title,url=url,date='',description=description,author='',content=''))
def parse_web_index(key, keyurl): def parse_web_index(key, keyurl):
print("Section: "+key+': '+self.url_prefix+keyurl)
try: try:
soup = self.index_to_soup(self.url_prefix+keyurl) soup = self.index_to_soup(self.url_prefix+keyurl)
except: except:
print("Section: "+key+' NOT FOUND');
return return
ans.append(key) ans.append(key)
mainsoup = soup.find('div','bodywrapper') mainsoup = soup.find('div','bodywrapper')
footer = mainsoup.find(attrs={'id':'footerfeature'}) footer = mainsoup.find(attrs={'id':'footerfeature'})
if footer is not None: if footer is not None:
footer.extract() footer.extract()
print("Section: "+key) for wdiv in mainsoup.findAll(attrs={'class':['genericfeature']}):
for wdiv in mainsoup.findAll('div',attrs={'id':re.compile('^HorizontalFeatureSlider_1_Story')}): wdiv.extract()
handle_article(wdiv,key) for wdiv in mainsoup.findAll(attrs={'class':['headline','featurecontent']}):
wdiv.extract() handle_article(wdiv,key)
for wdiv in mainsoup.findAll(attrs={'id':['featurewidget','textfeature','textlinks_timestamp']}):
for adiv in wdiv.findAll('div','featurecontent'):
handle_article(adiv,key)
for (k,url) in self.postmedia_index_pages: for (k,url) in self.postmedia_index_pages:
parse_web_index(k,url) parse_web_index(k,url)

View File

@ -1,314 +0,0 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
__license__ = 'GPL v3'
'''
www.canada.com
'''
import re
from calibre.web.feeds.recipes import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulStoneSoup, Tag
class CanWestPaper(BasicNewsRecipe):
postmedia_index_pages = [
(u'Headlines',u'/index.html'),
(u'Ottawa & Area',u'/news/ottawa/index.html'),
(u'Vancouver',u'/news/vancouver/index.html'),
(u'Calgary',u'/news/calgary/index.html'),
(u'Edmonton',u'/news/edmonton/index.html'),
(u'Montreal',u'/news/montreal/index.html'),
(u'Fraser Valley',u'/news/fraser-valley/index.html'),
(u'British Columbia',u'/news/bc/index.html'),
(u'Alberta',u'/news/alberta/index.html'),
(u'Canada',u'/news/canada/index.html'),
(u'National',u'/news/national/index.html'),
(u'Politics',u'/news/politics/index.html'),
(u'Insight',u'/news/insight/index.html'),
(u'Special Reports',u'/news/specialreports/index.html'),
(u'Gangs',u'/news/gangs/index.html'),
(u'Education',u'/news/education/index.html'),
(u'Health',u'/news/health/index.html'),
(u'Environment',u'/news/environment/index.html'),
(u'World',u'/news/world/index.html'),
(u'Police Blotter',u'/news/crime-and-justice/index.html'),
(u'Crime',u'/news/blotter/index.html'),
(u'Around Town',u'/news/topic.html?t=keyword&q=Around+Town'),
(u'Diplomatica',u'/news/diplomatica/index.html'),
(u'Opinion',u'/opinion/index.html'),
(u'Columnists',u'/columnists/index.html'),
(u'Editorials',u'/opinion/editorials/index.html'),
(u'Letters',u'/opinion/letters/index.html'),
(u'Business',u'/business/index.html'),
(u'Sports',u'/sports/index.html'),
(u'Arts',u'/entertainment/index.html'),
(u'Life',u'/life/index.html'),
(u'Technology',u'/technology/index.html'),
(u'Travel',u'/travel/index.html'),
(u'Health',u'/health/index.html')
]
# un-comment the following six lines for the Vancouver Province
title = u'Vancouver Province'
url_prefix = 'http://www.theprovince.com'
description = u'News from Vancouver, BC'
std_logo_url = 'http://www.theprovince.com/images/logo_theprovince.jpg'
logo_url = 'vplogo.jpg'
fp_tag = 'CAN_TP'
# un-comment the following six lines for the Vancouver Sun
## title = u'Vancouver Sun'
## url_prefix = 'http://www.vancouversun.com'
## description = u'News from Vancouver, BC'
## std_logo_url = 'http://www.vancouversun.com/images/logo_vancouversun.jpg'
## logo_url = 'vslogo.jpg'
## fp_tag = 'CAN_VS'
# un-comment the following six lines for the Calgary Herald
## title = u'Calgary Herald'
## url_prefix = 'http://www.calgaryherald.com'
## description = u'News from Calgary, AB'
## std_logo_url = 'http://www.calgaryherald.com/images/logo_calgaryherald.jpg'
## logo_url = 'chlogo.jpg'
## fp_tag = 'CAN_CH'
# un-comment the following six lines for the Edmonton Journal
## title = u'Edmonton Journal'
## url_prefix = 'http://www.edmontonjournal.com'
## description = u'News from Edmonton, AB'
## std_logo_url = 'http://www.edmontonjournal.com/images/logo_edmontonjournal.jpg'
## logo_url = 'ejlogo.jpg'
## fp_tag = 'CAN_EJ'
# un-comment the following six lines for the Ottawa Citizen
## title = u'Ottawa Citizen'
## url_prefix = 'http://www.ottawacitizen.com'
## description = u'News from Ottawa, ON'
## std_logo_url = 'http://www.ottawacitizen.com/images/logo_ottawacitizen.jpg'
## logo_url = 'oclogo.jpg'
## fp_tag = 'CAN_OC'
# un-comment the following six lines for the Montreal Gazette
## title = u'Montreal Gazette'
## url_prefix = 'http://www.montrealgazette.com'
## description = u'News from Montreal, QC'
## std_logo_url = 'http://www.montrealgazette.com/images/logo_montrealgazette.jpg'
## logo_url = 'mglogo.jpg'
## fp_tag = 'CAN_MG'
Kindle_Fire=False
masthead_url = std_logo_url
url_list = []
language = 'en_CA'
__author__ = 'Nick Redding'
no_stylesheets = True
timefmt = ' [%b %d]'
encoding = 'utf-8'
extra_css = '''
.timestamp { font-size:xx-small; display: block; }
#storyheader { font-size: medium; }
#storyheader h1 { font-size: x-large; }
#storyheader h2 { font-size: small; font-style: italic; }
.byline { font-size:xx-small; }
#photocaption { font-size: xx-small; font-style: italic; font-weight: normal; }
.photocaption { font-size: xx-small; font-style: italic; font-weight: normal; }
#photocredit { font-size: xx-small; font-weight: normal; }'''
keep_only_tags = [dict(name='div', attrs={'id':'storyheader'}),dict(name='div', attrs={'id':'storycontent'}),dict(name='div', attrs={'id':'storyphoto'})]
remove_tags = [{'class':'comments'},
dict(name='div', attrs={'class':'navbar'}),dict(name='div', attrs={'class':'morelinks'}),
dict(name='h2', attrs={'id':'photocredit'}),
dict(name='div', attrs={'class':'viewmore'}),dict(name='li', attrs={'class':'email'}),
dict(name='div', attrs={'class':'story_tool_hr'}),dict(name='div', attrs={'class':'clear'}),
dict(name='div', attrs={'class':'story_tool'}),dict(name='div', attrs={'class':'copyright'}),
dict(name='div', attrs={'class':'rule_grey_solid'}),
dict(name='li', attrs={'class':'print'}),dict(name='li', attrs={'class':'share'}),dict(name='ul', attrs={'class':'bullet'})]
def get_cover_url(self):
from datetime import timedelta, date
cover = 'http://webmedia.newseum.org/newseum-multimedia/dfp/jpg'+str(date.today().day)+'/lg/'+self.fp_tag+'.jpg'
br = BasicNewsRecipe.get_browser(self)
daysback=1
try:
br.open(cover)
except:
while daysback<7:
cover = 'http://webmedia.newseum.org/newseum-multimedia/dfp/jpg'+str((date.today() - timedelta(days=daysback)).day)+'/lg/'+self.fp_tag+'.jpg'
br = BasicNewsRecipe.get_browser(self)
try:
br.open(cover)
except:
daysback = daysback+1
continue
break
if daysback==7:
self.log("\nCover unavailable")
cover = None
return cover
def prepare_masthead_image(self, path_to_image, out_path):
if self.Kindle_Fire:
from calibre.utils.magick import Image, create_canvas
img = Image()
img.open(path_to_image)
width, height = img.size
img2 = create_canvas(width, height)
img2.compose(img)
img2.save(out_path)
else:
BasicNewsRecipe.prepare_masthead_image(self, path_to_image, out_path)
def fixChars(self,string):
# Replace lsquo (\x91)
fixed = re.sub("\x91","",string)
# Replace rsquo (\x92)
fixed = re.sub("\x92","",fixed)
# Replace ldquo (\x93)
fixed = re.sub("\x93","“",fixed)
# Replace rdquo (\x94)
fixed = re.sub("\x94","”",fixed)
# Replace ndash (\x96)
fixed = re.sub("\x96","",fixed)
# Replace mdash (\x97)
fixed = re.sub("\x97","—",fixed)
fixed = re.sub("&#x2019;","",fixed)
return fixed
def massageNCXText(self, description):
# Kindle TOC descriptions won't render certain characters
if description:
massaged = unicode(BeautifulStoneSoup(description, convertEntities=BeautifulStoneSoup.HTML_ENTITIES))
# Replace '&' with '&'
massaged = re.sub("&","&", massaged)
return self.fixChars(massaged)
else:
return description
def populate_article_metadata(self, article, soup, first):
if first:
picdiv = soup.find('body').find('img')
if picdiv is not None:
self.add_toc_thumbnail(article,re.sub(r'links\\link\d+\\','',picdiv['src']))
xtitle = article.text_summary.strip()
if len(xtitle) == 0:
desc = soup.find('meta',attrs={'property':'og:description'})
if desc is not None:
article.summary = article.text_summary = desc['content']
def strip_anchors(self,soup):
paras = soup.findAll(True)
for para in paras:
aTags = para.findAll('a')
for a in aTags:
if a.img is None:
a.replaceWith(a.renderContents().decode('cp1252','replace'))
return soup
def preprocess_html(self,soup):
#delete empty id attributes--they screw up the TOC for unknown reasons
divtags = soup.findAll('div',attrs={'id':''})
if divtags:
for div in divtags:
del(div['id'])
pgall = soup.find('div',attrs={'id':'storyphoto'})
if pgall is not None: # photo gallery perhaps
if (soup.find('div',attrs={'id':'storycontent'}) is None):
allpics = Tag(soup,'div')
first_img = pgall.find('div','storyimage')
if first_img is not None:
first_img.extract()
tlist = pgall.find('div',attrs={'id':'relatedthumbs'})
if tlist is not None:
for atag in tlist.findAll('a'):
img = Tag(soup,'img')
srcpre, sep, srcpost = atag.img['src'].partition('?')
img['src'] = srcpre
pdesc = Tag(soup,'p')
pdesc.insert(0,atag.img['alt'])
pdesc['class']='photocaption'
div = Tag(soup,'div')
div.insert(0,pdesc)
div.insert(0,img)
allpics.append(div)
pgall.replaceWith(allpics)
for pg in soup.findAll('div',attrs={'id':'storyphoto'}):
pg.extract()
return self.strip_anchors(soup)
def parse_index(self):
articles = {}
ans = []
def handle_article(adiv,key):
h1tag = adiv.h1
if h1tag is not None:
atag = h1tag.a
if atag is not None:
url = atag['href']
if atag['href'].startswith('http'):
return
elif atag['href'].startswith('/'):
url = self.url_prefix+atag['href']
else:
url = self.url_prefix+'/'+atag['href']
if url in self.url_list:
return
self.url_list.append(url)
title = self.tag_to_string(atag,False)
if 'VIDEO' in title.upper():
return
if 'GALLERY' in title.upper():
return
if 'PHOTOS' in title.upper():
return
dtag = adiv.find('div','content')
description=''
print("URL "+url)
print("TITLE "+title)
if dtag is not None:
stag = dtag.span
if stag is not None:
if stag['class'] != 'timestamp':
description = self.tag_to_string(stag,False)
else:
description = self.tag_to_string(dtag,False)
print("DESCRIPTION: "+description)
if not articles.has_key(key):
articles[key] = []
articles[key].append(dict(title=title,url=url,date='',description=description,author='',content=''))
def parse_web_index(key, keyurl):
try:
soup = self.index_to_soup(self.url_prefix+keyurl)
except:
return
ans.append(key)
mainsoup = soup.find('div','bodywrapper')
footer = mainsoup.find(attrs={'id':'footerfeature'})
if footer is not None:
footer.extract()
print("Section: "+key)
for wdiv in mainsoup.findAll('div',attrs={'id':re.compile('^HorizontalFeatureSlider_1_Story')}):
handle_article(wdiv,key)
wdiv.extract()
for wdiv in mainsoup.findAll(attrs={'id':['featurewidget','textfeature','textlinks_timestamp']}):
for adiv in wdiv.findAll('div','featurecontent'):
handle_article(adiv,key)
for (k,url) in self.postmedia_index_pages:
parse_web_index(k,url)
ans = [(key, articles[key]) for key in ans if articles.has_key(key)]
return ans

View File

@ -1,105 +1,138 @@
#!/usr/bin/env python #!/usr/bin/env python
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
__license__ = 'GPL v3' __license__ = 'GPL v3'
''' '''
www.canada.com www.canada.com
''' '''
import re import re
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.recipes import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulStoneSoup from calibre.ebooks.BeautifulSoup import BeautifulStoneSoup, Tag
class CanWestPaper(BasicNewsRecipe): class CanWestPaper(BasicNewsRecipe):
# un-comment the following four lines for the Victoria Times Colonist postmedia_index_pages = [
## title = u'Victoria Times Colonist' (u'Headlines',u'/index.html'),
## url_prefix = 'http://www.timescolonist.com' (u'Ottawa & Area',u'/news/ottawa/index.html'),
## description = u'News from Victoria, BC' (u'Vancouver',u'/news/vancouver/index.html'),
## fp_tag = 'CAN_TC' (u'Calgary',u'/news/calgary/index.html'),
(u'Edmonton',u'/news/edmonton/index.html'),
(u'Montreal',u'/news/montreal/index.html'),
(u'Fraser Valley',u'/news/fraser-valley/index.html'),
(u'British Columbia',u'/news/bc/index.html'),
(u'Alberta',u'/news/alberta/index.html'),
(u'Canada',u'/news/canada/index.html'),
(u'National',u'/news/national/index.html'),
(u'Politics',u'/news/politics/index.html'),
(u'Insight',u'/news/insight/index.html'),
(u'Special Reports',u'/news/specialreports/index.html'),
(u'Gangs',u'/news/gangs/index.html'),
(u'Education',u'/news/education/index.html'),
(u'Health',u'/news/health/index.html'),
(u'Environment',u'/news/environment/index.html'),
(u'World',u'/news/world/index.html'),
(u'Police Blotter',u'/news/crime-and-justice/index.html'),
(u'Crime',u'/news/blotter/index.html'),
(u'Around Town',u'/news/topic.html?t=keyword&q=Around+Town'),
(u'Diplomatica',u'/news/diplomatica/index.html'),
(u'Opinion',u'/opinion/index.html'),
(u'Columnists',u'/columnists/index.html'),
(u'Editorials',u'/opinion/editorials/index.html'),
(u'Letters',u'/opinion/letters/index.html'),
(u'Business',u'/business/index.html'),
(u'Sports',u'/sports/index.html'),
(u'Arts',u'/entertainment/index.html'),
(u'Life',u'/life/index.html'),
(u'Technology',u'/technology/index.html'),
(u'Travel',u'/travel/index.html'),
(u'Health',u'/health/index.html')
]
# un-comment the following four lines for the Vancouver Province
# un-comment the following six lines for the Vancouver Province
title = u'Vancouver Province' title = u'Vancouver Province'
url_prefix = 'http://www.theprovince.com' url_prefix = 'http://www.theprovince.com'
description = u'News from Vancouver, BC' description = u'News from Vancouver, BC'
fp_tag = 'CAN_VP' std_logo_url = 'http://www.theprovince.com/images/logo_theprovince.jpg'
logo_url = 'vplogo.jpg'
fp_tag = 'CAN_TP'
# un-comment the following four lines for the Vancouver Sun # un-comment the following six lines for the Vancouver Sun
## title = u'Vancouver Sun' # title = u'Vancouver Sun'
## url_prefix = 'http://www.vancouversun.com' # url_prefix = 'http://www.vancouversun.com'
## description = u'News from Vancouver, BC' # description = u'News from Vancouver, BC'
## fp_tag = 'CAN_VS' # std_logo_url = 'http://www.vancouversun.com/images/logo_vancouversun.jpg'
# logo_url = 'vslogo.jpg'
# fp_tag = 'CAN_VS'
# un-comment the following four lines for the Edmonton Journal # un-comment the following six lines for the Calgary Herald
## title = u'Edmonton Journal'
## url_prefix = 'http://www.edmontonjournal.com'
## description = u'News from Edmonton, AB'
## fp_tag = 'CAN_EJ'
# un-comment the following four lines for the Calgary Herald
## title = u'Calgary Herald' ## title = u'Calgary Herald'
## url_prefix = 'http://www.calgaryherald.com' ## url_prefix = 'http://www.calgaryherald.com'
## description = u'News from Calgary, AB' ## description = u'News from Calgary, AB'
## std_logo_url = 'http://www.calgaryherald.com/images/logo_calgaryherald.jpg'
## logo_url = 'chlogo.jpg'
## fp_tag = 'CAN_CH' ## fp_tag = 'CAN_CH'
# un-comment the following four lines for the Regina Leader-Post # un-comment the following six lines for the Edmonton Journal
## title = u'Regina Leader-Post' ## title = u'Edmonton Journal'
## url_prefix = 'http://www.leaderpost.com' ## url_prefix = 'http://www.edmontonjournal.com'
## description = u'News from Regina, SK' ## description = u'News from Edmonton, AB'
## fp_tag = '' ## std_logo_url = 'http://www.edmontonjournal.com/images/logo_edmontonjournal.jpg'
## logo_url = 'ejlogo.jpg'
## fp_tag = 'CAN_EJ'
# un-comment the following four lines for the Saskatoon Star-Phoenix # un-comment the following six lines for the Ottawa Citizen
## title = u'Saskatoon Star-Phoenix'
## url_prefix = 'http://www.thestarphoenix.com'
## description = u'News from Saskatoon, SK'
## fp_tag = ''
# un-comment the following four lines for the Windsor Star
## title = u'Windsor Star'
## url_prefix = 'http://www.windsorstar.com'
## description = u'News from Windsor, ON'
## fp_tag = 'CAN_'
# un-comment the following four lines for the Ottawa Citizen
## title = u'Ottawa Citizen' ## title = u'Ottawa Citizen'
## url_prefix = 'http://www.ottawacitizen.com' ## url_prefix = 'http://www.ottawacitizen.com'
## description = u'News from Ottawa, ON' ## description = u'News from Ottawa, ON'
## std_logo_url = 'http://www.ottawacitizen.com/images/logo_ottawacitizen.jpg'
## logo_url = 'oclogo.jpg'
## fp_tag = 'CAN_OC' ## fp_tag = 'CAN_OC'
# un-comment the following four lines for the Montreal Gazette # un-comment the following six lines for the Montreal Gazette
## title = u'Montreal Gazette' ## title = u'Montreal Gazette'
## url_prefix = 'http://www.montrealgazette.com' ## url_prefix = 'http://www.montrealgazette.com'
## description = u'News from Montreal, QC' ## description = u'News from Montreal, QC'
## std_logo_url = 'http://www.montrealgazette.com/images/logo_montrealgazette.jpg'
## logo_url = 'mglogo.jpg'
## fp_tag = 'CAN_MG' ## fp_tag = 'CAN_MG'
Kindle_Fire=False
masthead_url = std_logo_url
url_list = []
language = 'en_CA' language = 'en_CA'
__author__ = 'Nick Redding' __author__ = 'Nick Redding'
no_stylesheets = True no_stylesheets = True
timefmt = ' [%b %d]' timefmt = ' [%b %d]'
encoding = 'utf-8'
extra_css = ''' extra_css = '''
.timestamp { font-size:xx-small; display: block; } .timestamp { font-size:xx-small; display: block; }
#storyheader { font-size: medium; } #storyheader { font-size: medium; }
#storyheader h1 { font-size: x-large; } #storyheader h1 { font-size: x-large; }
#storyheader h2 { font-size: large; font-style: italic; } #storyheader h2 { font-size: small; font-style: italic; }
.byline { font-size:xx-small; } .byline { font-size:xx-small; }
#photocaption { font-size: small; font-style: italic } #photocaption { font-size: xx-small; font-style: italic; font-weight: normal; }
#photocredit { font-size: xx-small; }''' .photocaption { font-size: xx-small; font-style: italic; font-weight: normal; }
keep_only_tags = [dict(name='div', attrs={'id':'storyheader'}),dict(name='div', attrs={'id':'storycontent'})] #photocredit { font-size: xx-small; font-weight: normal; }'''
remove_tags = [{'class':'comments'},
dict(name='div', attrs={'class':'navbar'}),dict(name='div', attrs={'class':'morelinks'}), keep_only_tags = [dict(name='div', attrs={'id':re.compile('story')})]
remove_tags = [{'class':'comments'},{'class':'comment-intro'},{'class':'storytab'},
dict(name='div', attrs={'class':'section_title'}),dict(name='div', attrs={'class':'sharebar'}),dict(name='div', attrs={'class':'navbar'}),dict(name='div', attrs={'class':'morelinks'}),
dict(name='h2', attrs={'id':'photocredit'}),
dict(name='div', attrs={'class':'viewmore'}),dict(name='li', attrs={'class':'email'}), dict(name='div', attrs={'class':'viewmore'}),dict(name='li', attrs={'class':'email'}),
dict(name='div', attrs={'class':'story_tool_hr'}),dict(name='div', attrs={'class':'clear'}), dict(name='div', attrs={'class':'story_tool_hr'}),dict(name='div', attrs={'class':'clear'}),
dict(name='div', attrs={'class':'story_tool'}),dict(name='div', attrs={'class':'copyright'}), dict(name='div', attrs={'class':'story_tool'}),dict(name='div', attrs={'class':'copyright'}),
dict(name='div', attrs={'class':'rule_grey_solid'}), dict(name='div', attrs={'class':'rule_grey_solid'}),
dict(name='div', attrs={'id':'soundoff'}),
dict(name='div', attrs={'id':re.compile('flyer')}),
dict(name='li', attrs={'class':'print'}),dict(name='li', attrs={'class':'share'}),dict(name='ul', attrs={'class':'bullet'})] dict(name='li', attrs={'class':'print'}),dict(name='li', attrs={'class':'share'}),dict(name='ul', attrs={'class':'bullet'})]
def get_cover_url(self): def get_cover_url(self):
from datetime import timedelta, date from datetime import timedelta, date
if self.fp_tag=='':
return None
cover = 'http://webmedia.newseum.org/newseum-multimedia/dfp/jpg'+str(date.today().day)+'/lg/'+self.fp_tag+'.jpg' cover = 'http://webmedia.newseum.org/newseum-multimedia/dfp/jpg'+str(date.today().day)+'/lg/'+self.fp_tag+'.jpg'
br = BasicNewsRecipe.get_browser(self) br = BasicNewsRecipe.get_browser(self)
daysback=1 daysback=1
@ -120,6 +153,18 @@ class CanWestPaper(BasicNewsRecipe):
cover = None cover = None
return cover return cover
def prepare_masthead_image(self, path_to_image, out_path):
if self.Kindle_Fire:
from calibre.utils.magick import Image, create_canvas
img = Image()
img.open(path_to_image)
width, height = img.size
img2 = create_canvas(width, height)
img2.compose(img)
img2.save(out_path)
else:
BasicNewsRecipe.prepare_masthead_image(self, path_to_image, out_path)
def fixChars(self,string): def fixChars(self,string):
# Replace lsquo (\x91) # Replace lsquo (\x91)
fixed = re.sub("\x91","",string) fixed = re.sub("\x91","",string)
@ -166,55 +211,110 @@ class CanWestPaper(BasicNewsRecipe):
a.replaceWith(a.renderContents().decode('cp1252','replace')) a.replaceWith(a.renderContents().decode('cp1252','replace'))
return soup return soup
def preprocess_html(self, soup):
def preprocess_html(self,soup):
#delete empty id attributes--they screw up the TOC for unknown reasons
divtags = soup.findAll('div',attrs={'id':''})
if divtags:
for div in divtags:
del(div['id'])
pgall = soup.find('div',attrs={'id':'storyphoto'})
if pgall is not None: # photo gallery perhaps
if (soup.find('div',attrs={'id':'storycontent'}) is None):
allpics = Tag(soup,'div')
first_img = pgall.find('div','storyimage')
if first_img is not None:
first_img.extract()
tlist = pgall.find('div',attrs={'id':'relatedthumbs'})
if tlist is not None:
for atag in tlist.findAll('a'):
img = Tag(soup,'img')
srcpre, sep, srcpost = atag.img['src'].partition('?')
img['src'] = srcpre
pdesc = Tag(soup,'p')
pdesc.insert(0,atag.img['alt'])
pdesc['class']='photocaption'
div = Tag(soup,'div')
div.insert(0,pdesc)
div.insert(0,img)
allpics.append(div)
pgall.replaceWith(allpics)
for pg in soup.findAll('div',attrs={'id':'storyphoto'}):
pg.extract()
return self.strip_anchors(soup) return self.strip_anchors(soup)
def parse_index(self): def parse_index(self):
soup = self.index_to_soup(self.url_prefix+'/news/todays-paper/index.html')
articles = {} articles = {}
key = 'News' ans = []
ans = ['News']
# Find each instance of class="sectiontitle", class="featurecontent"
for divtag in soup.findAll('div',attrs={'class' : ["section_title02","featurecontent"]}):
#self.log(" div class = %s" % divtag['class'])
if divtag['class'].startswith('section_title'):
# div contains section title
if not divtag.h3:
continue
key = self.tag_to_string(divtag.h3,False)
ans.append(key)
self.log("Section name %s" % key)
continue
# div contains article data
h1tag = divtag.find('h1')
if not h1tag:
continue
atag = h1tag.find('a',href=True)
if not atag:
continue
url = self.url_prefix+'/news/todays-paper/'+atag['href']
#self.log("Section %s" % key)
#self.log("url %s" % url)
title = self.tag_to_string(atag,False)
#self.log("title %s" % title)
pubdate = ''
description = ''
ptag = divtag.find('p');
if ptag:
description = self.tag_to_string(ptag,False)
#self.log("description %s" % description)
author = ''
autag = divtag.find('h4')
if autag:
author = self.tag_to_string(autag,False)
#self.log("author %s" % author)
if not articles.has_key(key):
articles[key] = []
articles[key].append(dict(title=title,url=url,date=pubdate,description=description,author=author,content=''))
def handle_article(adiv,key):
if adiv.name=='h1' or adiv.name=='h3':
h1tag = adiv
else:
h1tag = adiv.h1
if h1tag is None:
h1tag = adiv.h3
if h1tag is not None:
atag = h1tag.a
if atag is not None:
url = atag['href']
if url.startswith('/'):
url = self.url_prefix+url
if not url.startswith(self.url_prefix):
print("Rejected "+url)
return
if url in self.url_list:
print("Rejected dup "+url)
return
self.url_list.append(url)
title = self.tag_to_string(atag,False)
if 'VIDEO' in title.upper():
return
if 'GALLERY' in title.upper():
return
if 'PHOTOS' in title.upper():
return
dtag = adiv.find('div','content')
description=''
print("URL "+url)
print("TITLE "+title)
if dtag is not None:
stag = dtag.span
if stag is not None:
if stag['class'] != 'timestamp':
description = self.tag_to_string(stag,False)
else:
description = self.tag_to_string(dtag,False)
print("DESCRIPTION: "+description)
if not articles.has_key(key):
articles[key] = []
articles[key].append(dict(title=title,url=url,date='',description=description,author='',content=''))
def parse_web_index(key, keyurl):
print("Section: "+key+': '+self.url_prefix+keyurl)
try:
soup = self.index_to_soup(self.url_prefix+keyurl)
except:
print("Section: "+key+' NOT FOUND');
return
ans.append(key)
mainsoup = soup.find('div','bodywrapper')
footer = mainsoup.find(attrs={'id':'footerfeature'})
if footer is not None:
footer.extract()
for wdiv in mainsoup.findAll(attrs={'class':['genericfeature']}):
wdiv.extract()
for wdiv in mainsoup.findAll(attrs={'class':['headline','featurecontent']}):
handle_article(wdiv,key)
for (k,url) in self.postmedia_index_pages:
parse_web_index(k,url)
ans = [(key, articles[key]) for key in ans if articles.has_key(key)] ans = [(key, articles[key]) for key in ans if articles.has_key(key)]
return ans return ans

View File

@ -117,15 +117,17 @@ class CanWestPaper(BasicNewsRecipe):
.photocaption { font-size: xx-small; font-style: italic; font-weight: normal; } .photocaption { font-size: xx-small; font-style: italic; font-weight: normal; }
#photocredit { font-size: xx-small; font-weight: normal; }''' #photocredit { font-size: xx-small; font-weight: normal; }'''
keep_only_tags = [dict(name='div', attrs={'id':'storyheader'}),dict(name='div', attrs={'id':'storycontent'}),dict(name='div', attrs={'id':'storyphoto'})] keep_only_tags = [dict(name='div', attrs={'id':re.compile('story')})]
remove_tags = [{'class':'comments'}, remove_tags = [{'class':'comments'},{'class':'comment-intro'},{'class':'storytab'},
dict(name='div', attrs={'class':'navbar'}),dict(name='div', attrs={'class':'morelinks'}), dict(name='div', attrs={'class':'section_title'}),dict(name='div', attrs={'class':'sharebar'}),dict(name='div', attrs={'class':'navbar'}),dict(name='div', attrs={'class':'morelinks'}),
dict(name='h2', attrs={'id':'photocredit'}), dict(name='h2', attrs={'id':'photocredit'}),
dict(name='div', attrs={'class':'viewmore'}),dict(name='li', attrs={'class':'email'}), dict(name='div', attrs={'class':'viewmore'}),dict(name='li', attrs={'class':'email'}),
dict(name='div', attrs={'class':'story_tool_hr'}),dict(name='div', attrs={'class':'clear'}), dict(name='div', attrs={'class':'story_tool_hr'}),dict(name='div', attrs={'class':'clear'}),
dict(name='div', attrs={'class':'story_tool'}),dict(name='div', attrs={'class':'copyright'}), dict(name='div', attrs={'class':'story_tool'}),dict(name='div', attrs={'class':'copyright'}),
dict(name='div', attrs={'class':'rule_grey_solid'}), dict(name='div', attrs={'class':'rule_grey_solid'}),
dict(name='div', attrs={'id':'soundoff'}),
dict(name='div', attrs={'id':re.compile('flyer')}),
dict(name='li', attrs={'class':'print'}),dict(name='li', attrs={'class':'share'}),dict(name='ul', attrs={'class':'bullet'})] dict(name='li', attrs={'class':'print'}),dict(name='li', attrs={'class':'share'}),dict(name='ul', attrs={'class':'bullet'})]
@ -252,18 +254,23 @@ class CanWestPaper(BasicNewsRecipe):
def handle_article(adiv,key): def handle_article(adiv,key):
h1tag = adiv.h1 if adiv.name=='h1' or adiv.name=='h3':
h1tag = adiv
else:
h1tag = adiv.h1
if h1tag is None:
h1tag = adiv.h3
if h1tag is not None: if h1tag is not None:
atag = h1tag.a atag = h1tag.a
if atag is not None: if atag is not None:
url = atag['href'] url = atag['href']
if atag['href'].startswith('http'): if url.startswith('/'):
url = self.url_prefix+url
if not url.startswith(self.url_prefix):
print("Rejected "+url)
return return
elif atag['href'].startswith('/'):
url = self.url_prefix+atag['href']
else:
url = self.url_prefix+'/'+atag['href']
if url in self.url_list: if url in self.url_list:
print("Rejected dup "+url)
return return
self.url_list.append(url) self.url_list.append(url)
title = self.tag_to_string(atag,False) title = self.tag_to_string(atag,False)
@ -290,22 +297,21 @@ class CanWestPaper(BasicNewsRecipe):
articles[key].append(dict(title=title,url=url,date='',description=description,author='',content='')) articles[key].append(dict(title=title,url=url,date='',description=description,author='',content=''))
def parse_web_index(key, keyurl): def parse_web_index(key, keyurl):
print("Section: "+key+': '+self.url_prefix+keyurl)
try: try:
soup = self.index_to_soup(self.url_prefix+keyurl) soup = self.index_to_soup(self.url_prefix+keyurl)
except: except:
print("Section: "+key+' NOT FOUND');
return return
ans.append(key) ans.append(key)
mainsoup = soup.find('div','bodywrapper') mainsoup = soup.find('div','bodywrapper')
footer = mainsoup.find(attrs={'id':'footerfeature'}) footer = mainsoup.find(attrs={'id':'footerfeature'})
if footer is not None: if footer is not None:
footer.extract() footer.extract()
print("Section: "+key) for wdiv in mainsoup.findAll(attrs={'class':['genericfeature']}):
for wdiv in mainsoup.findAll('div',attrs={'id':re.compile('^HorizontalFeatureSlider_1_Story')}): wdiv.extract()
handle_article(wdiv,key) for wdiv in mainsoup.findAll(attrs={'class':['headline','featurecontent']}):
wdiv.extract() handle_article(wdiv,key)
for wdiv in mainsoup.findAll(attrs={'id':['featurewidget','textfeature','textlinks_timestamp']}):
for adiv in wdiv.findAll('div','featurecontent'):
handle_article(adiv,key)
for (k,url) in self.postmedia_index_pages: for (k,url) in self.postmedia_index_pages:
parse_web_index(k,url) parse_web_index(k,url)