Updated Postmedia publications

This commit is contained in:
Kovid Goyal 2012-02-07 11:44:14 +05:30
parent b51079b26a
commit d4d7d2e13f
8 changed files with 1070 additions and 152 deletions

View File

@ -1,4 +1,5 @@
#!/usr/bin/env python #!/usr/bin/env python
# -*- coding: utf-8 -*-
__license__ = 'GPL v3' __license__ = 'GPL v3'
@ -6,45 +7,81 @@ __license__ = 'GPL v3'
www.canada.com www.canada.com
''' '''
import string, re
from calibre import strftime
from calibre.web.feeds.news import BasicNewsRecipe
import string, re
from calibre import strftime
from calibre.web.feeds.recipes import BasicNewsRecipe from calibre.web.feeds.recipes import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag
class CanWestPaper(BasicNewsRecipe): class CanWestPaper(BasicNewsRecipe):
# un-comment the following three lines for the Calgary Herald # un-comment the following four lines for the Victoria Times Colonist
## title = u'Victoria Times Colonist'
## url_prefix = 'http://www.timescolonist.com'
## description = u'News from Victoria, BC'
## fp_tag = 'CAN_TC'
# un-comment the following four lines for the Vancouver Province
## title = u'Vancouver Province'
## url_prefix = 'http://www.theprovince.com'
## description = u'News from Vancouver, BC'
## fp_tag = 'CAN_VP'
# un-comment the following four lines for the Vancouver Sun
## title = u'Vancouver Sun'
## url_prefix = 'http://www.vancouversun.com'
## description = u'News from Vancouver, BC'
## fp_tag = 'CAN_VS'
# un-comment the following four lines for the Edmonton Journal
## title = u'Edmonton Journal'
## url_prefix = 'http://www.edmontonjournal.com'
## description = u'News from Edmonton, AB'
## fp_tag = 'CAN_EJ'
# un-comment the following four lines for the Calgary Herald
title = u'Calgary Herald' title = u'Calgary Herald'
url_prefix = 'http://www.calgaryherald.com' url_prefix = 'http://www.calgaryherald.com'
description = u'News from Calgary, AB' description = u'News from Calgary, AB'
fp_tag = 'CAN_CH'
# un-comment the following three lines for the Regina Leader-Post # un-comment the following four lines for the Regina Leader-Post
#title = u'Regina Leader-Post' ## title = u'Regina Leader-Post'
#url_prefix = 'http://www.leaderpost.com' ## url_prefix = 'http://www.leaderpost.com'
#description = u'News from Regina, SK' ## description = u'News from Regina, SK'
## fp_tag = ''
# un-comment the following three lines for the Saskatoon Star-Phoenix # un-comment the following four lines for the Saskatoon Star-Phoenix
#title = u'Saskatoon Star-Phoenix' ## title = u'Saskatoon Star-Phoenix'
#url_prefix = 'http://www.thestarphoenix.com' ## url_prefix = 'http://www.thestarphoenix.com'
#description = u'News from Saskatoon, SK' ## description = u'News from Saskatoon, SK'
## fp_tag = ''
# un-comment the following three lines for the Windsor Star # un-comment the following four lines for the Windsor Star
#title = u'Windsor Star' ## title = u'Windsor Star'
#url_prefix = 'http://www.windsorstar.com' ## url_prefix = 'http://www.windsorstar.com'
#description = u'News from Windsor, ON' ## description = u'News from Windsor, ON'
## fp_tag = 'CAN_'
# un-comment the following three lines for the Ottawa Citizen # un-comment the following four lines for the Ottawa Citizen
#title = u'Ottawa Citizen' ## title = u'Ottawa Citizen'
#url_prefix = 'http://www.ottawacitizen.com' ## url_prefix = 'http://www.ottawacitizen.com'
#description = u'News from Ottawa, ON' ## description = u'News from Ottawa, ON'
## fp_tag = 'CAN_OC'
# un-comment the following three lines for the Montreal Gazette # un-comment the following four lines for the Montreal Gazette
#title = u'Montreal Gazette' ## title = u'Montreal Gazette'
#url_prefix = 'http://www.montrealgazette.com' ## url_prefix = 'http://www.montrealgazette.com'
#description = u'News from Montreal, QC' ## description = u'News from Montreal, QC'
## fp_tag = 'CAN_MG'
language = 'en_CA' language = 'en_CA'
__author__ = 'Nick Redding' __author__ = 'Nick Redding'
encoding = 'latin1'
no_stylesheets = True no_stylesheets = True
timefmt = ' [%b %d]' timefmt = ' [%b %d]'
extra_css = ''' extra_css = '''
@ -72,6 +109,80 @@ class CanWestPaper(BasicNewsRecipe):
del(div['id']) del(div['id'])
return soup return soup
def get_cover_url(self):
from datetime import timedelta, datetime, date
if self.fp_tag=='':
return None
cover = 'http://webmedia.newseum.org/newseum-multimedia/dfp/jpg'+str(date.today().day)+'/lg/'+self.fp_tag+'.jpg'
br = BasicNewsRecipe.get_browser()
daysback=1
try:
br.open(cover)
except:
while daysback<7:
cover = 'http://webmedia.newseum.org/newseum-multimedia/dfp/jpg'+str((date.today() - timedelta(days=daysback)).day)+'/lg/'+self.fp_tag+'.jpg'
br = BasicNewsRecipe.get_browser()
try:
br.open(cover)
except:
daysback = daysback+1
continue
break
if daysback==7:
self.log("\nCover unavailable")
cover = None
return cover
def fixChars(self,string):
# Replace lsquo (\x91)
fixed = re.sub("\x91","",string)
# Replace rsquo (\x92)
fixed = re.sub("\x92","",fixed)
# Replace ldquo (\x93)
fixed = re.sub("\x93","“",fixed)
# Replace rdquo (\x94)
fixed = re.sub("\x94","”",fixed)
# Replace ndash (\x96)
fixed = re.sub("\x96","",fixed)
# Replace mdash (\x97)
fixed = re.sub("\x97","—",fixed)
fixed = re.sub("&#x2019;","",fixed)
return fixed
def massageNCXText(self, description):
# Kindle TOC descriptions won't render certain characters
if description:
massaged = unicode(BeautifulStoneSoup(description, convertEntities=BeautifulStoneSoup.HTML_ENTITIES))
# Replace '&' with '&'
massaged = re.sub("&","&", massaged)
return self.fixChars(massaged)
else:
return description
def populate_article_metadata(self, article, soup, first):
if first:
picdiv = soup.find('body').find('img')
if picdiv is not None:
self.add_toc_thumbnail(article,re.sub(r'links\\link\d+\\','',picdiv['src']))
xtitle = article.text_summary.strip()
if len(xtitle) == 0:
desc = soup.find('meta',attrs={'property':'og:description'})
if desc is not None:
article.summary = article.text_summary = desc['content']
def strip_anchors(self,soup):
paras = soup.findAll(True)
for para in paras:
aTags = para.findAll('a')
for a in aTags:
if a.img is None:
a.replaceWith(a.renderContents().decode('cp1252','replace'))
return soup
def preprocess_html(self, soup):
return self.strip_anchors(soup)
def parse_index(self): def parse_index(self):
soup = self.index_to_soup(self.url_prefix+'/news/todays-paper/index.html') soup = self.index_to_soup(self.url_prefix+'/news/todays-paper/index.html')
@ -98,8 +209,6 @@ class CanWestPaper(BasicNewsRecipe):
atag = h1tag.find('a',href=True) atag = h1tag.find('a',href=True)
if not atag: if not atag:
continue continue
url = atag['href']
if not url.startswith('http:'):
url = self.url_prefix+'/news/todays-paper/'+atag['href'] url = self.url_prefix+'/news/todays-paper/'+atag['href']
#self.log("Section %s" % key) #self.log("Section %s" % key)
#self.log("url %s" % url) #self.log("url %s" % url)

View File

@ -1,4 +1,5 @@
#!/usr/bin/env python #!/usr/bin/env python
# -*- coding: utf-8 -*-
__license__ = 'GPL v3' __license__ = 'GPL v3'
@ -6,45 +7,77 @@ __license__ = 'GPL v3'
www.canada.com www.canada.com
''' '''
import string, re
from calibre import strftime
from calibre.web.feeds.news import BasicNewsRecipe
import string, re
from calibre import strftime
from calibre.web.feeds.recipes import BasicNewsRecipe from calibre.web.feeds.recipes import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag
class CanWestPaper(BasicNewsRecipe): class CanWestPaper(BasicNewsRecipe):
# un-comment the following three lines for the Edmonton Journal # un-comment the following four lines for the Victoria Times Colonist
## title = u'Victoria Times Colonist'
## url_prefix = 'http://www.timescolonist.com'
## description = u'News from Victoria, BC'
## fp_tag = 'CAN_TC'
# un-comment the following four lines for the Vancouver Province
## title = u'Vancouver Province'
## url_prefix = 'http://www.theprovince.com'
## description = u'News from Vancouver, BC'
## fp_tag = 'CAN_VP'
# un-comment the following four lines for the Vancouver Sun
## title = u'Vancouver Sun'
## url_prefix = 'http://www.vancouversun.com'
## description = u'News from Vancouver, BC'
## fp_tag = 'CAN_VS'
# un-comment the following four lines for the Edmonton Journal
title = u'Edmonton Journal' title = u'Edmonton Journal'
url_prefix = 'http://www.edmontonjournal.com' url_prefix = 'http://www.edmontonjournal.com'
description = u'News from Edmonton, AB' description = u'News from Edmonton, AB'
fp_tag = 'CAN_EJ'
# un-comment the following three lines for the Calgary Herald # un-comment the following four lines for the Calgary Herald
#title = u'Calgary Herald' ## title = u'Calgary Herald'
#url_prefix = 'http://www.calgaryherald.com' ## url_prefix = 'http://www.calgaryherald.com'
#description = u'News from Calgary, AB' ## description = u'News from Calgary, AB'
## fp_tag = 'CAN_CH'
# un-comment the following three lines for the Regina Leader-Post # un-comment the following four lines for the Regina Leader-Post
#title = u'Regina Leader-Post' ## title = u'Regina Leader-Post'
#url_prefix = 'http://www.leaderpost.com' ## url_prefix = 'http://www.leaderpost.com'
#description = u'News from Regina, SK' ## description = u'News from Regina, SK'
## fp_tag = ''
# un-comment the following three lines for the Saskatoon Star-Phoenix # un-comment the following four lines for the Saskatoon Star-Phoenix
#title = u'Saskatoon Star-Phoenix' ## title = u'Saskatoon Star-Phoenix'
#url_prefix = 'http://www.thestarphoenix.com' ## url_prefix = 'http://www.thestarphoenix.com'
#description = u'News from Saskatoon, SK' ## description = u'News from Saskatoon, SK'
## fp_tag = ''
# un-comment the following three lines for the Windsor Star # un-comment the following four lines for the Windsor Star
#title = u'Windsor Star' ## title = u'Windsor Star'
#url_prefix = 'http://www.windsorstar.com' ## url_prefix = 'http://www.windsorstar.com'
#description = u'News from Windsor, ON' ## description = u'News from Windsor, ON'
## fp_tag = 'CAN_'
# un-comment the following three lines for the Ottawa Citizen # un-comment the following four lines for the Ottawa Citizen
#title = u'Ottawa Citizen' ## title = u'Ottawa Citizen'
#url_prefix = 'http://www.ottawacitizen.com' ## url_prefix = 'http://www.ottawacitizen.com'
#description = u'News from Ottawa, ON' ## description = u'News from Ottawa, ON'
## fp_tag = 'CAN_OC'
# un-comment the following three lines for the Montreal Gazette # un-comment the following four lines for the Montreal Gazette
#title = u'Montreal Gazette' ## title = u'Montreal Gazette'
#url_prefix = 'http://www.montrealgazette.com' ## url_prefix = 'http://www.montrealgazette.com'
#description = u'News from Montreal, QC' ## description = u'News from Montreal, QC'
## fp_tag = 'CAN_MG'
language = 'en_CA' language = 'en_CA'
@ -76,6 +109,80 @@ class CanWestPaper(BasicNewsRecipe):
del(div['id']) del(div['id'])
return soup return soup
def get_cover_url(self):
from datetime import timedelta, datetime, date
if self.fp_tag=='':
return None
cover = 'http://webmedia.newseum.org/newseum-multimedia/dfp/jpg'+str(date.today().day)+'/lg/'+self.fp_tag+'.jpg'
br = BasicNewsRecipe.get_browser()
daysback=1
try:
br.open(cover)
except:
while daysback<7:
cover = 'http://webmedia.newseum.org/newseum-multimedia/dfp/jpg'+str((date.today() - timedelta(days=daysback)).day)+'/lg/'+self.fp_tag+'.jpg'
br = BasicNewsRecipe.get_browser()
try:
br.open(cover)
except:
daysback = daysback+1
continue
break
if daysback==7:
self.log("\nCover unavailable")
cover = None
return cover
def fixChars(self,string):
# Replace lsquo (\x91)
fixed = re.sub("\x91","",string)
# Replace rsquo (\x92)
fixed = re.sub("\x92","",fixed)
# Replace ldquo (\x93)
fixed = re.sub("\x93","“",fixed)
# Replace rdquo (\x94)
fixed = re.sub("\x94","”",fixed)
# Replace ndash (\x96)
fixed = re.sub("\x96","",fixed)
# Replace mdash (\x97)
fixed = re.sub("\x97","—",fixed)
fixed = re.sub("&#x2019;","",fixed)
return fixed
def massageNCXText(self, description):
# Kindle TOC descriptions won't render certain characters
if description:
massaged = unicode(BeautifulStoneSoup(description, convertEntities=BeautifulStoneSoup.HTML_ENTITIES))
# Replace '&' with '&'
massaged = re.sub("&","&", massaged)
return self.fixChars(massaged)
else:
return description
def populate_article_metadata(self, article, soup, first):
if first:
picdiv = soup.find('body').find('img')
if picdiv is not None:
self.add_toc_thumbnail(article,re.sub(r'links\\link\d+\\','',picdiv['src']))
xtitle = article.text_summary.strip()
if len(xtitle) == 0:
desc = soup.find('meta',attrs={'property':'og:description'})
if desc is not None:
article.summary = article.text_summary = desc['content']
def strip_anchors(self,soup):
paras = soup.findAll(True)
for para in paras:
aTags = para.findAll('a')
for a in aTags:
if a.img is None:
a.replaceWith(a.renderContents().decode('cp1252','replace'))
return soup
def preprocess_html(self, soup):
return self.strip_anchors(soup)
def parse_index(self): def parse_index(self):
soup = self.index_to_soup(self.url_prefix+'/news/todays-paper/index.html') soup = self.index_to_soup(self.url_prefix+'/news/todays-paper/index.html')

View File

@ -1,4 +1,5 @@
#!/usr/bin/env python #!/usr/bin/env python
# -*- coding: utf-8 -*-
__license__ = 'GPL v3' __license__ = 'GPL v3'
@ -6,15 +7,77 @@ __license__ = 'GPL v3'
www.canada.com www.canada.com
''' '''
import string, re
from calibre import strftime
from calibre.web.feeds.news import BasicNewsRecipe
import string, re
from calibre import strftime
from calibre.web.feeds.recipes import BasicNewsRecipe from calibre.web.feeds.recipes import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag
class CanWestPaper(BasicNewsRecipe): class CanWestPaper(BasicNewsRecipe):
# un-comment the following three lines for the Montreal Gazette # un-comment the following four lines for the Victoria Times Colonist
## title = u'Victoria Times Colonist'
## url_prefix = 'http://www.timescolonist.com'
## description = u'News from Victoria, BC'
## fp_tag = 'CAN_TC'
# un-comment the following four lines for the Vancouver Province
## title = u'Vancouver Province'
## url_prefix = 'http://www.theprovince.com'
## description = u'News from Vancouver, BC'
## fp_tag = 'CAN_VP'
# un-comment the following four lines for the Vancouver Sun
## title = u'Vancouver Sun'
## url_prefix = 'http://www.vancouversun.com'
## description = u'News from Vancouver, BC'
## fp_tag = 'CAN_VS'
# un-comment the following four lines for the Edmonton Journal
## title = u'Edmonton Journal'
## url_prefix = 'http://www.edmontonjournal.com'
## description = u'News from Edmonton, AB'
## fp_tag = 'CAN_EJ'
# un-comment the following four lines for the Calgary Herald
## title = u'Calgary Herald'
## url_prefix = 'http://www.calgaryherald.com'
## description = u'News from Calgary, AB'
## fp_tag = 'CAN_CH'
# un-comment the following four lines for the Regina Leader-Post
## title = u'Regina Leader-Post'
## url_prefix = 'http://www.leaderpost.com'
## description = u'News from Regina, SK'
## fp_tag = ''
# un-comment the following four lines for the Saskatoon Star-Phoenix
## title = u'Saskatoon Star-Phoenix'
## url_prefix = 'http://www.thestarphoenix.com'
## description = u'News from Saskatoon, SK'
## fp_tag = ''
# un-comment the following four lines for the Windsor Star
## title = u'Windsor Star'
## url_prefix = 'http://www.windsorstar.com'
## description = u'News from Windsor, ON'
## fp_tag = 'CAN_'
# un-comment the following four lines for the Ottawa Citizen
## title = u'Ottawa Citizen'
## url_prefix = 'http://www.ottawacitizen.com'
## description = u'News from Ottawa, ON'
## fp_tag = 'CAN_OC'
# un-comment the following four lines for the Montreal Gazette
title = u'Montreal Gazette' title = u'Montreal Gazette'
url_prefix = 'http://www.montrealgazette.com' url_prefix = 'http://www.montrealgazette.com'
description = u'News from Montreal, QC' description = u'News from Montreal, QC'
fp_tag = 'CAN_MG'
language = 'en_CA' language = 'en_CA'
@ -46,6 +109,80 @@ class CanWestPaper(BasicNewsRecipe):
del(div['id']) del(div['id'])
return soup return soup
def get_cover_url(self):
from datetime import timedelta, datetime, date
if self.fp_tag=='':
return None
cover = 'http://webmedia.newseum.org/newseum-multimedia/dfp/jpg'+str(date.today().day)+'/lg/'+self.fp_tag+'.jpg'
br = BasicNewsRecipe.get_browser()
daysback=1
try:
br.open(cover)
except:
while daysback<7:
cover = 'http://webmedia.newseum.org/newseum-multimedia/dfp/jpg'+str((date.today() - timedelta(days=daysback)).day)+'/lg/'+self.fp_tag+'.jpg'
br = BasicNewsRecipe.get_browser()
try:
br.open(cover)
except:
daysback = daysback+1
continue
break
if daysback==7:
self.log("\nCover unavailable")
cover = None
return cover
def fixChars(self,string):
# Replace lsquo (\x91)
fixed = re.sub("\x91","",string)
# Replace rsquo (\x92)
fixed = re.sub("\x92","",fixed)
# Replace ldquo (\x93)
fixed = re.sub("\x93","“",fixed)
# Replace rdquo (\x94)
fixed = re.sub("\x94","”",fixed)
# Replace ndash (\x96)
fixed = re.sub("\x96","",fixed)
# Replace mdash (\x97)
fixed = re.sub("\x97","—",fixed)
fixed = re.sub("&#x2019;","",fixed)
return fixed
def massageNCXText(self, description):
# Kindle TOC descriptions won't render certain characters
if description:
massaged = unicode(BeautifulStoneSoup(description, convertEntities=BeautifulStoneSoup.HTML_ENTITIES))
# Replace '&' with '&'
massaged = re.sub("&","&", massaged)
return self.fixChars(massaged)
else:
return description
def populate_article_metadata(self, article, soup, first):
if first:
picdiv = soup.find('body').find('img')
if picdiv is not None:
self.add_toc_thumbnail(article,re.sub(r'links\\link\d+\\','',picdiv['src']))
xtitle = article.text_summary.strip()
if len(xtitle) == 0:
desc = soup.find('meta',attrs={'property':'og:description'})
if desc is not None:
article.summary = article.text_summary = desc['content']
def strip_anchors(self,soup):
paras = soup.findAll(True)
for para in paras:
aTags = para.findAll('a')
for a in aTags:
if a.img is None:
a.replaceWith(a.renderContents().decode('cp1252','replace'))
return soup
def preprocess_html(self, soup):
return self.strip_anchors(soup)
def parse_index(self): def parse_index(self):
soup = self.index_to_soup(self.url_prefix+'/news/todays-paper/index.html') soup = self.index_to_soup(self.url_prefix+'/news/todays-paper/index.html')

View File

@ -1,4 +1,5 @@
#!/usr/bin/env python #!/usr/bin/env python
# -*- coding: utf-8 -*-
__license__ = 'GPL v3' __license__ = 'GPL v3'
@ -6,20 +7,77 @@ __license__ = 'GPL v3'
www.canada.com www.canada.com
''' '''
import string, re
from calibre import strftime
from calibre.web.feeds.news import BasicNewsRecipe
import string, re
from calibre import strftime
from calibre.web.feeds.recipes import BasicNewsRecipe from calibre.web.feeds.recipes import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag
class CanWestPaper(BasicNewsRecipe): class CanWestPaper(BasicNewsRecipe):
# un-comment the following three lines for the Ottawa Citizen # un-comment the following four lines for the Victoria Times Colonist
## title = u'Victoria Times Colonist'
## url_prefix = 'http://www.timescolonist.com'
## description = u'News from Victoria, BC'
## fp_tag = 'CAN_TC'
# un-comment the following four lines for the Vancouver Province
## title = u'Vancouver Province'
## url_prefix = 'http://www.theprovince.com'
## description = u'News from Vancouver, BC'
## fp_tag = 'CAN_VP'
# un-comment the following four lines for the Vancouver Sun
## title = u'Vancouver Sun'
## url_prefix = 'http://www.vancouversun.com'
## description = u'News from Vancouver, BC'
## fp_tag = 'CAN_VS'
# un-comment the following four lines for the Edmonton Journal
## title = u'Edmonton Journal'
## url_prefix = 'http://www.edmontonjournal.com'
## description = u'News from Edmonton, AB'
## fp_tag = 'CAN_EJ'
# un-comment the following four lines for the Calgary Herald
## title = u'Calgary Herald'
## url_prefix = 'http://www.calgaryherald.com'
## description = u'News from Calgary, AB'
## fp_tag = 'CAN_CH'
# un-comment the following four lines for the Regina Leader-Post
## title = u'Regina Leader-Post'
## url_prefix = 'http://www.leaderpost.com'
## description = u'News from Regina, SK'
## fp_tag = ''
# un-comment the following four lines for the Saskatoon Star-Phoenix
## title = u'Saskatoon Star-Phoenix'
## url_prefix = 'http://www.thestarphoenix.com'
## description = u'News from Saskatoon, SK'
## fp_tag = ''
# un-comment the following four lines for the Windsor Star
## title = u'Windsor Star'
## url_prefix = 'http://www.windsorstar.com'
## description = u'News from Windsor, ON'
## fp_tag = 'CAN_'
# un-comment the following four lines for the Ottawa Citizen
title = u'Ottawa Citizen' title = u'Ottawa Citizen'
url_prefix = 'http://www.ottawacitizen.com' url_prefix = 'http://www.ottawacitizen.com'
description = u'News from Ottawa, ON' description = u'News from Ottawa, ON'
fp_tag = 'CAN_OC'
# un-comment the following three lines for the Montreal Gazette # un-comment the following four lines for the Montreal Gazette
#title = u'Montreal Gazette' ## title = u'Montreal Gazette'
#url_prefix = 'http://www.montrealgazette.com' ## url_prefix = 'http://www.montrealgazette.com'
#description = u'News from Montreal, QC' ## description = u'News from Montreal, QC'
## fp_tag = 'CAN_MG'
language = 'en_CA' language = 'en_CA'
@ -51,6 +109,80 @@ class CanWestPaper(BasicNewsRecipe):
del(div['id']) del(div['id'])
return soup return soup
def get_cover_url(self):
from datetime import timedelta, datetime, date
if self.fp_tag=='':
return None
cover = 'http://webmedia.newseum.org/newseum-multimedia/dfp/jpg'+str(date.today().day)+'/lg/'+self.fp_tag+'.jpg'
br = BasicNewsRecipe.get_browser()
daysback=1
try:
br.open(cover)
except:
while daysback<7:
cover = 'http://webmedia.newseum.org/newseum-multimedia/dfp/jpg'+str((date.today() - timedelta(days=daysback)).day)+'/lg/'+self.fp_tag+'.jpg'
br = BasicNewsRecipe.get_browser()
try:
br.open(cover)
except:
daysback = daysback+1
continue
break
if daysback==7:
self.log("\nCover unavailable")
cover = None
return cover
def fixChars(self,string):
# Replace lsquo (\x91)
fixed = re.sub("\x91","",string)
# Replace rsquo (\x92)
fixed = re.sub("\x92","",fixed)
# Replace ldquo (\x93)
fixed = re.sub("\x93","“",fixed)
# Replace rdquo (\x94)
fixed = re.sub("\x94","”",fixed)
# Replace ndash (\x96)
fixed = re.sub("\x96","",fixed)
# Replace mdash (\x97)
fixed = re.sub("\x97","—",fixed)
fixed = re.sub("&#x2019;","",fixed)
return fixed
def massageNCXText(self, description):
# Kindle TOC descriptions won't render certain characters
if description:
massaged = unicode(BeautifulStoneSoup(description, convertEntities=BeautifulStoneSoup.HTML_ENTITIES))
# Replace '&' with '&'
massaged = re.sub("&","&", massaged)
return self.fixChars(massaged)
else:
return description
def populate_article_metadata(self, article, soup, first):
if first:
picdiv = soup.find('body').find('img')
if picdiv is not None:
self.add_toc_thumbnail(article,re.sub(r'links\\link\d+\\','',picdiv['src']))
xtitle = article.text_summary.strip()
if len(xtitle) == 0:
desc = soup.find('meta',attrs={'property':'og:description'})
if desc is not None:
article.summary = article.text_summary = desc['content']
def strip_anchors(self,soup):
paras = soup.findAll(True)
for para in paras:
aTags = para.findAll('a')
for a in aTags:
if a.img is None:
a.replaceWith(a.renderContents().decode('cp1252','replace'))
return soup
def preprocess_html(self, soup):
return self.strip_anchors(soup)
def parse_index(self): def parse_index(self):
soup = self.index_to_soup(self.url_prefix+'/news/todays-paper/index.html') soup = self.index_to_soup(self.url_prefix+'/news/todays-paper/index.html')

View File

@ -1,4 +1,5 @@
#!/usr/bin/env python #!/usr/bin/env python
# -*- coding: utf-8 -*-
__license__ = 'GPL v3' __license__ = 'GPL v3'
@ -6,35 +7,77 @@ __license__ = 'GPL v3'
www.canada.com www.canada.com
''' '''
import string, re
from calibre import strftime
from calibre.web.feeds.news import BasicNewsRecipe
import string, re
from calibre import strftime
from calibre.web.feeds.recipes import BasicNewsRecipe from calibre.web.feeds.recipes import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag
class CanWestPaper(BasicNewsRecipe): class CanWestPaper(BasicNewsRecipe):
# un-comment the following three lines for the Regina Leader-Post # un-comment the following four lines for the Victoria Times Colonist
## title = u'Victoria Times Colonist'
## url_prefix = 'http://www.timescolonist.com'
## description = u'News from Victoria, BC'
## fp_tag = 'CAN_TC'
# un-comment the following four lines for the Vancouver Province
## title = u'Vancouver Province'
## url_prefix = 'http://www.theprovince.com'
## description = u'News from Vancouver, BC'
## fp_tag = 'CAN_VP'
# un-comment the following four lines for the Vancouver Sun
## title = u'Vancouver Sun'
## url_prefix = 'http://www.vancouversun.com'
## description = u'News from Vancouver, BC'
## fp_tag = 'CAN_VS'
# un-comment the following four lines for the Edmonton Journal
## title = u'Edmonton Journal'
## url_prefix = 'http://www.edmontonjournal.com'
## description = u'News from Edmonton, AB'
## fp_tag = 'CAN_EJ'
# un-comment the following four lines for the Calgary Herald
## title = u'Calgary Herald'
## url_prefix = 'http://www.calgaryherald.com'
## description = u'News from Calgary, AB'
## fp_tag = 'CAN_CH'
# un-comment the following four lines for the Regina Leader-Post
title = u'Regina Leader-Post' title = u'Regina Leader-Post'
url_prefix = 'http://www.leaderpost.com' url_prefix = 'http://www.leaderpost.com'
description = u'News from Regina, SK' description = u'News from Regina, SK'
fp_tag = ''
# un-comment the following three lines for the Saskatoon Star-Phoenix # un-comment the following four lines for the Saskatoon Star-Phoenix
#title = u'Saskatoon Star-Phoenix' ## title = u'Saskatoon Star-Phoenix'
#url_prefix = 'http://www.thestarphoenix.com' ## url_prefix = 'http://www.thestarphoenix.com'
#description = u'News from Saskatoon, SK' ## description = u'News from Saskatoon, SK'
## fp_tag = ''
# un-comment the following three lines for the Windsor Star # un-comment the following four lines for the Windsor Star
#title = u'Windsor Star' ## title = u'Windsor Star'
#url_prefix = 'http://www.windsorstar.com' ## url_prefix = 'http://www.windsorstar.com'
#description = u'News from Windsor, ON' ## description = u'News from Windsor, ON'
## fp_tag = 'CAN_'
# un-comment the following three lines for the Ottawa Citizen # un-comment the following four lines for the Ottawa Citizen
#title = u'Ottawa Citizen' ## title = u'Ottawa Citizen'
#url_prefix = 'http://www.ottawacitizen.com' ## url_prefix = 'http://www.ottawacitizen.com'
#description = u'News from Ottawa, ON' ## description = u'News from Ottawa, ON'
## fp_tag = 'CAN_OC'
# un-comment the following three lines for the Montreal Gazette # un-comment the following four lines for the Montreal Gazette
#title = u'Montreal Gazette' ## title = u'Montreal Gazette'
#url_prefix = 'http://www.montrealgazette.com' ## url_prefix = 'http://www.montrealgazette.com'
#description = u'News from Montreal, QC' ## description = u'News from Montreal, QC'
## fp_tag = 'CAN_MG'
language = 'en_CA' language = 'en_CA'
@ -66,6 +109,80 @@ class CanWestPaper(BasicNewsRecipe):
del(div['id']) del(div['id'])
return soup return soup
def get_cover_url(self):
from datetime import timedelta, datetime, date
if self.fp_tag=='':
return None
cover = 'http://webmedia.newseum.org/newseum-multimedia/dfp/jpg'+str(date.today().day)+'/lg/'+self.fp_tag+'.jpg'
br = BasicNewsRecipe.get_browser()
daysback=1
try:
br.open(cover)
except:
while daysback<7:
cover = 'http://webmedia.newseum.org/newseum-multimedia/dfp/jpg'+str((date.today() - timedelta(days=daysback)).day)+'/lg/'+self.fp_tag+'.jpg'
br = BasicNewsRecipe.get_browser()
try:
br.open(cover)
except:
daysback = daysback+1
continue
break
if daysback==7:
self.log("\nCover unavailable")
cover = None
return cover
def fixChars(self,string):
# Replace lsquo (\x91)
fixed = re.sub("\x91","",string)
# Replace rsquo (\x92)
fixed = re.sub("\x92","",fixed)
# Replace ldquo (\x93)
fixed = re.sub("\x93","“",fixed)
# Replace rdquo (\x94)
fixed = re.sub("\x94","”",fixed)
# Replace ndash (\x96)
fixed = re.sub("\x96","",fixed)
# Replace mdash (\x97)
fixed = re.sub("\x97","—",fixed)
fixed = re.sub("&#x2019;","",fixed)
return fixed
def massageNCXText(self, description):
# Kindle TOC descriptions won't render certain characters
if description:
massaged = unicode(BeautifulStoneSoup(description, convertEntities=BeautifulStoneSoup.HTML_ENTITIES))
# Replace '&' with '&'
massaged = re.sub("&","&", massaged)
return self.fixChars(massaged)
else:
return description
def populate_article_metadata(self, article, soup, first):
if first:
picdiv = soup.find('body').find('img')
if picdiv is not None:
self.add_toc_thumbnail(article,re.sub(r'links\\link\d+\\','',picdiv['src']))
xtitle = article.text_summary.strip()
if len(xtitle) == 0:
desc = soup.find('meta',attrs={'property':'og:description'})
if desc is not None:
article.summary = article.text_summary = desc['content']
def strip_anchors(self,soup):
paras = soup.findAll(True)
for para in paras:
aTags = para.findAll('a')
for a in aTags:
if a.img is None:
a.replaceWith(a.renderContents().decode('cp1252','replace'))
return soup
def preprocess_html(self, soup):
return self.strip_anchors(soup)
def parse_index(self): def parse_index(self):
soup = self.index_to_soup(self.url_prefix+'/news/todays-paper/index.html') soup = self.index_to_soup(self.url_prefix+'/news/todays-paper/index.html')

View File

@ -1,4 +1,5 @@
#!/usr/bin/env python #!/usr/bin/env python
# -*- coding: utf-8 -*-
__license__ = 'GPL v3' __license__ = 'GPL v3'
@ -6,30 +7,77 @@ __license__ = 'GPL v3'
www.canada.com www.canada.com
''' '''
import string, re
from calibre import strftime
from calibre.web.feeds.news import BasicNewsRecipe
import string, re
from calibre import strftime
from calibre.web.feeds.recipes import BasicNewsRecipe from calibre.web.feeds.recipes import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag
class CanWestPaper(BasicNewsRecipe): class CanWestPaper(BasicNewsRecipe):
# un-comment the following three lines for the Saskatoon Star-Phoenix # un-comment the following four lines for the Victoria Times Colonist
## title = u'Victoria Times Colonist'
## url_prefix = 'http://www.timescolonist.com'
## description = u'News from Victoria, BC'
## fp_tag = 'CAN_TC'
# un-comment the following four lines for the Vancouver Province
## title = u'Vancouver Province'
## url_prefix = 'http://www.theprovince.com'
## description = u'News from Vancouver, BC'
## fp_tag = 'CAN_VP'
# un-comment the following four lines for the Vancouver Sun
## title = u'Vancouver Sun'
## url_prefix = 'http://www.vancouversun.com'
## description = u'News from Vancouver, BC'
## fp_tag = 'CAN_VS'
# un-comment the following four lines for the Edmonton Journal
## title = u'Edmonton Journal'
## url_prefix = 'http://www.edmontonjournal.com'
## description = u'News from Edmonton, AB'
## fp_tag = 'CAN_EJ'
# un-comment the following four lines for the Calgary Herald
## title = u'Calgary Herald'
## url_prefix = 'http://www.calgaryherald.com'
## description = u'News from Calgary, AB'
## fp_tag = 'CAN_CH'
# un-comment the following four lines for the Regina Leader-Post
## title = u'Regina Leader-Post'
## url_prefix = 'http://www.leaderpost.com'
## description = u'News from Regina, SK'
## fp_tag = ''
# un-comment the following four lines for the Saskatoon Star-Phoenix
title = u'Saskatoon Star-Phoenix' title = u'Saskatoon Star-Phoenix'
url_prefix = 'http://www.thestarphoenix.com' url_prefix = 'http://www.thestarphoenix.com'
description = u'News from Saskatoon, SK' description = u'News from Saskatoon, SK'
fp_tag = ''
# un-comment the following three lines for the Windsor Star # un-comment the following four lines for the Windsor Star
#title = u'Windsor Star' ## title = u'Windsor Star'
#url_prefix = 'http://www.windsorstar.com' ## url_prefix = 'http://www.windsorstar.com'
#description = u'News from Windsor, ON' ## description = u'News from Windsor, ON'
## fp_tag = 'CAN_'
# un-comment the following three lines for the Ottawa Citizen # un-comment the following four lines for the Ottawa Citizen
#title = u'Ottawa Citizen' ## title = u'Ottawa Citizen'
#url_prefix = 'http://www.ottawacitizen.com' ## url_prefix = 'http://www.ottawacitizen.com'
#description = u'News from Ottawa, ON' ## description = u'News from Ottawa, ON'
## fp_tag = 'CAN_OC'
# un-comment the following three lines for the Montreal Gazette # un-comment the following four lines for the Montreal Gazette
#title = u'Montreal Gazette' ## title = u'Montreal Gazette'
#url_prefix = 'http://www.montrealgazette.com' ## url_prefix = 'http://www.montrealgazette.com'
#description = u'News from Montreal, QC' ## description = u'News from Montreal, QC'
## fp_tag = 'CAN_MG'
language = 'en_CA' language = 'en_CA'
@ -61,6 +109,80 @@ class CanWestPaper(BasicNewsRecipe):
del(div['id']) del(div['id'])
return soup return soup
def get_cover_url(self):
from datetime import timedelta, datetime, date
if self.fp_tag=='':
return None
cover = 'http://webmedia.newseum.org/newseum-multimedia/dfp/jpg'+str(date.today().day)+'/lg/'+self.fp_tag+'.jpg'
br = BasicNewsRecipe.get_browser()
daysback=1
try:
br.open(cover)
except:
while daysback<7:
cover = 'http://webmedia.newseum.org/newseum-multimedia/dfp/jpg'+str((date.today() - timedelta(days=daysback)).day)+'/lg/'+self.fp_tag+'.jpg'
br = BasicNewsRecipe.get_browser()
try:
br.open(cover)
except:
daysback = daysback+1
continue
break
if daysback==7:
self.log("\nCover unavailable")
cover = None
return cover
def fixChars(self,string):
# Replace lsquo (\x91)
fixed = re.sub("\x91","",string)
# Replace rsquo (\x92)
fixed = re.sub("\x92","",fixed)
# Replace ldquo (\x93)
fixed = re.sub("\x93","“",fixed)
# Replace rdquo (\x94)
fixed = re.sub("\x94","”",fixed)
# Replace ndash (\x96)
fixed = re.sub("\x96","",fixed)
# Replace mdash (\x97)
fixed = re.sub("\x97","—",fixed)
fixed = re.sub("&#x2019;","",fixed)
return fixed
def massageNCXText(self, description):
# Kindle TOC descriptions won't render certain characters
if description:
massaged = unicode(BeautifulStoneSoup(description, convertEntities=BeautifulStoneSoup.HTML_ENTITIES))
# Replace '&' with '&'
massaged = re.sub("&","&", massaged)
return self.fixChars(massaged)
else:
return description
def populate_article_metadata(self, article, soup, first):
if first:
picdiv = soup.find('body').find('img')
if picdiv is not None:
self.add_toc_thumbnail(article,re.sub(r'links\\link\d+\\','',picdiv['src']))
xtitle = article.text_summary.strip()
if len(xtitle) == 0:
desc = soup.find('meta',attrs={'property':'og:description'})
if desc is not None:
article.summary = article.text_summary = desc['content']
def strip_anchors(self,soup):
paras = soup.findAll(True)
for para in paras:
aTags = para.findAll('a')
for a in aTags:
if a.img is None:
a.replaceWith(a.renderContents().decode('cp1252','replace'))
return soup
def preprocess_html(self, soup):
return self.strip_anchors(soup)
def parse_index(self): def parse_index(self):
soup = self.index_to_soup(self.url_prefix+'/news/todays-paper/index.html') soup = self.index_to_soup(self.url_prefix+'/news/todays-paper/index.html')

View File

@ -1,4 +1,5 @@
#!/usr/bin/env python #!/usr/bin/env python
# -*- coding: utf-8 -*-
__license__ = 'GPL v3' __license__ = 'GPL v3'
@ -6,50 +7,77 @@ __license__ = 'GPL v3'
www.canada.com www.canada.com
''' '''
import string, re
from calibre import strftime
from calibre.web.feeds.news import BasicNewsRecipe
import string, re
from calibre import strftime
from calibre.web.feeds.recipes import BasicNewsRecipe from calibre.web.feeds.recipes import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag
class CanWestPaper(BasicNewsRecipe): class CanWestPaper(BasicNewsRecipe):
# un-comment the following three lines for the Vancouver Sun # un-comment the following four lines for the Victoria Times Colonist
## title = u'Victoria Times Colonist'
## url_prefix = 'http://www.timescolonist.com'
## description = u'News from Victoria, BC'
## fp_tag = 'CAN_TC'
# un-comment the following four lines for the Vancouver Province
## title = u'Vancouver Province'
## url_prefix = 'http://www.theprovince.com'
## description = u'News from Vancouver, BC'
## fp_tag = 'CAN_VP'
# un-comment the following four lines for the Vancouver Sun
title = u'Vancouver Sun' title = u'Vancouver Sun'
url_prefix = 'http://www.vancouversun.com' url_prefix = 'http://www.vancouversun.com'
description = u'News from Vancouver, BC' description = u'News from Vancouver, BC'
fp_tag = 'CAN_VS'
# un-comment the following three lines for the Edmonton Journal # un-comment the following four lines for the Edmonton Journal
#title = u'Edmonton Journal' ## title = u'Edmonton Journal'
#url_prefix = 'http://www.edmontonjournal.com' ## url_prefix = 'http://www.edmontonjournal.com'
#description = u'News from Edmonton, AB' ## description = u'News from Edmonton, AB'
## fp_tag = 'CAN_EJ'
# un-comment the following three lines for the Calgary Herald # un-comment the following four lines for the Calgary Herald
#title = u'Calgary Herald' ## title = u'Calgary Herald'
#url_prefix = 'http://www.calgaryherald.com' ## url_prefix = 'http://www.calgaryherald.com'
#description = u'News from Calgary, AB' ## description = u'News from Calgary, AB'
## fp_tag = 'CAN_CH'
# un-comment the following three lines for the Regina Leader-Post # un-comment the following four lines for the Regina Leader-Post
#title = u'Regina Leader-Post' ## title = u'Regina Leader-Post'
#url_prefix = 'http://www.leaderpost.com' ## url_prefix = 'http://www.leaderpost.com'
#description = u'News from Regina, SK' ## description = u'News from Regina, SK'
## fp_tag = ''
# un-comment the following three lines for the Saskatoon Star-Phoenix # un-comment the following four lines for the Saskatoon Star-Phoenix
#title = u'Saskatoon Star-Phoenix' ## title = u'Saskatoon Star-Phoenix'
#url_prefix = 'http://www.thestarphoenix.com' ## url_prefix = 'http://www.thestarphoenix.com'
#description = u'News from Saskatoon, SK' ## description = u'News from Saskatoon, SK'
## fp_tag = ''
# un-comment the following three lines for the Windsor Star # un-comment the following four lines for the Windsor Star
#title = u'Windsor Star' ## title = u'Windsor Star'
#url_prefix = 'http://www.windsorstar.com' ## url_prefix = 'http://www.windsorstar.com'
#description = u'News from Windsor, ON' ## description = u'News from Windsor, ON'
## fp_tag = 'CAN_'
# un-comment the following three lines for the Ottawa Citizen # un-comment the following four lines for the Ottawa Citizen
#title = u'Ottawa Citizen' ## title = u'Ottawa Citizen'
#url_prefix = 'http://www.ottawacitizen.com' ## url_prefix = 'http://www.ottawacitizen.com'
#description = u'News from Ottawa, ON' ## description = u'News from Ottawa, ON'
## fp_tag = 'CAN_OC'
# un-comment the following three lines for the Montreal Gazette # un-comment the following four lines for the Montreal Gazette
#title = u'Montreal Gazette' ## title = u'Montreal Gazette'
#url_prefix = 'http://www.montrealgazette.com' ## url_prefix = 'http://www.montrealgazette.com'
#description = u'News from Montreal, QC' ## description = u'News from Montreal, QC'
## fp_tag = 'CAN_MG'
language = 'en_CA' language = 'en_CA'
@ -81,6 +109,80 @@ class CanWestPaper(BasicNewsRecipe):
del(div['id']) del(div['id'])
return soup return soup
def get_cover_url(self):
from datetime import timedelta, datetime, date
if self.fp_tag=='':
return None
cover = 'http://webmedia.newseum.org/newseum-multimedia/dfp/jpg'+str(date.today().day)+'/lg/'+self.fp_tag+'.jpg'
br = BasicNewsRecipe.get_browser()
daysback=1
try:
br.open(cover)
except:
while daysback<7:
cover = 'http://webmedia.newseum.org/newseum-multimedia/dfp/jpg'+str((date.today() - timedelta(days=daysback)).day)+'/lg/'+self.fp_tag+'.jpg'
br = BasicNewsRecipe.get_browser()
try:
br.open(cover)
except:
daysback = daysback+1
continue
break
if daysback==7:
self.log("\nCover unavailable")
cover = None
return cover
def fixChars(self,string):
# Replace lsquo (\x91)
fixed = re.sub("\x91","",string)
# Replace rsquo (\x92)
fixed = re.sub("\x92","",fixed)
# Replace ldquo (\x93)
fixed = re.sub("\x93","“",fixed)
# Replace rdquo (\x94)
fixed = re.sub("\x94","”",fixed)
# Replace ndash (\x96)
fixed = re.sub("\x96","",fixed)
# Replace mdash (\x97)
fixed = re.sub("\x97","—",fixed)
fixed = re.sub("&#x2019;","",fixed)
return fixed
def massageNCXText(self, description):
# Kindle TOC descriptions won't render certain characters
if description:
massaged = unicode(BeautifulStoneSoup(description, convertEntities=BeautifulStoneSoup.HTML_ENTITIES))
# Replace '&' with '&'
massaged = re.sub("&","&", massaged)
return self.fixChars(massaged)
else:
return description
def populate_article_metadata(self, article, soup, first):
if first:
picdiv = soup.find('body').find('img')
if picdiv is not None:
self.add_toc_thumbnail(article,re.sub(r'links\\link\d+\\','',picdiv['src']))
xtitle = article.text_summary.strip()
if len(xtitle) == 0:
desc = soup.find('meta',attrs={'property':'og:description'})
if desc is not None:
article.summary = article.text_summary = desc['content']
def strip_anchors(self,soup):
paras = soup.findAll(True)
for para in paras:
aTags = para.findAll('a')
for a in aTags:
if a.img is None:
a.replaceWith(a.renderContents().decode('cp1252','replace'))
return soup
def preprocess_html(self, soup):
return self.strip_anchors(soup)
def parse_index(self): def parse_index(self):
soup = self.index_to_soup(self.url_prefix+'/news/todays-paper/index.html') soup = self.index_to_soup(self.url_prefix+'/news/todays-paper/index.html')

View File

@ -1,4 +1,5 @@
#!/usr/bin/env python #!/usr/bin/env python
# -*- coding: utf-8 -*-
__license__ = 'GPL v3' __license__ = 'GPL v3'
@ -6,60 +7,77 @@ __license__ = 'GPL v3'
www.canada.com www.canada.com
''' '''
import string, re
from calibre import strftime
from calibre.web.feeds.news import BasicNewsRecipe
import string, re
from calibre import strftime
from calibre.web.feeds.recipes import BasicNewsRecipe from calibre.web.feeds.recipes import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag
class CanWestPaper(BasicNewsRecipe): class CanWestPaper(BasicNewsRecipe):
# un-comment the following three lines for the Victoria Times Colonist # un-comment the following four lines for the Victoria Times Colonist
title = u'Victoria Times Colonist' title = u'Victoria Times Colonist'
url_prefix = 'http://www.timescolonist.com' url_prefix = 'http://www.timescolonist.com'
description = u'News from Victoria, BC' description = u'News from Victoria, BC'
fp_tag = 'CAN_TC'
# un-comment the following three lines for the Vancouver Province # un-comment the following four lines for the Vancouver Province
#title = u'Vancouver Province' ## title = u'Vancouver Province'
#url_prefix = 'http://www.theprovince.com' ## url_prefix = 'http://www.theprovince.com'
#description = u'News from Vancouver, BC' ## description = u'News from Vancouver, BC'
## fp_tag = 'CAN_VP'
# un-comment the following three lines for the Vancouver Sun # un-comment the following four lines for the Vancouver Sun
#title = u'Vancouver Sun' ## title = u'Vancouver Sun'
#url_prefix = 'http://www.vancouversun.com' ## url_prefix = 'http://www.vancouversun.com'
#description = u'News from Vancouver, BC' ## description = u'News from Vancouver, BC'
## fp_tag = 'CAN_VS'
# un-comment the following three lines for the Edmonton Journal # un-comment the following four lines for the Edmonton Journal
#title = u'Edmonton Journal' ## title = u'Edmonton Journal'
#url_prefix = 'http://www.edmontonjournal.com' ## url_prefix = 'http://www.edmontonjournal.com'
#description = u'News from Edmonton, AB' ## description = u'News from Edmonton, AB'
## fp_tag = 'CAN_EJ'
# un-comment the following three lines for the Calgary Herald # un-comment the following four lines for the Calgary Herald
#title = u'Calgary Herald' ## title = u'Calgary Herald'
#url_prefix = 'http://www.calgaryherald.com' ## url_prefix = 'http://www.calgaryherald.com'
#description = u'News from Calgary, AB' ## description = u'News from Calgary, AB'
## fp_tag = 'CAN_CH'
# un-comment the following three lines for the Regina Leader-Post # un-comment the following four lines for the Regina Leader-Post
#title = u'Regina Leader-Post' ## title = u'Regina Leader-Post'
#url_prefix = 'http://www.leaderpost.com' ## url_prefix = 'http://www.leaderpost.com'
#description = u'News from Regina, SK' ## description = u'News from Regina, SK'
## fp_tag = ''
# un-comment the following three lines for the Saskatoon Star-Phoenix # un-comment the following four lines for the Saskatoon Star-Phoenix
#title = u'Saskatoon Star-Phoenix' ## title = u'Saskatoon Star-Phoenix'
#url_prefix = 'http://www.thestarphoenix.com' ## url_prefix = 'http://www.thestarphoenix.com'
#description = u'News from Saskatoon, SK' ## description = u'News from Saskatoon, SK'
## fp_tag = ''
# un-comment the following three lines for the Windsor Star # un-comment the following four lines for the Windsor Star
#title = u'Windsor Star' ## title = u'Windsor Star'
#url_prefix = 'http://www.windsorstar.com' ## url_prefix = 'http://www.windsorstar.com'
#description = u'News from Windsor, ON' ## description = u'News from Windsor, ON'
## fp_tag = 'CAN_'
# un-comment the following three lines for the Ottawa Citizen # un-comment the following four lines for the Ottawa Citizen
#title = u'Ottawa Citizen' ## title = u'Ottawa Citizen'
#url_prefix = 'http://www.ottawacitizen.com' ## url_prefix = 'http://www.ottawacitizen.com'
#description = u'News from Ottawa, ON' ## description = u'News from Ottawa, ON'
## fp_tag = 'CAN_OC'
# un-comment the following three lines for the Montreal Gazette # un-comment the following four lines for the Montreal Gazette
#title = u'Montreal Gazette' ## title = u'Montreal Gazette'
#url_prefix = 'http://www.montrealgazette.com' ## url_prefix = 'http://www.montrealgazette.com'
#description = u'News from Montreal, QC' ## description = u'News from Montreal, QC'
## fp_tag = 'CAN_MG'
language = 'en_CA' language = 'en_CA'
@ -91,6 +109,80 @@ class CanWestPaper(BasicNewsRecipe):
del(div['id']) del(div['id'])
return soup return soup
def get_cover_url(self):
from datetime import timedelta, datetime, date
if self.fp_tag=='':
return None
cover = 'http://webmedia.newseum.org/newseum-multimedia/dfp/jpg'+str(date.today().day)+'/lg/'+self.fp_tag+'.jpg'
br = BasicNewsRecipe.get_browser()
daysback=1
try:
br.open(cover)
except:
while daysback<7:
cover = 'http://webmedia.newseum.org/newseum-multimedia/dfp/jpg'+str((date.today() - timedelta(days=daysback)).day)+'/lg/'+self.fp_tag+'.jpg'
br = BasicNewsRecipe.get_browser()
try:
br.open(cover)
except:
daysback = daysback+1
continue
break
if daysback==7:
self.log("\nCover unavailable")
cover = None
return cover
def fixChars(self,string):
# Replace lsquo (\x91)
fixed = re.sub("\x91","",string)
# Replace rsquo (\x92)
fixed = re.sub("\x92","",fixed)
# Replace ldquo (\x93)
fixed = re.sub("\x93","“",fixed)
# Replace rdquo (\x94)
fixed = re.sub("\x94","”",fixed)
# Replace ndash (\x96)
fixed = re.sub("\x96","",fixed)
# Replace mdash (\x97)
fixed = re.sub("\x97","—",fixed)
fixed = re.sub("&#x2019;","",fixed)
return fixed
def massageNCXText(self, description):
# Kindle TOC descriptions won't render certain characters
if description:
massaged = unicode(BeautifulStoneSoup(description, convertEntities=BeautifulStoneSoup.HTML_ENTITIES))
# Replace '&' with '&'
massaged = re.sub("&","&", massaged)
return self.fixChars(massaged)
else:
return description
def populate_article_metadata(self, article, soup, first):
if first:
picdiv = soup.find('body').find('img')
if picdiv is not None:
self.add_toc_thumbnail(article,re.sub(r'links\\link\d+\\','',picdiv['src']))
xtitle = article.text_summary.strip()
if len(xtitle) == 0:
desc = soup.find('meta',attrs={'property':'og:description'})
if desc is not None:
article.summary = article.text_summary = desc['content']
def strip_anchors(self,soup):
paras = soup.findAll(True)
for para in paras:
aTags = para.findAll('a')
for a in aTags:
if a.img is None:
a.replaceWith(a.renderContents().decode('cp1252','replace'))
return soup
def preprocess_html(self, soup):
return self.strip_anchors(soup)
def parse_index(self): def parse_index(self):
soup = self.index_to_soup(self.url_prefix+'/news/todays-paper/index.html') soup = self.index_to_soup(self.url_prefix+'/news/todays-paper/index.html')