From 218a92de0ec8803f8b007207ede3d909e9fd42fa Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Wed, 15 Aug 2012 09:24:48 +0530 Subject: [PATCH] Updated various Canadian newspapers --- recipes/calgary_herald.recipe | 355 ++++++++++++++++++++++++++++--- recipes/edmonton_journal.recipe | 278 ++++++++++++++++-------- recipes/montreal_gazette.recipe | 310 +++++++++++++++++++++++++-- recipes/ottawa_citizen.recipe | 278 ++++++++++++++++-------- recipes/vancouver_provice.recipe | 352 ++++++++++++++++++++++-------- recipes/vancouver_sun.recipe | 278 ++++++++++++++++-------- 6 files changed, 1446 insertions(+), 405 deletions(-) diff --git a/recipes/calgary_herald.recipe b/recipes/calgary_herald.recipe index 12134bc9a4..74ec104463 100644 --- a/recipes/calgary_herald.recipe +++ b/recipes/calgary_herald.recipe @@ -1,35 +1,320 @@ -from calibre.web.feeds.news import BasicNewsRecipe - -class CalgaryHerald(BasicNewsRecipe): - title = u'Calgary Herald' - oldest_article = 3 - max_articles_per_feed = 100 - - feeds = [ - (u'News', u'http://rss.canada.com/get/?F233'), - (u'Calgary', u'http://www.calgaryherald.com/scripts/sp6query.aspx?catalog=cahr&tags=keyword|calgary&output=rss?link=http%3a%2f%2fwww.calgaryherald'), - (u'Alberta', u'http://www.calgaryherald.com/scripts/Sp6Query.aspx?catalog=CAHR&tags=Keyword|Alberta&output=rss?link=http%3A%2F%2Fwww.calgaryherald.com%2Fnews%2Falberta%2Findex.html'), - (u'Politics', u'http://rss.canada.com/get/?F7551'), - (u'National', u'http://rss.canada.com/get/?F7552'), - (u'World', u'http://rss.canada.com/get/?F7553'), - ] - __author__ = 'rty' - pubisher = 'Calgary Herald' - description = 'Calgary, Alberta, Canada' - category = 'News, Calgary, Alberta, Canada' - - - remove_javascript = True - use_embedded_content = False - no_stylesheets = True - language = 'en_CA' - encoding = 'utf-8' - conversion_options = {'linearize_tables':True} - ##masthead_url = 'http://www.calgaryherald.com/index.html' - keep_only_tags = [ - dict(name='div', attrs={'id':'storyheader'}), - dict(name='div', attrs={'id':'storycontent'}) - - ] - remove_tags_after = {'class':"story_tool_hr"} - +#!/usr/bin/env python +# -*- coding: utf-8 -*- +__license__ = 'GPL v3' + +''' +www.canada.com +''' +import string, re +from calibre import strftime +from calibre.web.feeds.news import BasicNewsRecipe + +import string, re +from calibre import strftime +from calibre.web.feeds.recipes import BasicNewsRecipe +from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag + + +class CanWestPaper(BasicNewsRecipe): + + postmedia_index_pages = [ + (u'Headlines',u'/index.html'), + (u'Ottawa & Area',u'/news/ottawa/index.html'), + (u'Vancouver',u'/news/vancouver/index.html'), + (u'Calgary',u'/news/calgary/index.html'), + (u'Edmonton',u'/news/edmonton/index.html'), + (u'Montreal',u'/news/montreal/index.html'), + (u'Fraser Valley',u'/news/fraser-valley/index.html'), + (u'British Columbia',u'/news/bc/index.html'), + (u'Alberta',u'/news/alberta/index.html'), + (u'Canada',u'/news/canada/index.html'), + (u'National',u'/news/national/index.html'), + (u'Politics',u'/news/politics/index.html'), + (u'Insight',u'/news/insight/index.html'), + (u'Special Reports',u'/news/specialreports/index.html'), + (u'Gangs',u'/news/gangs/index.html'), + (u'Education',u'/news/education/index.html'), + (u'Health',u'/news/health/index.html'), + (u'Environment',u'/news/environment/index.html'), + (u'World',u'/news/world/index.html'), + (u'Police Blotter',u'/news/crime-and-justice/index.html'), + (u'Crime',u'/news/blotter/index.html'), + (u'Around Town',u'/news/topic.html?t=keyword&q=Around+Town'), + (u'Diplomatica',u'/news/diplomatica/index.html'), + (u'Opinion',u'/opinion/index.html'), + (u'Columnists',u'/columnists/index.html'), + (u'Editorials',u'/opinion/editorials/index.html'), + (u'Letters',u'/opinion/letters/index.html'), + (u'Business',u'/business/index.html'), + (u'Sports',u'/sports/index.html'), + (u'Arts',u'/entertainment/index.html'), + (u'Life',u'/life/index.html'), + (u'Technology',u'/technology/index.html'), + (u'Travel',u'/travel/index.html'), + (u'Health',u'/health/index.html') + ] + + + # un-comment the following six lines for the Vancouver Province +## title = u'Vancouver Province' +## url_prefix = 'http://www.theprovince.com' +## description = u'News from Vancouver, BC' +## std_logo_url = 'http://www.theprovince.com/images/logo_theprovince.jpg' +## logo_url = 'vplogo.jpg' +## fp_tag = 'CAN_TP' + + # un-comment the following six lines for the Vancouver Sun +## title = u'Vancouver Sun' +## url_prefix = 'http://www.vancouversun.com' +## description = u'News from Vancouver, BC' +## std_logo_url = 'http://www.vancouversun.com/images/logo_vancouversun.jpg' +## logo_url = 'vslogo.jpg' +## fp_tag = 'CAN_VS' + + # un-comment the following six lines for the Calgary Herald + title = u'Calgary Herald' + url_prefix = 'http://www.calgaryherald.com' + description = u'News from Calgary, AB' + std_logo_url = 'http://www.calgaryherald.com/images/logo_calgaryherald.jpg' + logo_url = 'chlogo.jpg' + fp_tag = 'CAN_CH' + + # un-comment the following six lines for the Edmonton Journal +## title = u'Edmonton Journal' +## url_prefix = 'http://www.edmontonjournal.com' +## description = u'News from Edmonton, AB' +## std_logo_url = 'http://www.edmontonjournal.com/images/logo_edmontonjournal.jpg' +## logo_url = 'ejlogo.jpg' +## fp_tag = 'CAN_EJ' + + # un-comment the following six lines for the Ottawa Citizen +## title = u'Ottawa Citizen' +## url_prefix = 'http://www.ottawacitizen.com' +## description = u'News from Ottawa, ON' +## std_logo_url = 'http://www.ottawacitizen.com/images/logo_ottawacitizen.jpg' +## logo_url = 'oclogo.jpg' +## fp_tag = 'CAN_OC' + + # un-comment the following six lines for the Montreal Gazette +## title = u'Montreal Gazette' +## url_prefix = 'http://www.montrealgazette.com' +## description = u'News from Montreal, QC' +## std_logo_url = 'http://www.montrealgazette.com/images/logo_montrealgazette.jpg' +## logo_url = 'mglogo.jpg' +## fp_tag = 'CAN_MG' + + Kindle_Fire=False + masthead_url = std_logo_url + + url_list = [] + language = 'en_CA' + __author__ = 'Nick Redding' + no_stylesheets = True + timefmt = ' [%b %d]' + encoding = 'utf-8' + extra_css = ''' + .timestamp { font-size:xx-small; display: block; } + #storyheader { font-size: medium; } + #storyheader h1 { font-size: x-large; } + #storyheader h2 { font-size: small; font-style: italic; } + .byline { font-size:xx-small; } + #photocaption { font-size: xx-small; font-style: italic; font-weight: normal; } + .photocaption { font-size: xx-small; font-style: italic; font-weight: normal; } + #photocredit { font-size: xx-small; font-weight: normal; }''' + + keep_only_tags = [dict(name='div', attrs={'id':'storyheader'}),dict(name='div', attrs={'id':'storycontent'}),dict(name='div', attrs={'id':'storyphoto'})] + + remove_tags = [{'class':'comments'}, + dict(name='div', attrs={'class':'navbar'}),dict(name='div', attrs={'class':'morelinks'}), + dict(name='h2', attrs={'id':'photocredit'}), + dict(name='div', attrs={'class':'viewmore'}),dict(name='li', attrs={'class':'email'}), + dict(name='div', attrs={'class':'story_tool_hr'}),dict(name='div', attrs={'class':'clear'}), + dict(name='div', attrs={'class':'story_tool'}),dict(name='div', attrs={'class':'copyright'}), + dict(name='div', attrs={'class':'rule_grey_solid'}), + dict(name='li', attrs={'class':'print'}),dict(name='li', attrs={'class':'share'}),dict(name='ul', attrs={'class':'bullet'})] + + + def get_cover_url(self): + from datetime import timedelta, datetime, date + cover = 'http://webmedia.newseum.org/newseum-multimedia/dfp/jpg'+str(date.today().day)+'/lg/'+self.fp_tag+'.jpg' + br = BasicNewsRecipe.get_browser() + daysback=1 + try: + br.open(cover) + except: + while daysback<7: + cover = 'http://webmedia.newseum.org/newseum-multimedia/dfp/jpg'+str((date.today() - timedelta(days=daysback)).day)+'/lg/'+self.fp_tag+'.jpg' + br = BasicNewsRecipe.get_browser() + try: + br.open(cover) + except: + daysback = daysback+1 + continue + break + if daysback==7: + self.log("\nCover unavailable") + cover = None + return cover + + def prepare_masthead_image(self, path_to_image, out_path): + if self.Kindle_Fire: + from calibre import fit_image + from calibre.utils.magick import Image, create_canvas + img = Image() + img.open(path_to_image) + width, height = img.size + img2 = create_canvas(width, height) + img2.compose(img) + img2.save(out_path) + else: + BasicNewsRecipe.prepare_masthead_image(self, path_to_image, out_path) + + def fixChars(self,string): + # Replace lsquo (\x91) + fixed = re.sub("\x91","‘",string) + # Replace rsquo (\x92) + fixed = re.sub("\x92","’",fixed) + # Replace ldquo (\x93) + fixed = re.sub("\x93","“",fixed) + # Replace rdquo (\x94) + fixed = re.sub("\x94","”",fixed) + # Replace ndash (\x96) + fixed = re.sub("\x96","–",fixed) + # Replace mdash (\x97) + fixed = re.sub("\x97","—",fixed) + fixed = re.sub("’","’",fixed) + return fixed + + def massageNCXText(self, description): + # Kindle TOC descriptions won't render certain characters + if description: + massaged = unicode(BeautifulStoneSoup(description, convertEntities=BeautifulStoneSoup.HTML_ENTITIES)) + # Replace '&' with '&' + massaged = re.sub("&","&", massaged) + return self.fixChars(massaged) + else: + return description + + def populate_article_metadata(self, article, soup, first): + if first: + picdiv = soup.find('body').find('img') + if picdiv is not None: + self.add_toc_thumbnail(article,re.sub(r'links\\link\d+\\','',picdiv['src'])) + xtitle = article.text_summary.strip() + if len(xtitle) == 0: + desc = soup.find('meta',attrs={'property':'og:description'}) + if desc is not None: + article.summary = article.text_summary = desc['content'] + + def strip_anchors(self,soup): + paras = soup.findAll(True) + for para in paras: + aTags = para.findAll('a') + for a in aTags: + if a.img is None: + a.replaceWith(a.renderContents().decode('cp1252','replace')) + return soup + + + def preprocess_html(self,soup): + #delete empty id attributes--they screw up the TOC for unknown reasons + divtags = soup.findAll('div',attrs={'id':''}) + if divtags: + for div in divtags: + del(div['id']) + + pgall = soup.find('div',attrs={'id':'storyphoto'}) + if pgall is not None: # photo gallery perhaps + if (soup.find('div',attrs={'id':'storycontent'}) is None): + allpics = Tag(soup,'div') + first_img = pgall.find('div','storyimage') + if first_img is not None: + first_img.extract() + tlist = pgall.find('div',attrs={'id':'relatedthumbs'}) + if tlist is not None: + for atag in tlist.findAll('a'): + img = Tag(soup,'img') + srcpre, sep, srcpost = atag.img['src'].partition('?') + img['src'] = srcpre + pdesc = Tag(soup,'p') + pdesc.insert(0,atag.img['alt']) + pdesc['class']='photocaption' + div = Tag(soup,'div') + div.insert(0,pdesc) + div.insert(0,img) + allpics.append(div) + pgall.replaceWith(allpics) + + for pg in soup.findAll('div',attrs={'id':'storyphoto'}): + pg.extract() + return self.strip_anchors(soup) + + + + def parse_index(self): + + articles = {} + ans = [] + + + def handle_article(adiv,key): + h1tag = adiv.h1 + if h1tag is not None: + atag = h1tag.a + if atag is not None: + url = atag['href'] + if atag['href'].startswith('http'): + return + elif atag['href'].startswith('/'): + url = self.url_prefix+atag['href'] + else: + url = self.url_prefix+'/'+atag['href'] + if url in self.url_list: + return + self.url_list.append(url) + title = self.tag_to_string(atag,False) + if 'VIDEO' in title.upper(): + return + if 'GALLERY' in title.upper(): + return + if 'PHOTOS' in title.upper(): + return + dtag = adiv.find('div','content') + description='' + print("URL "+url) + print("TITLE "+title) + if dtag is not None: + stag = dtag.span + if stag is not None: + if stag['class'] != 'timestamp': + description = self.tag_to_string(stag,False) + else: + description = self.tag_to_string(dtag,False) + print("DESCRIPTION: "+description) + if not articles.has_key(key): + articles[key] = [] + articles[key].append(dict(title=title,url=url,date='',description=description,author='',content='')) + + def parse_web_index(key, keyurl): + try: + soup = self.index_to_soup(self.url_prefix+keyurl) + except: + return + ans.append(key) + mainsoup = soup.find('div','bodywrapper') + footer = mainsoup.find(attrs={'id':'footerfeature'}) + if footer is not None: + footer.extract() + print("Section: "+key) + for wdiv in mainsoup.findAll('div',attrs={'id':re.compile('^HorizontalFeatureSlider_1_Story')}): + handle_article(wdiv,key) + wdiv.extract() + for wdiv in mainsoup.findAll(attrs={'id':['featurewidget','textfeature','textlinks_timestamp']}): + for adiv in wdiv.findAll('div','featurecontent'): + handle_article(adiv,key) + + for (k,url) in self.postmedia_index_pages: + parse_web_index(k,url) + ans = [(key, articles[key]) for key in ans if articles.has_key(key)] + return ans + diff --git a/recipes/edmonton_journal.recipe b/recipes/edmonton_journal.recipe index e0c02b7d83..85cc521a81 100644 --- a/recipes/edmonton_journal.recipe +++ b/recipes/edmonton_journal.recipe @@ -1,105 +1,141 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- - __license__ = 'GPL v3' ''' www.canada.com ''' - -import re +import string, re +from calibre import strftime from calibre.web.feeds.news import BasicNewsRecipe -from calibre.ebooks.BeautifulSoup import BeautifulStoneSoup + +import string, re +from calibre import strftime +from calibre.web.feeds.recipes import BasicNewsRecipe +from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag class CanWestPaper(BasicNewsRecipe): - # un-comment the following four lines for the Victoria Times Colonist -## title = u'Victoria Times Colonist' -## url_prefix = 'http://www.timescolonist.com' -## description = u'News from Victoria, BC' -## fp_tag = 'CAN_TC' + postmedia_index_pages = [ + (u'Headlines',u'/index.html'), + (u'Ottawa & Area',u'/news/ottawa/index.html'), + (u'Vancouver',u'/news/vancouver/index.html'), + (u'Calgary',u'/news/calgary/index.html'), + (u'Edmonton',u'/news/edmonton/index.html'), + (u'Montreal',u'/news/montreal/index.html'),, + (u'Fraser Valley',u'/news/fraser-valley/index.html'), + (u'British Columbia',u'/news/bc/index.html'), + (u'Alberta',u'/news/alberta/index.html'), + (u'Canada',u'/news/canada/index.html'), + (u'National',u'/news/national/index.html'), + (u'Politics',u'/news/politics/index.html'), + (u'Insight',u'/news/insight/index.html'), + (u'Special Reports',u'/news/specialreports/index.html'), + (u'Gangs',u'/news/gangs/index.html'), + (u'Education',u'/news/education/index.html'), + (u'Health',u'/news/health/index.html'), + (u'Environment',u'/news/environment/index.html'), + (u'World',u'/news/world/index.html'), + (u'Police Blotter',u'/news/crime-and-justice/index.html'), + (u'Crime',u'/news/blotter/index.html'), + (u'Around Town',u'/news/topic.html?t=keyword&q=Around+Town'), + (u'Diplomatica',u'/news/diplomatica/index.html'), + (u'Opinion',u'/opinion/index.html'), + (u'Columnists',u'/columnists/index.html'), + (u'Editorials',u'/opinion/editorials/index.html'), + (u'Letters',u'/opinion/letters/index.html'), + (u'Business',u'/business/index.html'), + (u'Sports',u'/sports/index.html'), + (u'Arts',u'/entertainment/index.html'), + (u'Life',u'/life/index.html'), + (u'Technology',u'/technology/index.html'), + (u'Travel',u'/travel/index.html'), + (u'Health',u'/health/index.html') + ] - # un-comment the following four lines for the Vancouver Province + + # un-comment the following six lines for the Vancouver Province ## title = u'Vancouver Province' ## url_prefix = 'http://www.theprovince.com' ## description = u'News from Vancouver, BC' -## fp_tag = 'CAN_VP' +## std_logo_url = 'http://www.theprovince.com/images/logo_theprovince.jpg' +## logo_url = 'vplogo.jpg' +## fp_tag = 'CAN_TP' - # un-comment the following four lines for the Vancouver Sun + # un-comment the following six lines for the Vancouver Sun ## title = u'Vancouver Sun' ## url_prefix = 'http://www.vancouversun.com' ## description = u'News from Vancouver, BC' +## std_logo_url = 'http://www.vancouversun.com/images/logo_vancouversun.jpg' +## logo_url = 'vslogo.jpg' ## fp_tag = 'CAN_VS' - # un-comment the following four lines for the Edmonton Journal - title = u'Edmonton Journal' - url_prefix = 'http://www.edmontonjournal.com' - description = u'News from Edmonton, AB' - fp_tag = 'CAN_EJ' - - # un-comment the following four lines for the Calgary Herald + # un-comment the following six lines for the Calgary Herald ## title = u'Calgary Herald' ## url_prefix = 'http://www.calgaryherald.com' ## description = u'News from Calgary, AB' +## std_logo_url = 'http://www.calgaryherald.com/images/logo_calgaryherald.jpg' +## logo_url = 'chlogo.jpg' ## fp_tag = 'CAN_CH' - # un-comment the following four lines for the Regina Leader-Post -## title = u'Regina Leader-Post' -## url_prefix = 'http://www.leaderpost.com' -## description = u'News from Regina, SK' -## fp_tag = '' + # un-comment the following six lines for the Edmonton Journal + title = u'Edmonton Journal' + url_prefix = 'http://www.edmontonjournal.com' + description = u'News from Edmonton, AB' + std_logo_url = 'http://www.edmontonjournal.com/images/logo_edmontonjournal.jpg' + logo_url = 'ejlogo.jpg' + fp_tag = 'CAN_EJ' - # un-comment the following four lines for the Saskatoon Star-Phoenix -## title = u'Saskatoon Star-Phoenix' -## url_prefix = 'http://www.thestarphoenix.com' -## description = u'News from Saskatoon, SK' -## fp_tag = '' - - # un-comment the following four lines for the Windsor Star -## title = u'Windsor Star' -## url_prefix = 'http://www.windsorstar.com' -## description = u'News from Windsor, ON' -## fp_tag = 'CAN_' - - # un-comment the following four lines for the Ottawa Citizen + # un-comment the following six lines for the Ottawa Citizen ## title = u'Ottawa Citizen' ## url_prefix = 'http://www.ottawacitizen.com' -## description = u'News from Ottawa, ON' +## description = u'News from Ottawa, ON' +## std_logo_url = 'http://www.ottawacitizen.com/images/logo_ottawacitizen.jpg' +## logo_url = 'oclogo.jpg' ## fp_tag = 'CAN_OC' - # un-comment the following four lines for the Montreal Gazette + # un-comment the following six lines for the Montreal Gazette ## title = u'Montreal Gazette' ## url_prefix = 'http://www.montrealgazette.com' ## description = u'News from Montreal, QC' +## std_logo_url = 'http://www.montrealgazette.com/images/logo_montrealgazette.jpg' +## logo_url = 'mglogo.jpg' ## fp_tag = 'CAN_MG' + Kindle_Fire=False + masthead_url = std_logo_url + url_list = [] language = 'en_CA' __author__ = 'Nick Redding' no_stylesheets = True - timefmt = ' [%b %d]' + timefmt = ' [%b %d]' + encoding = 'utf-8' extra_css = ''' .timestamp { font-size:xx-small; display: block; } #storyheader { font-size: medium; } #storyheader h1 { font-size: x-large; } - #storyheader h2 { font-size: large; font-style: italic; } + #storyheader h2 { font-size: small; font-style: italic; } .byline { font-size:xx-small; } - #photocaption { font-size: small; font-style: italic } - #photocredit { font-size: xx-small; }''' - keep_only_tags = [dict(name='div', attrs={'id':'storyheader'}),dict(name='div', attrs={'id':'storycontent'})] + #photocaption { font-size: xx-small; font-style: italic; font-weight: normal; } + .photocaption { font-size: xx-small; font-style: italic; font-weight: normal; } + #photocredit { font-size: xx-small; font-weight: normal; }''' + + keep_only_tags = [dict(name='div', attrs={'id':'storyheader'}),dict(name='div', attrs={'id':'storycontent'}),dict(name='div', attrs={'id':'storyphoto'})] + remove_tags = [{'class':'comments'}, dict(name='div', attrs={'class':'navbar'}),dict(name='div', attrs={'class':'morelinks'}), + dict(name='h2', attrs={'id':'photocredit'}), dict(name='div', attrs={'class':'viewmore'}),dict(name='li', attrs={'class':'email'}), dict(name='div', attrs={'class':'story_tool_hr'}),dict(name='div', attrs={'class':'clear'}), dict(name='div', attrs={'class':'story_tool'}),dict(name='div', attrs={'class':'copyright'}), dict(name='div', attrs={'class':'rule_grey_solid'}), dict(name='li', attrs={'class':'print'}),dict(name='li', attrs={'class':'share'}),dict(name='ul', attrs={'class':'bullet'})] + def get_cover_url(self): - from datetime import timedelta, date - if self.fp_tag=='': - return None + from datetime import timedelta, datetime, date cover = 'http://webmedia.newseum.org/newseum-multimedia/dfp/jpg'+str(date.today().day)+'/lg/'+self.fp_tag+'.jpg' br = BasicNewsRecipe.get_browser() daysback=1 @@ -120,6 +156,19 @@ class CanWestPaper(BasicNewsRecipe): cover = None return cover + def prepare_masthead_image(self, path_to_image, out_path): + if self.Kindle_Fire: + from calibre import fit_image + from calibre.utils.magick import Image, create_canvas + img = Image() + img.open(path_to_image) + width, height = img.size + img2 = create_canvas(width, height) + img2.compose(img) + img2.save(out_path) + else: + BasicNewsRecipe.prepare_masthead_image(self, path_to_image, out_path) + def fixChars(self,string): # Replace lsquo (\x91) fixed = re.sub("\x91","‘",string) @@ -166,55 +215,106 @@ class CanWestPaper(BasicNewsRecipe): a.replaceWith(a.renderContents().decode('cp1252','replace')) return soup - def preprocess_html(self, soup): + + def preprocess_html(self,soup): + #delete empty id attributes--they screw up the TOC for unknown reasons + divtags = soup.findAll('div',attrs={'id':''}) + if divtags: + for div in divtags: + del(div['id']) + + pgall = soup.find('div',attrs={'id':'storyphoto'}) + if pgall is not None: # photo gallery perhaps + if (soup.find('div',attrs={'id':'storycontent'}) is None): + allpics = Tag(soup,'div') + first_img = pgall.find('div','storyimage') + if first_img is not None: + first_img.extract() + tlist = pgall.find('div',attrs={'id':'relatedthumbs'}) + if tlist is not None: + for atag in tlist.findAll('a'): + img = Tag(soup,'img') + srcpre, sep, srcpost = atag.img['src'].partition('?') + img['src'] = srcpre + pdesc = Tag(soup,'p') + pdesc.insert(0,atag.img['alt']) + pdesc['class']='photocaption' + div = Tag(soup,'div') + div.insert(0,pdesc) + div.insert(0,img) + allpics.append(div) + pgall.replaceWith(allpics) + + for pg in soup.findAll('div',attrs={'id':'storyphoto'}): + pg.extract() return self.strip_anchors(soup) - + def parse_index(self): - soup = self.index_to_soup(self.url_prefix+'/news/todays-paper/index.html') articles = {} - key = 'News' - ans = ['News'] + ans = [] - # Find each instance of class="sectiontitle", class="featurecontent" - for divtag in soup.findAll('div',attrs={'class' : ["section_title02","featurecontent"]}): - #self.log(" div class = %s" % divtag['class']) - if divtag['class'].startswith('section_title'): - # div contains section title - if not divtag.h3: - continue - key = self.tag_to_string(divtag.h3,False) - ans.append(key) - self.log("Section name %s" % key) - continue - # div contains article data - h1tag = divtag.find('h1') - if not h1tag: - continue - atag = h1tag.find('a',href=True) - if not atag: - continue - url = self.url_prefix+'/news/todays-paper/'+atag['href'] - #self.log("Section %s" % key) - #self.log("url %s" % url) - title = self.tag_to_string(atag,False) - #self.log("title %s" % title) - pubdate = '' - description = '' - ptag = divtag.find('p'); - if ptag: - description = self.tag_to_string(ptag,False) - #self.log("description %s" % description) - author = '' - autag = divtag.find('h4') - if autag: - author = self.tag_to_string(autag,False) - #self.log("author %s" % author) - if not articles.has_key(key): - articles[key] = [] - articles[key].append(dict(title=title,url=url,date=pubdate,description=description,author=author,content='')) + def handle_article(adiv,key): + h1tag = adiv.h1 + if h1tag is not None: + atag = h1tag.a + if atag is not None: + url = atag['href'] + if atag['href'].startswith('http'): + return + elif atag['href'].startswith('/'): + url = self.url_prefix+atag['href'] + else: + url = self.url_prefix+'/'+atag['href'] + if url in self.url_list: + return + self.url_list.append(url) + title = self.tag_to_string(atag,False) + if 'VIDEO' in title.upper(): + return + if 'GALLERY' in title.upper(): + return + if 'PHOTOS' in title.upper(): + return + dtag = adiv.find('div','content') + description='' + print("URL "+url) + print("TITLE "+title) + if dtag is not None: + stag = dtag.span + if stag is not None: + if stag['class'] != 'timestamp': + description = self.tag_to_string(stag,False) + else: + description = self.tag_to_string(dtag,False) + print("DESCRIPTION: "+description) + if not articles.has_key(key): + articles[key] = [] + articles[key].append(dict(title=title,url=url,date='',description=description,author='',content='')) + + def parse_web_index(key, keyurl): + try: + soup = self.index_to_soup(self.url_prefix+keyurl) + except: + return + ans.append(key) + mainsoup = soup.find('div','bodywrapper') + footer = mainsoup.find(attrs={'id':'footerfeature'}) + if footer is not None: + footer.extract() + print("Section: "+key) + for wdiv in mainsoup.findAll('div',attrs={'id':re.compile('^HorizontalFeatureSlider_1_Story')}): + handle_article(wdiv,key) + wdiv.extract() + for wdiv in mainsoup.findAll(attrs={'id':['featurewidget','textfeature','textlinks_timestamp']}): + for adiv in wdiv.findAll('div','featurecontent'): + handle_article(adiv,key) + + for (k,url) in self.postmedia_index_pages: + parse_web_index(k,url) ans = [(key, articles[key]) for key in ans if articles.has_key(key)] return ans + diff --git a/recipes/montreal_gazette.recipe b/recipes/montreal_gazette.recipe index 4ebbdbc0a1..0e87322309 100644 --- a/recipes/montreal_gazette.recipe +++ b/recipes/montreal_gazette.recipe @@ -1,48 +1,320 @@ #!/usr/bin/env python - +# -*- coding: utf-8 -*- __license__ = 'GPL v3' ''' www.canada.com ''' +import string, re +from calibre import strftime +from calibre.web.feeds.news import BasicNewsRecipe +import string, re +from calibre import strftime from calibre.web.feeds.recipes import BasicNewsRecipe +from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag class CanWestPaper(BasicNewsRecipe): - # un-comment the following three lines for the Montreal Gazette + postmedia_index_pages = [ + (u'Headlines',u'/index.html'), + (u'Ottawa & Area',u'/news/ottawa/index.html'), + (u'Vancouver',u'/news/vancouver/index.html'), + (u'Calgary',u'/news/calgary/index.html'), + (u'Edmonton',u'/news/edmonton/index.html'), + (u'Montreal',u'/news/montreal/index.html'), + (u'Fraser Valley',u'/news/fraser-valley/index.html'), + (u'British Columbia',u'/news/bc/index.html'), + (u'Alberta',u'/news/alberta/index.html'), + (u'Canada',u'/news/canada/index.html'), + (u'National',u'/news/national/index.html'), + (u'Politics',u'/news/politics/index.html'), + (u'Insight',u'/news/insight/index.html'), + (u'Special Reports',u'/news/specialreports/index.html'), + (u'Gangs',u'/news/gangs/index.html'), + (u'Education',u'/news/education/index.html'), + (u'Health',u'/news/health/index.html'), + (u'Environment',u'/news/environment/index.html'), + (u'World',u'/news/world/index.html'), + (u'Police Blotter',u'/news/crime-and-justice/index.html'), + (u'Crime',u'/news/blotter/index.html'), + (u'Around Town',u'/news/topic.html?t=keyword&q=Around+Town'), + (u'Diplomatica',u'/news/diplomatica/index.html'), + (u'Opinion',u'/opinion/index.html'), + (u'Columnists',u'/columnists/index.html'), + (u'Editorials',u'/opinion/editorials/index.html'), + (u'Letters',u'/opinion/letters/index.html'), + (u'Business',u'/business/index.html'), + (u'Sports',u'/sports/index.html'), + (u'Arts',u'/entertainment/index.html'), + (u'Life',u'/life/index.html'), + (u'Technology',u'/technology/index.html'), + (u'Travel',u'/travel/index.html'), + (u'Health',u'/health/index.html') + ] + + + # un-comment the following six lines for the Vancouver Province +## title = u'Vancouver Province' +## url_prefix = 'http://www.theprovince.com' +## description = u'News from Vancouver, BC' +## std_logo_url = 'http://www.theprovince.com/images/logo_theprovince.jpg' +## logo_url = 'vplogo.jpg' +## fp_tag = 'CAN_TP' + + # un-comment the following six lines for the Vancouver Sun +## title = u'Vancouver Sun' +## url_prefix = 'http://www.vancouversun.com' +## description = u'News from Vancouver, BC' +## std_logo_url = 'http://www.vancouversun.com/images/logo_vancouversun.jpg' +## logo_url = 'vslogo.jpg' +## fp_tag = 'CAN_VS' + + # un-comment the following six lines for the Calgary Herald +## title = u'Calgary Herald' +## url_prefix = 'http://www.calgaryherald.com' +## description = u'News from Calgary, AB' +## std_logo_url = 'http://www.calgaryherald.com/images/logo_calgaryherald.jpg' +## logo_url = 'chlogo.jpg' +## fp_tag = 'CAN_CH' + + # un-comment the following six lines for the Edmonton Journal +## title = u'Edmonton Journal' +## url_prefix = 'http://www.edmontonjournal.com' +## description = u'News from Edmonton, AB' +## std_logo_url = 'http://www.edmontonjournal.com/images/logo_edmontonjournal.jpg' +## logo_url = 'ejlogo.jpg' +## fp_tag = 'CAN_EJ' + + # un-comment the following six lines for the Ottawa Citizen +## title = u'Ottawa Citizen' +## url_prefix = 'http://www.ottawacitizen.com' +## description = u'News from Ottawa, ON' +## std_logo_url = 'http://www.ottawacitizen.com/images/logo_ottawacitizen.jpg' +## logo_url = 'oclogo.jpg' +## fp_tag = 'CAN_OC' + + # un-comment the following six lines for the Montreal Gazette title = u'Montreal Gazette' + url_prefix = 'http://www.montrealgazette.com' description = u'News from Montreal, QC' + std_logo_url = 'http://www.montrealgazette.com/images/logo_montrealgazette.jpg' + logo_url = 'mglogo.jpg' + fp_tag = 'CAN_MG' + Kindle_Fire=False + masthead_url = std_logo_url + url_list = [] language = 'en_CA' __author__ = 'Nick Redding' no_stylesheets = True - auto_cleanup = True - auto_cleanup_keep = '//*[@id="imageBox"]' - timefmt = ' [%b %d]' + timefmt = ' [%b %d]' + encoding = 'utf-8' extra_css = ''' .timestamp { font-size:xx-small; display: block; } #storyheader { font-size: medium; } #storyheader h1 { font-size: x-large; } - #storyheader h2 { font-size: large; font-style: italic; } + #storyheader h2 { font-size: small; font-style: italic; } .byline { font-size:xx-small; } - #photocaption { font-size: small; font-style: italic } - #photocredit { font-size: xx-small; }''' + #photocaption { font-size: xx-small; font-style: italic; font-weight: normal; } + .photocaption { font-size: xx-small; font-style: italic; font-weight: normal; } + #photocredit { font-size: xx-small; font-weight: normal; }''' + keep_only_tags = [dict(name='div', attrs={'id':'storyheader'}),dict(name='div', attrs={'id':'storycontent'}),dict(name='div', attrs={'id':'storyphoto'})] + + remove_tags = [{'class':'comments'}, + dict(name='div', attrs={'class':'navbar'}),dict(name='div', attrs={'class':'morelinks'}), + dict(name='h2', attrs={'id':'photocredit'}), + dict(name='div', attrs={'class':'viewmore'}),dict(name='li', attrs={'class':'email'}), + dict(name='div', attrs={'class':'story_tool_hr'}),dict(name='div', attrs={'class':'clear'}), + dict(name='div', attrs={'class':'story_tool'}),dict(name='div', attrs={'class':'copyright'}), + dict(name='div', attrs={'class':'rule_grey_solid'}), + dict(name='li', attrs={'class':'print'}),dict(name='li', attrs={'class':'share'}),dict(name='ul', attrs={'class':'bullet'})] - feeds = [ -('News', - 'http://rss.canada.com/get/?F297'), - ('Sports', - 'http://rss.canada.com/get/?F299'), - ('Entertainment', - 'http://rss.canada.com/get/?F7366'), - ('Business', - 'http://rss.canada.com/get/?F6939'), -] + def get_cover_url(self): + from datetime import timedelta, datetime, date + cover = 'http://webmedia.newseum.org/newseum-multimedia/dfp/jpg'+str(date.today().day)+'/lg/'+self.fp_tag+'.jpg' + br = BasicNewsRecipe.get_browser() + daysback=1 + try: + br.open(cover) + except: + while daysback<7: + cover = 'http://webmedia.newseum.org/newseum-multimedia/dfp/jpg'+str((date.today() - timedelta(days=daysback)).day)+'/lg/'+self.fp_tag+'.jpg' + br = BasicNewsRecipe.get_browser() + try: + br.open(cover) + except: + daysback = daysback+1 + continue + break + if daysback==7: + self.log("\nCover unavailable") + cover = None + return cover + + def prepare_masthead_image(self, path_to_image, out_path): + if self.Kindle_Fire: + from calibre import fit_image + from calibre.utils.magick import Image, create_canvas + img = Image() + img.open(path_to_image) + width, height = img.size + img2 = create_canvas(width, height) + img2.compose(img) + img2.save(out_path) + else: + BasicNewsRecipe.prepare_masthead_image(self, path_to_image, out_path) + + def fixChars(self,string): + # Replace lsquo (\x91) + fixed = re.sub("\x91","‘",string) + # Replace rsquo (\x92) + fixed = re.sub("\x92","’",fixed) + # Replace ldquo (\x93) + fixed = re.sub("\x93","“",fixed) + # Replace rdquo (\x94) + fixed = re.sub("\x94","”",fixed) + # Replace ndash (\x96) + fixed = re.sub("\x96","–",fixed) + # Replace mdash (\x97) + fixed = re.sub("\x97","—",fixed) + fixed = re.sub("’","’",fixed) + return fixed + + def massageNCXText(self, description): + # Kindle TOC descriptions won't render certain characters + if description: + massaged = unicode(BeautifulStoneSoup(description, convertEntities=BeautifulStoneSoup.HTML_ENTITIES)) + # Replace '&' with '&' + massaged = re.sub("&","&", massaged) + return self.fixChars(massaged) + else: + return description + + def populate_article_metadata(self, article, soup, first): + if first: + picdiv = soup.find('body').find('img') + if picdiv is not None: + self.add_toc_thumbnail(article,re.sub(r'links\\link\d+\\','',picdiv['src'])) + xtitle = article.text_summary.strip() + if len(xtitle) == 0: + desc = soup.find('meta',attrs={'property':'og:description'}) + if desc is not None: + article.summary = article.text_summary = desc['content'] + + def strip_anchors(self,soup): + paras = soup.findAll(True) + for para in paras: + aTags = para.findAll('a') + for a in aTags: + if a.img is None: + a.replaceWith(a.renderContents().decode('cp1252','replace')) + return soup - + def preprocess_html(self,soup): + #delete empty id attributes--they screw up the TOC for unknown reasons + divtags = soup.findAll('div',attrs={'id':''}) + if divtags: + for div in divtags: + del(div['id']) + + pgall = soup.find('div',attrs={'id':'storyphoto'}) + if pgall is not None: # photo gallery perhaps + if (soup.find('div',attrs={'id':'storycontent'}) is None): + allpics = Tag(soup,'div') + first_img = pgall.find('div','storyimage') + if first_img is not None: + first_img.extract() + tlist = pgall.find('div',attrs={'id':'relatedthumbs'}) + if tlist is not None: + for atag in tlist.findAll('a'): + img = Tag(soup,'img') + srcpre, sep, srcpost = atag.img['src'].partition('?') + img['src'] = srcpre + pdesc = Tag(soup,'p') + pdesc.insert(0,atag.img['alt']) + pdesc['class']='photocaption' + div = Tag(soup,'div') + div.insert(0,pdesc) + div.insert(0,img) + allpics.append(div) + pgall.replaceWith(allpics) + + for pg in soup.findAll('div',attrs={'id':'storyphoto'}): + pg.extract() + return self.strip_anchors(soup) + + + + def parse_index(self): + + articles = {} + ans = [] + + + def handle_article(adiv,key): + h1tag = adiv.h1 + if h1tag is not None: + atag = h1tag.a + if atag is not None: + url = atag['href'] + if atag['href'].startswith('http'): + return + elif atag['href'].startswith('/'): + url = self.url_prefix+atag['href'] + else: + url = self.url_prefix+'/'+atag['href'] + if url in self.url_list: + return + self.url_list.append(url) + title = self.tag_to_string(atag,False) + if 'VIDEO' in title.upper(): + return + if 'GALLERY' in title.upper(): + return + if 'PHOTOS' in title.upper(): + return + dtag = adiv.find('div','content') + description='' + print("URL "+url) + print("TITLE "+title) + if dtag is not None: + stag = dtag.span + if stag is not None: + if stag['class'] != 'timestamp': + description = self.tag_to_string(stag,False) + else: + description = self.tag_to_string(dtag,False) + print("DESCRIPTION: "+description) + if not articles.has_key(key): + articles[key] = [] + articles[key].append(dict(title=title,url=url,date='',description=description,author='',content='')) + + def parse_web_index(key, keyurl): + try: + soup = self.index_to_soup(self.url_prefix+keyurl) + except: + return + ans.append(key) + mainsoup = soup.find('div','bodywrapper') + footer = mainsoup.find(attrs={'id':'footerfeature'}) + if footer is not None: + footer.extract() + print("Section: "+key) + for wdiv in mainsoup.findAll('div',attrs={'id':re.compile('^HorizontalFeatureSlider_1_Story')}): + handle_article(wdiv,key) + wdiv.extract() + for wdiv in mainsoup.findAll(attrs={'id':['featurewidget','textfeature','textlinks_timestamp']}): + for adiv in wdiv.findAll('div','featurecontent'): + handle_article(adiv,key) + + for (k,url) in self.postmedia_index_pages: + parse_web_index(k,url) + ans = [(key, articles[key]) for key in ans if articles.has_key(key)] + return ans + diff --git a/recipes/ottawa_citizen.recipe b/recipes/ottawa_citizen.recipe index 32d5567d6d..5a53bbbab8 100644 --- a/recipes/ottawa_citizen.recipe +++ b/recipes/ottawa_citizen.recipe @@ -1,105 +1,141 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- - __license__ = 'GPL v3' ''' www.canada.com ''' - -import re +import string, re +from calibre import strftime from calibre.web.feeds.news import BasicNewsRecipe -from calibre.ebooks.BeautifulSoup import BeautifulStoneSoup + +import string, re +from calibre import strftime +from calibre.web.feeds.recipes import BasicNewsRecipe +from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag class CanWestPaper(BasicNewsRecipe): - # un-comment the following four lines for the Victoria Times Colonist -## title = u'Victoria Times Colonist' -## url_prefix = 'http://www.timescolonist.com' -## description = u'News from Victoria, BC' -## fp_tag = 'CAN_TC' + postmedia_index_pages = [ + (u'Headlines',u'/index.html'), + (u'Ottawa & Area',u'/news/ottawa/index.html'), + (u'Vancouver',u'/news/vancouver/index.html'), + (u'Calgary',u'/news/calgary/index.html'), + (u'Edmonton',u'/news/edmonton/index.html'), + (u'Montreal',u'/news/montreal/index.html'), + (u'Fraser Valley',u'/news/fraser-valley/index.html'), + (u'British Columbia',u'/news/bc/index.html'), + (u'Alberta',u'/news/alberta/index.html'), + (u'Canada',u'/news/canada/index.html'), + (u'National',u'/news/national/index.html'), + (u'Politics',u'/news/politics/index.html'), + (u'Insight',u'/news/insight/index.html'), + (u'Special Reports',u'/news/specialreports/index.html'), + (u'Gangs',u'/news/gangs/index.html'), + (u'Education',u'/news/education/index.html'), + (u'Health',u'/news/health/index.html'), + (u'Environment',u'/news/environment/index.html'), + (u'World',u'/news/world/index.html'), + (u'Police Blotter',u'/news/crime-and-justice/index.html'), + (u'Crime',u'/news/blotter/index.html'), + (u'Around Town',u'/news/topic.html?t=keyword&q=Around+Town'), + (u'Diplomatica',u'/news/diplomatica/index.html'), + (u'Opinion',u'/opinion/index.html'), + (u'Columnists',u'/columnists/index.html'), + (u'Editorials',u'/opinion/editorials/index.html'), + (u'Letters',u'/opinion/letters/index.html'), + (u'Business',u'/business/index.html'), + (u'Sports',u'/sports/index.html'), + (u'Arts',u'/entertainment/index.html'), + (u'Life',u'/life/index.html'), + (u'Technology',u'/technology/index.html'), + (u'Travel',u'/travel/index.html'), + (u'Health',u'/health/index.html') + ] - # un-comment the following four lines for the Vancouver Province + + # un-comment the following six lines for the Vancouver Province ## title = u'Vancouver Province' ## url_prefix = 'http://www.theprovince.com' ## description = u'News from Vancouver, BC' -## fp_tag = 'CAN_VP' +## std_logo_url = 'http://www.theprovince.com/images/logo_theprovince.jpg' +## logo_url = 'vplogo.jpg' +## fp_tag = 'CAN_TP' - # un-comment the following four lines for the Vancouver Sun + # un-comment the following six lines for the Vancouver Sun ## title = u'Vancouver Sun' ## url_prefix = 'http://www.vancouversun.com' ## description = u'News from Vancouver, BC' +## std_logo_url = 'http://www.vancouversun.com/images/logo_vancouversun.jpg' +## logo_url = 'vslogo.jpg' ## fp_tag = 'CAN_VS' - # un-comment the following four lines for the Edmonton Journal -## title = u'Edmonton Journal' -## url_prefix = 'http://www.edmontonjournal.com' -## description = u'News from Edmonton, AB' -## fp_tag = 'CAN_EJ' - - # un-comment the following four lines for the Calgary Herald + # un-comment the following six lines for the Calgary Herald ## title = u'Calgary Herald' ## url_prefix = 'http://www.calgaryherald.com' ## description = u'News from Calgary, AB' +## std_logo_url = 'http://www.calgaryherald.com/images/logo_calgaryherald.jpg' +## logo_url = 'chlogo.jpg' ## fp_tag = 'CAN_CH' - # un-comment the following four lines for the Regina Leader-Post -## title = u'Regina Leader-Post' -## url_prefix = 'http://www.leaderpost.com' -## description = u'News from Regina, SK' -## fp_tag = '' + # un-comment the following six lines for the Edmonton Journal +## title = u'Edmonton Journal' +## url_prefix = 'http://www.edmontonjournal.com' +## description = u'News from Edmonton, AB' +## std_logo_url = 'http://www.edmontonjournal.com/images/logo_edmontonjournal.jpg' +## logo_url = 'ejlogo.jpg' +## fp_tag = 'CAN_EJ' - # un-comment the following four lines for the Saskatoon Star-Phoenix -## title = u'Saskatoon Star-Phoenix' -## url_prefix = 'http://www.thestarphoenix.com' -## description = u'News from Saskatoon, SK' -## fp_tag = '' - - # un-comment the following four lines for the Windsor Star -## title = u'Windsor Star' -## url_prefix = 'http://www.windsorstar.com' -## description = u'News from Windsor, ON' -## fp_tag = 'CAN_' - - # un-comment the following four lines for the Ottawa Citizen + # un-comment the following six lines for the Ottawa Citizen title = u'Ottawa Citizen' url_prefix = 'http://www.ottawacitizen.com' - description = u'News from Ottawa, ON' + description = u'News from Ottawa, ON' + std_logo_url = 'http://www.ottawacitizen.com/images/logo_ottawacitizen.jpg' + logo_url = 'oclogo.jpg' fp_tag = 'CAN_OC' - # un-comment the following four lines for the Montreal Gazette + # un-comment the following six lines for the Montreal Gazette ## title = u'Montreal Gazette' ## url_prefix = 'http://www.montrealgazette.com' ## description = u'News from Montreal, QC' +## std_logo_url = 'http://www.montrealgazette.com/images/logo_montrealgazette.jpg' +## logo_url = 'mglogo.jpg' ## fp_tag = 'CAN_MG' + Kindle_Fire=False + masthead_url = std_logo_url + url_list = [] language = 'en_CA' __author__ = 'Nick Redding' no_stylesheets = True - timefmt = ' [%b %d]' + timefmt = ' [%b %d]' + encoding = 'utf-8' extra_css = ''' .timestamp { font-size:xx-small; display: block; } #storyheader { font-size: medium; } #storyheader h1 { font-size: x-large; } - #storyheader h2 { font-size: large; font-style: italic; } + #storyheader h2 { font-size: small; font-style: italic; } .byline { font-size:xx-small; } - #photocaption { font-size: small; font-style: italic } - #photocredit { font-size: xx-small; }''' - keep_only_tags = [dict(name='div', attrs={'id':'storyheader'}),dict(name='div', attrs={'id':'storycontent'})] + #photocaption { font-size: xx-small; font-style: italic; font-weight: normal; } + .photocaption { font-size: xx-small; font-style: italic; font-weight: normal; } + #photocredit { font-size: xx-small; font-weight: normal; }''' + + keep_only_tags = [dict(name='div', attrs={'id':'storyheader'}),dict(name='div', attrs={'id':'storycontent'}),dict(name='div', attrs={'id':'storyphoto'})] + remove_tags = [{'class':'comments'}, dict(name='div', attrs={'class':'navbar'}),dict(name='div', attrs={'class':'morelinks'}), + dict(name='h2', attrs={'id':'photocredit'}), dict(name='div', attrs={'class':'viewmore'}),dict(name='li', attrs={'class':'email'}), dict(name='div', attrs={'class':'story_tool_hr'}),dict(name='div', attrs={'class':'clear'}), dict(name='div', attrs={'class':'story_tool'}),dict(name='div', attrs={'class':'copyright'}), dict(name='div', attrs={'class':'rule_grey_solid'}), dict(name='li', attrs={'class':'print'}),dict(name='li', attrs={'class':'share'}),dict(name='ul', attrs={'class':'bullet'})] + def get_cover_url(self): - from datetime import timedelta, date - if self.fp_tag=='': - return None + from datetime import timedelta, datetime, date cover = 'http://webmedia.newseum.org/newseum-multimedia/dfp/jpg'+str(date.today().day)+'/lg/'+self.fp_tag+'.jpg' br = BasicNewsRecipe.get_browser() daysback=1 @@ -120,6 +156,19 @@ class CanWestPaper(BasicNewsRecipe): cover = None return cover + def prepare_masthead_image(self, path_to_image, out_path): + if self.Kindle_Fire: + from calibre import fit_image + from calibre.utils.magick import Image, create_canvas + img = Image() + img.open(path_to_image) + width, height = img.size + img2 = create_canvas(width, height) + img2.compose(img) + img2.save(out_path) + else: + BasicNewsRecipe.prepare_masthead_image(self, path_to_image, out_path) + def fixChars(self,string): # Replace lsquo (\x91) fixed = re.sub("\x91","‘",string) @@ -166,55 +215,106 @@ class CanWestPaper(BasicNewsRecipe): a.replaceWith(a.renderContents().decode('cp1252','replace')) return soup - def preprocess_html(self, soup): + + def preprocess_html(self,soup): + #delete empty id attributes--they screw up the TOC for unknown reasons + divtags = soup.findAll('div',attrs={'id':''}) + if divtags: + for div in divtags: + del(div['id']) + + pgall = soup.find('div',attrs={'id':'storyphoto'}) + if pgall is not None: # photo gallery perhaps + if (soup.find('div',attrs={'id':'storycontent'}) is None): + allpics = Tag(soup,'div') + first_img = pgall.find('div','storyimage') + if first_img is not None: + first_img.extract() + tlist = pgall.find('div',attrs={'id':'relatedthumbs'}) + if tlist is not None: + for atag in tlist.findAll('a'): + img = Tag(soup,'img') + srcpre, sep, srcpost = atag.img['src'].partition('?') + img['src'] = srcpre + pdesc = Tag(soup,'p') + pdesc.insert(0,atag.img['alt']) + pdesc['class']='photocaption' + div = Tag(soup,'div') + div.insert(0,pdesc) + div.insert(0,img) + allpics.append(div) + pgall.replaceWith(allpics) + + for pg in soup.findAll('div',attrs={'id':'storyphoto'}): + pg.extract() return self.strip_anchors(soup) - + def parse_index(self): - soup = self.index_to_soup(self.url_prefix+'/news/todays-paper/index.html') articles = {} - key = 'News' - ans = ['News'] + ans = [] - # Find each instance of class="sectiontitle", class="featurecontent" - for divtag in soup.findAll('div',attrs={'class' : ["section_title02","featurecontent"]}): - #self.log(" div class = %s" % divtag['class']) - if divtag['class'].startswith('section_title'): - # div contains section title - if not divtag.h3: - continue - key = self.tag_to_string(divtag.h3,False) - ans.append(key) - self.log("Section name %s" % key) - continue - # div contains article data - h1tag = divtag.find('h1') - if not h1tag: - continue - atag = h1tag.find('a',href=True) - if not atag: - continue - url = self.url_prefix+'/news/todays-paper/'+atag['href'] - #self.log("Section %s" % key) - #self.log("url %s" % url) - title = self.tag_to_string(atag,False) - #self.log("title %s" % title) - pubdate = '' - description = '' - ptag = divtag.find('p'); - if ptag: - description = self.tag_to_string(ptag,False) - #self.log("description %s" % description) - author = '' - autag = divtag.find('h4') - if autag: - author = self.tag_to_string(autag,False) - #self.log("author %s" % author) - if not articles.has_key(key): - articles[key] = [] - articles[key].append(dict(title=title,url=url,date=pubdate,description=description,author=author,content='')) + def handle_article(adiv,key): + h1tag = adiv.h1 + if h1tag is not None: + atag = h1tag.a + if atag is not None: + url = atag['href'] + if atag['href'].startswith('http'): + return + elif atag['href'].startswith('/'): + url = self.url_prefix+atag['href'] + else: + url = self.url_prefix+'/'+atag['href'] + if url in self.url_list: + return + self.url_list.append(url) + title = self.tag_to_string(atag,False) + if 'VIDEO' in title.upper(): + return + if 'GALLERY' in title.upper(): + return + if 'PHOTOS' in title.upper(): + return + dtag = adiv.find('div','content') + description='' + print("URL "+url) + print("TITLE "+title) + if dtag is not None: + stag = dtag.span + if stag is not None: + if stag['class'] != 'timestamp': + description = self.tag_to_string(stag,False) + else: + description = self.tag_to_string(dtag,False) + print("DESCRIPTION: "+description) + if not articles.has_key(key): + articles[key] = [] + articles[key].append(dict(title=title,url=url,date='',description=description,author='',content='')) + + def parse_web_index(key, keyurl): + try: + soup = self.index_to_soup(self.url_prefix+keyurl) + except: + return + ans.append(key) + mainsoup = soup.find('div','bodywrapper') + footer = mainsoup.find(attrs={'id':'footerfeature'}) + if footer is not None: + footer.extract() + print("Section: "+key) + for wdiv in mainsoup.findAll('div',attrs={'id':re.compile('^HorizontalFeatureSlider_1_Story')}): + handle_article(wdiv,key) + wdiv.extract() + for wdiv in mainsoup.findAll(attrs={'id':['featurewidget','textfeature','textlinks_timestamp']}): + for adiv in wdiv.findAll('div','featurecontent'): + handle_article(adiv,key) + + for (k,url) in self.postmedia_index_pages: + parse_web_index(k,url) ans = [(key, articles[key]) for key in ans if articles.has_key(key)] return ans + diff --git a/recipes/vancouver_provice.recipe b/recipes/vancouver_provice.recipe index 9375670c59..1e41591a79 100644 --- a/recipes/vancouver_provice.recipe +++ b/recipes/vancouver_provice.recipe @@ -1,136 +1,320 @@ #!/usr/bin/env python - +# -*- coding: utf-8 -*- __license__ = 'GPL v3' ''' www.canada.com ''' +import string, re +from calibre import strftime +from calibre.web.feeds.news import BasicNewsRecipe +import string, re +from calibre import strftime from calibre.web.feeds.recipes import BasicNewsRecipe +from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag class CanWestPaper(BasicNewsRecipe): - # un-comment the following three lines for the Vancouver Province + postmedia_index_pages = [ + (u'Headlines',u'/index.html'), + (u'Ottawa & Area',u'/news/ottawa/index.html'), + (u'Vancouver',u'/news/vancouver/index.html'), + (u'Calgary',u'/news/calgary/index.html'), + (u'Edmonton',u'/news/edmonton/index.html'), + (u'Montreal',u'/news/montreal/index.html'), + (u'Fraser Valley',u'/news/fraser-valley/index.html'), + (u'British Columbia',u'/news/bc/index.html'), + (u'Alberta',u'/news/alberta/index.html'), + (u'Canada',u'/news/canada/index.html'), + (u'National',u'/news/national/index.html'), + (u'Politics',u'/news/politics/index.html'), + (u'Insight',u'/news/insight/index.html'), + (u'Special Reports',u'/news/specialreports/index.html'), + (u'Gangs',u'/news/gangs/index.html'), + (u'Education',u'/news/education/index.html'), + (u'Health',u'/news/health/index.html'), + (u'Environment',u'/news/environment/index.html'), + (u'World',u'/news/world/index.html'), + (u'Police Blotter',u'/news/crime-and-justice/index.html'), + (u'Crime',u'/news/blotter/index.html'), + (u'Around Town',u'/news/topic.html?t=keyword&q=Around+Town'), + (u'Diplomatica',u'/news/diplomatica/index.html'), + (u'Opinion',u'/opinion/index.html'), + (u'Columnists',u'/columnists/index.html'), + (u'Editorials',u'/opinion/editorials/index.html'), + (u'Letters',u'/opinion/letters/index.html'), + (u'Business',u'/business/index.html'), + (u'Sports',u'/sports/index.html'), + (u'Arts',u'/entertainment/index.html'), + (u'Life',u'/life/index.html'), + (u'Technology',u'/technology/index.html'), + (u'Travel',u'/travel/index.html'), + (u'Health',u'/health/index.html') + ] + + + # un-comment the following six lines for the Vancouver Province title = u'Vancouver Province' url_prefix = 'http://www.theprovince.com' description = u'News from Vancouver, BC' + std_logo_url = 'http://www.theprovince.com/images/logo_theprovince.jpg' + logo_url = 'vplogo.jpg' + fp_tag = 'CAN_TP' - # un-comment the following three lines for the Vancouver Sun - #title = u'Vancouver Sun' - #url_prefix = 'http://www.vancouversun.com' - #description = u'News from Vancouver, BC' + # un-comment the following six lines for the Vancouver Sun +## title = u'Vancouver Sun' +## url_prefix = 'http://www.vancouversun.com' +## description = u'News from Vancouver, BC' +## std_logo_url = 'http://www.vancouversun.com/images/logo_vancouversun.jpg' +## logo_url = 'vslogo.jpg' +## fp_tag = 'CAN_VS' - # un-comment the following three lines for the Edmonton Journal - #title = u'Edmonton Journal' - #url_prefix = 'http://www.edmontonjournal.com' - #description = u'News from Edmonton, AB' + # un-comment the following six lines for the Calgary Herald +## title = u'Calgary Herald' +## url_prefix = 'http://www.calgaryherald.com' +## description = u'News from Calgary, AB' +## std_logo_url = 'http://www.calgaryherald.com/images/logo_calgaryherald.jpg' +## logo_url = 'chlogo.jpg' +## fp_tag = 'CAN_CH' - # un-comment the following three lines for the Calgary Herald - #title = u'Calgary Herald' - #url_prefix = 'http://www.calgaryherald.com' - #description = u'News from Calgary, AB' + # un-comment the following six lines for the Edmonton Journal +## title = u'Edmonton Journal' +## url_prefix = 'http://www.edmontonjournal.com' +## description = u'News from Edmonton, AB' +## std_logo_url = 'http://www.edmontonjournal.com/images/logo_edmontonjournal.jpg' +## logo_url = 'ejlogo.jpg' +## fp_tag = 'CAN_EJ' - # un-comment the following three lines for the Regina Leader-Post - #title = u'Regina Leader-Post' - #url_prefix = 'http://www.leaderpost.com' - #description = u'News from Regina, SK' + # un-comment the following six lines for the Ottawa Citizen +## title = u'Ottawa Citizen' +## url_prefix = 'http://www.ottawacitizen.com' +## description = u'News from Ottawa, ON' +## std_logo_url = 'http://www.ottawacitizen.com/images/logo_ottawacitizen.jpg' +## logo_url = 'oclogo.jpg' +## fp_tag = 'CAN_OC' - # un-comment the following three lines for the Saskatoon Star-Phoenix - #title = u'Saskatoon Star-Phoenix' - #url_prefix = 'http://www.thestarphoenix.com' - #description = u'News from Saskatoon, SK' - - # un-comment the following three lines for the Windsor Star - #title = u'Windsor Star' - #url_prefix = 'http://www.windsorstar.com' - #description = u'News from Windsor, ON' - - # un-comment the following three lines for the Ottawa Citizen - #title = u'Ottawa Citizen' - #url_prefix = 'http://www.ottawacitizen.com' - #description = u'News from Ottawa, ON' - - # un-comment the following three lines for the Montreal Gazette - #title = u'Montreal Gazette' - #url_prefix = 'http://www.montrealgazette.com' - #description = u'News from Montreal, QC' + # un-comment the following six lines for the Montreal Gazette +## title = u'Montreal Gazette' +## url_prefix = 'http://www.montrealgazette.com' +## description = u'News from Montreal, QC' +## std_logo_url = 'http://www.montrealgazette.com/images/logo_montrealgazette.jpg' +## logo_url = 'mglogo.jpg' +## fp_tag = 'CAN_MG' + Kindle_Fire=False + masthead_url = std_logo_url + url_list = [] language = 'en_CA' __author__ = 'Nick Redding' no_stylesheets = True - timefmt = ' [%b %d]' + timefmt = ' [%b %d]' + encoding = 'utf-8' extra_css = ''' .timestamp { font-size:xx-small; display: block; } #storyheader { font-size: medium; } #storyheader h1 { font-size: x-large; } - #storyheader h2 { font-size: large; font-style: italic; } + #storyheader h2 { font-size: small; font-style: italic; } .byline { font-size:xx-small; } - #photocaption { font-size: small; font-style: italic } - #photocredit { font-size: xx-small; }''' - keep_only_tags = [dict(name='div', attrs={'id':'storyheader'}),dict(name='div', attrs={'id':'storycontent'})] + #photocaption { font-size: xx-small; font-style: italic; font-weight: normal; } + .photocaption { font-size: xx-small; font-style: italic; font-weight: normal; } + #photocredit { font-size: xx-small; font-weight: normal; }''' + + keep_only_tags = [dict(name='div', attrs={'id':'storyheader'}),dict(name='div', attrs={'id':'storycontent'}),dict(name='div', attrs={'id':'storyphoto'})] + remove_tags = [{'class':'comments'}, dict(name='div', attrs={'class':'navbar'}),dict(name='div', attrs={'class':'morelinks'}), + dict(name='h2', attrs={'id':'photocredit'}), dict(name='div', attrs={'class':'viewmore'}),dict(name='li', attrs={'class':'email'}), dict(name='div', attrs={'class':'story_tool_hr'}),dict(name='div', attrs={'class':'clear'}), dict(name='div', attrs={'class':'story_tool'}),dict(name='div', attrs={'class':'copyright'}), dict(name='div', attrs={'class':'rule_grey_solid'}), dict(name='li', attrs={'class':'print'}),dict(name='li', attrs={'class':'share'}),dict(name='ul', attrs={'class':'bullet'})] + + def get_cover_url(self): + from datetime import timedelta, datetime, date + cover = 'http://webmedia.newseum.org/newseum-multimedia/dfp/jpg'+str(date.today().day)+'/lg/'+self.fp_tag+'.jpg' + br = BasicNewsRecipe.get_browser() + daysback=1 + try: + br.open(cover) + except: + while daysback<7: + cover = 'http://webmedia.newseum.org/newseum-multimedia/dfp/jpg'+str((date.today() - timedelta(days=daysback)).day)+'/lg/'+self.fp_tag+'.jpg' + br = BasicNewsRecipe.get_browser() + try: + br.open(cover) + except: + daysback = daysback+1 + continue + break + if daysback==7: + self.log("\nCover unavailable") + cover = None + return cover + + def prepare_masthead_image(self, path_to_image, out_path): + if self.Kindle_Fire: + from calibre import fit_image + from calibre.utils.magick import Image, create_canvas + img = Image() + img.open(path_to_image) + width, height = img.size + img2 = create_canvas(width, height) + img2.compose(img) + img2.save(out_path) + else: + BasicNewsRecipe.prepare_masthead_image(self, path_to_image, out_path) + + def fixChars(self,string): + # Replace lsquo (\x91) + fixed = re.sub("\x91","‘",string) + # Replace rsquo (\x92) + fixed = re.sub("\x92","’",fixed) + # Replace ldquo (\x93) + fixed = re.sub("\x93","“",fixed) + # Replace rdquo (\x94) + fixed = re.sub("\x94","”",fixed) + # Replace ndash (\x96) + fixed = re.sub("\x96","–",fixed) + # Replace mdash (\x97) + fixed = re.sub("\x97","—",fixed) + fixed = re.sub("’","’",fixed) + return fixed + + def massageNCXText(self, description): + # Kindle TOC descriptions won't render certain characters + if description: + massaged = unicode(BeautifulStoneSoup(description, convertEntities=BeautifulStoneSoup.HTML_ENTITIES)) + # Replace '&' with '&' + massaged = re.sub("&","&", massaged) + return self.fixChars(massaged) + else: + return description + + def populate_article_metadata(self, article, soup, first): + if first: + picdiv = soup.find('body').find('img') + if picdiv is not None: + self.add_toc_thumbnail(article,re.sub(r'links\\link\d+\\','',picdiv['src'])) + xtitle = article.text_summary.strip() + if len(xtitle) == 0: + desc = soup.find('meta',attrs={'property':'og:description'}) + if desc is not None: + article.summary = article.text_summary = desc['content'] + + def strip_anchors(self,soup): + paras = soup.findAll(True) + for para in paras: + aTags = para.findAll('a') + for a in aTags: + if a.img is None: + a.replaceWith(a.renderContents().decode('cp1252','replace')) + return soup + + def preprocess_html(self,soup): - #delete iempty id attributes--they screw up the TOC for unknow reasons + #delete empty id attributes--they screw up the TOC for unknown reasons divtags = soup.findAll('div',attrs={'id':''}) if divtags: for div in divtags: del(div['id']) - return soup + pgall = soup.find('div',attrs={'id':'storyphoto'}) + if pgall is not None: # photo gallery perhaps + if (soup.find('div',attrs={'id':'storycontent'}) is None): + allpics = Tag(soup,'div') + first_img = pgall.find('div','storyimage') + if first_img is not None: + first_img.extract() + tlist = pgall.find('div',attrs={'id':'relatedthumbs'}) + if tlist is not None: + for atag in tlist.findAll('a'): + img = Tag(soup,'img') + srcpre, sep, srcpost = atag.img['src'].partition('?') + img['src'] = srcpre + pdesc = Tag(soup,'p') + pdesc.insert(0,atag.img['alt']) + pdesc['class']='photocaption' + div = Tag(soup,'div') + div.insert(0,pdesc) + div.insert(0,img) + allpics.append(div) + pgall.replaceWith(allpics) + + for pg in soup.findAll('div',attrs={'id':'storyphoto'}): + pg.extract() + return self.strip_anchors(soup) + + def parse_index(self): - soup = self.index_to_soup(self.url_prefix+'/news/todays-paper/index.html') articles = {} - key = 'News' - ans = ['News'] + ans = [] - # Find each instance of class="sectiontitle", class="featurecontent" - for divtag in soup.findAll('div',attrs={'class' : ["section_title02","featurecontent"]}): - #self.log(" div class = %s" % divtag['class']) - if divtag['class'].startswith('section_title'): - # div contains section title - if not divtag.h3: - continue - key = self.tag_to_string(divtag.h3,False) - ans.append(key) - self.log("Section name %s" % key) - continue - # div contains article data - h1tag = divtag.find('h1') - if not h1tag: - continue - atag = h1tag.find('a',href=True) - if not atag: - continue - url = self.url_prefix+'/news/todays-paper/'+atag['href'] - #self.log("Section %s" % key) - #self.log("url %s" % url) - title = self.tag_to_string(atag,False) - #self.log("title %s" % title) - pubdate = '' - description = '' - ptag = divtag.find('p'); - if ptag: - description = self.tag_to_string(ptag,False) - #self.log("description %s" % description) - author = '' - autag = divtag.find('h4') - if autag: - author = self.tag_to_string(autag,False) - #self.log("author %s" % author) - if not articles.has_key(key): - articles[key] = [] - articles[key].append(dict(title=title,url=url,date=pubdate,description=description,author=author,content='')) + def handle_article(adiv,key): + h1tag = adiv.h1 + if h1tag is not None: + atag = h1tag.a + if atag is not None: + url = atag['href'] + if atag['href'].startswith('http'): + return + elif atag['href'].startswith('/'): + url = self.url_prefix+atag['href'] + else: + url = self.url_prefix+'/'+atag['href'] + if url in self.url_list: + return + self.url_list.append(url) + title = self.tag_to_string(atag,False) + if 'VIDEO' in title.upper(): + return + if 'GALLERY' in title.upper(): + return + if 'PHOTOS' in title.upper(): + return + dtag = adiv.find('div','content') + description='' + print("URL "+url) + print("TITLE "+title) + if dtag is not None: + stag = dtag.span + if stag is not None: + if stag['class'] != 'timestamp': + description = self.tag_to_string(stag,False) + else: + description = self.tag_to_string(dtag,False) + print("DESCRIPTION: "+description) + if not articles.has_key(key): + articles[key] = [] + articles[key].append(dict(title=title,url=url,date='',description=description,author='',content='')) + + def parse_web_index(key, keyurl): + try: + soup = self.index_to_soup(self.url_prefix+keyurl) + except: + return + ans.append(key) + mainsoup = soup.find('div','bodywrapper') + footer = mainsoup.find(attrs={'id':'footerfeature'}) + if footer is not None: + footer.extract() + print("Section: "+key) + for wdiv in mainsoup.findAll('div',attrs={'id':re.compile('^HorizontalFeatureSlider_1_Story')}): + handle_article(wdiv,key) + wdiv.extract() + for wdiv in mainsoup.findAll(attrs={'id':['featurewidget','textfeature','textlinks_timestamp']}): + for adiv in wdiv.findAll('div','featurecontent'): + handle_article(adiv,key) + + for (k,url) in self.postmedia_index_pages: + parse_web_index(k,url) ans = [(key, articles[key]) for key in ans if articles.has_key(key)] return ans + diff --git a/recipes/vancouver_sun.recipe b/recipes/vancouver_sun.recipe index 98926e4ad8..4cc3c478e4 100644 --- a/recipes/vancouver_sun.recipe +++ b/recipes/vancouver_sun.recipe @@ -1,105 +1,141 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- - __license__ = 'GPL v3' ''' www.canada.com ''' - -import re +import string, re +from calibre import strftime from calibre.web.feeds.news import BasicNewsRecipe -from calibre.ebooks.BeautifulSoup import BeautifulStoneSoup + +import string, re +from calibre import strftime +from calibre.web.feeds.recipes import BasicNewsRecipe +from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag class CanWestPaper(BasicNewsRecipe): - # un-comment the following four lines for the Victoria Times Colonist -## title = u'Victoria Times Colonist' -## url_prefix = 'http://www.timescolonist.com' -## description = u'News from Victoria, BC' -## fp_tag = 'CAN_TC' + postmedia_index_pages = [ + (u'Headlines',u'/index.html'), + (u'Ottawa & Area',u'/news/ottawa/index.html'), + (u'Vancouver',u'/news/vancouver/index.html'), + (u'Calgary',u'/news/calgary/index.html'), + (u'Edmonton',u'/news/edmonton/index.html'), + (u'Montreal',u'/news/montreal/index.html'), + (u'Fraser Valley',u'/news/fraser-valley/index.html'), + (u'British Columbia',u'/news/bc/index.html'), + (u'Alberta',u'/news/alberta/index.html'), + (u'Canada',u'/news/canada/index.html'), + (u'National',u'/news/national/index.html'), + (u'Politics',u'/news/politics/index.html'), + (u'Insight',u'/news/insight/index.html'), + (u'Special Reports',u'/news/specialreports/index.html'), + (u'Gangs',u'/news/gangs/index.html'), + (u'Education',u'/news/education/index.html'), + (u'Health',u'/news/health/index.html'), + (u'Environment',u'/news/environment/index.html'), + (u'World',u'/news/world/index.html'), + (u'Police Blotter',u'/news/crime-and-justice/index.html'), + (u'Crime',u'/news/blotter/index.html'), + (u'Around Town',u'/news/topic.html?t=keyword&q=Around+Town'), + (u'Diplomatica',u'/news/diplomatica/index.html'), + (u'Opinion',u'/opinion/index.html'), + (u'Columnists',u'/columnists/index.html'), + (u'Editorials',u'/opinion/editorials/index.html'), + (u'Letters',u'/opinion/letters/index.html'), + (u'Business',u'/business/index.html'), + (u'Sports',u'/sports/index.html'), + (u'Arts',u'/entertainment/index.html'), + (u'Life',u'/life/index.html'), + (u'Technology',u'/technology/index.html'), + (u'Travel',u'/travel/index.html'), + (u'Health',u'/health/index.html') + ] - # un-comment the following four lines for the Vancouver Province + + # un-comment the following six lines for the Vancouver Province ## title = u'Vancouver Province' ## url_prefix = 'http://www.theprovince.com' ## description = u'News from Vancouver, BC' -## fp_tag = 'CAN_VP' +## std_logo_url = 'http://www.theprovince.com/images/logo_theprovince.jpg' +## logo_url = 'vplogo.jpg' +## fp_tag = 'CAN_TP' - # un-comment the following four lines for the Vancouver Sun + # un-comment the following six lines for the Vancouver Sun title = u'Vancouver Sun' url_prefix = 'http://www.vancouversun.com' description = u'News from Vancouver, BC' + std_logo_url = 'http://www.vancouversun.com/images/logo_vancouversun.jpg' + logo_url = 'vslogo.jpg' fp_tag = 'CAN_VS' - # un-comment the following four lines for the Edmonton Journal -## title = u'Edmonton Journal' -## url_prefix = 'http://www.edmontonjournal.com' -## description = u'News from Edmonton, AB' -## fp_tag = 'CAN_EJ' - - # un-comment the following four lines for the Calgary Herald + # un-comment the following six lines for the Calgary Herald ## title = u'Calgary Herald' ## url_prefix = 'http://www.calgaryherald.com' ## description = u'News from Calgary, AB' +## std_logo_url = 'http://www.calgaryherald.com/images/logo_calgaryherald.jpg' +## logo_url = 'chlogo.jpg' ## fp_tag = 'CAN_CH' - # un-comment the following four lines for the Regina Leader-Post -## title = u'Regina Leader-Post' -## url_prefix = 'http://www.leaderpost.com' -## description = u'News from Regina, SK' -## fp_tag = '' + # un-comment the following six lines for the Edmonton Journal +## title = u'Edmonton Journal' +## url_prefix = 'http://www.edmontonjournal.com' +## description = u'News from Edmonton, AB' +## std_logo_url = 'http://www.edmontonjournal.com/images/logo_edmontonjournal.jpg' +## logo_url = 'ejlogo.jpg' +## fp_tag = 'CAN_EJ' - # un-comment the following four lines for the Saskatoon Star-Phoenix -## title = u'Saskatoon Star-Phoenix' -## url_prefix = 'http://www.thestarphoenix.com' -## description = u'News from Saskatoon, SK' -## fp_tag = '' - - # un-comment the following four lines for the Windsor Star -## title = u'Windsor Star' -## url_prefix = 'http://www.windsorstar.com' -## description = u'News from Windsor, ON' -## fp_tag = 'CAN_' - - # un-comment the following four lines for the Ottawa Citizen + # un-comment the following six lines for the Ottawa Citizen ## title = u'Ottawa Citizen' ## url_prefix = 'http://www.ottawacitizen.com' -## description = u'News from Ottawa, ON' +## description = u'News from Ottawa, ON' +## std_logo_url = 'http://www.ottawacitizen.com/images/logo_ottawacitizen.jpg' +## logo_url = 'oclogo.jpg' ## fp_tag = 'CAN_OC' - # un-comment the following four lines for the Montreal Gazette + # un-comment the following six lines for the Montreal Gazette ## title = u'Montreal Gazette' ## url_prefix = 'http://www.montrealgazette.com' ## description = u'News from Montreal, QC' +## std_logo_url = 'http://www.montrealgazette.com/images/logo_montrealgazette.jpg' +## logo_url = 'mglogo.jpg' ## fp_tag = 'CAN_MG' + Kindle_Fire=False + masthead_url = std_logo_url + url_list = [] language = 'en_CA' __author__ = 'Nick Redding' no_stylesheets = True - timefmt = ' [%b %d]' + timefmt = ' [%b %d]' + encoding = 'utf-8' extra_css = ''' .timestamp { font-size:xx-small; display: block; } #storyheader { font-size: medium; } #storyheader h1 { font-size: x-large; } - #storyheader h2 { font-size: large; font-style: italic; } + #storyheader h2 { font-size: small; font-style: italic; } .byline { font-size:xx-small; } - #photocaption { font-size: small; font-style: italic } - #photocredit { font-size: xx-small; }''' - keep_only_tags = [dict(name='div', attrs={'id':'storyheader'}),dict(name='div', attrs={'id':'storycontent'})] + #photocaption { font-size: xx-small; font-style: italic; font-weight: normal; } + .photocaption { font-size: xx-small; font-style: italic; font-weight: normal; } + #photocredit { font-size: xx-small; font-weight: normal; }''' + + keep_only_tags = [dict(name='div', attrs={'id':'storyheader'}),dict(name='div', attrs={'id':'storycontent'}),dict(name='div', attrs={'id':'storyphoto'})] + remove_tags = [{'class':'comments'}, dict(name='div', attrs={'class':'navbar'}),dict(name='div', attrs={'class':'morelinks'}), + dict(name='h2', attrs={'id':'photocredit'}), dict(name='div', attrs={'class':'viewmore'}),dict(name='li', attrs={'class':'email'}), dict(name='div', attrs={'class':'story_tool_hr'}),dict(name='div', attrs={'class':'clear'}), dict(name='div', attrs={'class':'story_tool'}),dict(name='div', attrs={'class':'copyright'}), dict(name='div', attrs={'class':'rule_grey_solid'}), dict(name='li', attrs={'class':'print'}),dict(name='li', attrs={'class':'share'}),dict(name='ul', attrs={'class':'bullet'})] + def get_cover_url(self): - from datetime import timedelta, date - if self.fp_tag=='': - return None + from datetime import timedelta, datetime, date cover = 'http://webmedia.newseum.org/newseum-multimedia/dfp/jpg'+str(date.today().day)+'/lg/'+self.fp_tag+'.jpg' br = BasicNewsRecipe.get_browser() daysback=1 @@ -120,6 +156,19 @@ class CanWestPaper(BasicNewsRecipe): cover = None return cover + def prepare_masthead_image(self, path_to_image, out_path): + if self.Kindle_Fire: + from calibre import fit_image + from calibre.utils.magick import Image, create_canvas + img = Image() + img.open(path_to_image) + width, height = img.size + img2 = create_canvas(width, height) + img2.compose(img) + img2.save(out_path) + else: + BasicNewsRecipe.prepare_masthead_image(self, path_to_image, out_path) + def fixChars(self,string): # Replace lsquo (\x91) fixed = re.sub("\x91","‘",string) @@ -166,55 +215,106 @@ class CanWestPaper(BasicNewsRecipe): a.replaceWith(a.renderContents().decode('cp1252','replace')) return soup - def preprocess_html(self, soup): + + def preprocess_html(self,soup): + #delete empty id attributes--they screw up the TOC for unknown reasons + divtags = soup.findAll('div',attrs={'id':''}) + if divtags: + for div in divtags: + del(div['id']) + + pgall = soup.find('div',attrs={'id':'storyphoto'}) + if pgall is not None: # photo gallery perhaps + if (soup.find('div',attrs={'id':'storycontent'}) is None): + allpics = Tag(soup,'div') + first_img = pgall.find('div','storyimage') + if first_img is not None: + first_img.extract() + tlist = pgall.find('div',attrs={'id':'relatedthumbs'}) + if tlist is not None: + for atag in tlist.findAll('a'): + img = Tag(soup,'img') + srcpre, sep, srcpost = atag.img['src'].partition('?') + img['src'] = srcpre + pdesc = Tag(soup,'p') + pdesc.insert(0,atag.img['alt']) + pdesc['class']='photocaption' + div = Tag(soup,'div') + div.insert(0,pdesc) + div.insert(0,img) + allpics.append(div) + pgall.replaceWith(allpics) + + for pg in soup.findAll('div',attrs={'id':'storyphoto'}): + pg.extract() return self.strip_anchors(soup) - + def parse_index(self): - soup = self.index_to_soup(self.url_prefix+'/news/todays-paper/index.html') articles = {} - key = 'News' - ans = ['News'] + ans = [] - # Find each instance of class="sectiontitle", class="featurecontent" - for divtag in soup.findAll('div',attrs={'class' : ["section_title02","featurecontent"]}): - #self.log(" div class = %s" % divtag['class']) - if divtag['class'].startswith('section_title'): - # div contains section title - if not divtag.h3: - continue - key = self.tag_to_string(divtag.h3,False) - ans.append(key) - self.log("Section name %s" % key) - continue - # div contains article data - h1tag = divtag.find('h1') - if not h1tag: - continue - atag = h1tag.find('a',href=True) - if not atag: - continue - url = self.url_prefix+'/news/todays-paper/'+atag['href'] - #self.log("Section %s" % key) - #self.log("url %s" % url) - title = self.tag_to_string(atag,False) - #self.log("title %s" % title) - pubdate = '' - description = '' - ptag = divtag.find('p'); - if ptag: - description = self.tag_to_string(ptag,False) - #self.log("description %s" % description) - author = '' - autag = divtag.find('h4') - if autag: - author = self.tag_to_string(autag,False) - #self.log("author %s" % author) - if not articles.has_key(key): - articles[key] = [] - articles[key].append(dict(title=title,url=url,date=pubdate,description=description,author=author,content='')) + def handle_article(adiv,key): + h1tag = adiv.h1 + if h1tag is not None: + atag = h1tag.a + if atag is not None: + url = atag['href'] + if atag['href'].startswith('http'): + return + elif atag['href'].startswith('/'): + url = self.url_prefix+atag['href'] + else: + url = self.url_prefix+'/'+atag['href'] + if url in self.url_list: + return + self.url_list.append(url) + title = self.tag_to_string(atag,False) + if 'VIDEO' in title.upper(): + return + if 'GALLERY' in title.upper(): + return + if 'PHOTOS' in title.upper(): + return + dtag = adiv.find('div','content') + description='' + print("URL "+url) + print("TITLE "+title) + if dtag is not None: + stag = dtag.span + if stag is not None: + if stag['class'] != 'timestamp': + description = self.tag_to_string(stag,False) + else: + description = self.tag_to_string(dtag,False) + print("DESCRIPTION: "+description) + if not articles.has_key(key): + articles[key] = [] + articles[key].append(dict(title=title,url=url,date='',description=description,author='',content='')) + + def parse_web_index(key, keyurl): + try: + soup = self.index_to_soup(self.url_prefix+keyurl) + except: + return + ans.append(key) + mainsoup = soup.find('div','bodywrapper') + footer = mainsoup.find(attrs={'id':'footerfeature'}) + if footer is not None: + footer.extract() + print("Section: "+key) + for wdiv in mainsoup.findAll('div',attrs={'id':re.compile('^HorizontalFeatureSlider_1_Story')}): + handle_article(wdiv,key) + wdiv.extract() + for wdiv in mainsoup.findAll(attrs={'id':['featurewidget','textfeature','textlinks_timestamp']}): + for adiv in wdiv.findAll('div','featurecontent'): + handle_article(adiv,key) + + for (k,url) in self.postmedia_index_pages: + parse_web_index(k,url) ans = [(key, articles[key]) for key in ans if articles.has_key(key)] return ans +