#!/usr/bin/env python # -*- coding: utf-8 -*- __license__ = 'GPL v3' ''' www.canada.com ''' import re from calibre.web.feeds.news import BasicNewsRecipe from calibre.ebooks.BeautifulSoup import Tag, BeautifulStoneSoup class TimesColonist(BasicNewsRecipe): # Customization -- remove sections you don't want. # If your e-reader is an e-ink Kindle and your output profile is # set properly this recipe will not include images because the # resulting file is too large. If you have one of these and want # images you can set kindle_omit_images = False # and remove sections (typically the e-ink Kindles will # work with about a dozen of these, but your mileage may vary). kindle_omit_images = True section_list = [ ('','Web Front Page'), ('news/','News Headlines'), ('news/b-c/','BC News'), ('news/national/','National News'), ('news/world/','World News'), ('opinion/','Opinion'), ('opinion/letters/','Letters'), ('business/','Business'), ('business/money/','Money'), ('business/technology/','Technology'), ('business/working/','Working'), ('sports/','Sports'), ('sports/hockey/','Hockey'), ('sports/football/','Football'), ('sports/basketball/','Basketball'), ('sports/golf/','Golf'), ('entertainment/','entertainment'), ('entertainment/go/','Go!'), ('entertainment/music/','Music'), ('entertainment/books/','Books'), ('entertainment/Movies/','Movies'), ('entertainment/television/','Television'), ('life/','Life'), ('life/health/','Health'), ('life/travel/','Travel'), ('life/driving/','Driving'), ('life/homes/','Homes'), ('life/food-drink/','Food & Drink') ] title = u'Victoria Times Colonist' url_prefix = 'http://www.timescolonist.com' description = u'News from Victoria, BC' fp_tag = 'CAN_TC' masthead_url = 'http://www.timescolonist.com/gmg/img/global/logoTimesColonist.png' url_list = [] language = 'en_CA' __author__ = 'Nick Redding' no_stylesheets = True timefmt = ' [%b %d]' encoding = 'utf-8' extra_css = ''' .byline { font-size:xx-small; font-weight: bold;} h3 { margin-bottom: 6px; } .caption { font-size: xx-small; font-style: italic; font-weight: normal; } ''' keep_only_tags = [dict(name='div', attrs={'class':re.compile('main.content')})] def __init__(self, options, log, progress_reporter): self.remove_tags = [{'class':'comments'}, {'id':'photocredit'}, dict(name='div', attrs={'class':re.compile('top.controls')}), dict(name='div', attrs={'class':re.compile('^comments')}), dict(name='div', attrs={'class':re.compile('social')}), dict(name='div', attrs={'class':re.compile('tools')}), dict(name='div', attrs={'class':re.compile('bottom.tools')}), dict(name='div', attrs={'class':re.compile('window')}), dict(name='div', attrs={'class':re.compile('related.news.element')})] print("PROFILE NAME = "+options.output_profile.short_name) if self.kindle_omit_images and options.output_profile.short_name in ['kindle', 'kindle_dx', 'kindle_pw']: self.remove_tags.append(dict(name='div', attrs={'class':re.compile('image-container')})) BasicNewsRecipe.__init__(self, options, log, progress_reporter) def get_cover_url(self): from datetime import timedelta, date cover = 'http://webmedia.newseum.org/newseum-multimedia/dfp/jpg'+str(date.today().day)+'/lg/'+self.fp_tag+'.jpg' br = BasicNewsRecipe.get_browser(self) daysback=1 try: br.open(cover) except: while daysback<7: cover = 'http://webmedia.newseum.org/newseum-multimedia/dfp/jpg'+str((date.today() - timedelta(days=daysback)).day)+'/lg/'+self.fp_tag+'.jpg' br = BasicNewsRecipe.get_browser(self) try: br.open(cover) except: daysback = daysback+1 continue break if daysback==7: self.log("\nCover unavailable") cover = None return cover def prepare_masthead_image(self, path_to_image, out_path): if self.Kindle_Fire: from calibre.utils.magick import Image, create_canvas img = Image() img.open(path_to_image) width, height = img.size img2 = create_canvas(width, height) img2.compose(img) img2.save(out_path) else: BasicNewsRecipe.prepare_masthead_image(path_to_image, out_path) def fixChars(self,string): # Replace lsquo (\x91) fixed = re.sub("\x91","‘",string) # Replace rsquo (\x92) fixed = re.sub("\x92","’",fixed) # Replace ldquo (\x93) fixed = re.sub("\x93","“",fixed) # Replace rdquo (\x94) fixed = re.sub("\x94","”",fixed) # Replace ndash (\x96) fixed = re.sub("\x96","–",fixed) # Replace mdash (\x97) fixed = re.sub("\x97","—",fixed) fixed = re.sub("’","’",fixed) return fixed def massageNCXText(self, description): # Kindle TOC descriptions won't render certain characters if description: massaged = unicode(BeautifulStoneSoup(description, convertEntities=BeautifulStoneSoup.HTML_ENTITIES)) # Replace '&' with '&' massaged = re.sub("&","&", massaged) return self.fixChars(massaged) else: return description def populate_article_metadata(self, article, soup, first): if first: picdiv = soup.find('body').find('img') if picdiv is not None: self.add_toc_thumbnail(article,re.sub(r'links\\link\d+\\','',picdiv['src'])) xtitle = article.text_summary.strip() if len(xtitle) == 0: desc = soup.find('meta',attrs={'property':'og:description'}) if desc is not None: article.summary = article.text_summary = desc['content'] def strip_anchors(self,soup): paras = soup.findAll(True) for para in paras: aTags = para.findAll('a') for a in aTags: if a.img is None: a.replaceWith(a.renderContents().decode('cp1252','replace')) return soup def preprocess_html(self,soup): byline = soup.find('p',attrs={'class':re.compile('ancillary')}) if byline is not None: authstr = self.tag_to_string(byline,False) authstr = re.sub('/ *Times Colonist','/',authstr, flags=re.IGNORECASE) authstr = re.sub('BY */','',authstr, flags=re.IGNORECASE) newdiv = Tag(soup,'div') newdiv.insert(0,authstr) newdiv['class']='byline' byline.replaceWith(newdiv) for caption in soup.findAll('p',attrs={'class':re.compile('caption')}): capstr = self.tag_to_string(caption,False) capstr = re.sub('Photograph by.*$','',capstr, flags=re.IGNORECASE) newdiv = Tag(soup,'div') newdiv.insert(0,capstr) newdiv['class']='caption' caption.replaceWith(newdiv) for ptag in soup.findAll('p'): ptext = self.tag_to_string(ptag,use_alt=False, normalize_whitespace=True) ptext = re.sub(r'\s+','', ptext) if (ptext=='') or (ptext==' '): ptag.extract() return self.strip_anchors(soup) raeside = False def handle_articles(self,htag,article_list,sectitle): atag = htag.a if atag is not None: url = atag['href'] url = url.strip() # print("Checking >>"+url+'<<\n\r') if url.startswith('/'): url = self.url_prefix+url if url in self.url_list: return self.url_list.append(url) title = self.tag_to_string(atag,False) if 'VIDEO' in title.upper(): return if 'GALLERY' in title.upper(): return if 'PHOTOS' in title.upper(): return if 'RAESIDE' in title.upper(): if self.raeside: return self.raeside = True dtag = htag.findNext('p') description='' if dtag is not None: description = self.tag_to_string(dtag,False) article_list.append(dict(title=title,url=url,date='',description=description,author='',content='')) print(sectitle+title+": description = "+description+" URL="+url+'\n\r') def add_section_index(self,ans,securl,sectitle): print("Add section url="+self.url_prefix+'/'+securl+'\n\r') try: soup = self.index_to_soup(self.url_prefix+'/'+securl) except: return ans mainsoup = soup.find('div',attrs={'class':re.compile('main.content')}) article_list = [] for wdiv in mainsoup.findAll('div',attrs={'id':re.compile('featured.story')}): for htag in wdiv.findAll('h3'): self.handle_articles(htag,article_list,sectitle) for ladiv in mainsoup.findAll(attrs={'class':re.compile('leading.articles')}): for wdiv in mainsoup.findAll('div',attrs={'class':re.compile('article.row')}): for htag in wdiv.findAll('h2'): self.handle_articles(htag,article_list,sectitle) ans.append((sectitle,article_list)) return ans def parse_index(self): ans = [] for (url,title) in self.section_list: ans = self.add_section_index(ans,url,title) return ans