#!/usr/bin/env python # -*- coding: utf-8 -*- from __future__ import print_function __license__ = 'GPL v3' ''' www.canada.com ''' import re from calibre.web.feeds.news import BasicNewsRecipe from calibre.ebooks.BeautifulSoup import Tag def new_tag(soup, name, attrs=()): impl = getattr(soup, 'new_tag', None) if impl is not None: return impl(name, attrs=dict(attrs)) return Tag(soup, name, attrs=attrs or None) class TimesColonist(BasicNewsRecipe): # Customization -- remove sections you don't want. # If your e-reader is an e-ink Kindle and your output profile is # set properly this recipe will not include images because the # resulting file is too large. If you have one of these and want # images you can set kindle_omit_images = False # and remove sections (typically the e-ink Kindles will # work with about a dozen of these, but your mileage may vary). kindle_omit_images = True section_list = [ ('', 'Web Front Page'), ('news/', 'News Headlines'), ('news/b-c/', 'BC News'), ('news/national/', 'National News'), ('news/world/', 'World News'), ('opinion/', 'Opinion'), ('opinion/letters/', 'Letters'), ('business/', 'Business'), ('business/money/', 'Money'), ('business/technology/', 'Technology'), ('business/working/', 'Working'), ('sports/', 'Sports'), ('sports/hockey/', 'Hockey'), ('sports/football/', 'Football'), ('sports/basketball/', 'Basketball'), ('sports/golf/', 'Golf'), ('entertainment/', 'entertainment'), ('entertainment/go/', 'Go!'), ('entertainment/music/', 'Music'), ('entertainment/books/', 'Books'), ('entertainment/Movies/', 'Movies'), ('entertainment/television/', 'Television'), ('life/', 'Life'), ('life/health/', 'Health'), ('life/travel/', 'Travel'), ('life/driving/', 'Driving'), ('life/homes/', 'Homes'), ('life/food-drink/', 'Food & Drink') ] title = u'Victoria Times Colonist' url_prefix = 'http://www.timescolonist.com' description = u'News from Victoria, BC' fp_tag = 'CAN_TC' masthead_url = 'http://www.timescolonist.com/gmg/img/global/logoTimesColonist.png' url_list = [] language = 'en_CA' __author__ = 'Nick Redding' no_stylesheets = True timefmt = ' [%b %d]' encoding = 'utf-8' extra_css = ''' .byline { font-size:xx-small; font-weight: bold;} h3 { margin-bottom: 6px; } .caption { font-size: xx-small; font-style: italic; font-weight: normal; } ''' keep_only_tags = [ dict(name='div', attrs={'class': re.compile('main.content')})] def __init__(self, options, log, progress_reporter): self.remove_tags = [{'class': 'comments'}, {'id': 'photocredit'}, dict(name='div', attrs={ 'class': re.compile('top.controls')}), dict(name='div', attrs={ 'class': re.compile('^comments')}), dict(name='div', attrs={ 'class': re.compile('social')}), dict(name='div', attrs={ 'class': re.compile('tools')}), dict(name='div', attrs={ 'class': re.compile('bottom.tools')}), dict(name='div', attrs={ 'class': re.compile('window')}), dict(name='div', attrs={'class': re.compile('related.news.element')})] print("PROFILE NAME = " + options.output_profile.short_name) if self.kindle_omit_images and options.output_profile.short_name in ['kindle', 'kindle_dx', 'kindle_pw']: self.remove_tags.append( dict(name='div', attrs={'class': re.compile('image-container')})) BasicNewsRecipe.__init__(self, options, log, progress_reporter) def get_cover_url(self): from datetime import timedelta, date cover = 'http://webmedia.newseum.org/newseum-multimedia/dfp/jpg' + \ str(date.today().day) + '/lg/' + self.fp_tag + '.jpg' br = BasicNewsRecipe.get_browser(self) daysback = 1 try: br.open(cover) except: while daysback < 7: cover = 'http://webmedia.newseum.org/newseum-multimedia/dfp/jpg' + \ str((date.today() - timedelta(days=daysback)).day) + \ '/lg/' + self.fp_tag + '.jpg' br = BasicNewsRecipe.get_browser(self) try: br.open(cover) except: daysback = daysback + 1 continue break if daysback == 7: self.log("\nCover unavailable") cover = None return cover def fixChars(self, string): # Replace lsquo (\x91) fixed = re.sub("\x91", "‘", string) # Replace rsquo (\x92) fixed = re.sub("\x92", "’", fixed) # Replace ldquo (\x93) fixed = re.sub("\x93", "“", fixed) # Replace rdquo (\x94) fixed = re.sub("\x94", "”", fixed) # Replace ndash (\x96) fixed = re.sub("\x96", "–", fixed) # Replace mdash (\x97) fixed = re.sub("\x97", "—", fixed) fixed = re.sub("’", "’", fixed) return fixed def massageNCXText(self, description): return description def populate_article_metadata(self, article, soup, first): if first: picdiv = soup.find('body').find('img') if picdiv is not None: self.add_toc_thumbnail(article, re.sub( r'links\\link\d+\\', '', picdiv['src'])) xtitle = article.text_summary.strip() if len(xtitle) == 0: desc = soup.find('meta', attrs={'property': 'og:description'}) if desc is not None: article.summary = article.text_summary = desc['content'] def strip_anchors(self, soup): paras = soup.findAll(True) for para in paras: aTags = para.findAll('a') for a in aTags: if a.img is None: a.replaceWith(a.renderContents().decode( 'cp1252', 'replace')) return soup def preprocess_html(self, soup): byline = soup.find('p', attrs={'class': re.compile('ancillary')}) if byline is not None: authstr = self.tag_to_string(byline, False) authstr = re.sub('/ *Times Colonist', '/', authstr, flags=re.IGNORECASE) authstr = re.sub('BY */', '', authstr, flags=re.IGNORECASE) newdiv = new_tag(soup, 'div') newdiv.insert(0, authstr) newdiv['class'] = 'byline' byline.replaceWith(newdiv) for caption in soup.findAll('p', attrs={'class': re.compile('caption')}): capstr = self.tag_to_string(caption, False) capstr = re.sub('Photograph by.*$', '', capstr, flags=re.IGNORECASE) newdiv = new_tag(soup, 'div') newdiv.insert(0, capstr) newdiv['class'] = 'caption' caption.replaceWith(newdiv) for ptag in soup.findAll('p'): ptext = self.tag_to_string( ptag, use_alt=False, normalize_whitespace=True) ptext = re.sub(r'\s+', '', ptext) if (ptext == '') or (ptext == ' '): ptag.extract() return self.strip_anchors(soup) raeside = False def handle_articles(self, htag, article_list, sectitle): atag = htag.a if atag is not None: url = atag['href'] url = url.strip() # print("Checking >>"+url+'<<\n\r') if url.startswith('/'): url = self.url_prefix + url if url in self.url_list: return self.url_list.append(url) title = self.tag_to_string(atag, False) if 'VIDEO' in title.upper(): return if 'GALLERY' in title.upper(): return if 'PHOTOS' in title.upper(): return if 'RAESIDE' in title.upper(): if self.raeside: return self.raeside = True dtag = htag.findNext('p') description = '' if dtag is not None: description = self.tag_to_string(dtag, False) article_list.append(dict( title=title, url=url, date='', description=description, author='', content='')) print(sectitle + title + ": description = " + description + " URL=" + url + '\n\r') def add_section_index(self, ans, securl, sectitle): print("Add section url=" + self.url_prefix + '/' + securl + '\n\r') try: soup = self.index_to_soup(self.url_prefix + '/' + securl) except: return ans mainsoup = soup.find( 'div', attrs={'class': re.compile('main.content')}) article_list = [] for wdiv in mainsoup.findAll('div', attrs={'id': re.compile('featured.story')}): for htag in wdiv.findAll('h3'): self.handle_articles(htag, article_list, sectitle) for ladiv in mainsoup.findAll(attrs={'class': re.compile('leading.articles')}): for wdiv in mainsoup.findAll('div', attrs={'class': re.compile('article.row')}): for htag in wdiv.findAll('h2'): self.handle_articles(htag, article_list, sectitle) ans.append((sectitle, article_list)) return ans def parse_index(self): ans = [] for (url, title) in self.section_list: ans = self.add_section_index(ans, url, title) return ans