calibre/recipes/vic_times.recipe

#!/usr/bin/env  python2
# -*- coding: utf-8 -*-
__license__   = 'GPL v3'

'''
www.canada.com
'''
import re
from calibre.web.feeds.news import BasicNewsRecipe

from calibre.ebooks.BeautifulSoup import Tag, BeautifulStoneSoup


class TimesColonist(BasicNewsRecipe):

    # Customization -- remove sections you don't want.
    # If your e-reader is an e-ink Kindle and your output profile is
    # set properly this recipe will not include images because the
    # resulting file is too large. If you have one of these and want
    # images you can set kindle_omit_images = False
    # and remove sections (typically the e-ink Kindles will
    # work with about a dozen of these, but your mileage may vary).

    kindle_omit_images = True

    section_list = [
        ('','Web Front Page'),
        ('news/','News Headlines'),
        ('news/b-c/','BC News'),
        ('news/national/','National News'),
        ('news/world/','World News'),
        ('opinion/','Opinion'),
        ('opinion/letters/','Letters'),
        ('business/','Business'),
        ('business/money/','Money'),
        ('business/technology/','Technology'),
        ('business/working/','Working'),
        ('sports/','Sports'),
        ('sports/hockey/','Hockey'),
        ('sports/football/','Football'),
        ('sports/basketball/','Basketball'),
        ('sports/golf/','Golf'),
        ('entertainment/','entertainment'),
        ('entertainment/go/','Go!'),
        ('entertainment/music/','Music'),
        ('entertainment/books/','Books'),
        ('entertainment/Movies/','Movies'),
        ('entertainment/television/','Television'),
        ('life/','Life'),
        ('life/health/','Health'),
        ('life/travel/','Travel'),
        ('life/driving/','Driving'),
        ('life/homes/','Homes'),
        ('life/food-drink/','Food & Drink')
    ]

    title = u'Victoria Times Colonist'
    url_prefix = 'http://www.timescolonist.com'
    description = u'News from Victoria, BC'
    fp_tag = 'CAN_TC'

    masthead_url = 'http://www.timescolonist.com/gmg/img/global/logoTimesColonist.png'


    url_list = []
    language = 'en_CA'
    __author__ = 'Nick Redding'
    no_stylesheets = True
    timefmt =  ' [%b %d]'
    encoding = 'utf-8'
    extra_css = '''
                .byline { font-size:xx-small; font-weight: bold;}
                h3 { margin-bottom: 6px; }
                .caption { font-size: xx-small; font-style: italic; font-weight: normal; }
                '''
    keep_only_tags = [dict(name='div', attrs={'class':re.compile('main.content')})]

    def __init__(self, options, log, progress_reporter):
        self.remove_tags = [{'class':'comments'},
                       {'id':'photocredit'},
                       dict(name='div', attrs={'class':re.compile('top.controls')}),
                       dict(name='div', attrs={'class':re.compile('^comments')}),
                       dict(name='div', attrs={'class':re.compile('social')}),
                       dict(name='div', attrs={'class':re.compile('tools')}),
                       dict(name='div', attrs={'class':re.compile('bottom.tools')}),
                       dict(name='div', attrs={'class':re.compile('window')}),
                       dict(name='div', attrs={'class':re.compile('related.news.element')})]
        print("PROFILE NAME = "+options.output_profile.short_name)
        if self.kindle_omit_images and options.output_profile.short_name in ['kindle', 'kindle_dx', 'kindle_pw']:
            self.remove_tags.append(dict(name='div', attrs={'class':re.compile('image-container')}))
        BasicNewsRecipe.__init__(self, options, log, progress_reporter)

    def get_cover_url(self):
        from datetime import timedelta, date
        cover = 'http://webmedia.newseum.org/newseum-multimedia/dfp/jpg'+str(date.today().day)+'/lg/'+self.fp_tag+'.jpg'
        br = BasicNewsRecipe.get_browser(self)
        daysback=1
        try:
            br.open(cover)
        except:
            while daysback<7:
                cover = 'http://webmedia.newseum.org/newseum-multimedia/dfp/jpg'+str((date.today() - timedelta(days=daysback)).day)+'/lg/'+self.fp_tag+'.jpg'
                br = BasicNewsRecipe.get_browser(self)
                try:
                    br.open(cover)
                except:
                    daysback = daysback+1
                    continue
                break
        if daysback==7:
            self.log("\nCover unavailable")
            cover = None
        return cover

    def prepare_masthead_image(self, path_to_image, out_path):
        if self.Kindle_Fire:
            from calibre.utils.magick import Image, create_canvas
            img = Image()
            img.open(path_to_image)
            width, height = img.size
            img2 = create_canvas(width, height)
            img2.compose(img)
            img2.save(out_path)
        else:
            BasicNewsRecipe.prepare_masthead_image(path_to_image, out_path)

    def fixChars(self,string):
        # Replace lsquo (\x91)
        fixed = re.sub("\x91","‘",string)
        # Replace rsquo (\x92)
        fixed = re.sub("\x92","’",fixed)
        # Replace ldquo (\x93)
        fixed = re.sub("\x93","“",fixed)
        # Replace rdquo (\x94)
        fixed = re.sub("\x94","”",fixed)
        # Replace ndash (\x96)
        fixed = re.sub("\x96","–",fixed)
        # Replace mdash (\x97)
        fixed = re.sub("\x97","—",fixed)
        fixed = re.sub("&#x2019;","’",fixed)
        return fixed

    def massageNCXText(self, description):
        # Kindle TOC descriptions won't render certain characters
        if description:
            massaged = unicode(BeautifulStoneSoup(description, convertEntities=BeautifulStoneSoup.HTML_ENTITIES))
            # Replace '&' with '&'
            massaged = re.sub("&","&", massaged)
            return self.fixChars(massaged)
        else:
            return description

    def populate_article_metadata(self, article, soup, first):
        if first:
            picdiv = soup.find('body').find('img')
            if picdiv is not None:
                self.add_toc_thumbnail(article,re.sub(r'links\\link\d+\\','',picdiv['src']))
        xtitle = article.text_summary.strip()
        if len(xtitle) == 0:
            desc = soup.find('meta',attrs={'property':'og:description'})
            if desc is not None:
                article.summary = article.text_summary = desc['content']

    def strip_anchors(self,soup):
        paras = soup.findAll(True)
        for para in paras:
            aTags = para.findAll('a')
            for a in aTags:
                if a.img is None:
                    a.replaceWith(a.renderContents().decode('cp1252','replace'))
        return soup

    def preprocess_html(self,soup):
        byline = soup.find('p',attrs={'class':re.compile('ancillary')})
        if byline is not None:
            authstr = self.tag_to_string(byline,False)
            authstr = re.sub('/ *Times Colonist','/',authstr, flags=re.IGNORECASE)
            authstr = re.sub('BY */','',authstr, flags=re.IGNORECASE)
            newdiv = Tag(soup,'div')
            newdiv.insert(0,authstr)
            newdiv['class']='byline'
            byline.replaceWith(newdiv)
        for caption in soup.findAll('p',attrs={'class':re.compile('caption')}):
            capstr = self.tag_to_string(caption,False)
            capstr = re.sub('Photograph by.*$','',capstr, flags=re.IGNORECASE)
            newdiv = Tag(soup,'div')
            newdiv.insert(0,capstr)
            newdiv['class']='caption'
            caption.replaceWith(newdiv)
        for ptag in soup.findAll('p'):
            ptext = self.tag_to_string(ptag,use_alt=False, normalize_whitespace=True)
            ptext = re.sub(r'\s+','', ptext)
            if (ptext=='') or (ptext=='&nbsp;'):
                ptag.extract()
        return self.strip_anchors(soup)

    raeside = False
    def handle_articles(self,htag,article_list,sectitle):
        atag = htag.a
        if atag is not None:
            url = atag['href']
            url = url.strip()
            # print("Checking >>"+url+'<<\n\r')
            if url.startswith('/'):
                url = self.url_prefix+url
            if url in self.url_list:
                return
            self.url_list.append(url)
            title = self.tag_to_string(atag,False)
            if 'VIDEO' in title.upper():
                return
            if 'GALLERY' in title.upper():
                return
            if 'PHOTOS' in title.upper():
                return
            if 'RAESIDE' in title.upper():
                if self.raeside:
                    return
                self.raeside = True
            dtag = htag.findNext('p')
            description=''
            if dtag is not None:
                description = self.tag_to_string(dtag,False)
            article_list.append(dict(title=title,url=url,date='',description=description,author='',content=''))
            print(sectitle+title+": description = "+description+" URL="+url+'\n\r')

    def add_section_index(self,ans,securl,sectitle):
        print("Add section url="+self.url_prefix+'/'+securl+'\n\r')
        try:
            soup = self.index_to_soup(self.url_prefix+'/'+securl)
        except:
            return ans
        mainsoup = soup.find('div',attrs={'class':re.compile('main.content')})
        article_list = []
        for wdiv in mainsoup.findAll('div',attrs={'id':re.compile('featured.story')}):
            for htag in wdiv.findAll('h3'):
                self.handle_articles(htag,article_list,sectitle)
        for ladiv in mainsoup.findAll(attrs={'class':re.compile('leading.articles')}):
            for wdiv in mainsoup.findAll('div',attrs={'class':re.compile('article.row')}):
                for htag in wdiv.findAll('h2'):
                    self.handle_articles(htag,article_list,sectitle)
        ans.append((sectitle,article_list))
        return ans

    def parse_index(self):
        ans = []
        for (url,title) in self.section_list:
            ans = self.add_section_index(ans,url,title)
        return ans