calibre/recipes/vic_times.recipe

#!/usr/bin/env python
# -*- coding: utf-8 -*-
from __future__ import print_function
__license__ = 'GPL v3'

'''
www.canada.com
'''
import re
from calibre.web.feeds.news import BasicNewsRecipe

from calibre.ebooks.BeautifulSoup import Tag


def new_tag(soup, name, attrs=()):
    impl = getattr(soup, 'new_tag', None)
    if impl is not None:
        return impl(name, attrs=dict(attrs))
    return Tag(soup, name, attrs=attrs or None)


class TimesColonist(BasicNewsRecipe):

    # Customization -- remove sections you don't want.
    # If your e-reader is an e-ink Kindle and your output profile is
    # set properly this recipe will not include images because the
    # resulting file is too large. If you have one of these and want
    # images you can set kindle_omit_images = False
    # and remove sections (typically the e-ink Kindles will
    # work with about a dozen of these, but your mileage may vary).

    kindle_omit_images = True

    section_list = [
        ('', 'Web Front Page'),
        ('news/', 'News Headlines'),
        ('news/b-c/', 'BC News'),
        ('news/national/', 'National News'),
        ('news/world/', 'World News'),
        ('opinion/', 'Opinion'),
        ('opinion/letters/', 'Letters'),
        ('business/', 'Business'),
        ('business/money/', 'Money'),
        ('business/technology/', 'Technology'),
        ('business/working/', 'Working'),
        ('sports/', 'Sports'),
        ('sports/hockey/', 'Hockey'),
        ('sports/football/', 'Football'),
        ('sports/basketball/', 'Basketball'),
        ('sports/golf/', 'Golf'),
        ('entertainment/', 'entertainment'),
        ('entertainment/go/', 'Go!'),
        ('entertainment/music/', 'Music'),
        ('entertainment/books/', 'Books'),
        ('entertainment/Movies/', 'Movies'),
        ('entertainment/television/', 'Television'),
        ('life/', 'Life'),
        ('life/health/', 'Health'),
        ('life/travel/', 'Travel'),
        ('life/driving/', 'Driving'),
        ('life/homes/', 'Homes'),
        ('life/food-drink/', 'Food & Drink')
    ]

    title = u'Victoria Times Colonist'
    url_prefix = 'http://www.timescolonist.com'
    description = u'News from Victoria, BC'
    fp_tag = 'CAN_TC'

    masthead_url = 'http://www.timescolonist.com/gmg/img/global/logoTimesColonist.png'

    url_list = []
    language = 'en_CA'
    __author__ = 'Nick Redding'
    no_stylesheets = True
    timefmt = ' [%b %d]'
    encoding = 'utf-8'
    extra_css = '''
                .byline { font-size:xx-small; font-weight: bold;}
                h3 { margin-bottom: 6px; }
                .caption { font-size: xx-small; font-style: italic; font-weight: normal; }
                '''
    keep_only_tags = [
        dict(name='div', attrs={'class': re.compile('main.content')})]

    def __init__(self, options, log, progress_reporter):
        self.remove_tags = [{'class': 'comments'},
                            {'id': 'photocredit'},
                            dict(name='div', attrs={
                                 'class': re.compile('top.controls')}),
                            dict(name='div', attrs={
                                 'class': re.compile('^comments')}),
                            dict(name='div', attrs={
                                 'class': re.compile('social')}),
                            dict(name='div', attrs={
                                 'class': re.compile('tools')}),
                            dict(name='div', attrs={
                                 'class': re.compile('bottom.tools')}),
                            dict(name='div', attrs={
                                 'class': re.compile('window')}),
                            dict(name='div', attrs={'class': re.compile('related.news.element')})]
        print("PROFILE NAME = " + options.output_profile.short_name)
        if self.kindle_omit_images and options.output_profile.short_name in ['kindle', 'kindle_dx', 'kindle_pw']:
            self.remove_tags.append(
                dict(name='div', attrs={'class': re.compile('image-container')}))
        BasicNewsRecipe.__init__(self, options, log, progress_reporter)

    def get_cover_url(self):
        from datetime import timedelta, date
        cover = 'http://webmedia.newseum.org/newseum-multimedia/dfp/jpg' + \
            str(date.today().day) + '/lg/' + self.fp_tag + '.jpg'
        br = BasicNewsRecipe.get_browser(self)
        daysback = 1
        try:
            br.open(cover)
        except:
            while daysback < 7:
                cover = 'http://webmedia.newseum.org/newseum-multimedia/dfp/jpg' + \
                    str((date.today() - timedelta(days=daysback)).day) + \
                    '/lg/' + self.fp_tag + '.jpg'
                br = BasicNewsRecipe.get_browser(self)
                try:
                    br.open(cover)
                except:
                    daysback = daysback + 1
                    continue
                break
        if daysback == 7:
            self.log("\nCover unavailable")
            cover = None
        return cover

    def fixChars(self, string):
        # Replace lsquo (\x91)
        fixed = re.sub("\x91", "‘", string)
        # Replace rsquo (\x92)
        fixed = re.sub("\x92", "’", fixed)
        # Replace ldquo (\x93)
        fixed = re.sub("\x93", "“", fixed)
        # Replace rdquo (\x94)
        fixed = re.sub("\x94", "”", fixed)
        # Replace ndash (\x96)
        fixed = re.sub("\x96", "–", fixed)
        # Replace mdash (\x97)
        fixed = re.sub("\x97", "—", fixed)
        fixed = re.sub("&#x2019;", "’", fixed)
        return fixed

    def massageNCXText(self, description):
        return description

    def populate_article_metadata(self, article, soup, first):
        if first:
            picdiv = soup.find('body').find('img')
            if picdiv is not None:
                self.add_toc_thumbnail(article, re.sub(
                    r'links\\link\d+\\', '', picdiv['src']))
        xtitle = article.text_summary.strip()
        if len(xtitle) == 0:
            desc = soup.find('meta', attrs={'property': 'og:description'})
            if desc is not None:
                article.summary = article.text_summary = desc['content']

    def strip_anchors(self, soup):
        paras = soup.findAll(True)
        for para in paras:
            aTags = para.findAll('a')
            for a in aTags:
                if a.img is None:
                    a.replaceWith(a.renderContents().decode(
                        'cp1252', 'replace'))
        return soup

    def preprocess_html(self, soup):
        byline = soup.find('p', attrs={'class': re.compile('ancillary')})
        if byline is not None:
            authstr = self.tag_to_string(byline, False)
            authstr = re.sub('/ *Times Colonist', '/',
                             authstr, flags=re.IGNORECASE)
            authstr = re.sub('BY */', '', authstr, flags=re.IGNORECASE)
            newdiv = new_tag(soup, 'div')
            newdiv.insert(0, authstr)
            newdiv['class'] = 'byline'
            byline.replaceWith(newdiv)
        for caption in soup.findAll('p', attrs={'class': re.compile('caption')}):
            capstr = self.tag_to_string(caption, False)
            capstr = re.sub('Photograph by.*$', '',
                            capstr, flags=re.IGNORECASE)
            newdiv = new_tag(soup, 'div')
            newdiv.insert(0, capstr)
            newdiv['class'] = 'caption'
            caption.replaceWith(newdiv)
        for ptag in soup.findAll('p'):
            ptext = self.tag_to_string(
                ptag, use_alt=False, normalize_whitespace=True)
            ptext = re.sub(r'\s+', '', ptext)
            if (ptext == '') or (ptext == '&nbsp;'):
                ptag.extract()
        return self.strip_anchors(soup)

    raeside = False

    def handle_articles(self, htag, article_list, sectitle):
        atag = htag.a
        if atag is not None:
            url = atag['href']
            url = url.strip()
            # print("Checking >>"+url+'<<\n\r')
            if url.startswith('/'):
                url = self.url_prefix + url
            if url in self.url_list:
                return
            self.url_list.append(url)
            title = self.tag_to_string(atag, False)
            if 'VIDEO' in title.upper():
                return
            if 'GALLERY' in title.upper():
                return
            if 'PHOTOS' in title.upper():
                return
            if 'RAESIDE' in title.upper():
                if self.raeside:
                    return
                self.raeside = True
            dtag = htag.findNext('p')
            description = ''
            if dtag is not None:
                description = self.tag_to_string(dtag, False)
            article_list.append(dict(
                title=title, url=url, date='', description=description, author='', content=''))
            print(sectitle + title + ": description = " +
                  description + " URL=" + url + '\n\r')

    def add_section_index(self, ans, securl, sectitle):
        print("Add section url=" + self.url_prefix + '/' + securl + '\n\r')
        try:
            soup = self.index_to_soup(self.url_prefix + '/' + securl)
        except:
            return ans
        mainsoup = soup.find(
            'div', attrs={'class': re.compile('main.content')})
        article_list = []
        for wdiv in mainsoup.findAll('div', attrs={'id': re.compile('featured.story')}):
            for htag in wdiv.findAll('h3'):
                self.handle_articles(htag, article_list, sectitle)
        for ladiv in mainsoup.findAll(attrs={'class': re.compile('leading.articles')}):
            for wdiv in mainsoup.findAll('div', attrs={'class': re.compile('article.row')}):
                for htag in wdiv.findAll('h2'):
                    self.handle_articles(htag, article_list, sectitle)
        ans.append((sectitle, article_list))
        return ans

    def parse_index(self):
        ans = []
        for (url, title) in self.section_list:
            ans = self.add_section_index(ans, url, title)
        return ans