calibre/recipes/saskatoon_star_phoenix.recipe

#!/usr/bin/env python
# -*- coding: utf-8 -*-

__license__ = 'GPL v3'

'''
www.canada.com
'''

import re
from calibre.web.feeds.news import BasicNewsRecipe


class CanWestPaper(BasicNewsRecipe):

    # un-comment the following four lines for the Victoria Times Colonist
    # #    title = u'Victoria Times Colonist'
    # #    url_prefix = 'http://www.timescolonist.com'
    # #    description = u'News from Victoria, BC'
    # #    fp_tag = 'CAN_TC'
    #
    # un-comment the following four lines for the Vancouver Province
    # #    title = u'Vancouver Province'
    # #    url_prefix = 'http://www.theprovince.com'
    # #    description = u'News from Vancouver, BC'
    # #    fp_tag = 'CAN_VP'
    #
    # un-comment the following four lines for the Vancouver Sun
    # #    title = u'Vancouver Sun'
    # #    url_prefix = 'http://www.vancouversun.com'
    # #    description = u'News from Vancouver, BC'
    # #    fp_tag = 'CAN_VS'
    #
    # un-comment the following four lines for the Edmonton Journal
    # #    title = u'Edmonton Journal'
    # #    url_prefix = 'http://www.edmontonjournal.com'
    # #    description = u'News from Edmonton, AB'
    # #    fp_tag = 'CAN_EJ'
    #
    # un-comment the following four lines for the Calgary Herald
    # #    title = u'Calgary Herald'
    # #    url_prefix = 'http://www.calgaryherald.com'
    # #    description = u'News from Calgary, AB'
    # #    fp_tag = 'CAN_CH'
    #
    # un-comment the following four lines for the Regina Leader-Post
    # #    title = u'Regina Leader-Post'
    # #    url_prefix = 'http://www.leaderpost.com'
    # #    description = u'News from Regina, SK'
    # #    fp_tag = ''
    #
    # un-comment the following four lines for the Saskatoon Star-Phoenix
    title = u'Saskatoon Star-Phoenix'
    url_prefix = 'http://www.thestarphoenix.com'
    description = u'News from Saskatoon, SK'
    fp_tag = ''

#     un-comment the following four lines for the Windsor Star
# #    title = u'Windsor Star'
# #    url_prefix = 'http://www.windsorstar.com'
# #    description = u'News from Windsor, ON'
# #    fp_tag = 'CAN_'
#
#     un-comment the following four lines for the Ottawa Citizen
# #    title = u'Ottawa Citizen'
# #    url_prefix = 'http://www.ottawacitizen.com'
# #    description = u'News from Ottawa, ON'
# #    fp_tag = 'CAN_OC'
#
#     un-comment the following four lines for the Montreal Gazette
# #    title = u'Montreal Gazette'
# #    url_prefix = 'http://www.montrealgazette.com'
# #    description = u'News from Montreal, QC'
# #    fp_tag = 'CAN_MG'

    language = 'en_CA'
    __author__ = 'Nick Redding'
    no_stylesheets = True
    timefmt = ' [%b %d]'
    extra_css = '''
                .timestamp {  font-size:xx-small; display: block; }
                #storyheader { font-size: medium; }
                #storyheader h1 { font-size: x-large; }
                #storyheader h2 { font-size: large;  font-style: italic; }
                .byline { font-size:xx-small; }
                #photocaption { font-size: small; font-style: italic }
                #photocredit { font-size: xx-small; }'''
    keep_only_tags = [dict(name='div', attrs={'id': 'storyheader'}), dict(
        name='div', attrs={'id': 'storycontent'})]
    remove_tags = [{'class': 'comments'},
                   dict(name='div', attrs={'class': 'navbar'}), dict(
                       name='div', attrs={'class': 'morelinks'}),
                   dict(name='div', attrs={'class': 'viewmore'}), dict(
                       name='li', attrs={'class': 'email'}),
                   dict(name='div', attrs={'class': 'story_tool_hr'}), dict(
                       name='div', attrs={'class': 'clear'}),
                   dict(name='div', attrs={'class': 'story_tool'}), dict(
                       name='div', attrs={'class': 'copyright'}),
                   dict(name='div', attrs={'class': 'rule_grey_solid'}),
                   dict(name='li', attrs={'class': 'print'}), dict(name='li', attrs={'class': 'share'}), dict(name='ul', attrs={'class': 'bullet'})]

    def get_cover_url(self):
        from datetime import timedelta, date
        if self.fp_tag == '':
            return None
        cover = 'http://webmedia.newseum.org/newseum-multimedia/dfp/jpg' + \
            str(date.today().day) + '/lg/' + self.fp_tag + '.jpg'
        br = BasicNewsRecipe.get_browser(self)
        daysback = 1
        try:
            br.open(cover)
        except:
            while daysback < 7:
                cover = 'http://webmedia.newseum.org/newseum-multimedia/dfp/jpg' + \
                    str((date.today() - timedelta(days=daysback)).day) + \
                    '/lg/' + self.fp_tag + '.jpg'
                br = BasicNewsRecipe.get_browser(self)
                try:
                    br.open(cover)
                except:
                    daysback = daysback + 1
                    continue
                break
        if daysback == 7:
            self.log("\nCover unavailable")
            cover = None
        return cover

    def fixChars(self, string):
        # Replace lsquo (\x91)
        fixed = re.sub("\x91", "‘", string)
        # Replace rsquo (\x92)
        fixed = re.sub("\x92", "’", fixed)
        # Replace ldquo (\x93)
        fixed = re.sub("\x93", "“", fixed)
        # Replace rdquo (\x94)
        fixed = re.sub("\x94", "”", fixed)
        # Replace ndash (\x96)
        fixed = re.sub("\x96", "–", fixed)
        # Replace mdash (\x97)
        fixed = re.sub("\x97", "—", fixed)
        fixed = re.sub("&#x2019;", "’", fixed)
        return fixed

    def massageNCXText(self, description):
        return description

    def populate_article_metadata(self, article, soup, first):
        if first:
            picdiv = soup.find('body').find('img')
            if picdiv is not None:
                self.add_toc_thumbnail(article, re.sub(
                    r'links\\link\d+\\', '', picdiv['src']))
        xtitle = article.text_summary.strip()
        if len(xtitle) == 0:
            desc = soup.find('meta', attrs={'property': 'og:description'})
            if desc is not None:
                article.summary = article.text_summary = desc['content']

    def strip_anchors(self, soup):
        paras = soup.findAll(True)
        for para in paras:
            aTags = para.findAll('a')
            for a in aTags:
                if a.img is None:
                    a.replaceWith(a.renderContents().decode(
                        'cp1252', 'replace'))
        return soup

    def preprocess_html(self, soup):
        return self.strip_anchors(soup)

    def parse_index(self):
        soup = self.index_to_soup(
            self.url_prefix + '/news/todays-paper/index.html')

        articles = {}
        key = 'News'
        ans = ['News']

        # Find each instance of class="sectiontitle", class="featurecontent"
        for divtag in soup.findAll('div', attrs={'class': ["section_title02", "featurecontent"]}):
            if ''.join(divtag['class']).startswith('section_title'):
                # div contains section title
                if not divtag.h3:
                    continue
                key = self.tag_to_string(divtag.h3, False)
                ans.append(key)
                self.log("Section name %s" % key)
                continue
            # div contains article data
            h1tag = divtag.find('h1')
            if not h1tag:
                continue
            atag = h1tag.find('a', href=True)
            if not atag:
                continue
            url = self.url_prefix + '/news/todays-paper/' + atag['href']
            title = self.tag_to_string(atag, False)
            pubdate = ''
            description = ''
            ptag = divtag.find('p')
            if ptag:
                description = self.tag_to_string(ptag, False)
            author = ''
            autag = divtag.find('h4')
            if autag:
                author = self.tag_to_string(autag, False)
            if key not in articles:
                articles[key] = []
            articles[key].append(dict(title=title, url=url, date=pubdate,
                                      description=description, author=author, content=''))

        ans = [(k, articles[k]) for k in ans if k in articles]
        return ans