calibre/recipes/nuus24.recipe

from __future__ import print_function
import re
from calibre import strftime
from calibre.web.feeds.recipes import BasicNewsRecipe


class Nuus24(BasicNewsRecipe):

    title = 'Nuus24'
    __author__ = 'Nicki de Wet'
    encoding = 'utf-8'
    description = 'Daaglikse Afrikaanse Nuus via Nuus24'
    language = 'af'
    publisher = 'Media24'
    timefmt = ' [%a, %d %b, %Y]'
    masthead_url = 'http://afrikaans.news24.com/images/nuus.jpg'
    max_articles_per_feed = 25
    remove_tags_before = dict(id='TheFeed')
    remove_tags_after = dict(id='TheFeed')
    remove_tags = [dict(
        attrs={
            'class': [
                'personal-bar row-fluid', 'navbar main-menu-fixed',
                'breaking-news-wrapper', 'row-fluid comments-bg',
                'unstyled actions', 'modal-body', 'modal-header', 'desktop']}),
        dict(id=['weather-forecast', 'topics', 'side-widgets',
                 'footer-container', 'sb-container', 'myModal']),
        dict(name=['script', 'noscript', 'style'])]

    keep_only_tags = [dict(attrs={'class': ['span8 border-right']}),
                      dict(name=['article', 'section']),
                      dict(id=['img-wrapper'])]
    extra_css = """ div.carousel-inner{ overflow:hidden;display: block;height:300px;} img{display: block} """
    no_stylesheets = True

    def parse_index(self):
        soup = self.index_to_soup('http://afrikaans.news24.com/Index.aspx')

        def feed_title(div):
            return ''.join(div.findAll(text=True, recursive=False)).strip()

        articles = {}
        key = None
        key = 'Nuus in Afrikaans'
        articles[key] = []
        ans = []

        for anchor in soup.findAll(True,
                                   attrs={'id': ['lnkLink']}):
            url = re.sub(r'\?.*', '', anchor['href'])
            title = self.tag_to_string(anchor, use_alt=True).strip()
            print(title)
            description = ''
            pubdate = strftime('%a, %d %b')
            articles[key].append(
                dict(title=title, url=url, date=pubdate,
                     description=description,
                     content=''))
        ans = [(key, articles[key])]
        return ans