from __future__ import print_function import re from calibre import strftime from calibre.web.feeds.recipes import BasicNewsRecipe class Nuus24(BasicNewsRecipe): title = 'Nuus24' __author__ = 'Nicki de Wet' encoding = 'utf-8' description = 'Daaglikse Afrikaanse Nuus via Nuus24' language = 'af' publisher = 'Media24' timefmt = ' [%a, %d %b, %Y]' masthead_url = 'http://afrikaans.news24.com/images/nuus.jpg' max_articles_per_feed = 25 remove_tags_before = dict(id='TheFeed') remove_tags_after = dict(id='TheFeed') remove_tags = [dict( attrs={ 'class': [ 'personal-bar row-fluid', 'navbar main-menu-fixed', 'breaking-news-wrapper', 'row-fluid comments-bg', 'unstyled actions', 'modal-body', 'modal-header', 'desktop']}), dict(id=['weather-forecast', 'topics', 'side-widgets', 'footer-container', 'sb-container', 'myModal']), dict(name=['script', 'noscript', 'style'])] keep_only_tags = [dict(attrs={'class': ['span8 border-right']}), dict(name=['article', 'section']), dict(id=['img-wrapper'])] extra_css = """ div.carousel-inner{ overflow:hidden;display: block;height:300px;} img{display: block} """ no_stylesheets = True def parse_index(self): soup = self.index_to_soup('http://afrikaans.news24.com/Index.aspx') def feed_title(div): return ''.join(div.findAll(text=True, recursive=False)).strip() articles = {} key = None key = 'Nuus in Afrikaans' articles[key] = [] ans = [] for anchor in soup.findAll(True, attrs={'id': ['lnkLink']}): url = re.sub(r'\?.*', '', anchor['href']) title = self.tag_to_string(anchor, use_alt=True).strip() print(title) description = '' pubdate = strftime('%a, %d %b') articles[key].append( dict(title=title, url=url, date=pubdate, description=description, content='')) ans = [(key, articles[key])] return ans