From 7cf205f0c7c138a0d72495c0b0e9c10744edd7a1 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sat, 17 Aug 2013 13:25:32 +0530 Subject: [PATCH] News24 and Nuus24 by Nicki de Wet --- recipes/news24.recipe | 53 ++++++++++++++++++++++++++++++++++++++++ recipes/nuus24.recipe | 57 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 110 insertions(+) create mode 100644 recipes/news24.recipe create mode 100644 recipes/nuus24.recipe diff --git a/recipes/news24.recipe b/recipes/news24.recipe new file mode 100644 index 0000000000..a344a44aec --- /dev/null +++ b/recipes/news24.recipe @@ -0,0 +1,53 @@ +from calibre.web.feeds.news import BasicNewsRecipe + +class AdvancedUserRecipe1375900744(BasicNewsRecipe): + title = u'News24' + description = "News24." + __author__ = 'Nicki de Wet' + publisher = 'Media24' + category = 'news, politics, South Africa' + oldest_article = 3 + max_articles_per_feed = 20 + no_stylesheets = True + encoding = 'utf8' + use_embedded_content = False + auto_cleanup = False + language = 'en_ZA' + remove_empty_feeds = True + publication_type = 'newsportal' + masthead_url = 'http://www.24.com/images/widgethead_news.png' + extra_css = """ + body{font-family: Arial,Helvetica,sans-serif } + img{display: block} + """ + + conversion_options = { + 'comment' : description + , 'tags' : category + , 'publisher' : publisher + , 'language' : language + } + + remove_tags = [ + dict(name=['object','embed','iframe','table','meta','link']), + dict(attrs={ + 'class':['TwitterfacebookLink','superSportArticleBlock', + 'videoHighlights', 'facebookComments','share', + 'item_block','kalahari_product left', 'block red', + 'credit']}), + dict(attrs={'id':['comments_wrap', 'article_toolbox_bot', + 'inside_news','sponsored-links', 'lnkGalleries', + 'relatedlinks_box', 'lnkUserGalleries', + 'lnkNewsGalleries', 'relatedlinks', + 'divRelatedLinks']})] + + keep_only_tags = [ + dict(attrs={'class':['left col633', 'article col626', + 'columnWrapperLeft', 'articlecolumn', + 'article_img', 'picture_caption', 'DiveTable']})] + + feeds = [ + (u'Top Stories', u'http://feeds.news24.com/articles/news24/TopStories/rss'), + (u'South Africa', u'http://feeds.news24.com/articles/news24/SouthAfrica/rss'), + (u'World', u'http://feeds.news24.com/articles/news24/World/rss'), + (u'Sport', u'http://feeds.24.com/articles/sport/featured/topstories/rss')] diff --git a/recipes/nuus24.recipe b/recipes/nuus24.recipe new file mode 100644 index 0000000000..3b80bddb9b --- /dev/null +++ b/recipes/nuus24.recipe @@ -0,0 +1,57 @@ +import re +from calibre import strftime +from calibre.web.feeds.recipes import BasicNewsRecipe + +class Nuus24(BasicNewsRecipe): + + title = 'Nuus24' + __author__ = 'Nicki de Wet' + encoding = 'utf-8' + description = 'Daaglikse Afrikaanse Nuus via Nuus24' + language = 'af' + publisher = 'Media24' + timefmt = ' [%a, %d %b, %Y]' + masthead_url = 'http://afrikaans.news24.com/images/nuus.jpg' + max_articles_per_feed = 25 + remove_tags_before = dict(id='TheFeed') + remove_tags_after = dict(id='TheFeed') + remove_tags = [dict( + attrs={ + 'class':[ + 'personal-bar row-fluid', 'navbar main-menu-fixed', + 'breaking-news-wrapper', 'row-fluid comments-bg', + 'unstyled actions', 'modal-body', 'modal-header', 'desktop']}), + dict(id=['weather-forecast', 'topics', 'side-widgets', 'footer-container', 'sb-container', 'myModal']), + dict(name=['script', 'noscript', 'style'])] + + keep_only_tags = [dict(attrs={'class':['span8 border-right']}), + dict(name=['article', 'section']), + dict(id=['img-wrapper'])] + extra_css = """ div.carousel-inner{ overflow:hidden;display: block;height:300px;} img{display: block} """ + no_stylesheets = True + + def parse_index(self): + soup = self.index_to_soup('http://afrikaans.news24.com/Index.aspx') + + def feed_title(div): + return ''.join(div.findAll(text=True, recursive=False)).strip() + + articles = {} + key = None + key = 'Nuus in Afrikaans' + articles[key] = [] + ans= [] + + for anchor in soup.findAll(True, + attrs={'id':['lnkLink']}): + url = re.sub(r'\?.*', '', anchor['href']) + title = self.tag_to_string(anchor, use_alt=True).strip() + print title + description = '' + pubdate = strftime('%a, %d %b') + articles[key].append( + dict(title=title, url=url, date=pubdate, + description=description, + content='')) + ans = [(key, articles[key])] + return ans