#!/usr/bin/env python # vim:fileencoding=utf-8 __license__ = 'GPL v3' __copyright__ = '2009-2010, Darko Miletic ' ''' english.aljazeera.net ''' from calibre.web.feeds.news import BasicNewsRecipe, classes class AlJazeera(BasicNewsRecipe): title = 'Al Jazeera in English' __author__ = 'Darko Miletic' description = 'News from Middle East' language = 'en' publisher = 'Al Jazeera' category = 'news, politics, middle east' delay = 1 oldest_article = 2 max_articles_per_feed = 100 no_stylesheets = True use_embedded_content = False extra_css = ''' body{font-family: Arial,sans-serif} ''' conversion_options = { 'comment': description, 'tags': category, 'publisher': publisher, 'language': language } keep_only_tags = [ classes('article-header article-featured-image wysiwyg--all-content'), ] remove_tags = [ dict(name=['object', 'link', 'table', 'meta', 'base', 'iframe', 'embed']), ] recipe_specific_options = { 'days': { 'short': 'Oldest article to download from this news source. In days ', 'long': 'For example, 0.5, gives you articles from the past 12 hours', 'default': str(oldest_article) } } def __init__(self, *args, **kwargs): BasicNewsRecipe.__init__(self, *args, **kwargs) d = self.recipe_specific_options.get('days') if d and isinstance(d, str): self.oldest_article = float(d) feeds = [(u'Al Jazeera English', u'http://www.aljazeera.com/xml/rss/all.xml')] def get_article_url(self, article): artlurl = article.get('link', None) return artlurl def preprocess_html(self, soup): for item in soup.findAll(style=True): del item['style'] for item in soup.findAll(face=True): del item['face'] for alink in soup.findAll('a'): if alink.string is not None: tstr = alink.string alink.replaceWith(tstr) return soup