__license__ = 'GPL v3' __copyright__ = '2009-2010, Darko Miletic ' ''' english.aljazeera.net ''' from calibre.web.feeds.news import BasicNewsRecipe def has_cls(x): return dict(attrs={'class':lambda cls: cls and x in cls.split()}) class AlJazeera(BasicNewsRecipe): title = 'Al Jazeera in English' __author__ = 'Darko Miletic' description = 'News from Middle East' language = 'en' publisher = 'Al Jazeera' category = 'news, politics, middle east' delay = 1 oldest_article = 2 max_articles_per_feed = 100 no_stylesheets = True use_embedded_content = False extra_css = """ body{font-family: Arial,sans-serif} #ctl00_cphBody_dvSummary{font-weight: bold} #dvArticleDate{font-size: small; color: #999999} """ conversion_options = { 'comment' : description , 'tags' : category , 'publisher' : publisher , 'language' : language } keep_only_tags = [ dict(id='main-story'), ] remove_tags = [ has_cls('MoreOnTheStory'), has_cls('ArticleBottomToolbar'), dict(smtitle="ShowMore"), dict(name=['object','link','table','meta','base','iframe','embed']), ] feeds = [(u'Al Jazeera English', u'http://english.aljazeera.net/Services/Rss/?PostingId=2007731105943979989')] def get_article_url(self, article): artlurl = article.get('link', None) return artlurl.replace('http://english.aljazeera.net//','http://english.aljazeera.net/') def preprocess_html(self, soup): for item in soup.findAll(style=True): del item['style'] for item in soup.findAll(face=True): del item['face'] td = soup.find('td',attrs={'class':'DetailedSummary'}) if td: td.name = 'div' spn = soup.find('span',attrs={'id':'DetailedTitle'}) if spn: spn.name='h1' for itm in soup.findAll('span', attrs={'id':['dvArticleDate','ctl00_cphBody_lblDate']}): itm.name = 'div' for alink in soup.findAll('a'): if alink.string is not None: tstr = alink.string alink.replaceWith(tstr) return soup