From d2b427c3f93ac7ee7a8194c498a7b309b596972b Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Fri, 19 Nov 2010 08:24:12 -0700 Subject: [PATCH] Fix #7583 (Updated recipe for Al Jazeera in English) --- resources/recipes/al_jazeera.recipe | 58 ++++++++++++++++++----------- 1 file changed, 37 insertions(+), 21 deletions(-) diff --git a/resources/recipes/al_jazeera.recipe b/resources/recipes/al_jazeera.recipe index fd5f07973d..133a793191 100644 --- a/resources/recipes/al_jazeera.recipe +++ b/resources/recipes/al_jazeera.recipe @@ -1,10 +1,8 @@ -#!/usr/bin/env python - __license__ = 'GPL v3' -__copyright__ = '2009, Darko Miletic ' +__copyright__ = '2009-2010, Darko Miletic ' ''' -aljazeera.net +english.aljazeera.net ''' from calibre.web.feeds.news import BasicNewsRecipe @@ -12,41 +10,59 @@ class AlJazeera(BasicNewsRecipe): title = 'Al Jazeera in English' __author__ = 'Darko Miletic' description = 'News from Middle East' - language = 'en' - + language = 'en' publisher = 'Al Jazeera' category = 'news, politics, middle east' - simultaneous_downloads = 1 - delay = 4 - oldest_article = 1 + delay = 1 + oldest_article = 2 max_articles_per_feed = 100 no_stylesheets = True encoding = 'iso-8859-1' - remove_javascript = True use_embedded_content = False + extra_css = """ + body{font-family: Arial,sans-serif} + #ctl00_cphBody_dvSummary{font-weight: bold} + #dvArticleDate{font-size: small; color: #999999} + """ + conversion_options = { + 'comment' : description + , 'tags' : category + , 'publisher' : publisher + , 'language' : language + } - html2lrf_options = [ - '--comment', description - , '--category', category - , '--publisher', publisher - , '--ignore-tables' - ] - - html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"\nlinearize_table=True' - - keep_only_tags = [dict(name='div', attrs={'id':'ctl00_divContent'})] + keep_only_tags = [ + dict(attrs={'id':['DetailedTitle','ctl00_cphBody_dvSummary','dvArticleDate']}) + ,dict(name='td',attrs={'class':'DetailedSummary'}) + ] remove_tags = [ - dict(name=['object','link']) + dict(name=['object','link','table','meta','base','iframe','embed']) ,dict(name='td', attrs={'class':['MostActiveDescHeader','MostActiveDescBody']}) ] feeds = [(u'AL JAZEERA ENGLISH (AJE)', u'http://english.aljazeera.net/Services/Rss/?PostingId=2007731105943979989' )] + def get_article_url(self, article): + artlurl = article.get('link', None) + return artlurl.replace('http://english.aljazeera.net//','http://english.aljazeera.net/') + def preprocess_html(self, soup): for item in soup.findAll(style=True): del item['style'] for item in soup.findAll(face=True): del item['face'] + td = soup.find('td',attrs={'class':'DetailedSummary'}) + if td: + td.name = 'div' + spn = soup.find('span',attrs={'id':'DetailedTitle'}) + if spn: + spn.name='h1' + for itm in soup.findAll('span', attrs={'id':['dvArticleDate','ctl00_cphBody_lblDate']}): + itm.name = 'div' + for alink in soup.findAll('a'): + if alink.string is not None: + tstr = alink.string + alink.replaceWith(tstr) return soup