From 7a4f106f3d46ae349fe3d2b322576ae054c22489 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Tue, 25 Mar 2014 16:47:51 +0530 Subject: [PATCH] Update Courrier International --- recipes/courrierinternational.recipe | 58 ++++++++++++++++++++++------ 1 file changed, 46 insertions(+), 12 deletions(-) diff --git a/recipes/courrierinternational.recipe b/recipes/courrierinternational.recipe index 0ad471b9ba..aaea5c8995 100644 --- a/recipes/courrierinternational.recipe +++ b/recipes/courrierinternational.recipe @@ -19,23 +19,57 @@ class CourrierInternational(BasicNewsRecipe): max_articles_per_feed = 50 no_stylesheets = True + ignore_duplicate_articles = {'title', 'url'} + html2lrf_options = ['--base-font-size', '10'] + keep_only_tags = [ + dict(name='div', attrs={'class':'dessin'}), + dict(name='div', attrs={'class':'story-content'}), + ] + remove_tags = [ + dict(name='div', attrs={'class':re.compile('story-share storylinks|pager|event-expand')}), + dict(name='li', attrs={'class':'event-partage_outils'}), + dict(name='li', attrs={'class':'story-comment-link'}), + ] + + needs_subscription = "optional" + login_url = 'http://www.courrierinternational.com/login' + + def get_browser(self): + br = BasicNewsRecipe.get_browser(self) + if self.username: + br.open(self.login_url) + br.select_form(nr=1) + br['name'] = self.username + br['pass'] = self.password + br.submit() + return br + + def preprocess_html(self, soup): + for link in soup.findAll("a",href=re.compile('^/(notule|sources|comment)')): + link["href"]='http://www.courrierinternational.com' + link["href"] + return soup + feeds = [ # Some articles requiring subscription fails on download. ('A la Une', 'http://www.courrierinternational.com/rss/rss_a_la_une.xml'), + ('France', 'http://courrierint.com/rss/rp/14/0/rss.xml'), + ('Europe', 'http://courrierint.com/rss/rp/15/0/rss.xml'), + ('Amerique', 'http://courrierint.com/rss/rp/16/0/rss.xml'), + ('Asie', 'http://courrierint.com/rss/rp/17/0/rss.xml'), + ('Afrique', 'http://courrierint.com/rss/rp/18/0/rss.xml'), + ('Moyen-Orient', 'http://courrierint.com/rss/rp/19/0/rss.xml'), + ('Economie', 'http://courrierint.com/rss/rp/20/0/rss.xml'), + ('Multimedia', 'http://courrierint.com/rss/rp/23/0/rss.xml'), + ('Sciences', 'http://courrierint.com/rss/rp/22/0/rss.xml'), + ('Culture', 'http://courrierint.com/rss/rp/24/0/rss.xml'), + ('Insolites', 'http://courrierint.com/rss/rp/26/0/rss.xml'), + ('Cartoons', 'http://cs.courrierint.com/rss/all/rss.xml'), + ('Environnement', 'http://vt.courrierint.com/rss/all/rss.xml'), + ('Cinema', 'http://ca.courrierint.com/rss/all/rss.xml'), + ('Sport', 'http://st.courrierint.com/rss/all/rss.xml'), ] - preprocess_regexps = [ (re.compile(i[0], re.IGNORECASE|re.DOTALL), i[1]) for i in - [ - #Handle Depeches - (r'.*]*>([0-9][0-9]/.*

).*', lambda match : '
'+match.group(1)+'
'), - #Handle Articles - (r'.*]*>(Courrier international.*?) .*', lambda match : '
'+match.group(1)+''), - ] - ] - - def print_version(self, url): - return re.sub('/[a-zA-Z]+\.asp','/imprimer.asp' ,url) - + return url + '?page=all'