diff --git a/recipes/mediapart.recipe b/recipes/mediapart.recipe index 4540879f72..a5bc4e96f9 100644 --- a/recipes/mediapart.recipe +++ b/recipes/mediapart.recipe @@ -1,16 +1,17 @@ __license__ = 'GPL v3' -__copyright__ = '2009, Mathieu Godlewski ; 2010, Louis Gesbert ' +__copyright__ = '2009, Mathieu Godlewski ; 2010, 2011, Louis Gesbert ' ''' Mediapart ''' -from calibre.ebooks.BeautifulSoup import Tag +import re +from calibre.ebooks.BeautifulSoup import BeautifulSoup from calibre.web.feeds.news import BasicNewsRecipe class Mediapart(BasicNewsRecipe): title = 'Mediapart' - __author__ = 'Mathieu Godlewski' - description = 'Global news in french from online newspapers' + __author__ = 'Mathieu Godlewski, Louis Gesbert' + description = 'Global news in french from news site Mediapart' oldest_article = 7 language = 'fr' needs_subscription = True @@ -18,52 +19,30 @@ class Mediapart(BasicNewsRecipe): max_articles_per_feed = 50 no_stylesheets = True - cover_url = 'http://www.mediapart.fr/sites/all/themes/mediapart/mediapart/images/annonce.jpg' + cover_url = 'http://static.mediapart.fr/files/pave_mediapart.jpg' feeds = [ ('Les articles', 'http://www.mediapart.fr/articles/feed'), ] -# -- print-version has poor quality on this website, better do the conversion ourselves -# -# preprocess_regexps = [ (re.compile(i[0], re.IGNORECASE|re.DOTALL), i[1]) for i in -# [ -# (r'', lambda match : '

'+match.group(1)+'

'), -# (r'[^>]+]*>([^<]*)[^<]*', -# lambda match : ''+match.group(1)+''), -# (r'\'', lambda match: '’'), -# ] -# ] -# -# remove_tags = [ dict(name='div', attrs={'class':'print-source_url'}), -# dict(name='div', attrs={'class':'print-links'}), -# dict(name='img', attrs={'src':'entete_article.png'}), -# dict(name='br') ] -# -# def print_version(self, url): -# raw = self.browser.open(url).read() -# soup = BeautifulSoup(raw.decode('utf8', 'replace')) -# div = soup.find('div', {'id':re.compile('node-\d+')}) -# if div is None: -# return None -# article_id = string.replace(div['id'], 'node-', '') -# if article_id is None: -# return None -# return 'http://www.mediapart.fr/print/'+article_id +# -- print-version -# -- Non-print version [dict(name='div', attrs={'class':'advert'})] - - keep_only_tags = [ - dict(name='h1', attrs={'class':'title'}), - dict(name='div', attrs={'class':'page_papier_detail'}), + preprocess_regexps = [ (re.compile(i[0], re.IGNORECASE|re.DOTALL), i[1]) for i in + [ + (r'', lambda match : '

'+match.group(1)+'

'), + (r'\'', lambda match: '’') ] + ] - def preprocess_html(self,soup): - for title in soup.findAll('div', {'class':'titre'}): - tag = Tag(soup, 'h3') - title.replaceWith(tag) - tag.insert(0,title) - return soup + remove_tags = [ dict(name='div', attrs={'class':'print-source_url'}) ] + + def print_version(self, url): + raw = self.browser.open(url).read() + soup = BeautifulSoup(raw.decode('utf8', 'replace')) + link = soup.find('a', {'title':'Imprimer'}) + if link is None: + return None + return link['href'] # -- Handle login @@ -76,4 +55,3 @@ class Mediapart(BasicNewsRecipe): br['pass'] = self.password br.submit() return br -