From 32a66a5e2899dffd65b1cda2be4938c22e0711f8 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sat, 7 Mar 2009 09:30:14 -0800 Subject: [PATCH] New recipe for Mediapart.fr by Mathieu Godlewski --- src/calibre/web/feeds/recipes/__init__.py | 2 +- .../web/feeds/recipes/recipe_mediapart.py | 53 +++++++++++++++++++ 2 files changed, 54 insertions(+), 1 deletion(-) create mode 100644 src/calibre/web/feeds/recipes/recipe_mediapart.py diff --git a/src/calibre/web/feeds/recipes/__init__.py b/src/calibre/web/feeds/recipes/__init__.py index 6eb24e162b..a513f34728 100644 --- a/src/calibre/web/feeds/recipes/__init__.py +++ b/src/calibre/web/feeds/recipes/__init__.py @@ -33,7 +33,7 @@ recipe_modules = ['recipe_' + r for r in ( 'la_republica', 'physics_today', 'chicago_tribune', 'e_novine', 'al_jazeera', 'winsupersite', 'borba', 'courrierinternational', 'lamujerdemivida', 'soldiers', 'theonion', 'news_times', - 'el_universal', + 'el_universal', 'mediapart', )] import re, imp, inspect, time, os diff --git a/src/calibre/web/feeds/recipes/recipe_mediapart.py b/src/calibre/web/feeds/recipes/recipe_mediapart.py new file mode 100644 index 0000000000..60dc893834 --- /dev/null +++ b/src/calibre/web/feeds/recipes/recipe_mediapart.py @@ -0,0 +1,53 @@ +#!/usr/bin/env python + +__license__ = 'GPL v3' +__copyright__ = '2009, Mathieu Godlewski ' +''' +Mediapart +''' + +import re, string +from datetime import date +from calibre.ebooks.BeautifulSoup import BeautifulSoup +from calibre.web.feeds.news import BasicNewsRecipe + +class Mediapart(BasicNewsRecipe): + title = 'Mediapart' + __author__ = 'Mathieu Godlewski ' + description = 'Global news in french from online newspapers' + oldest_article = 7 + language = _('French') + max_articles_per_feed = 50 + no_stylesheets = True + + html2lrf_options = ['--base-font-size', '10'] + + feeds = [ + ('Les articles', 'http://www.mediapart.fr/articles/feed'), + ] + + preprocess_regexps = [ (re.compile(i[0], re.IGNORECASE|re.DOTALL), i[1]) for i in + [ + (r'', lambda match : '

'+match.group(1)+'

'), + (r'

Mediapart\.fr

', lambda match : ''), + (r']*>[\s]*

', lambda match : ''), + (r'

[^>]*

', lambda match : ''), + ] + ] + + remove_tags = [ dict(name='div', attrs={'class':'print-source_url'}), + dict(name='div', attrs={'class':'print-links'}), + dict(name='img', attrs={'src':'entete_article.png'}), + ] + + + def print_version(self, url): + raw = self.browser.open(url).read() + soup = BeautifulSoup(raw.decode('utf8', 'replace')) + div = soup.find('div', {'class':'node node-type-article'}) + if div is None: + return None + article_id = string.replace(div['id'], 'node-', '') + if article_id is None: + return None + return 'http://www.mediapart.fr/print/'+article_id