diff --git a/recipes/mediapart.recipe b/recipes/mediapart.recipe index 305bfb3460..094103de6b 100644 --- a/recipes/mediapart.recipe +++ b/recipes/mediapart.recipe @@ -14,10 +14,16 @@ from calibre.web.feeds import feeds_from_index from datetime import date, timedelta +def classes(classes): + q = frozenset(classes.split(' ')) + return dict(attrs={ + 'class': lambda x: x and frozenset(x.split()).intersection(q)}) + + class Mediapart(BasicNewsRecipe): title = 'Mediapart' __author__ = 'Daniel Bonnery from a version by Mathieu Godlewski, Louis Gesbert' - description = 'Global news in french from news site Mediapart' + description = 'Global news in French from news site Mediapart' publication_type = 'newspaper' language = 'fr' needs_subscription = True @@ -26,6 +32,15 @@ class Mediapart(BasicNewsRecipe): use_embedded_content = False no_stylesheets = True + keep_only_tags = [ + dict(name='h1'), + dict(name='div', **classes('author')), + classes('introduction content-article') + ] + remove_tags = [ + classes('login-subscribe print-source_url') + ] + cover_url = 'https://static.mediapart.fr/files/M%20Philips/logo-mediapart.png' # -- @@ -116,8 +131,6 @@ class Mediapart(BasicNewsRecipe): conversion_options = {'smarten_punctuation': True} - remove_tags = [dict(name='div', attrs={'class': 'print-source_url'})] - # non-locale specific date parse (strptime("%d %b %Y",s) would work with # french locale) def parse_french_date(self, date_str): @@ -127,21 +140,6 @@ class Mediapart(BasicNewsRecipe): month=[None, 'janvier', 'février', 'mars', 'avril', 'mai', 'juin', 'juillet', 'août', 'septembre', 'octobre', 'novembre', 'décembre'].index(date_arr[1])) - def print_version(self, url): - soup = self.index_to_soup(url) - # Filter old articles - # article_date = self.parse_french_date(self.tag_to_string(soup.find('span', 'article-date'))) - - # if article_date < self.oldest_article_date: - # return None - - tools = soup.find('li', {'class': 'print'}) - link = tools.find('a', {'href': re.compile(r'\/print\/.*')}) - # if link is None: - # print 'Error: print link not found' - # return None - return 'https://mediapart.fr' + link['href'] - # -- Handle login def get_browser(self): def is_form_login(form): @@ -154,12 +152,3 @@ class Mediapart(BasicNewsRecipe): br['password'] = self.password br.submit() return br - - # This is a workaround articles with scribd content that include - # tags _within_ the body - preprocess_regexps = [ - (re.compile(r'()(.*)', re.IGNORECASE | re.DOTALL), - lambda match: - match.group(1) + re.sub( - re.compile(r'', re.IGNORECASE | re.DOTALL), '', match.group(2)) + '') - ]