From fd251bb00a8a42c776cad1a8cc1a69152f97d7f7 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Fri, 4 Mar 2016 09:09:48 +0530 Subject: [PATCH] Update Mediapart --- recipes/mediapart.recipe | 160 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 160 insertions(+) create mode 100644 recipes/mediapart.recipe diff --git a/recipes/mediapart.recipe b/recipes/mediapart.recipe new file mode 100644 index 0000000000..086dbd4c99 --- /dev/null +++ b/recipes/mediapart.recipe @@ -0,0 +1,160 @@ +#!/usr/bin/env python2 +# vim:fileencoding=utf-8 +from __future__ import unicode_literals + +__license__ = 'GPL v3' +__copyright__ = '2016, Daniel Bonnery ? (contact: DanielBonnery sur mobileread.com) 2009, Mathieu Godlewski ; 2010-2012, Louis Gesbert ' +''' +Mediapart +''' + +import re +from calibre.ebooks.BeautifulSoup import BeautifulSoup +from calibre.web.feeds.news import BasicNewsRecipe +from calibre.web.feeds import feeds_from_index +from datetime import date,timedelta + +class Mediapart(BasicNewsRecipe): + title = 'Mediapart' + __author__ = 'Daniel Bonnery from a version by Mathieu Godlewski, Louis Gesbert' + description = 'Global news in french from news site Mediapart' + publication_type = 'newspaper' + language = 'fr' + needs_subscription = True + oldest_article = 2 + + use_embedded_content = False + no_stylesheets = True + + cover_url = 'https://static.mediapart.fr/files/M%20Philips/logo-mediapart.png' + +# -- + + oldest_article_date = date.today() - timedelta(days=oldest_article) + +# -- get the index (the feed at 'http://www.mediapart.fr/articles/feed' only has +# the 10 last elements :/) + + feeds = [ + ('La Une', 'http://www.mediapart.fr/articles/feed'), + ] + + def parse_feeds(self): + feeds = super(Mediapart, self).parse_feeds() + feeds += feeds_from_index(self.my_parse_index(feeds)) + return feeds + + def my_parse_index(self, la_une): + articles = [] + + breves = [] + liens = [] + confidentiels = [] + + soup = self.index_to_soup('https://www.mediapart.fr/journal/fil-dactualites') + page = soup.find('div', {'class':'page-content bust'}) + fils = page.find('ul', {'class':'post-list universe-journal'}) + + for article in fils.findAll('li'): + try: + title = article.find('h3',recursive=False) + + if title is None or title['class'] == 'title-specific': + continue + + # print "found fil ",title + article_type = article.find('a', {'href': re.compile(r'.*\/type-darticles\/.*')}).renderContents() + # print "kind: ",article_type + + for s in title('span'): + s.replaceWith(s.renderContents() + "\n") + url = title.find('a', href=True)['href'] + + # article_date = self.parse_french_date(article.find("span", "article-date").renderContents()) + # print("################################# 9") + # print(article_date) + + # if article_date < self.oldest_article_date: + # print "too old" + # continue + + authors = article.findAll('a',{'class':re.compile(r'\bjournalist\b')}) + authors = [self.tag_to_string(a) for a in authors] + + # description = article.find('div', {'class': lambda c: c != 'taxonomy-teaser'}, recursive=False).findAll('p') + + # print "fil ",title," by ",authors," : ",description + + summary = { + 'title': self.tag_to_string(title).strip(), + 'author': ', '.join(authors), + 'url': url, + } + { + "Brève": breves, + "Lien": liens, + "Confidentiel": confidentiels, + }.get(article_type).append(summary) + except: + pass + + # print 'La Une: ', len(la_une), ' articles' + # for a in la_une: print a["title"] + # print 'Brèves: ', len(breves), ' articles' + # print 'Revue web: ', len(liens), ' articles' + # print 'Confidentiel: ', len(confidentiels), ' articles' + + articles += [('Brèves', breves)] if breves else [] + articles += [('Revue du Web', liens)] if liens else [] + articles += [('Confidentiel', confidentiels)] if confidentiels else [] + return articles +# -- print-version + + conversion_options = {'smarten_punctuation' : True} + + remove_tags = [dict(name='div', attrs={'class':'print-source_url'})] + + # non-locale specific date parse (strptime("%d %b %Y",s) would work with french locale) + def parse_french_date(self, date_str): + date_arr = date_str.lower().split() + return date(day=int(date_arr[0]), + year=int(date_arr[2]), + month=[None, 'janvier', 'février', 'mars', 'avril', 'mai', 'juin', 'juillet', + 'août', 'septembre', 'octobre', 'novembre', 'décembre'].index(date_arr[1])) + + def print_version(self, url): + raw = self.browser.open(url).read() + soup = BeautifulSoup(raw.decode('utf8', 'replace')) + # Filter old articles + # article_date = self.parse_french_date(self.tag_to_string(soup.find('span', 'article-date'))) + + # if article_date < self.oldest_article_date: + # return None + + tools = soup.find('li', {'class':'print'}) + link = tools.find('a', {'href': re.compile(r'\/print\/.*')}) + print(link['href']) + # if link is None: + # print 'Error: print link not found' + # return None + return 'https://mediapart.fr' + link['href'] + +# -- Handle login + def get_browser(self): + br = BasicNewsRecipe.get_browser(self) + if self.username is not None and self.password is not None: + br.open('https://www.mediapart.fr/login') + br.select_form(nr=1) + br['name'] = self.username + br['password'] = self.password + br.submit() + return br + + # This is a workaround articles with scribd content that include + # tags _within_ the body + preprocess_regexps = [ + (re.compile(r'()(.*)', re.IGNORECASE|re.DOTALL), + lambda match: + match.group(1) + re.sub( + re.compile(r'', re.IGNORECASE|re.DOTALL),'', match.group(2)) + '') + ]