diff --git a/recipes/mediapart.recipe b/recipes/mediapart.recipe index 094103de6b..5bc840e056 100644 --- a/recipes/mediapart.recipe +++ b/recipes/mediapart.recipe @@ -3,26 +3,28 @@ from __future__ import unicode_literals __license__ = 'GPL v3' -__copyright__ = '2016, Daniel Bonnery ? (contact: DanielBonnery sur mobileread.com) 2009, Mathieu Godlewski ; 2010-2012, Louis Gesbert ' # noqa +__copyright__ = '2021, Loïc Houpert . Adapted from: 2016, Daniel Bonnery; 2009, Mathieu Godlewski; 2010-2012, Louis Gesbert' # noqa ''' Mediapart ''' import re -from calibre.web.feeds.news import BasicNewsRecipe +from datetime import date, datetime, timedelta + from calibre.web.feeds import feeds_from_index -from datetime import date, timedelta +from calibre.web.feeds.news import BasicNewsRecipe def classes(classes): q = frozenset(classes.split(' ')) - return dict(attrs={ - 'class': lambda x: x and frozenset(x.split()).intersection(q)}) + return dict( + attrs={'class': lambda x: x and frozenset(x.split()).intersection(q)} + ) class Mediapart(BasicNewsRecipe): title = 'Mediapart' - __author__ = 'Daniel Bonnery from a version by Mathieu Godlewski, Louis Gesbert' + __author__ = 'Loïc Houpert' description = 'Global news in French from news site Mediapart' publication_type = 'newspaper' language = 'fr' @@ -37,113 +39,191 @@ class Mediapart(BasicNewsRecipe): dict(name='div', **classes('author')), classes('introduction content-article') ] - remove_tags = [ - classes('login-subscribe print-source_url') - ] + remove_tags = [classes('login-subscribe print-source_url')] + conversion_options = {'smarten_punctuation': True} cover_url = 'https://static.mediapart.fr/files/M%20Philips/logo-mediapart.png' -# -- + # -- - oldest_article_date = date.today() - timedelta(days=oldest_article) - -# -- get the index (the feed at 'http://www.mediapart.fr/articles/feed' only has -# the 10 last elements :/) + oldest_article_date = datetime.today() - timedelta(days=oldest_article) feeds = [ ('La Une', 'http://www.mediapart.fr/articles/feed'), ] + # The feed at 'http://www.mediapart.fr/articles/feed' only displayed the 10 + # last elements so the articles are indexed on specific pages + # in the function my_parse_index. In this function the article are parsed + # using the funtion get_articles and the dict values dict_article_sources + def parse_feeds(self): feeds = super(Mediapart, self).parse_feeds() feeds += feeds_from_index(self.my_parse_index(feeds)) return feeds def my_parse_index(self, la_une): + + dict_article_sources = [ + { + 'type': 'Brèves', + 'webpage': 'https://www.mediapart.fr/journal/fil-dactualites', + 'separador': { + 'page': 'ul', + 'thread': 'li' + } + }, + { + 'type': 'International', + 'webpage': 'https://www.mediapart.fr/journal/international', + 'separador': { + 'page': 'div', + 'thread': 'div' + } + }, + { + 'type': 'France', + 'webpage': 'https://www.mediapart.fr/journal/france', + 'separador': { + 'page': 'div', + 'thread': 'div' + } + }, + { + 'type': 'Économie', + 'webpage': 'https://www.mediapart.fr/journal/economie', + 'separador': { + 'page': 'div', + 'thread': 'div' + } + }, + { + 'type': 'Culture', + 'webpage': 'https://www.mediapart.fr/journal/culture-idees', + 'separador': { + 'page': 'div', + 'thread': 'div' + } + }, + ] + + def get_articles( + type_of_article, webpage, separador_page='ul', separador_thread='li' + ): + + specific_articles = [] + + webpage_article = [] + soup = self.index_to_soup(webpage) + page = soup.find('main', {'class': 'global-wrapper'}) + fils = page.find(separador_page, {'class': 'post-list universe-journal'}) + + all_articles = fils.findAll(separador_thread) + for article in all_articles: + try: + title = article.find('h3', recursive=False) + if title is None or ''.join(title['class']) == 'title-specific': + # print(f"[BAD title entry] Print value of title:\n {title}") + continue + # print(f"\n[OK title entry] Print value of title:\n {title}\n") + + try: + article_mot_cle = article.find( + 'a', { + 'href': re.compile(r'.*\/mot-cle\/.*') + } + ).renderContents().decode('utf-8') + except Exception: + article_mot_cle = '' + + try: + article_type = article.find( + 'a', { + 'href': re.compile(r'.*\/type-darticles\/.*') + } + ).renderContents().decode('utf-8') + except Exception: + article_type = '' + + for s in title('span'): + s.replaceWith(s.renderContents().decode('utf-8') + "\n") + url = title.find('a', href=True)['href'] + + date = article.find('time', datetime=True)['datetime'] + article_date = datetime.strptime(date, '%Y-%m-%d') + if article_date < self.oldest_article_date: + print("article_date < self.oldest_article_date\n") + continue + + # print("-------- Recent article added to the list ------- \n") + all_authors = article.findAll( + 'a', {'class': re.compile(r'\bjournalist\b')} + ) + authors = [self.tag_to_string(a) for a in all_authors] + # print(f"Authors in tag : {authors}") + + # If not link to the author profile is available the + # html separador is a span tag + if not all_authors: + try: + all_authors = article.findAll( + 'span', {'class': re.compile(r'\bjournalist\b')} + ) + authors = [self.tag_to_string(a) for a in all_authors] + # print(f"Authors in tag : {authors}") + except: + authors = 'unknown' + + description = article.find('p').renderContents().decode('utf-8') + # print(f"

in article : {self.tag_to_string(description).strip()} ") + + summary = { + 'title': self.tag_to_string(title).strip(), + 'description': description, + 'date': article_date.strftime("%a, %d %b, %Y %H:%M"), + 'author': ', '.join(authors), + 'article_type': article_type, + 'mot_cle': article_mot_cle.capitalize(), + 'url': 'https://www.mediapart.fr' + url, + } + + webpage_article.append(summary) + except Exception: + pass + + specific_articles += [(type_of_article, + webpage_article)] if webpage_article else [] + return specific_articles + articles = [] - breves = [] - liens = [] - confidentiels = [] + for category in dict_article_sources: + articles += get_articles( + category['type'], category['webpage'], category['separador']['page'], + category['separador']['thread'] + ) - soup = self.index_to_soup( - 'https://www.mediapart.fr/journal/fil-dactualites') - page = soup.find('main', {'class': 'global-wrapper'}) - fils = page.find('ul', {'class': 'post-list universe-journal'}) - - for article in fils.findAll('li'): - try: - title = article.find('h3', recursive=False) - - if title is None or ''.join(title['class']) == 'title-specific': - continue - - # print "found fil ",title - article_type = article.find('a', {'href': re.compile( - r'.*\/type-darticles\/.*')}).renderContents().decode('utf-8') - # print "kind: ",article_type - - for s in title('span'): - s.replaceWith(s.renderContents().decode('utf-8') + "\n") - url = title.find('a', href=True)['href'] - - # article_date = self.parse_french_date(article.find("span", "article-date").renderContents().decode('utf-8')) - # print("################################# 9") - # print(article_date) - - # if article_date < self.oldest_article_date: - # print "too old" - # continue - - authors = article.findAll( - 'a', {'class': re.compile(r'\bjournalist\b')}) - authors = [self.tag_to_string(a) for a in authors] - - # description = article.find('div', {'class': lambda c: c != 'taxonomy-teaser'}, recursive=False).findAll('p') - - # print "fil ",title," by ",authors," : ",description - - summary = { - 'title': self.tag_to_string(title).strip(), - 'author': ', '.join(authors), - 'url': 'https://www.mediapart.fr' + url - } - if article_type == 'Lien': - liens.append(summary) - if article_type == 'Confidentiel': - confidentiels.append(summary) - if article_type not in ['Lien', 'Confidentiel']: - breves.append(summary) - except: - pass - - # print 'La Une: ', len(la_une), ' articles' - # for a in la_une: print a["title"] - # print 'Brèves: ', len(breves), ' articles' - # print 'Revue web: ', len(liens), ' articles' - # print 'Confidentiel: ', len(confidentiels), ' articles' - - articles += [('Brèves', breves)] if breves else [] - articles += [('Revue du Web', liens)] if liens else [] - articles += [('Confidentiel', confidentiels)] if confidentiels else [] return articles -# -- print-version - - conversion_options = {'smarten_punctuation': True} # non-locale specific date parse (strptime("%d %b %Y",s) would work with # french locale) def parse_french_date(self, date_str): date_arr = date_str.lower().split() - return date(day=int(date_arr[0]), - year=int(date_arr[2]), - month=[None, 'janvier', 'février', 'mars', 'avril', 'mai', 'juin', 'juillet', - 'août', 'septembre', 'octobre', 'novembre', 'décembre'].index(date_arr[1])) + return date( + day=int(date_arr[0]), + year=int(date_arr[2]), + month=[ + None, 'janvier', 'février', 'mars', 'avril', 'mai', 'juin', + 'juillet', 'août', 'septembre', 'octobre', 'novembre', 'décembre' + ].index(date_arr[1]) + ) -# -- Handle login def get_browser(self): + # -- Handle login + def is_form_login(form): return "id" in form.attrs and form.attrs['id'] == "logFormEl" + br = BasicNewsRecipe.get_browser(self) if self.username is not None and self.password is not None: br.open('https://www.mediapart.fr/login')