diff --git a/recipes/mediapart.recipe b/recipes/mediapart.recipe index 094103de6b..dbfed75476 100644 --- a/recipes/mediapart.recipe +++ b/recipes/mediapart.recipe @@ -3,15 +3,17 @@ from __future__ import unicode_literals __license__ = 'GPL v3' -__copyright__ = '2016, Daniel Bonnery ? (contact: DanielBonnery sur mobileread.com) 2009, Mathieu Godlewski ; 2010-2012, Louis Gesbert ' # noqa +__copyright__ = '2021, Loïc Houpert . Adapted from: 2016, Daniel Bonnery; 2009, Mathieu Godlewski; 2010-2012, Louis Gesbert' # noqa ''' Mediapart ''' +import sys + import re from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds import feeds_from_index -from datetime import date, timedelta +from datetime import datetime, timedelta def classes(classes): @@ -22,7 +24,7 @@ def classes(classes): class Mediapart(BasicNewsRecipe): title = 'Mediapart' - __author__ = 'Daniel Bonnery from a version by Mathieu Godlewski, Louis Gesbert' + __author__ = 'Loïc Houpert (adapted from a version by Daniel Bonnery, Mathieu Godlewski and Louis Gesbert)' description = 'Global news in French from news site Mediapart' publication_type = 'newspaper' language = 'fr' @@ -45,89 +47,157 @@ class Mediapart(BasicNewsRecipe): # -- - oldest_article_date = date.today() - timedelta(days=oldest_article) + oldest_article_date = datetime.today() - timedelta(days=oldest_article) -# -- get the index (the feed at 'http://www.mediapart.fr/articles/feed' only has -# the 10 last elements :/) feeds = [ ('La Une', 'http://www.mediapart.fr/articles/feed'), ] + # The feed at 'http://www.mediapart.fr/articles/feed' only displayed the 10 + # last elements so the articles are indexed on specific pages + # in the function my_parse_index. In this function the article are parsed + # using the funtion get_articles and the dict values dict_article_sources + + + def parse_feeds(self): feeds = super(Mediapart, self).parse_feeds() feeds += feeds_from_index(self.my_parse_index(feeds)) + print("\n======================================================" + + "======================================================\n") + print("======================================================" + + "======================================================\n") + print(f" List of feeds: {feeds}") + #sys.exit("sys.exit for debug") return feeds def my_parse_index(self, la_une): + + dict_article_sources = [ + {'type':'Brèves', + 'webpage':'https://www.mediapart.fr/journal/fil-dactualites', + 'separador':{'page':'ul','thread':'li'} + }, + {'type':'International', + 'webpage':'https://www.mediapart.fr/journal/international', + 'separador':{'page':'div','thread':'div'} + }, + {'type':'France', + 'webpage':'https://www.mediapart.fr/journal/france', + 'separador':{'page':'div','thread':'div'} + }, + {'type':'Économie', + 'webpage':'https://www.mediapart.fr/journal/economie', + 'separador':{'page':'div','thread':'div'} + }, + {'type':'Culture', + 'webpage':'https://www.mediapart.fr/journal/culture-idees', + 'separador':{'page':'div','thread':'div'} + }, + ] + + def get_articles(type_of_article,webpage, + separador_page='ul', separador_thread='li'): + + print("\n======================================================" + + "======================================================") + print(f"[Type of Article]:{type_of_article}") + print(f"[Webpage]:{webpage}") + print("\n======================================================" + + "======================================================\n") + + specific_articles = [] + + webpage_article = [] + soup = self.index_to_soup(webpage) + page = soup.find('main', {'class': 'global-wrapper'}) + fils = page.find(separador_page, {'class': 'post-list universe-journal'}) + + # print(f"Print value of fils.findAll('li'):\n {fils.findAll('li')} ") + all_articles = fils.findAll(separador_thread) + # print(soup.prettify()) + for article in all_articles: + try: + title = article.find('h3', recursive=False) + if title is None or ''.join(title['class']) == 'title-specific': + # print(f"[BAD title entry] Print value of title:\n {title}") + continue + # print(f"\n[OK title entry] Print value of title:\n {title}\n") + + try: + article_mot_cle = article.find('a', {'href': re.compile( + r'.*\/mot-cle\/.*')}).renderContents().decode('utf-8') + except: + article_mot_cle = '' + + try: + article_type = article.find('a', {'href': re.compile( + r'.*\/type-darticles\/.*')}).renderContents().decode('utf-8') + except: + article_type = '' + + # print(f"Article Type:\n {article_type}\n") + + for s in title('span'): + s.replaceWith(s.renderContents().decode('utf-8') + "\n") + url = title.find('a', href=True)['href'] + + date = article.find('time', datetime=True)['datetime'] + article_date = datetime.strptime(date,'%Y-%m-%d') + if article_date < self.oldest_article_date: + print("article_date < self.oldest_article_date\n") + continue + + # print("-------- Recent article added to the list ------- \n") + all_authors = article.findAll( + 'a', {'class': re.compile(r'\bjournalist\b')}) + authors = [self.tag_to_string(a) for a in all_authors] + # print(f"Authors in tag : {authors}") + + # If not link to the author profile is available the + # html separador is a span tag + if not all_authors: + try: + all_authors = article.findAll( + 'span', {'class': re.compile(r'\bjournalist\b')}) + authors = [self.tag_to_string(a) for a in all_authors] + # print(f"Authors in tag : {authors}") + except: + authors = 'unknown' + + description = article.find('p').renderContents().decode('utf-8') + # print(f"

in article : {self.tag_to_string(description).strip()} ") + + summary = { + 'title': self.tag_to_string(title).strip(), + 'description': description, + 'date': article_date.strftime("%a, %d %b, %Y %H:%M"), + 'author': ', '.join(authors), + 'article_type': article_type, + 'mot_cle': article_mot_cle.capitalize(), + 'url': 'https://www.mediapart.fr' + url, + } + + # print(f"\nSummary: {summary}") + + webpage_article.append(summary) + except: + pass + + specific_articles += [(type_of_article, webpage_article)] if webpage_article else [] + return specific_articles + articles = [] - breves = [] - liens = [] - confidentiels = [] + for category in dict_article_sources: + articles += get_articles(category['type'],category['webpage'], + category['separador']['page'], + category['separador']['thread'] + ) - soup = self.index_to_soup( - 'https://www.mediapart.fr/journal/fil-dactualites') - page = soup.find('main', {'class': 'global-wrapper'}) - fils = page.find('ul', {'class': 'post-list universe-journal'}) - - for article in fils.findAll('li'): - try: - title = article.find('h3', recursive=False) - - if title is None or ''.join(title['class']) == 'title-specific': - continue - - # print "found fil ",title - article_type = article.find('a', {'href': re.compile( - r'.*\/type-darticles\/.*')}).renderContents().decode('utf-8') - # print "kind: ",article_type - - for s in title('span'): - s.replaceWith(s.renderContents().decode('utf-8') + "\n") - url = title.find('a', href=True)['href'] - - # article_date = self.parse_french_date(article.find("span", "article-date").renderContents().decode('utf-8')) - # print("################################# 9") - # print(article_date) - - # if article_date < self.oldest_article_date: - # print "too old" - # continue - - authors = article.findAll( - 'a', {'class': re.compile(r'\bjournalist\b')}) - authors = [self.tag_to_string(a) for a in authors] - - # description = article.find('div', {'class': lambda c: c != 'taxonomy-teaser'}, recursive=False).findAll('p') - - # print "fil ",title," by ",authors," : ",description - - summary = { - 'title': self.tag_to_string(title).strip(), - 'author': ', '.join(authors), - 'url': 'https://www.mediapart.fr' + url - } - if article_type == 'Lien': - liens.append(summary) - if article_type == 'Confidentiel': - confidentiels.append(summary) - if article_type not in ['Lien', 'Confidentiel']: - breves.append(summary) - except: - pass - - # print 'La Une: ', len(la_une), ' articles' - # for a in la_une: print a["title"] - # print 'Brèves: ', len(breves), ' articles' - # print 'Revue web: ', len(liens), ' articles' - # print 'Confidentiel: ', len(confidentiels), ' articles' - - articles += [('Brèves', breves)] if breves else [] - articles += [('Revue du Web', liens)] if liens else [] - articles += [('Confidentiel', confidentiels)] if confidentiels else [] + print(articles) return articles -# -- print-version conversion_options = {'smarten_punctuation': True}