rewritte my_parse_index so it handles more sections

2025-08-11 09:13:57 -04:00 · 2021-01-11 21:43:50 +01:00 · 2021-01-11 21:43:50 +01:00 · c7174bc8d6
commit c7174bc8d6
parent 34da60c843
1 changed files with 140 additions and 70 deletions
--- a/recipes/mediapart.recipe
+++ b/recipes/mediapart.recipe
@ -3,15 +3,17 @@
 from __future__ import unicode_literals
 __license__ = 'GPL v3'
-__copyright__ = '2016, Daniel Bonnery ? (contact: DanielBonnery sur mobileread.com) 2009, Mathieu Godlewski <mathieu at godlewski.fr>; 2010-2012, Louis Gesbert <meta at antislash dot info>'  # noqa
+__copyright__ = '2021, Loïc Houpert <houpertloic at gmail .com>. Adapted from: 2016, Daniel Bonnery; 2009, Mathieu Godlewski; 2010-2012, Louis Gesbert'  # noqa
 '''
 Mediapart
 '''
 import sys
 import re
 from calibre.web.feeds.news import BasicNewsRecipe
 from calibre.web.feeds import feeds_from_index
-from datetime import date, timedelta
+from datetime import datetime, timedelta
 def classes(classes):
@ -22,7 +24,7 @@ def classes(classes):
 class Mediapart(BasicNewsRecipe):
    title = 'Mediapart'
-    __author__ = 'Daniel Bonnery from a version by Mathieu Godlewski, Louis Gesbert'
+    __author__ = 'Loïc Houpert (adapted from a version by Daniel Bonnery, Mathieu Godlewski and Louis Gesbert)'
    description = 'Global news in French from news site Mediapart'
    publication_type = 'newspaper'
    language = 'fr'
@ -45,89 +47,157 @@ class Mediapart(BasicNewsRecipe):
 # --
-    oldest_article_date = date.today() - timedelta(days=oldest_article)
+    oldest_article_date = datetime.today() - timedelta(days=oldest_article)
 # -- get the index (the feed at 'http://www.mediapart.fr/articles/feed' only has
 #    the 10 last elements :/)
    feeds = [
        ('La Une', 'http://www.mediapart.fr/articles/feed'),
    ]
    # The feed at 'http://www.mediapart.fr/articles/feed' only displayed the 10
    # last elements so the articles are indexed on specific pages
    # in the function my_parse_index. In this function the article are parsed
    # using the funtion get_articles and the dict values dict_article_sources
    def parse_feeds(self):
        feeds = super(Mediapart, self).parse_feeds()
        feeds += feeds_from_index(self.my_parse_index(feeds))
        print("\n======================================================" +
                    "======================================================\n")
        print("======================================================" +
                    "======================================================\n")
        print(f" List of feeds: {feeds}")
        #sys.exit("sys.exit for debug")
        return feeds
    def my_parse_index(self, la_une):
        dict_article_sources = [
            {'type':'Brèves',
            'webpage':'https://www.mediapart.fr/journal/fil-dactualites',
            'separador':{'page':'ul','thread':'li'}
            },
            {'type':'International',
            'webpage':'https://www.mediapart.fr/journal/international',
           'separador':{'page':'div','thread':'div'}
            },
            {'type':'France',
            'webpage':'https://www.mediapart.fr/journal/france',
           'separador':{'page':'div','thread':'div'}
            },
            {'type':'Économie',
            'webpage':'https://www.mediapart.fr/journal/economie',
           'separador':{'page':'div','thread':'div'}
            },
            {'type':'Culture',
            'webpage':'https://www.mediapart.fr/journal/culture-idees',
           'separador':{'page':'div','thread':'div'}
            },
            ]
        def get_articles(type_of_article,webpage,
                                separador_page='ul', separador_thread='li'):
            print("\n======================================================" +
                        "======================================================")
            print(f"[Type of Article]:{type_of_article}")
            print(f"[Webpage]:{webpage}")
            print("\n======================================================" +
                        "======================================================\n")
            specific_articles = []
            webpage_article = []
            soup = self.index_to_soup(webpage)
            page = soup.find('main', {'class': 'global-wrapper'})
            fils = page.find(separador_page, {'class': 'post-list universe-journal'})
            # print(f"Print value of fils.findAll('li'):\n {fils.findAll('li')} ")
            all_articles = fils.findAll(separador_thread)
            # print(soup.prettify())
            for article in all_articles:
                try:
                    title = article.find('h3', recursive=False)
                    if title is None or ''.join(title['class']) == 'title-specific':
                        # print(f"[BAD title entry] Print value of title:\n {title}")
                        continue
                    # print(f"\n[OK title entry] Print value of title:\n {title}\n")
                    try:
                        article_mot_cle = article.find('a', {'href': re.compile(
                            r'.*\/mot-cle\/.*')}).renderContents().decode('utf-8')
                    except:
                        article_mot_cle = ''
                    try:
                        article_type = article.find('a', {'href': re.compile(
                            r'.*\/type-darticles\/.*')}).renderContents().decode('utf-8')
                    except:
                        article_type = ''
                    # print(f"Article Type:\n {article_type}\n")
                    for s in title('span'):
                        s.replaceWith(s.renderContents().decode('utf-8') + "\n")
                    url = title.find('a', href=True)['href']
                    date = article.find('time', datetime=True)['datetime']
                    article_date = datetime.strptime(date,'%Y-%m-%d')
                    if article_date < self.oldest_article_date:
                        print("article_date < self.oldest_article_date\n")
                        continue
                    # print("-------- Recent article added to the list ------- \n")
                    all_authors = article.findAll(
                        'a', {'class': re.compile(r'\bjournalist\b')})
                    authors = [self.tag_to_string(a) for a in all_authors]
                    # print(f"Authors in tag <a>: {authors}")
                    # If not link to the author profile is available the
                    # html separador is a span tag
                    if not all_authors:
                         try:
                            all_authors = article.findAll(
                                'span', {'class': re.compile(r'\bjournalist\b')})
                            authors = [self.tag_to_string(a) for a in all_authors]
                            # print(f"Authors in tag <span>: {authors}")
                         except:
                             authors = 'unknown'
                    description = article.find('p').renderContents().decode('utf-8')
                    # print(f" <p> in article : {self.tag_to_string(description).strip()} ")
                    summary = {
                        'title': self.tag_to_string(title).strip(),
                        'description': description,
                        'date': article_date.strftime("%a, %d %b, %Y %H:%M"),
                        'author': ', '.join(authors),
                        'article_type': article_type,
                        'mot_cle': article_mot_cle.capitalize(),
                        'url': 'https://www.mediapart.fr' + url,
                    }
                    # print(f"\nSummary: {summary}")
                    webpage_article.append(summary)
                except:
                    pass
            specific_articles += [(type_of_article, webpage_article)] if webpage_article else []
            return specific_articles
        articles = []
-        breves = []
+        for category in dict_article_sources:
-        liens = []
+            articles += get_articles(category['type'],category['webpage'],
-        confidentiels = []
+                                    category['separador']['page'],
                                    category['separador']['thread']
                                    )
-        soup = self.index_to_soup(
+        print(articles)
            'https://www.mediapart.fr/journal/fil-dactualites')
        page = soup.find('main', {'class': 'global-wrapper'})
        fils = page.find('ul', {'class': 'post-list universe-journal'})
        for article in fils.findAll('li'):
            try:
                title = article.find('h3', recursive=False)
                if title is None or ''.join(title['class']) == 'title-specific':
                    continue
                # print "found fil ",title
                article_type = article.find('a', {'href': re.compile(
                    r'.*\/type-darticles\/.*')}).renderContents().decode('utf-8')
                # print "kind: ",article_type
                for s in title('span'):
                    s.replaceWith(s.renderContents().decode('utf-8') + "\n")
                url = title.find('a', href=True)['href']
                # article_date = self.parse_french_date(article.find("span", "article-date").renderContents().decode('utf-8'))
                # print("################################# 9")
                # print(article_date)
                # if article_date < self.oldest_article_date:
                #    print "too old"
                #    continue
                authors = article.findAll(
                    'a', {'class': re.compile(r'\bjournalist\b')})
                authors = [self.tag_to_string(a) for a in authors]
                # description = article.find('div', {'class': lambda c: c != 'taxonomy-teaser'}, recursive=False).findAll('p')
                # print "fil ",title," by ",authors," : ",description
                summary = {
                    'title': self.tag_to_string(title).strip(),
                    'author': ', '.join(authors),
                    'url': 'https://www.mediapart.fr' + url
                }
                if article_type == 'Lien':
                    liens.append(summary)
                if article_type == 'Confidentiel':
                    confidentiels.append(summary)
                if article_type not in ['Lien', 'Confidentiel']:
                    breves.append(summary)
            except:
                pass
        # print 'La Une: ', len(la_une), ' articles'
        # for a in la_une: print a["title"]
        # print 'Brèves: ', len(breves), ' articles'
        # print 'Revue web: ', len(liens), ' articles'
        # print 'Confidentiel: ', len(confidentiels), ' articles'
        articles += [('Brèves', breves)] if breves else []
        articles += [('Revue du Web', liens)] if liens else []
        articles += [('Confidentiel', confidentiels)] if confidentiels else []
        return articles
 # -- print-version
    conversion_options = {'smarten_punctuation': True}