Fix Mediapart site changes

Pages structure is not the same depending on source types : adding workarounds to perform a second search to get articles.
2025-07-09 03:04:10 -04:00 · 2022-08-01 19:00:38 +02:00 · 2022-08-01 19:00:38 +02:00 · 357c955b44
commit 357c955b44
parent 401c92737f
1 changed files with 17 additions and 4 deletions
--- a/recipes/mediapart.recipe
+++ b/recipes/mediapart.recipe
@ -130,12 +130,17 @@ class Mediapart(BasicNewsRecipe):
            webpage_article = []
            soup = self.index_to_soup(webpage)
            page = soup.find('main', {'class': 'global-wrapper'})
            if page is None:
                page = soup.find('section', {'class': 'news__body-wrapper mb-800'})
            fils = page.find(separador_page, {'class': 'post-list universe-journal'})
            if fils is None:
                fils = page.find(separador_page, {'class': 'news__list__content _hasNewsletter'})
            all_articles = fils.findAll(separador_thread)
            for article in all_articles:
                try:
-                    title = article.find('h3', recursive=False)
+                    # title = article.find('h3', recursive=False)
                    title = article.find('h3', recursive=True)
                    if title is None or ''.join(title['class']) == 'title-specific':
                        # print(f"[BAD title entry] Print value of title:\n {title}")
                        continue
@ -173,8 +178,13 @@ class Mediapart(BasicNewsRecipe):
                    # print("-------- Recent article added to the list ------- \n")
                    all_authors = article.findAll(
-                        'a', {'class': re.compile(r'\bjournalist\b')}
+                        # 'a', {'class': re.compile(r'\bjournalist\b')}
                        'div', {'class': 'teaser__signature'}
                    )
                    if not all_authors:
                        all_authors = article.findAll(
                            'a', {'class': re.compile(r'\bjournalist\b')}
                        )
                    authors = [self.tag_to_string(a) for a in all_authors]
                    # print(f"Authors in tag <a>: {authors}")
@ -202,8 +212,11 @@ class Mediapart(BasicNewsRecipe):
                        'mot_cle': article_mot_cle.capitalize(),
                        'url': 'https://www.mediapart.fr' + url,
                    }
-
+                    if webpage_article:
-                    webpage_article.append(summary)
+                        if summary['url'] != webpage_article[-1]['url']:
                            webpage_article.append(summary)
                    else:
                        webpage_article.append(summary)
                except Exception:
                    pass