Fix Mediapart site changes

Pages structure is not the same depending on source types : adding workarounds to perform a second search to get articles.
This commit is contained in:
Hervé M 2022-08-01 19:00:38 +02:00 committed by GitHub
parent 401c92737f
commit 357c955b44
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -130,12 +130,17 @@ class Mediapart(BasicNewsRecipe):
webpage_article = [] webpage_article = []
soup = self.index_to_soup(webpage) soup = self.index_to_soup(webpage)
page = soup.find('main', {'class': 'global-wrapper'}) page = soup.find('main', {'class': 'global-wrapper'})
if page is None:
page = soup.find('section', {'class': 'news__body-wrapper mb-800'})
fils = page.find(separador_page, {'class': 'post-list universe-journal'}) fils = page.find(separador_page, {'class': 'post-list universe-journal'})
if fils is None:
fils = page.find(separador_page, {'class': 'news__list__content _hasNewsletter'})
all_articles = fils.findAll(separador_thread) all_articles = fils.findAll(separador_thread)
for article in all_articles: for article in all_articles:
try: try:
title = article.find('h3', recursive=False) # title = article.find('h3', recursive=False)
title = article.find('h3', recursive=True)
if title is None or ''.join(title['class']) == 'title-specific': if title is None or ''.join(title['class']) == 'title-specific':
# print(f"[BAD title entry] Print value of title:\n {title}") # print(f"[BAD title entry] Print value of title:\n {title}")
continue continue
@ -173,8 +178,13 @@ class Mediapart(BasicNewsRecipe):
# print("-------- Recent article added to the list ------- \n") # print("-------- Recent article added to the list ------- \n")
all_authors = article.findAll( all_authors = article.findAll(
'a', {'class': re.compile(r'\bjournalist\b')} # 'a', {'class': re.compile(r'\bjournalist\b')}
'div', {'class': 'teaser__signature'}
) )
if not all_authors:
all_authors = article.findAll(
'a', {'class': re.compile(r'\bjournalist\b')}
)
authors = [self.tag_to_string(a) for a in all_authors] authors = [self.tag_to_string(a) for a in all_authors]
# print(f"Authors in tag <a>: {authors}") # print(f"Authors in tag <a>: {authors}")
@ -202,8 +212,11 @@ class Mediapart(BasicNewsRecipe):
'mot_cle': article_mot_cle.capitalize(), 'mot_cle': article_mot_cle.capitalize(),
'url': 'https://www.mediapart.fr' + url, 'url': 'https://www.mediapart.fr' + url,
} }
if webpage_article:
webpage_article.append(summary) if summary['url'] != webpage_article[-1]['url']:
webpage_article.append(summary)
else:
webpage_article.append(summary)
except Exception: except Exception:
pass pass