rewritte my_parse_index so it handles more sections

This commit is contained in:
Loic Houpert 2021-01-11 21:43:50 +01:00
parent 34da60c843
commit c7174bc8d6
No known key found for this signature in database
GPG Key ID: 420C6A6671A9F480

View File

@ -3,15 +3,17 @@
from __future__ import unicode_literals
__license__ = 'GPL v3'
__copyright__ = '2016, Daniel Bonnery ? (contact: DanielBonnery sur mobileread.com) 2009, Mathieu Godlewski <mathieu at godlewski.fr>; 2010-2012, Louis Gesbert <meta at antislash dot info>' # noqa
__copyright__ = '2021, Loïc Houpert <houpertloic at gmail .com>. Adapted from: 2016, Daniel Bonnery; 2009, Mathieu Godlewski; 2010-2012, Louis Gesbert' # noqa
'''
Mediapart
'''
import sys
import re
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.web.feeds import feeds_from_index
from datetime import date, timedelta
from datetime import datetime, timedelta
def classes(classes):
@ -22,7 +24,7 @@ def classes(classes):
class Mediapart(BasicNewsRecipe):
title = 'Mediapart'
__author__ = 'Daniel Bonnery from a version by Mathieu Godlewski, Louis Gesbert'
__author__ = 'Loïc Houpert (adapted from a version by Daniel Bonnery, Mathieu Godlewski and Louis Gesbert)'
description = 'Global news in French from news site Mediapart'
publication_type = 'newspaper'
language = 'fr'
@ -45,89 +47,157 @@ class Mediapart(BasicNewsRecipe):
# --
oldest_article_date = date.today() - timedelta(days=oldest_article)
oldest_article_date = datetime.today() - timedelta(days=oldest_article)
# -- get the index (the feed at 'http://www.mediapart.fr/articles/feed' only has
# the 10 last elements :/)
feeds = [
('La Une', 'http://www.mediapart.fr/articles/feed'),
]
# The feed at 'http://www.mediapart.fr/articles/feed' only displayed the 10
# last elements so the articles are indexed on specific pages
# in the function my_parse_index. In this function the article are parsed
# using the funtion get_articles and the dict values dict_article_sources
def parse_feeds(self):
feeds = super(Mediapart, self).parse_feeds()
feeds += feeds_from_index(self.my_parse_index(feeds))
print("\n======================================================" +
"======================================================\n")
print("======================================================" +
"======================================================\n")
print(f" List of feeds: {feeds}")
#sys.exit("sys.exit for debug")
return feeds
def my_parse_index(self, la_une):
dict_article_sources = [
{'type':'Brèves',
'webpage':'https://www.mediapart.fr/journal/fil-dactualites',
'separador':{'page':'ul','thread':'li'}
},
{'type':'International',
'webpage':'https://www.mediapart.fr/journal/international',
'separador':{'page':'div','thread':'div'}
},
{'type':'France',
'webpage':'https://www.mediapart.fr/journal/france',
'separador':{'page':'div','thread':'div'}
},
{'type':'Économie',
'webpage':'https://www.mediapart.fr/journal/economie',
'separador':{'page':'div','thread':'div'}
},
{'type':'Culture',
'webpage':'https://www.mediapart.fr/journal/culture-idees',
'separador':{'page':'div','thread':'div'}
},
]
def get_articles(type_of_article,webpage,
separador_page='ul', separador_thread='li'):
print("\n======================================================" +
"======================================================")
print(f"[Type of Article]:{type_of_article}")
print(f"[Webpage]:{webpage}")
print("\n======================================================" +
"======================================================\n")
specific_articles = []
webpage_article = []
soup = self.index_to_soup(webpage)
page = soup.find('main', {'class': 'global-wrapper'})
fils = page.find(separador_page, {'class': 'post-list universe-journal'})
# print(f"Print value of fils.findAll('li'):\n {fils.findAll('li')} ")
all_articles = fils.findAll(separador_thread)
# print(soup.prettify())
for article in all_articles:
try:
title = article.find('h3', recursive=False)
if title is None or ''.join(title['class']) == 'title-specific':
# print(f"[BAD title entry] Print value of title:\n {title}")
continue
# print(f"\n[OK title entry] Print value of title:\n {title}\n")
try:
article_mot_cle = article.find('a', {'href': re.compile(
r'.*\/mot-cle\/.*')}).renderContents().decode('utf-8')
except:
article_mot_cle = ''
try:
article_type = article.find('a', {'href': re.compile(
r'.*\/type-darticles\/.*')}).renderContents().decode('utf-8')
except:
article_type = ''
# print(f"Article Type:\n {article_type}\n")
for s in title('span'):
s.replaceWith(s.renderContents().decode('utf-8') + "\n")
url = title.find('a', href=True)['href']
date = article.find('time', datetime=True)['datetime']
article_date = datetime.strptime(date,'%Y-%m-%d')
if article_date < self.oldest_article_date:
print("article_date < self.oldest_article_date\n")
continue
# print("-------- Recent article added to the list ------- \n")
all_authors = article.findAll(
'a', {'class': re.compile(r'\bjournalist\b')})
authors = [self.tag_to_string(a) for a in all_authors]
# print(f"Authors in tag <a>: {authors}")
# If not link to the author profile is available the
# html separador is a span tag
if not all_authors:
try:
all_authors = article.findAll(
'span', {'class': re.compile(r'\bjournalist\b')})
authors = [self.tag_to_string(a) for a in all_authors]
# print(f"Authors in tag <span>: {authors}")
except:
authors = 'unknown'
description = article.find('p').renderContents().decode('utf-8')
# print(f" <p> in article : {self.tag_to_string(description).strip()} ")
summary = {
'title': self.tag_to_string(title).strip(),
'description': description,
'date': article_date.strftime("%a, %d %b, %Y %H:%M"),
'author': ', '.join(authors),
'article_type': article_type,
'mot_cle': article_mot_cle.capitalize(),
'url': 'https://www.mediapart.fr' + url,
}
# print(f"\nSummary: {summary}")
webpage_article.append(summary)
except:
pass
specific_articles += [(type_of_article, webpage_article)] if webpage_article else []
return specific_articles
articles = []
breves = []
liens = []
confidentiels = []
for category in dict_article_sources:
articles += get_articles(category['type'],category['webpage'],
category['separador']['page'],
category['separador']['thread']
)
soup = self.index_to_soup(
'https://www.mediapart.fr/journal/fil-dactualites')
page = soup.find('main', {'class': 'global-wrapper'})
fils = page.find('ul', {'class': 'post-list universe-journal'})
for article in fils.findAll('li'):
try:
title = article.find('h3', recursive=False)
if title is None or ''.join(title['class']) == 'title-specific':
continue
# print "found fil ",title
article_type = article.find('a', {'href': re.compile(
r'.*\/type-darticles\/.*')}).renderContents().decode('utf-8')
# print "kind: ",article_type
for s in title('span'):
s.replaceWith(s.renderContents().decode('utf-8') + "\n")
url = title.find('a', href=True)['href']
# article_date = self.parse_french_date(article.find("span", "article-date").renderContents().decode('utf-8'))
# print("################################# 9")
# print(article_date)
# if article_date < self.oldest_article_date:
# print "too old"
# continue
authors = article.findAll(
'a', {'class': re.compile(r'\bjournalist\b')})
authors = [self.tag_to_string(a) for a in authors]
# description = article.find('div', {'class': lambda c: c != 'taxonomy-teaser'}, recursive=False).findAll('p')
# print "fil ",title," by ",authors," : ",description
summary = {
'title': self.tag_to_string(title).strip(),
'author': ', '.join(authors),
'url': 'https://www.mediapart.fr' + url
}
if article_type == 'Lien':
liens.append(summary)
if article_type == 'Confidentiel':
confidentiels.append(summary)
if article_type not in ['Lien', 'Confidentiel']:
breves.append(summary)
except:
pass
# print 'La Une: ', len(la_une), ' articles'
# for a in la_une: print a["title"]
# print 'Brèves: ', len(breves), ' articles'
# print 'Revue web: ', len(liens), ' articles'
# print 'Confidentiel: ', len(confidentiels), ' articles'
articles += [('Brèves', breves)] if breves else []
articles += [('Revue du Web', liens)] if liens else []
articles += [('Confidentiel', confidentiels)] if confidentiels else []
print(articles)
return articles
# -- print-version
conversion_options = {'smarten_punctuation': True}