mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-08 02:34:06 -04:00
rewritte my_parse_index so it handles more sections
This commit is contained in:
parent
34da60c843
commit
c7174bc8d6
@ -3,15 +3,17 @@
|
|||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
__license__ = 'GPL v3'
|
__license__ = 'GPL v3'
|
||||||
__copyright__ = '2016, Daniel Bonnery ? (contact: DanielBonnery sur mobileread.com) 2009, Mathieu Godlewski <mathieu at godlewski.fr>; 2010-2012, Louis Gesbert <meta at antislash dot info>' # noqa
|
__copyright__ = '2021, Loïc Houpert <houpertloic at gmail .com>. Adapted from: 2016, Daniel Bonnery; 2009, Mathieu Godlewski; 2010-2012, Louis Gesbert' # noqa
|
||||||
'''
|
'''
|
||||||
Mediapart
|
Mediapart
|
||||||
'''
|
'''
|
||||||
|
|
||||||
|
import sys
|
||||||
|
|
||||||
import re
|
import re
|
||||||
from calibre.web.feeds.news import BasicNewsRecipe
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
from calibre.web.feeds import feeds_from_index
|
from calibre.web.feeds import feeds_from_index
|
||||||
from datetime import date, timedelta
|
from datetime import datetime, timedelta
|
||||||
|
|
||||||
|
|
||||||
def classes(classes):
|
def classes(classes):
|
||||||
@ -22,7 +24,7 @@ def classes(classes):
|
|||||||
|
|
||||||
class Mediapart(BasicNewsRecipe):
|
class Mediapart(BasicNewsRecipe):
|
||||||
title = 'Mediapart'
|
title = 'Mediapart'
|
||||||
__author__ = 'Daniel Bonnery from a version by Mathieu Godlewski, Louis Gesbert'
|
__author__ = 'Loïc Houpert (adapted from a version by Daniel Bonnery, Mathieu Godlewski and Louis Gesbert)'
|
||||||
description = 'Global news in French from news site Mediapart'
|
description = 'Global news in French from news site Mediapart'
|
||||||
publication_type = 'newspaper'
|
publication_type = 'newspaper'
|
||||||
language = 'fr'
|
language = 'fr'
|
||||||
@ -45,89 +47,157 @@ class Mediapart(BasicNewsRecipe):
|
|||||||
|
|
||||||
# --
|
# --
|
||||||
|
|
||||||
oldest_article_date = date.today() - timedelta(days=oldest_article)
|
oldest_article_date = datetime.today() - timedelta(days=oldest_article)
|
||||||
|
|
||||||
# -- get the index (the feed at 'http://www.mediapart.fr/articles/feed' only has
|
|
||||||
# the 10 last elements :/)
|
|
||||||
|
|
||||||
feeds = [
|
feeds = [
|
||||||
('La Une', 'http://www.mediapart.fr/articles/feed'),
|
('La Une', 'http://www.mediapart.fr/articles/feed'),
|
||||||
]
|
]
|
||||||
|
|
||||||
|
# The feed at 'http://www.mediapart.fr/articles/feed' only displayed the 10
|
||||||
|
# last elements so the articles are indexed on specific pages
|
||||||
|
# in the function my_parse_index. In this function the article are parsed
|
||||||
|
# using the funtion get_articles and the dict values dict_article_sources
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def parse_feeds(self):
|
def parse_feeds(self):
|
||||||
feeds = super(Mediapart, self).parse_feeds()
|
feeds = super(Mediapart, self).parse_feeds()
|
||||||
feeds += feeds_from_index(self.my_parse_index(feeds))
|
feeds += feeds_from_index(self.my_parse_index(feeds))
|
||||||
|
print("\n======================================================" +
|
||||||
|
"======================================================\n")
|
||||||
|
print("======================================================" +
|
||||||
|
"======================================================\n")
|
||||||
|
print(f" List of feeds: {feeds}")
|
||||||
|
#sys.exit("sys.exit for debug")
|
||||||
return feeds
|
return feeds
|
||||||
|
|
||||||
def my_parse_index(self, la_une):
|
def my_parse_index(self, la_une):
|
||||||
|
|
||||||
|
dict_article_sources = [
|
||||||
|
{'type':'Brèves',
|
||||||
|
'webpage':'https://www.mediapart.fr/journal/fil-dactualites',
|
||||||
|
'separador':{'page':'ul','thread':'li'}
|
||||||
|
},
|
||||||
|
{'type':'International',
|
||||||
|
'webpage':'https://www.mediapart.fr/journal/international',
|
||||||
|
'separador':{'page':'div','thread':'div'}
|
||||||
|
},
|
||||||
|
{'type':'France',
|
||||||
|
'webpage':'https://www.mediapart.fr/journal/france',
|
||||||
|
'separador':{'page':'div','thread':'div'}
|
||||||
|
},
|
||||||
|
{'type':'Économie',
|
||||||
|
'webpage':'https://www.mediapart.fr/journal/economie',
|
||||||
|
'separador':{'page':'div','thread':'div'}
|
||||||
|
},
|
||||||
|
{'type':'Culture',
|
||||||
|
'webpage':'https://www.mediapart.fr/journal/culture-idees',
|
||||||
|
'separador':{'page':'div','thread':'div'}
|
||||||
|
},
|
||||||
|
]
|
||||||
|
|
||||||
|
def get_articles(type_of_article,webpage,
|
||||||
|
separador_page='ul', separador_thread='li'):
|
||||||
|
|
||||||
|
print("\n======================================================" +
|
||||||
|
"======================================================")
|
||||||
|
print(f"[Type of Article]:{type_of_article}")
|
||||||
|
print(f"[Webpage]:{webpage}")
|
||||||
|
print("\n======================================================" +
|
||||||
|
"======================================================\n")
|
||||||
|
|
||||||
|
specific_articles = []
|
||||||
|
|
||||||
|
webpage_article = []
|
||||||
|
soup = self.index_to_soup(webpage)
|
||||||
|
page = soup.find('main', {'class': 'global-wrapper'})
|
||||||
|
fils = page.find(separador_page, {'class': 'post-list universe-journal'})
|
||||||
|
|
||||||
|
# print(f"Print value of fils.findAll('li'):\n {fils.findAll('li')} ")
|
||||||
|
all_articles = fils.findAll(separador_thread)
|
||||||
|
# print(soup.prettify())
|
||||||
|
for article in all_articles:
|
||||||
|
try:
|
||||||
|
title = article.find('h3', recursive=False)
|
||||||
|
if title is None or ''.join(title['class']) == 'title-specific':
|
||||||
|
# print(f"[BAD title entry] Print value of title:\n {title}")
|
||||||
|
continue
|
||||||
|
# print(f"\n[OK title entry] Print value of title:\n {title}\n")
|
||||||
|
|
||||||
|
try:
|
||||||
|
article_mot_cle = article.find('a', {'href': re.compile(
|
||||||
|
r'.*\/mot-cle\/.*')}).renderContents().decode('utf-8')
|
||||||
|
except:
|
||||||
|
article_mot_cle = ''
|
||||||
|
|
||||||
|
try:
|
||||||
|
article_type = article.find('a', {'href': re.compile(
|
||||||
|
r'.*\/type-darticles\/.*')}).renderContents().decode('utf-8')
|
||||||
|
except:
|
||||||
|
article_type = ''
|
||||||
|
|
||||||
|
# print(f"Article Type:\n {article_type}\n")
|
||||||
|
|
||||||
|
for s in title('span'):
|
||||||
|
s.replaceWith(s.renderContents().decode('utf-8') + "\n")
|
||||||
|
url = title.find('a', href=True)['href']
|
||||||
|
|
||||||
|
date = article.find('time', datetime=True)['datetime']
|
||||||
|
article_date = datetime.strptime(date,'%Y-%m-%d')
|
||||||
|
if article_date < self.oldest_article_date:
|
||||||
|
print("article_date < self.oldest_article_date\n")
|
||||||
|
continue
|
||||||
|
|
||||||
|
# print("-------- Recent article added to the list ------- \n")
|
||||||
|
all_authors = article.findAll(
|
||||||
|
'a', {'class': re.compile(r'\bjournalist\b')})
|
||||||
|
authors = [self.tag_to_string(a) for a in all_authors]
|
||||||
|
# print(f"Authors in tag <a>: {authors}")
|
||||||
|
|
||||||
|
# If not link to the author profile is available the
|
||||||
|
# html separador is a span tag
|
||||||
|
if not all_authors:
|
||||||
|
try:
|
||||||
|
all_authors = article.findAll(
|
||||||
|
'span', {'class': re.compile(r'\bjournalist\b')})
|
||||||
|
authors = [self.tag_to_string(a) for a in all_authors]
|
||||||
|
# print(f"Authors in tag <span>: {authors}")
|
||||||
|
except:
|
||||||
|
authors = 'unknown'
|
||||||
|
|
||||||
|
description = article.find('p').renderContents().decode('utf-8')
|
||||||
|
# print(f" <p> in article : {self.tag_to_string(description).strip()} ")
|
||||||
|
|
||||||
|
summary = {
|
||||||
|
'title': self.tag_to_string(title).strip(),
|
||||||
|
'description': description,
|
||||||
|
'date': article_date.strftime("%a, %d %b, %Y %H:%M"),
|
||||||
|
'author': ', '.join(authors),
|
||||||
|
'article_type': article_type,
|
||||||
|
'mot_cle': article_mot_cle.capitalize(),
|
||||||
|
'url': 'https://www.mediapart.fr' + url,
|
||||||
|
}
|
||||||
|
|
||||||
|
# print(f"\nSummary: {summary}")
|
||||||
|
|
||||||
|
webpage_article.append(summary)
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
|
specific_articles += [(type_of_article, webpage_article)] if webpage_article else []
|
||||||
|
return specific_articles
|
||||||
|
|
||||||
articles = []
|
articles = []
|
||||||
|
|
||||||
breves = []
|
for category in dict_article_sources:
|
||||||
liens = []
|
articles += get_articles(category['type'],category['webpage'],
|
||||||
confidentiels = []
|
category['separador']['page'],
|
||||||
|
category['separador']['thread']
|
||||||
|
)
|
||||||
|
|
||||||
soup = self.index_to_soup(
|
print(articles)
|
||||||
'https://www.mediapart.fr/journal/fil-dactualites')
|
|
||||||
page = soup.find('main', {'class': 'global-wrapper'})
|
|
||||||
fils = page.find('ul', {'class': 'post-list universe-journal'})
|
|
||||||
|
|
||||||
for article in fils.findAll('li'):
|
|
||||||
try:
|
|
||||||
title = article.find('h3', recursive=False)
|
|
||||||
|
|
||||||
if title is None or ''.join(title['class']) == 'title-specific':
|
|
||||||
continue
|
|
||||||
|
|
||||||
# print "found fil ",title
|
|
||||||
article_type = article.find('a', {'href': re.compile(
|
|
||||||
r'.*\/type-darticles\/.*')}).renderContents().decode('utf-8')
|
|
||||||
# print "kind: ",article_type
|
|
||||||
|
|
||||||
for s in title('span'):
|
|
||||||
s.replaceWith(s.renderContents().decode('utf-8') + "\n")
|
|
||||||
url = title.find('a', href=True)['href']
|
|
||||||
|
|
||||||
# article_date = self.parse_french_date(article.find("span", "article-date").renderContents().decode('utf-8'))
|
|
||||||
# print("################################# 9")
|
|
||||||
# print(article_date)
|
|
||||||
|
|
||||||
# if article_date < self.oldest_article_date:
|
|
||||||
# print "too old"
|
|
||||||
# continue
|
|
||||||
|
|
||||||
authors = article.findAll(
|
|
||||||
'a', {'class': re.compile(r'\bjournalist\b')})
|
|
||||||
authors = [self.tag_to_string(a) for a in authors]
|
|
||||||
|
|
||||||
# description = article.find('div', {'class': lambda c: c != 'taxonomy-teaser'}, recursive=False).findAll('p')
|
|
||||||
|
|
||||||
# print "fil ",title," by ",authors," : ",description
|
|
||||||
|
|
||||||
summary = {
|
|
||||||
'title': self.tag_to_string(title).strip(),
|
|
||||||
'author': ', '.join(authors),
|
|
||||||
'url': 'https://www.mediapart.fr' + url
|
|
||||||
}
|
|
||||||
if article_type == 'Lien':
|
|
||||||
liens.append(summary)
|
|
||||||
if article_type == 'Confidentiel':
|
|
||||||
confidentiels.append(summary)
|
|
||||||
if article_type not in ['Lien', 'Confidentiel']:
|
|
||||||
breves.append(summary)
|
|
||||||
except:
|
|
||||||
pass
|
|
||||||
|
|
||||||
# print 'La Une: ', len(la_une), ' articles'
|
|
||||||
# for a in la_une: print a["title"]
|
|
||||||
# print 'Brèves: ', len(breves), ' articles'
|
|
||||||
# print 'Revue web: ', len(liens), ' articles'
|
|
||||||
# print 'Confidentiel: ', len(confidentiels), ' articles'
|
|
||||||
|
|
||||||
articles += [('Brèves', breves)] if breves else []
|
|
||||||
articles += [('Revue du Web', liens)] if liens else []
|
|
||||||
articles += [('Confidentiel', confidentiels)] if confidentiels else []
|
|
||||||
return articles
|
return articles
|
||||||
# -- print-version
|
|
||||||
|
|
||||||
conversion_options = {'smarten_punctuation': True}
|
conversion_options = {'smarten_punctuation': True}
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user