calibre/recipes/le_monde_diplomatique_fr.recipe
2025-01-24 11:14:14 +01:00

140 lines
5.6 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python
# vim:fileencoding=utf-8
from __future__ import unicode_literals
__license__ = 'GPL v3'
__copyright__ = '2013'
'''
monde-diplomatique.fr
'''
import re
from calibre.web.feeds import feeds_from_index
from calibre.web.feeds.news import BasicNewsRecipe
def absurl(url):
if url.startswith('/'):
url = 'http://www.monde-diplomatique.fr' + url
return url
class LeMondeDiplomatiqueSiteWeb(BasicNewsRecipe):
title = u'Le Monde diplomatique.fr'
__author__ = 'Gaëtan Lehmann'
description = "Le Monde diplomatique est un mensuel français dinformation et dopinion à la ligne éditoriale nettement engagée en faveur d'une gauche de rupture avec le capitalisme. Il aborde de nombreux sujets — géopolitique, relations internationales, économie, questions sociales, écologie, culture, médias, …" # noqa: E501
oldest_article = 30
max_articles_per_feed = 100
auto_cleanup = True
publisher = 'monde-diplomatique.fr'
category = 'news, France, world'
language = 'fr'
masthead_url = 'http://www.monde-diplomatique.fr/squelettes/images/logotyfa.png'
timefmt = ' [%d %b %Y]'
no_stylesheets = True
recipe_specific_options = {
'days': {
'short': 'Oldest article to download from this news source. In days ',
'long': 'For example, 0.5, gives you articles from the past 12 hours',
'default': str(oldest_article)
}
}
def __init__(self, *args, **kwargs):
BasicNewsRecipe.__init__(self, *args, **kwargs)
d = self.recipe_specific_options.get('days')
if d and isinstance(d, str):
self.oldest_article = float(d)
feeds = [(u'Blogs', u'http://blog.mondediplo.net/spip.php?page=backend'),
(u'Archives', u'http://www.monde-diplomatique.fr/rss/')]
preprocess_regexps = [
(re.compile(r'<title>(.*) - Les blogs du Diplo</title>'),
lambda m: '<title>' + m.group(1) + '</title>'),
(re.compile(r'<h2>(.*) - Les blogs du Diplo</h2>'),
lambda m: '<h2>' + m.group(1) + '</h2>'),
(re.compile(r'<title>(.*) \(Le Monde diplomatique\)</title>'),
lambda m: '<title>' + m.group(1) + '</title>'),
(re.compile(r'<h2>(.*) \(Le Monde diplomatique\)</h2>'),
lambda m: '<h2>' + m.group(1) + '</h2>'),
(re.compile(r'<h3>Grand format</h3>'), lambda m: '')]
remove_tags = [dict(name='div', attrs={'class': 'voiraussi liste'}),
dict(name='ul', attrs={
'class': 'hermetique carto hombre_demi_inverse'}),
dict(name='a', attrs={'class': 'tousles'}),
dict(name='h3', attrs={'class': 'cat'}),
dict(name='div', attrs={'class': 'logodiplo'}),
dict(name='img', attrs={'class': 'spip_logos'}),
dict(name='p', attrs={'id': 'hierarchie'}),
dict(name='div', attrs={'class': 'espace'})]
conversion_options = {
'comments': description,
'tags': category,
'publisher': publisher,
'linearize_tables': True
}
remove_empty_feeds = True
filterDuplicates = True
# don't use parse_index - we need it to send an exception so we can mix
# feed and parse_index results in parse_feeds
def parse_index_valise(self):
articles = []
soup = self.index_to_soup('http://www.monde-diplomatique.fr/carnet/')
cnt = soup.find('ul', attrs={'class': 'liste double'})
for item in cnt.findAll('li'):
description = ''
feed_link = item.find('a', href=True)
title = self.tag_to_string(item.find('h3'))
desc = item.find('div', attrs={'class': 'intro'})
date = item.find('div', attrs={'class': 'dates_auteurs'})
if desc:
description = desc.string
if feed_link:
articles.append({
'title': title,
'date': self.tag_to_string(date),
'url': absurl(feed_link['href']),
'description': description
})
return [('La valise diplomatique', articles)]
def parse_index_cartes(self):
articles = []
soup = self.index_to_soup('http://www.monde-diplomatique.fr/cartes/')
cnt = soup.find('ul', attrs={'class': 'liste_vignettes hautcartes'})
for li in cnt.findAll('li'):
feed_link = li.find('a', href=True)
h3 = li.find('h3')
authorAndDate = li.find('div', attrs={'class': 'dates_auteurs'})
author_date = self.tag_to_string(authorAndDate).split(', ')
author = author_date[0]
date = author_date[-1]
if feed_link:
title = self.tag_to_string(h3)
articles.append({
'title': title,
'date': date,
'url': absurl(feed_link['href']),
'description': author
})
return [('Cartes', articles)]
def parse_feeds(self):
feeds = BasicNewsRecipe.parse_feeds(self)
valise = feeds_from_index(self.parse_index_valise(), oldest_article=self.oldest_article,
max_articles_per_feed=self.max_articles_per_feed,
log=self.log)
cartes = feeds_from_index(self.parse_index_cartes(), oldest_article=self.oldest_article,
max_articles_per_feed=self.max_articles_per_feed,
log=self.log)
feeds = valise + feeds + cartes
return feeds