calibre/recipes/le_monde_diplomatique_fr.recipe
Kovid Goyal 567040ee1e Perform PEP8 compliance checks on the entire codebase
Some bits of PEP 8 are turned off via setup.cfg
2016-07-29 21:25:17 +05:30

124 lines
5.0 KiB
Plaintext
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# vim:fileencoding=utf-8
from __future__ import unicode_literals
__license__ = 'GPL v3'
__copyright__ = '2013'
'''
monde-diplomatique.fr
'''
import re
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.web.feeds import feeds_from_index
def absurl(url):
if url.startswith('/'):
url = 'http://www.monde-diplomatique.fr' + url
return url
class LeMondeDiplomatiqueSiteWeb(BasicNewsRecipe):
title = u'Le Monde diplomatique.fr'
__author__ = 'Gaëtan Lehmann'
description = "Le Monde diplomatique est un mensuel français dinformation et dopinion à la ligne éditoriale nettement engagée en faveur d'une gauche de rupture avec le capitalisme. Il aborde de nombreux sujets — géopolitique, relations internationales, économie, questions sociales, écologie, culture, médias, …" # noqa
oldest_article = 30
max_articles_per_feed = 100
auto_cleanup = True
publisher = 'monde-diplomatique.fr'
category = 'news, France, world'
language = 'fr'
masthead_url = 'http://www.monde-diplomatique.fr/squelettes/images/logotyfa.png'
timefmt = ' [%d %b %Y]'
no_stylesheets = True
feeds = [(u'Blogs', u'http://blog.mondediplo.net/spip.php?page=backend'),
(u'Archives', u'http://www.monde-diplomatique.fr/rss/')]
preprocess_regexps = [
(re.compile(r'<title>(.*) - Les blogs du Diplo</title>'),
lambda m: '<title>' + m.group(1) + '</title>'),
(re.compile(r'<h2>(.*) - Les blogs du Diplo</h2>'),
lambda m: '<h2>' + m.group(1) + '</h2>'),
(re.compile(r'<title>(.*) \(Le Monde diplomatique\)</title>'),
lambda m: '<title>' + m.group(1) + '</title>'),
(re.compile(r'<h2>(.*) \(Le Monde diplomatique\)</h2>'),
lambda m: '<h2>' + m.group(1) + '</h2>'),
(re.compile(r'<h3>Grand format</h3>'), lambda m: '')]
remove_tags = [dict(name='div', attrs={'class': 'voiraussi liste'}),
dict(name='ul', attrs={
'class': 'hermetique carto hombre_demi_inverse'}),
dict(name='a', attrs={'class': 'tousles'}),
dict(name='h3', attrs={'class': 'cat'}),
dict(name='div', attrs={'class': 'logodiplo'}),
dict(name='img', attrs={'class': 'spip_logos'}),
dict(name='p', attrs={'id': 'hierarchie'}),
dict(name='div', attrs={'class': 'espace'})]
conversion_options = {
'comments': description,
'tags': category,
'publisher': publisher,
'linearize_tables': True
}
remove_empty_feeds = True
filterDuplicates = True
# don't use parse_index - we need it to send an exception so we can mix
# feed and parse_index results in parse_feeds
def parse_index_valise(self):
articles = []
soup = self.index_to_soup('http://www.monde-diplomatique.fr/carnet/')
cnt = soup.find('ul', attrs={'class': 'liste double'})
for item in cnt.findAll('li'):
description = ''
feed_link = item.find('a', href=True)
title = self.tag_to_string(item.find('h3'))
desc = item.find('div', attrs={'class': 'intro'})
date = item.find('div', attrs={'class': 'dates_auteurs'})
if desc:
description = desc.string
if feed_link:
articles.append({
'title': title,
'date': self.tag_to_string(date),
'url': absurl(feed_link['href']),
'description': description
})
return [("La valise diplomatique", articles)]
def parse_index_cartes(self):
articles = []
soup = self.index_to_soup('http://www.monde-diplomatique.fr/cartes/')
cnt = soup.find('ul', attrs={'class': 'liste_vignettes hautcartes'})
for li in cnt.findAll('li'):
feed_link = li.find('a', href=True)
h3 = li.find('h3')
authorAndDate = li.find('div', attrs={'class': 'dates_auteurs'})
author_date = self.tag_to_string(authorAndDate).split(', ')
author = author_date[0]
date = author_date[-1]
if feed_link:
title = self.tag_to_string(h3)
articles.append({
'title': title,
'date': date,
'url': absurl(feed_link['href']),
'description': author
})
return [("Cartes", articles)]
def parse_feeds(self):
feeds = BasicNewsRecipe.parse_feeds(self)
valise = feeds_from_index(self.parse_index_valise(), oldest_article=self.oldest_article,
max_articles_per_feed=self.max_articles_per_feed,
log=self.log)
cartes = feeds_from_index(self.parse_index_cartes(), oldest_article=self.oldest_article,
max_articles_per_feed=self.max_articles_per_feed,
log=self.log)
feeds = valise + feeds + cartes
return feeds