mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Alternatives Economiques by Kabonix
This commit is contained in:
parent
ebcfa3dd17
commit
d001a4e786
206
recipes/alternatives_economiques.recipe
Normal file
206
recipes/alternatives_economiques.recipe
Normal file
@ -0,0 +1,206 @@
|
||||
#!/usr/bin/env python
|
||||
import re
|
||||
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
|
||||
class AlternativesEconomiques(BasicNewsRecipe):
|
||||
title = 'Alternatives Économiques'
|
||||
__author__ = 'Kabonix'
|
||||
description = 'Articles de toutes les rubriques (10 articles max par rubrique)'
|
||||
publisher = 'Alternatives Économiques'
|
||||
language = 'fr'
|
||||
|
||||
oldest_article = 90
|
||||
max_articles_per_feed = 10
|
||||
no_stylesheets = True
|
||||
remove_javascript = True
|
||||
encoding = 'utf-8'
|
||||
auto_cleanup = False
|
||||
remove_empty_feeds = True
|
||||
remove_images = False
|
||||
|
||||
def get_cover_url(self):
|
||||
"""Récupère dynamiquement l'URL de la dernière une depuis MLP"""
|
||||
br = self.get_browser()
|
||||
try:
|
||||
# Accéder à la page du magazine sur MLP
|
||||
soup = self.index_to_soup(br.open('https://catalogueproduits.mlp.fr/produit.aspx?tit_code=1BMwusijpg0%3D').read())
|
||||
|
||||
# Chercher la div qui contient les images
|
||||
gallery = soup.find('div', id='gallery')
|
||||
if gallery:
|
||||
img = gallery.find('img', id='couverture_1')
|
||||
if img and img.get('src'):
|
||||
cover_url = img['src']
|
||||
if not cover_url.startswith('http'):
|
||||
cover_url = 'https://catalogueproduits.mlp.fr/' + cover_url
|
||||
self.log('Cover URL found:', cover_url)
|
||||
return cover_url
|
||||
|
||||
self.log('Aucune couverture trouvée, utilisation de l\'image par défaut')
|
||||
return 'https://www.alternatives-economiques.fr/sites/all/themes/alternatives-economiques-main/assets/logo-alternatives-economiques.svg'
|
||||
|
||||
except Exception as e:
|
||||
self.log.error('Erreur lors de la récupération de la couverture:', str(e))
|
||||
return 'https://www.alternatives-economiques.fr/sites/all/themes/alternatives-economiques-main/assets/logo-alternatives-economiques.svg'
|
||||
|
||||
def is_article_url(self, url):
|
||||
article_pattern = re.compile(r'/[^/]+/00\d{6}$')
|
||||
return bool(article_pattern.search(url))
|
||||
|
||||
def parse_index(self):
|
||||
articles = []
|
||||
base_url = 'https://www.alternatives-economiques.fr'
|
||||
|
||||
thematiques = [
|
||||
'biodiversite', 'ideesdebats', 'entreprise', 'europe', 'direct-de-recherche',
|
||||
'asie', 'a-la-carte', 'chine', 'culture', 'des-idees-pour-sortir-de-la-crise',
|
||||
'amerique-du-sud', 'idees-0', 'transport', 'services-publics', 'allemagne',
|
||||
'face-a-face', 'politique-monetaire', 'logement', 'climat', 'innovation',
|
||||
'agir', 'economie-sociale-et-solidaire', 'societe', 'theorie', 'revenus',
|
||||
'tribune', 'conditions-de-travail', 'familles', 'politique', 'numerique',
|
||||
'histoire', 'sociorama', 'mondialisation', 'opinions', 'consommation',
|
||||
'dette', 'assurance-chomage', 'finance', 'temps-de-travail', 'emploi',
|
||||
'protection-sociale', 'refugies', 'royaume-uni', 'conjoncture', 'population',
|
||||
'libertes', 'jeunes', 'droit-du-travail', 'responsabilite-sociale', 'industrie',
|
||||
'economie', 'ukraine', 'immigration', 'travail', 'etats-unis',
|
||||
'inflation', 'relations-sociales', 'inegalites', 'management', 'energie',
|
||||
'environnement', 'fiscalite', 'social', 'elections-europeennes', 'retraites',
|
||||
'international', 'commerce-exterieur', 'sante', 'gestion', 'salaires',
|
||||
'education', 'lanceurs-dalerte', 'japon', 'geopolitique', 'afrique',
|
||||
'commerce', 'politiques-publiques', 'budget', 'grece', 'genre',
|
||||
'services', 'pollution', 'agriculture', 'legislatives', 'chomage',
|
||||
'graphorama', 'formation', 'budget-2025', 'territoires', 'espagne'
|
||||
]
|
||||
|
||||
special_sections = {
|
||||
'grands-formats': '/grands-formats',
|
||||
'dessin': '/dessin'
|
||||
}
|
||||
|
||||
sections = {t: f'/thematiques/{t}' for t in thematiques}
|
||||
sections.update(special_sections)
|
||||
|
||||
for section_name, section_path in sections.items():
|
||||
url = f'{base_url}{section_path}'
|
||||
self.log('Analyzing section:', url)
|
||||
try:
|
||||
soup = self.index_to_soup(url)
|
||||
feed_articles = self.extract_articles(soup, base_url)
|
||||
if feed_articles:
|
||||
display_name = section_name.replace('-', ' ').title()
|
||||
articles.append((display_name, feed_articles[:self.max_articles_per_feed]))
|
||||
except Exception as e:
|
||||
self.log.error(f'Error processing {section_name}: {str(e)}')
|
||||
continue
|
||||
|
||||
return articles
|
||||
|
||||
def extract_articles(self, soup, base_url):
|
||||
feed_articles = []
|
||||
processed_urls = set()
|
||||
|
||||
for link in soup.find_all('a', href=True):
|
||||
article_url = link['href']
|
||||
if self.is_article_url(article_url):
|
||||
if not article_url.startswith('http'):
|
||||
article_url = base_url + article_url
|
||||
|
||||
if article_url in processed_urls:
|
||||
continue
|
||||
processed_urls.add(article_url)
|
||||
|
||||
try:
|
||||
article_soup = self.index_to_soup(article_url)
|
||||
h1_title = article_soup.find('h1', class_='o-head__title')
|
||||
|
||||
if h1_title:
|
||||
title = h1_title.get_text().strip()
|
||||
else:
|
||||
title_elem = link.find('h2')
|
||||
if title_elem:
|
||||
title = title_elem.get_text().strip()
|
||||
else:
|
||||
title = link.get_text().strip()
|
||||
if not title:
|
||||
title = article_url.split('/')[-2].replace('-', ' ').title()
|
||||
|
||||
if title:
|
||||
feed_articles.append({
|
||||
'title': title,
|
||||
'url': article_url,
|
||||
'description': ''
|
||||
})
|
||||
except Exception as e:
|
||||
self.log.error(f'Error getting H1 title for {article_url}: {str(e)}')
|
||||
continue
|
||||
|
||||
return feed_articles
|
||||
|
||||
keep_only_tags = [
|
||||
dict(name='h1', class_='o-head__title'),
|
||||
dict(name='div', class_='chapo'),
|
||||
dict(name='time', class_='o-infos__date-full'),
|
||||
dict(name='div', class_='o-page__content__who'),
|
||||
dict(name='div', class_='field-item even'),
|
||||
dict(name='div', attrs={'property': 'content:encoded'})
|
||||
]
|
||||
|
||||
remove_tags = [
|
||||
dict(name=['script', 'style', 'iframe', 'svg', 'audio', 'video', 'button', 'form', 'input']),
|
||||
dict(name='div', class_=[
|
||||
'c-article__social', 'social-buttons', 'social-sharing', 'social-media',
|
||||
'share-buttons', 'share-links', 'social-links', 'social-icons',
|
||||
'embedded-content', 'embed-container', 'embed-wrapper', 'media-embed',
|
||||
'twitter-embed', 'facebook-embed', 'social-embed',
|
||||
'c-kiosk--single', 'c-comments', 'c-article__toolbar',
|
||||
'c-article__related', 'c-epigraph', 'newsletter-signup',
|
||||
'twitter-tweet', 'twitter-timeline', 'twitter-follow-button',
|
||||
'c-footer__promo', 'o-page__block--offset--invert',
|
||||
'newsletter-form', 'newsletter-block', 'newsletter',
|
||||
'c-kiosk--single__content', 'c-kiosk--single__figure',
|
||||
'c-kiosk--single__body', 'c-kiosk--single__cta',
|
||||
'field-name-field-issue-cover'
|
||||
])
|
||||
]
|
||||
|
||||
extra_css = '''
|
||||
body { line-height: 1.6; margin: 1em; }
|
||||
h1 { font-size: 1.8em; margin-bottom: 0.5em; font-weight: bold; }
|
||||
.chapo { font-style: italic; margin: 1em 0; font-size: 1.2em; }
|
||||
.o-infos__date-full { color: #666; margin: 0.5em 0; font-size: 0.9em; }
|
||||
.o-page__content__who { color: #333; margin: 0.5em 0; font-weight: bold; }
|
||||
p { margin: 0.8em 0; }
|
||||
|
||||
a {
|
||||
text-decoration: none !important;
|
||||
color: inherit !important;
|
||||
}
|
||||
|
||||
.o-page__figure-full {
|
||||
break-inside: avoid;
|
||||
margin: 1em 0;
|
||||
page-break-inside: avoid;
|
||||
}
|
||||
.o-page__figure-full figcaption {
|
||||
font-style: italic;
|
||||
text-align: center;
|
||||
margin-top: 0.5em;
|
||||
font-size: 0.9em;
|
||||
color: #666;
|
||||
}
|
||||
'''
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
# Remove unwanted tags
|
||||
for tag in soup.find_all(['script', 'style', 'iframe', 'svg', 'audio', 'video']):
|
||||
tag.decompose()
|
||||
|
||||
# Clean attributes
|
||||
for tag in soup.find_all(True):
|
||||
if tag.name not in ['a', 'img']:
|
||||
allowed_attrs = {'src', 'href', 'alt', 'title'}
|
||||
tag.attrs = {k: v for k, v in tag.attrs.items() if k in allowed_attrs}
|
||||
|
||||
return soup
|
Loading…
x
Reference in New Issue
Block a user