#!/usr/bin/env python from calibre.web.feeds.news import BasicNewsRecipe class LeCanardEnchaine(BasicNewsRecipe): title = 'Le Canard Enchaîné' author = 'Kabonix' description = 'Articles du Canard Enchaîné' language = 'fr' no_stylesheets = True remove_javascript = True # Ajout des préférences pour les identifiants needs_subscription = True def get_browser(self): br = BasicNewsRecipe.get_browser(self) br.set_handle_robots(False) if self.username and self.password: br.open('https://www.lecanardenchaine.fr/coin/identification?u=/') br.select_form(nr=13) br['_username'] = self.username br['_password'] = self.password br.submit() else: raise Exception('Les identifiants de connexion sont requis. Veuillez les configurer dans les préférences de la recette.') return br # Le reste du code reste identique keep_only_tags = [ dict(name='div', attrs={'class': ['editorial', 'article__core']}), dict(name='div', attrs={'class': ['non-paywall', 'paywall']}) ] remove_tags = [ dict(name=['script', 'style', 'nav', 'header', 'footer']), dict(name='div', attrs={'class': ['social-share', 'comments', 'share-mobile', 'article__author', 'article__tags']}) ] extra_css = ''' body, p, div, h1, h2, h3, .article__subtitle, .article__chapeau, .chapeau { font-size: 1em !important; line-height: 1.5 !important; } ''' def get_cover_url(self): '''Récupère dynamiquement l'URL de la dernière une''' br = self.get_browser() try: soup = self.index_to_soup(br.open('https://boutique.lecanardenchaine.fr/acheter-au-numero/').read()) list_item = soup.find('li', {'class': 'list-item'}) if list_item: img = list_item.find('img') if img and img.get('srcset'): return 'https://boutique.lecanardenchaine.fr' + img['srcset'].split()[0] elif img and img.get('src'): return 'https://boutique.lecanardenchaine.fr' + img['src'] self.log.info("Aucune couverture trouvée, utilisation de l'image par défaut") return 'https://image.ausha.co/2x1H3rkhwjmSwAa8KzIFfcN0G9GxfJWY83UafXn8_400x400.jpeg' except Exception: self.log.exception('Erreur lors de la récupération de la couverture') return 'https://image.ausha.co/2x1H3rkhwjmSwAa8KzIFfcN0G9GxfJWY83UafXn8_400x400.jpeg' SECTIONS = { 'Politique': '/politique/', 'Économie': '/economie/', 'International': '/international/', 'Défense': '/defense/', 'Société': '/societe/', 'Police-Justice': '/police-justice/', 'Santé': '/sante/', 'Éducation': '/education/', 'Environnement': '/environnement/', 'Technologie-Sciences': '/technologie-sciences/', 'Culture-Idées': '/culture-idees/', 'Médias': '/medias/', 'Sport': '/sport/', 'Social': '/social/', 'Brèves': '/breves/' } def parse_index(self): br = self.get_browser() feeds = [] for section_title, section_url in self.SECTIONS.items(): print(f'Exploration de la rubrique : {section_title}') articles = [] try: url = 'https://www.lecanardenchaine.fr' + section_url raw = br.open(url).read() soup = self.index_to_soup(raw) for link in soup.findAll('a', href=True): href = link.get('href', '') if section_url[1:-1] in href and href.count('/') == 2: title = link.get_text().strip() if title: if not href.startswith('http'): href = 'https://www.lecanardenchaine.fr' + href articles.append({ 'title': title, 'url': href, 'description': '' }) seen_urls = set() unique_articles = [] for article in articles: if article['url'] not in seen_urls: seen_urls.add(article['url']) unique_articles.append(article) if unique_articles: feeds.append((section_title, unique_articles)) print(f' {len(unique_articles)} articles trouvés') except Exception as e: print(f'Erreur sur {section_title}: {e}') return feeds def preprocess_html(self, soup): for div in soup.findAll('div', attrs={'class': ['unlocked', 'paywall']}): div['class'] = '' return soup def postprocess_html(self, soup, first_fetch): for tag in soup.findAll(True): for attr in list(tag.attrs): if attr not in ['href', 'src', 'class']: del tag[attr] return soup