mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-09-29 15:31:08 -04:00
140 lines
5.1 KiB
Python
140 lines
5.1 KiB
Python
#!/usr/bin/env python
|
|
from calibre.web.feeds.news import BasicNewsRecipe
|
|
|
|
|
|
class LeCanardEnchaine(BasicNewsRecipe):
|
|
title = 'Le Canard Enchaîné'
|
|
author = 'Kabonix'
|
|
description = 'Articles du Canard Enchaîné'
|
|
language = 'fr'
|
|
no_stylesheets = True
|
|
remove_javascript = True
|
|
|
|
# Ajout des préférences pour les identifiants
|
|
needs_subscription = True
|
|
|
|
def get_browser(self):
|
|
br = BasicNewsRecipe.get_browser(self)
|
|
br.set_handle_robots(False)
|
|
|
|
if self.username and self.password:
|
|
br.open('https://www.lecanardenchaine.fr/coin/identification?u=/')
|
|
br.select_form(nr=13)
|
|
br['_username'] = self.username
|
|
br['_password'] = self.password
|
|
br.submit()
|
|
else:
|
|
raise Exception('Les identifiants de connexion sont requis. Veuillez les configurer dans les préférences de la recette.')
|
|
|
|
return br
|
|
|
|
# Le reste du code reste identique
|
|
keep_only_tags = [
|
|
dict(name='div', attrs={'class': ['editorial', 'article__core']}),
|
|
dict(name='div', attrs={'class': ['non-paywall', 'paywall']})
|
|
]
|
|
|
|
remove_tags = [
|
|
dict(name=['script', 'style', 'nav', 'header', 'footer']),
|
|
dict(name='div', attrs={'class': ['social-share', 'comments', 'share-mobile', 'article__author', 'article__tags']})
|
|
]
|
|
|
|
extra_css = '''
|
|
body, p, div, h1, h2, h3,
|
|
.article__subtitle, .article__chapeau, .chapeau {
|
|
font-size: 1em !important;
|
|
line-height: 1.5 !important;
|
|
}
|
|
'''
|
|
|
|
def get_cover_url(self):
|
|
'''Récupère dynamiquement l'URL de la dernière une'''
|
|
br = self.get_browser()
|
|
try:
|
|
soup = self.index_to_soup(br.open('https://boutique.lecanardenchaine.fr/acheter-au-numero/').read())
|
|
|
|
list_item = soup.find('li', {'class': 'list-item'})
|
|
if list_item:
|
|
img = list_item.find('img')
|
|
if img and img.get('srcset'):
|
|
return 'https://boutique.lecanardenchaine.fr' + img['srcset'].split()[0]
|
|
elif img and img.get('src'):
|
|
return 'https://boutique.lecanardenchaine.fr' + img['src']
|
|
|
|
self.log.info("Aucune couverture trouvée, utilisation de l'image par défaut")
|
|
return 'https://image.ausha.co/2x1H3rkhwjmSwAa8KzIFfcN0G9GxfJWY83UafXn8_400x400.jpeg'
|
|
except Exception:
|
|
self.log.exception('Erreur lors de la récupération de la couverture')
|
|
return 'https://image.ausha.co/2x1H3rkhwjmSwAa8KzIFfcN0G9GxfJWY83UafXn8_400x400.jpeg'
|
|
|
|
SECTIONS = {
|
|
'Politique': '/politique/',
|
|
'Économie': '/economie/',
|
|
'International': '/international/',
|
|
'Défense': '/defense/',
|
|
'Société': '/societe/',
|
|
'Police-Justice': '/police-justice/',
|
|
'Santé': '/sante/',
|
|
'Éducation': '/education/',
|
|
'Environnement': '/environnement/',
|
|
'Technologie-Sciences': '/technologie-sciences/',
|
|
'Culture-Idées': '/culture-idees/',
|
|
'Médias': '/medias/',
|
|
'Sport': '/sport/',
|
|
'Social': '/social/',
|
|
'Brèves': '/breves/'
|
|
}
|
|
|
|
def parse_index(self):
|
|
br = self.get_browser()
|
|
feeds = []
|
|
|
|
for section_title, section_url in self.SECTIONS.items():
|
|
print(f'Exploration de la rubrique : {section_title}')
|
|
articles = []
|
|
try:
|
|
url = 'https://www.lecanardenchaine.fr' + section_url
|
|
raw = br.open(url).read()
|
|
soup = self.index_to_soup(raw)
|
|
|
|
for link in soup.findAll('a', href=True):
|
|
href = link.get('href', '')
|
|
if section_url[1:-1] in href and href.count('/') == 2:
|
|
title = link.get_text().strip()
|
|
if title:
|
|
if not href.startswith('http'):
|
|
href = 'https://www.lecanardenchaine.fr' + href
|
|
articles.append({
|
|
'title': title,
|
|
'url': href,
|
|
'description': ''
|
|
})
|
|
|
|
seen_urls = set()
|
|
unique_articles = []
|
|
for article in articles:
|
|
if article['url'] not in seen_urls:
|
|
seen_urls.add(article['url'])
|
|
unique_articles.append(article)
|
|
|
|
if unique_articles:
|
|
feeds.append((section_title, unique_articles))
|
|
print(f' {len(unique_articles)} articles trouvés')
|
|
|
|
except Exception as e:
|
|
print(f'Erreur sur {section_title}: {e}')
|
|
|
|
return feeds
|
|
|
|
def preprocess_html(self, soup):
|
|
for div in soup.findAll('div', attrs={'class': ['unlocked', 'paywall']}):
|
|
div['class'] = ''
|
|
return soup
|
|
|
|
def postprocess_html(self, soup, first_fetch):
|
|
for tag in soup.findAll(True):
|
|
for attr in list(tag.attrs):
|
|
if attr not in ['href', 'src', 'class']:
|
|
del tag[attr]
|
|
return soup
|