mirror of
				https://github.com/kovidgoyal/calibre.git
				synced 2025-11-03 19:17:02 -05:00 
			
		
		
		
	Frieze Magazine by Kabonix
This commit is contained in:
		
							parent
							
								
									6155d07b1d
								
							
						
					
					
						commit
						27c5a12ed4
					
				
							
								
								
									
										234
									
								
								recipes/frieze.recipe
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										234
									
								
								recipes/frieze.recipe
									
									
									
									
									
										Normal file
									
								
							@ -0,0 +1,234 @@
 | 
				
			|||||||
 | 
					#!/usr/bin/env python
 | 
				
			||||||
 | 
					'''
 | 
				
			||||||
 | 
					frieze.com - Magazine d'art contemporain
 | 
				
			||||||
 | 
					'''
 | 
				
			||||||
 | 
					import re
 | 
				
			||||||
 | 
					from datetime import datetime
 | 
				
			||||||
 | 
					from zoneinfo import ZoneInfo
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					from calibre.web.feeds.news import BasicNewsRecipe
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					class FriezeMagazineRecipe(BasicNewsRecipe):
 | 
				
			||||||
 | 
					    title = 'Frieze Magazine'
 | 
				
			||||||
 | 
					    __author__ = 'Kabonix'
 | 
				
			||||||
 | 
					    description = "Magazine international d'art contemporain"
 | 
				
			||||||
 | 
					    language = 'en'
 | 
				
			||||||
 | 
					    oldest_article = 60
 | 
				
			||||||
 | 
					    max_articles_per_feed = 50
 | 
				
			||||||
 | 
					    auto_cleanup = False
 | 
				
			||||||
 | 
					    encoding = 'utf-8'
 | 
				
			||||||
 | 
					    no_stylesheets = True
 | 
				
			||||||
 | 
					    remove_javascript = True
 | 
				
			||||||
 | 
					    scale_news_images_to_device = True
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    base_url = 'https://www.frieze.com'
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    keep_only_tags = [
 | 
				
			||||||
 | 
					        {'class': 'article-header-container'},
 | 
				
			||||||
 | 
					        {'class': 'article-header-title'},
 | 
				
			||||||
 | 
					        {'class': 'article-header-headline'},
 | 
				
			||||||
 | 
					        {'class': 'article-header-author'},
 | 
				
			||||||
 | 
					        {'class': 'body-text'},
 | 
				
			||||||
 | 
					        {'class': 'body-field'}
 | 
				
			||||||
 | 
					    ]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    remove_tags = [
 | 
				
			||||||
 | 
					        {'class': 'social-share-container'},
 | 
				
			||||||
 | 
					        {'class': 'share-buttons'},
 | 
				
			||||||
 | 
					        {'class': 'article-header-social-responsive'},
 | 
				
			||||||
 | 
					        {'class': 'newsletter-subscribe-container'},
 | 
				
			||||||
 | 
					        {'class': 'suggested-articles-container'},
 | 
				
			||||||
 | 
					        {'class': 'article-footer-container'},
 | 
				
			||||||
 | 
					        {'class': 'ad-surround'},
 | 
				
			||||||
 | 
					        {'id': re.compile(r'ad-.*')},
 | 
				
			||||||
 | 
					        {'class': 'hidden'}
 | 
				
			||||||
 | 
					    ]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    extra_css = '''
 | 
				
			||||||
 | 
					        img { max-width: 100%; height: auto; display: block; margin: 1em auto; }
 | 
				
			||||||
 | 
					        h1 { font-size: 2em; margin: 1em 0; font-weight: bold; }
 | 
				
			||||||
 | 
					        h2 { font-size: 1.5em; margin: 1em 0; font-weight: bold; }
 | 
				
			||||||
 | 
					        p { font-size: 1.1em; line-height: 1.6; margin-bottom: 1em; }
 | 
				
			||||||
 | 
					        figcaption { font-style: italic; font-size: 0.9em; color: #555; text-align: center; }
 | 
				
			||||||
 | 
					        em { font-style: italic; }
 | 
				
			||||||
 | 
					        .article-header-title { margin-bottom: 0.5em; }
 | 
				
			||||||
 | 
					        .article-header-headline { margin-bottom: 1.5em; }
 | 
				
			||||||
 | 
					        .article-header-author { font-size: 0.9em; margin-bottom: 2em; color: #555; }
 | 
				
			||||||
 | 
					    '''
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def parse_index(self):
 | 
				
			||||||
 | 
					        magazine_url = self.base_url + '/magazines/frieze-magazine'
 | 
				
			||||||
 | 
					        soup = self.index_to_soup(magazine_url)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        # Recherche du dernier numéro de façon plus robuste
 | 
				
			||||||
 | 
					        # Essayer plusieurs sélecteurs possibles pour trouver la liste des numéros
 | 
				
			||||||
 | 
					        issue_containers = soup.find_all('div', {'class': re.compile(r'teaser-search-col')})
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        # S'il n'y a pas de conteneurs trouvés avec la classe spécifique, cherchons plus largement
 | 
				
			||||||
 | 
					        if not issue_containers:
 | 
				
			||||||
 | 
					            self.log.warning('Recherche élargie des numéros du magazine')
 | 
				
			||||||
 | 
					            issue_containers = soup.find_all('div', {'class': re.compile(r'teaser')})
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        # Trouvons le premier numéro de magazine (le plus récent) parmi les conteneurs
 | 
				
			||||||
 | 
					        latest_issue = None
 | 
				
			||||||
 | 
					        for container in issue_containers:
 | 
				
			||||||
 | 
					            # Vérifie si c'est un conteneur de magazine (pas un article)
 | 
				
			||||||
 | 
					            if container.find('a', href=re.compile(r'/magazines/|/issues/')):
 | 
				
			||||||
 | 
					                latest_issue = container
 | 
				
			||||||
 | 
					                break
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        if not latest_issue:
 | 
				
			||||||
 | 
					            self.log.warning('Aucun numéro spécifique trouvé, utilisation de la page principale')
 | 
				
			||||||
 | 
					            issue_url = magazine_url
 | 
				
			||||||
 | 
					        else:
 | 
				
			||||||
 | 
					            issue_link = latest_issue.find('a')
 | 
				
			||||||
 | 
					            if issue_link and issue_link.get('href'):
 | 
				
			||||||
 | 
					                issue_url = self.base_url + issue_link['href'] if issue_link['href'].startswith('/') else issue_link['href']
 | 
				
			||||||
 | 
					                self.log.info(f'Dernier numéro trouvé: {issue_url}')
 | 
				
			||||||
 | 
					            else:
 | 
				
			||||||
 | 
					                issue_url = magazine_url
 | 
				
			||||||
 | 
					                self.log.warning('Lien vers le numéro non trouvé')
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        self.log.info(f'Accès au numéro: {issue_url}')
 | 
				
			||||||
 | 
					        issue_soup = self.index_to_soup(issue_url)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        # Extraction plus robuste de l'image de couverture
 | 
				
			||||||
 | 
					        # Essayons plusieurs sélecteurs possibles
 | 
				
			||||||
 | 
					        cover_image = None
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        # Méthode 1: Chercher dans la section d'en-tête du magazine
 | 
				
			||||||
 | 
					        cover_div = issue_soup.find('div', class_=re.compile(r'magazine-header-image|issue-header-image'))
 | 
				
			||||||
 | 
					        if cover_div:
 | 
				
			||||||
 | 
					            img_tag = cover_div.find('img')
 | 
				
			||||||
 | 
					            if img_tag and img_tag.get('src'):
 | 
				
			||||||
 | 
					                cover_image = img_tag['src']
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        # Méthode 2: Chercher dans la section principale
 | 
				
			||||||
 | 
					        if not cover_image:
 | 
				
			||||||
 | 
					            main_section = issue_soup.find('section', class_=re.compile(r'main|content'))
 | 
				
			||||||
 | 
					            if main_section:
 | 
				
			||||||
 | 
					                img_tag = main_section.find('img')
 | 
				
			||||||
 | 
					                if img_tag and img_tag.get('src'):
 | 
				
			||||||
 | 
					                    cover_image = img_tag['src']
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        # Méthode 3: Recherche générale d'une grande image en haut de la page
 | 
				
			||||||
 | 
					        if not cover_image:
 | 
				
			||||||
 | 
					            for img in issue_soup.find_all('img', src=True)[:5]:  # Limiter aux 5 premières images
 | 
				
			||||||
 | 
					                if 'cover' in img.get('src', '').lower() or 'header' in img.get('src', '').lower():
 | 
				
			||||||
 | 
					                    cover_image = img['src']
 | 
				
			||||||
 | 
					                    break
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        if cover_image:
 | 
				
			||||||
 | 
					            self.cover_url = cover_image if cover_image.startswith('http') else self.base_url + cover_image
 | 
				
			||||||
 | 
					            self.log.info(f'Couverture trouvée: {self.cover_url}')
 | 
				
			||||||
 | 
					        else:
 | 
				
			||||||
 | 
					            self.log.warning("Pas d'image de couverture trouvée")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        # Extraction du titre du numéro
 | 
				
			||||||
 | 
					        issue_title = None
 | 
				
			||||||
 | 
					        for heading in issue_soup.find_all(['h1', 'h2']):
 | 
				
			||||||
 | 
					            if 'issue' in heading.text.lower() or 'magazine' in heading.text.lower():
 | 
				
			||||||
 | 
					                issue_title = heading.text.strip()
 | 
				
			||||||
 | 
					                break
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        if not issue_title:
 | 
				
			||||||
 | 
					            # Recherche plus générale
 | 
				
			||||||
 | 
					            for heading in issue_soup.find_all(['h1', 'h2'])[:3]:  # Limiter aux 3 premiers titres
 | 
				
			||||||
 | 
					                issue_title = heading.text.strip()
 | 
				
			||||||
 | 
					                if issue_title:
 | 
				
			||||||
 | 
					                    break
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        if not issue_title:
 | 
				
			||||||
 | 
					            issue_title = 'Frieze Magazine - Dernier numéro'
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        # Extraction des articles
 | 
				
			||||||
 | 
					        articles = []
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        # Recherche différents modèles de teasers d'articles
 | 
				
			||||||
 | 
					        article_containers = issue_soup.find_all('div', {'class': re.compile(r'teaser-content|article-teaser|article-item')})
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        if not article_containers:
 | 
				
			||||||
 | 
					            # Recherche plus large si les conteneurs spécifiques ne sont pas trouvés
 | 
				
			||||||
 | 
					            article_containers = issue_soup.find_all('div', {'class': re.compile(r'teaser|article')})
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        for container in article_containers:
 | 
				
			||||||
 | 
					            article_link = container.find('a')
 | 
				
			||||||
 | 
					            if not article_link or not article_link.get('href'):
 | 
				
			||||||
 | 
					                continue
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					            article_url = article_link['href']
 | 
				
			||||||
 | 
					            if article_url.startswith('/'):
 | 
				
			||||||
 | 
					                article_url = self.base_url + article_url
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					            # Extraction du titre avec différentes classes possibles
 | 
				
			||||||
 | 
					            title_element = container.find(['div', 'h2', 'h3', 'h4'], {'class': re.compile(r'title|heading')})
 | 
				
			||||||
 | 
					            title = title_element.text.strip() if title_element else 'Sans titre'
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					            # Extraction de la description
 | 
				
			||||||
 | 
					            desc_element = container.find(['div', 'p'], {'class': re.compile(r'deck|description|summary|excerpt')})
 | 
				
			||||||
 | 
					            description = desc_element.text.strip() if desc_element else ''
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					            # Extraction de l'auteur
 | 
				
			||||||
 | 
					            author_element = container.find(['div', 'span'], {'class': re.compile(r'author|byline')})
 | 
				
			||||||
 | 
					            if author_element:
 | 
				
			||||||
 | 
					                author_links = author_element.find_all('a')
 | 
				
			||||||
 | 
					                if author_links:
 | 
				
			||||||
 | 
					                    author = ', '.join([a.text.strip() for a in author_links])
 | 
				
			||||||
 | 
					                else:
 | 
				
			||||||
 | 
					                    author = author_element.text.strip()
 | 
				
			||||||
 | 
					            else:
 | 
				
			||||||
 | 
					                author = ''
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					            # Vérification que c'est bien un article et pas un lien interne
 | 
				
			||||||
 | 
					            if '/article/' in article_url or '/feature/' in article_url or '/review/' in article_url:
 | 
				
			||||||
 | 
					                articles.append({
 | 
				
			||||||
 | 
					                    'title': title,
 | 
				
			||||||
 | 
					                    'url': article_url,
 | 
				
			||||||
 | 
					                    'description': description,
 | 
				
			||||||
 | 
					                    'author': author,
 | 
				
			||||||
 | 
					                    'date': datetime.now(ZoneInfo('Europe/Paris')).strftime('%Y-%m-%d')
 | 
				
			||||||
 | 
					                })
 | 
				
			||||||
 | 
					                self.log.debug(f'Article trouvé: {title} - {article_url}')
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        self.log.info(f"Nombre d'articles trouvés: {len(articles)}")
 | 
				
			||||||
 | 
					        return [(issue_title, articles)]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def get_cover_url(self):
 | 
				
			||||||
 | 
					        return getattr(self, 'cover_url', None)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def preprocess_html(self, soup):
 | 
				
			||||||
 | 
					        # Nettoyage des éléments superflus
 | 
				
			||||||
 | 
					        for element in soup.find_all(class_=lambda c: c and ('share' in c or 'ad-' in c)):
 | 
				
			||||||
 | 
					            element.decompose()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        for tag in soup.find_all(['script', 'style']):
 | 
				
			||||||
 | 
					            tag.decompose()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        # Optimisation des images
 | 
				
			||||||
 | 
					        for img in soup.find_all('img'):
 | 
				
			||||||
 | 
					            # Sauvegarde des attributs importants seulement
 | 
				
			||||||
 | 
					            src = img.get('src') or img.get('data-src') or img.get('data-lazy-src')
 | 
				
			||||||
 | 
					            alt = img.get('alt', '')
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					            # Réinitialiser tous les attributs
 | 
				
			||||||
 | 
					            img.attrs = {}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					            # Réappliquer seulement src et alt
 | 
				
			||||||
 | 
					            if src:
 | 
				
			||||||
 | 
					                img['src'] = src
 | 
				
			||||||
 | 
					            if alt:
 | 
				
			||||||
 | 
					                img['alt'] = alt
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        # Formatage des métadonnées
 | 
				
			||||||
 | 
					        for class_name, style in [
 | 
				
			||||||
 | 
					            ('article-header-title', 'font-size: 2em; font-weight: bold;'),
 | 
				
			||||||
 | 
					            ('article-header-headline', 'font-style: italic;'),
 | 
				
			||||||
 | 
					            ('article-header-author', 'color: #555;')
 | 
				
			||||||
 | 
					        ]:
 | 
				
			||||||
 | 
					            element = soup.find('div', class_=class_name)
 | 
				
			||||||
 | 
					            if element:
 | 
				
			||||||
 | 
					                element.attrs.clear()
 | 
				
			||||||
 | 
					                element['style'] = style
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        return soup
 | 
				
			||||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user