mirror of
				https://github.com/kovidgoyal/calibre.git
				synced 2025-11-03 19:17:02 -05:00 
			
		
		
		
	Update Mediapart
This commit is contained in:
		
							parent
							
								
									b55dd98bce
								
							
						
					
					
						commit
						d64423a95f
					
				@ -9,6 +9,8 @@
 | 
			
		||||
# ( cover image format is changed to .jpeg)
 | 
			
		||||
# 14 Jan 2021 - Add Mediapart Logo url as masthead_url and change cover
 | 
			
		||||
#   by overlaying the date on top of the Mediapart cover
 | 
			
		||||
# 22 Mar 2023 - Switch to Google feeds
 | 
			
		||||
 | 
			
		||||
from __future__ import unicode_literals
 | 
			
		||||
 | 
			
		||||
__license__ = 'GPL v3'
 | 
			
		||||
@ -17,235 +19,74 @@ __copyright__ = '2021, Loïc Houpert <houpertloic at gmail .com>. Adapted from:
 | 
			
		||||
Mediapart
 | 
			
		||||
'''
 | 
			
		||||
 | 
			
		||||
import re
 | 
			
		||||
from datetime import date, datetime, timezone, timedelta
 | 
			
		||||
from calibre.web.feeds import feeds_from_index
 | 
			
		||||
from calibre.web.feeds.news import BasicNewsRecipe
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def classes(classes):
 | 
			
		||||
    q = frozenset(classes.split(' '))
 | 
			
		||||
    return dict(
 | 
			
		||||
        attrs={'class': lambda x: x and frozenset(x.split()).intersection(q)}
 | 
			
		||||
    )
 | 
			
		||||
 | 
			
		||||
from calibre.ptempfile import PersistentTemporaryFile
 | 
			
		||||
from calibre.web.feeds.news import BasicNewsRecipe, classes
 | 
			
		||||
 | 
			
		||||
class Mediapart(BasicNewsRecipe):
 | 
			
		||||
    title = 'Mediapart'
 | 
			
		||||
    __author__ = 'Loïc Houpert'
 | 
			
		||||
    __author__ = 'Loïc Houpert, unkn0wn'
 | 
			
		||||
    description = 'Global news in French from news site Mediapart'
 | 
			
		||||
    publication_type = 'newspaper'
 | 
			
		||||
    language = 'fr'
 | 
			
		||||
    needs_subscription = True
 | 
			
		||||
    oldest_article = 2
 | 
			
		||||
 | 
			
		||||
    
 | 
			
		||||
    use_embedded_content = False
 | 
			
		||||
    no_stylesheets = True
 | 
			
		||||
 | 
			
		||||
    keep_only_tags = [
 | 
			
		||||
        dict(name='h1'),
 | 
			
		||||
        dict(name='div', **classes('author')),
 | 
			
		||||
        classes('news__heading__top__intro news__body__center__article')
 | 
			
		||||
        classes(
 | 
			
		||||
            'news__heading__top news__heading__center news__body__center__article'
 | 
			
		||||
        )
 | 
			
		||||
    ]
 | 
			
		||||
    
 | 
			
		||||
    remove_tags = [
 | 
			
		||||
        classes('login-subscribe print-source_url'),
 | 
			
		||||
        classes('action-links media--rich read-also login-subscribe print-source_url'),
 | 
			
		||||
        dict(name='svg'),
 | 
			
		||||
    ]
 | 
			
		||||
    
 | 
			
		||||
    conversion_options = {'smarten_punctuation': True}
 | 
			
		||||
 | 
			
		||||
    masthead_url = "https://raw.githubusercontent.com/lhoupert/calibre_contrib/main/mediapart_masthead.png"
 | 
			
		||||
    # cover_url = 'https://raw.githubusercontent.com/lhoupert/calibre_contrib/main/mediapart.jpeg'
 | 
			
		||||
 | 
			
		||||
    # --
 | 
			
		||||
    ignore_duplicate_articles = {'title'}
 | 
			
		||||
    resolve_internal_links  = True
 | 
			
		||||
    remove_empty_feeds = True
 | 
			
		||||
    
 | 
			
		||||
    articles_are_obfuscated = True
 | 
			
		||||
 | 
			
		||||
    # Get date in french time zone format
 | 
			
		||||
    today = datetime.now(timezone.utc) + timedelta(hours=1)
 | 
			
		||||
    oldest_article_date = today - timedelta(days=oldest_article)
 | 
			
		||||
    def get_obfuscated_article(self, url):
 | 
			
		||||
        br = self.get_browser()
 | 
			
		||||
        try:
 | 
			
		||||
            br.open(url)
 | 
			
		||||
        except Exception as e:
 | 
			
		||||
            url = e.hdrs.get('location')
 | 
			
		||||
        soup = self.index_to_soup(url)
 | 
			
		||||
        link = soup.find('a', href=True)
 | 
			
		||||
        skip_sections =[ # add sections you want to skip
 | 
			
		||||
            '/video/', '/videos/', '/media/'
 | 
			
		||||
        ]
 | 
			
		||||
        if any(x in link['href'] for x in skip_sections):
 | 
			
		||||
            self.log('Aborting Article ', link['href'])
 | 
			
		||||
            self.abort_article('skipping video links')
 | 
			
		||||
 | 
			
		||||
    feeds = [
 | 
			
		||||
        ('La Une', 'http://www.mediapart.fr/articles/feed'),
 | 
			
		||||
        self.log('Downloading ', link['href'])
 | 
			
		||||
        html = br.open(link['href']).read()
 | 
			
		||||
        pt = PersistentTemporaryFile('.html')
 | 
			
		||||
        pt.write(html)
 | 
			
		||||
        pt.close()
 | 
			
		||||
        return pt.name
 | 
			
		||||
 | 
			
		||||
    feeds = []
 | 
			
		||||
 | 
			
		||||
    sections = [
 | 
			
		||||
        'france', 'international', 'economie', 'culture-idees', 'politique', 'ecologie', 'fil-dactualites'
 | 
			
		||||
    ]
 | 
			
		||||
 | 
			
		||||
    # The feed at 'http://www.mediapart.fr/articles/feed' only displayed the 10
 | 
			
		||||
    # last elements so the articles are indexed on specific pages
 | 
			
		||||
    # in the function my_parse_index. In this function the article are parsed
 | 
			
		||||
    # using the function get_articles and the dict values dict_article_sources
 | 
			
		||||
 | 
			
		||||
    def parse_feeds(self):
 | 
			
		||||
        feeds = super(Mediapart, self).parse_feeds()
 | 
			
		||||
        feeds += feeds_from_index(self.my_parse_index(feeds))
 | 
			
		||||
        return feeds
 | 
			
		||||
 | 
			
		||||
    def my_parse_index(self, la_une):
 | 
			
		||||
 | 
			
		||||
        dict_article_sources = [
 | 
			
		||||
            {
 | 
			
		||||
                'type': 'Brèves',
 | 
			
		||||
                'webpage': 'https://www.mediapart.fr/journal/fil-dactualites',
 | 
			
		||||
                'separador': {
 | 
			
		||||
                    'page': 'ul',
 | 
			
		||||
                    'thread': 'li'
 | 
			
		||||
                }
 | 
			
		||||
            },
 | 
			
		||||
            {
 | 
			
		||||
                'type': 'International',
 | 
			
		||||
                'webpage': 'https://www.mediapart.fr/journal/international',
 | 
			
		||||
                'separador': {
 | 
			
		||||
                    'page': 'div',
 | 
			
		||||
                    'thread': 'div'
 | 
			
		||||
                }
 | 
			
		||||
            },
 | 
			
		||||
            {
 | 
			
		||||
                'type': 'France',
 | 
			
		||||
                'webpage': 'https://www.mediapart.fr/journal/france',
 | 
			
		||||
                'separador': {
 | 
			
		||||
                    'page': 'div',
 | 
			
		||||
                    'thread': 'div'
 | 
			
		||||
                }
 | 
			
		||||
            },
 | 
			
		||||
            {
 | 
			
		||||
                'type': 'Économie',
 | 
			
		||||
                'webpage': 'https://www.mediapart.fr/journal/economie',
 | 
			
		||||
                'separador': {
 | 
			
		||||
                    'page': 'div',
 | 
			
		||||
                    'thread': 'div'
 | 
			
		||||
                }
 | 
			
		||||
            },
 | 
			
		||||
            {
 | 
			
		||||
                'type': 'Culture',
 | 
			
		||||
                'webpage': 'https://www.mediapart.fr/journal/culture-idees',
 | 
			
		||||
                'separador': {
 | 
			
		||||
                    'page': 'div',
 | 
			
		||||
                    'thread': 'div'
 | 
			
		||||
                }
 | 
			
		||||
            },
 | 
			
		||||
        ]
 | 
			
		||||
 | 
			
		||||
        def get_articles(
 | 
			
		||||
            type_of_article, webpage, separador_page='ul', separador_thread='li'
 | 
			
		||||
        ):
 | 
			
		||||
 | 
			
		||||
            specific_articles = []
 | 
			
		||||
 | 
			
		||||
            webpage_article = []
 | 
			
		||||
            soup = self.index_to_soup(webpage)
 | 
			
		||||
            page = soup.find('main', {'class': 'global-wrapper'})
 | 
			
		||||
            if page is None:
 | 
			
		||||
                page = soup.find('section', {'class': 'news__body-wrapper mb-800'})
 | 
			
		||||
            fils = page.find(separador_page, {'class': 'post-list universe-journal'})
 | 
			
		||||
            if fils is None:
 | 
			
		||||
                fils = page.find(separador_page, {'class': 'news__list__content _hasNewsletter'})
 | 
			
		||||
 | 
			
		||||
            all_articles = fils.findAll(separador_thread)
 | 
			
		||||
            for article in all_articles:
 | 
			
		||||
                try:
 | 
			
		||||
                    # title = article.find('h3', recursive=False)
 | 
			
		||||
                    title = article.find('h3', recursive=True)
 | 
			
		||||
                    if title is None or ''.join(title['class']) == 'title-specific':
 | 
			
		||||
                        # print(f"[BAD title entry] Print value of title:\n {title}")
 | 
			
		||||
                        continue
 | 
			
		||||
                    # print(f"\n[OK title entry] Print value of title:\n {title}\n")
 | 
			
		||||
 | 
			
		||||
                    try:
 | 
			
		||||
                        article_mot_cle = article.find(
 | 
			
		||||
                            'a', {
 | 
			
		||||
                                'href': re.compile(r'.*\/mot-cle\/.*')
 | 
			
		||||
                            }
 | 
			
		||||
                        ).renderContents().decode('utf-8')
 | 
			
		||||
                    except Exception:
 | 
			
		||||
                        article_mot_cle = ''
 | 
			
		||||
 | 
			
		||||
                    try:
 | 
			
		||||
                        article_type = article.find(
 | 
			
		||||
                            'a', {
 | 
			
		||||
                                'href': re.compile(r'.*\/type-darticles\/.*')
 | 
			
		||||
                            }
 | 
			
		||||
                        ).renderContents().decode('utf-8')
 | 
			
		||||
                    except Exception:
 | 
			
		||||
                        article_type = ''
 | 
			
		||||
 | 
			
		||||
                    for s in title('span'):
 | 
			
		||||
                        s.replaceWith(s.renderContents().decode('utf-8') + "\n")
 | 
			
		||||
                    url = title.find('a', href=True)['href']
 | 
			
		||||
 | 
			
		||||
                    date = article.find('time', datetime=True)['datetime']
 | 
			
		||||
                    article_date = datetime.strptime(date, '%Y-%m-%d')
 | 
			
		||||
                    # Add French timezone to date of the article for date check
 | 
			
		||||
                    article_date = article_date.replace(tzinfo=timezone.utc) + timedelta(hours=1)
 | 
			
		||||
                    if article_date < self.oldest_article_date:
 | 
			
		||||
                        print("article_date < self.oldest_article_date\n")
 | 
			
		||||
                        continue
 | 
			
		||||
 | 
			
		||||
                    # print("-------- Recent article added to the list ------- \n")
 | 
			
		||||
                    all_authors = article.findAll(
 | 
			
		||||
                        # 'a', {'class': re.compile(r'\bjournalist\b')}
 | 
			
		||||
                        'div', {'class': 'teaser__signature'}
 | 
			
		||||
                    )
 | 
			
		||||
                    if not all_authors:
 | 
			
		||||
                        all_authors = article.findAll(
 | 
			
		||||
                            'a', {'class': re.compile(r'\bjournalist\b')}
 | 
			
		||||
                        )
 | 
			
		||||
                    authors = [self.tag_to_string(a) for a in all_authors]
 | 
			
		||||
                    # print(f"Authors in tag <a>: {authors}")
 | 
			
		||||
 | 
			
		||||
                    # If not link to the author profile is available the
 | 
			
		||||
                    # html separador is a span tag
 | 
			
		||||
                    if not all_authors:
 | 
			
		||||
                        try:
 | 
			
		||||
                            all_authors = article.findAll(
 | 
			
		||||
                                'span', {'class': re.compile(r'\bjournalist\b')}
 | 
			
		||||
                            )
 | 
			
		||||
                            authors = [self.tag_to_string(a) for a in all_authors]
 | 
			
		||||
                            # print(f"Authors in tag <span>: {authors}")
 | 
			
		||||
                        except:
 | 
			
		||||
                            authors = 'unknown'
 | 
			
		||||
 | 
			
		||||
                    description = article.find('p').renderContents().decode('utf-8')
 | 
			
		||||
                    # print(f" <p> in article : {self.tag_to_string(description).strip()} ")
 | 
			
		||||
 | 
			
		||||
                    summary = {
 | 
			
		||||
                        'title': self.tag_to_string(title).strip(),
 | 
			
		||||
                        'description': description,
 | 
			
		||||
                        'date': article_date.strftime("%a, %d %b, %Y %H:%M"),
 | 
			
		||||
                        'author': ', '.join(authors),
 | 
			
		||||
                        'article_type': article_type,
 | 
			
		||||
                        'mot_cle': article_mot_cle.capitalize(),
 | 
			
		||||
                        'url': 'https://www.mediapart.fr' + url,
 | 
			
		||||
                    }
 | 
			
		||||
                    if webpage_article:
 | 
			
		||||
                        if summary['url'] != webpage_article[-1]['url']:
 | 
			
		||||
                            webpage_article.append(summary)
 | 
			
		||||
                    else:
 | 
			
		||||
                        webpage_article.append(summary)
 | 
			
		||||
                except Exception:
 | 
			
		||||
                    pass
 | 
			
		||||
 | 
			
		||||
            specific_articles += [(type_of_article,
 | 
			
		||||
                                   webpage_article)] if webpage_article else []
 | 
			
		||||
            return specific_articles
 | 
			
		||||
 | 
			
		||||
        articles = []
 | 
			
		||||
 | 
			
		||||
        for category in dict_article_sources:
 | 
			
		||||
            articles += get_articles(
 | 
			
		||||
                category['type'], category['webpage'], category['separador']['page'],
 | 
			
		||||
                category['separador']['thread']
 | 
			
		||||
            )
 | 
			
		||||
 | 
			
		||||
        return articles
 | 
			
		||||
 | 
			
		||||
    # non-locale specific date parse (strptime("%d %b %Y",s) would work with
 | 
			
		||||
    # french locale)
 | 
			
		||||
    def parse_french_date(self, date_str):
 | 
			
		||||
        date_arr = date_str.lower().split()
 | 
			
		||||
        return date(
 | 
			
		||||
            day=int(date_arr[0]),
 | 
			
		||||
            year=int(date_arr[2]),
 | 
			
		||||
            month=[
 | 
			
		||||
                None, 'janvier', 'février', 'mars', 'avril', 'mai', 'juin',
 | 
			
		||||
                'juillet', 'août', 'septembre', 'octobre', 'novembre', 'décembre'
 | 
			
		||||
            ].index(date_arr[1])
 | 
			
		||||
        )
 | 
			
		||||
    for sec in sections:
 | 
			
		||||
        a = 'https://news.google.com/rss/search?q=when:27h+allinurl:mediapart.fr%2Fjournal{}&hl=fr-FR&gl=FR&ceid=FR:fr'
 | 
			
		||||
        feeds.append((sec.capitalize(), a.format('%2F' + sec + '%2F')))
 | 
			
		||||
    feeds.append(('Autres', a.format('')))
 | 
			
		||||
 | 
			
		||||
    def get_browser(self):
 | 
			
		||||
        # -- Handle login
 | 
			
		||||
@ -298,7 +139,7 @@ class Mediapart(BasicNewsRecipe):
 | 
			
		||||
            p.setPen(pen)
 | 
			
		||||
            font = QFont()
 | 
			
		||||
            font.setFamily('Times')
 | 
			
		||||
            font.setPointSize(78)
 | 
			
		||||
            font.setPointSize(72)
 | 
			
		||||
            p.setFont(font)
 | 
			
		||||
            r = QRect(0, 600, 744,100)
 | 
			
		||||
            p.drawText(r, Qt.AlignmentFlag.AlignJustify | Qt.AlignmentFlag.AlignVCenter | Qt.AlignmentFlag.AlignCenter, date)
 | 
			
		||||
@ -329,4 +170,4 @@ class Mediapart(BasicNewsRecipe):
 | 
			
		||||
        except Exception:
 | 
			
		||||
            self.log.exception('Failed to generate default cover')
 | 
			
		||||
            return False
 | 
			
		||||
        return True
 | 
			
		||||
        return True
 | 
			
		||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user