#!/usr/bin/env python # vim:fileencoding=utf-8 # # 11 Jan 2021 - L. Houpert - Major changes in the Mediapart recipe: # 1) Summary of the article are noow available # 2) Additional sections International, France, Economie and Culture have # been added through custom entries in the function my_parse_index. # 3) Fix the cover image so it doesnt disappear from the Kindle menu # ( cover image format is changed to .jpeg) # 14 Jan 2021 - Add Mediapart Logo url as masthead_url and change cover # by overlaying the date on top of the Mediapart cover from __future__ import unicode_literals __license__ = 'GPL v3' __copyright__ = '2021, Loïc Houpert . Adapted from: 2016, Daniel Bonnery; 2009, Mathieu Godlewski; 2010-2012, Louis Gesbert' # noqa ''' Mediapart ''' import re from datetime import date, datetime, timezone, timedelta from calibre.web.feeds import feeds_from_index from calibre.web.feeds.news import BasicNewsRecipe def classes(classes): q = frozenset(classes.split(' ')) return dict( attrs={'class': lambda x: x and frozenset(x.split()).intersection(q)} ) class Mediapart(BasicNewsRecipe): title = 'Mediapart' __author__ = 'Loïc Houpert' description = 'Global news in French from news site Mediapart' publication_type = 'newspaper' language = 'fr' needs_subscription = True oldest_article = 2 use_embedded_content = False no_stylesheets = True keep_only_tags = [ dict(name='h1'), dict(name='div', **classes('author')), classes('introduction content-article') ] remove_tags = [classes('login-subscribe print-source_url')] conversion_options = {'smarten_punctuation': True} masthead_url = "https://raw.githubusercontent.com/lhoupert/calibre_contrib/main/mediapart_masthead.png" # cover_url = 'https://raw.githubusercontent.com/lhoupert/calibre_contrib/main/mediapart.jpeg' # -- # Get date in french time zone format today = datetime.now(timezone.utc) + timedelta(hours=1) oldest_article_date = today - timedelta(days=oldest_article) feeds = [ ('La Une', 'http://www.mediapart.fr/articles/feed'), ] # The feed at 'http://www.mediapart.fr/articles/feed' only displayed the 10 # last elements so the articles are indexed on specific pages # in the function my_parse_index. In this function the article are parsed # using the funtion get_articles and the dict values dict_article_sources def parse_feeds(self): feeds = super(Mediapart, self).parse_feeds() feeds += feeds_from_index(self.my_parse_index(feeds)) return feeds def my_parse_index(self, la_une): dict_article_sources = [ { 'type': 'Brèves', 'webpage': 'https://www.mediapart.fr/journal/fil-dactualites', 'separador': { 'page': 'ul', 'thread': 'li' } }, { 'type': 'International', 'webpage': 'https://www.mediapart.fr/journal/international', 'separador': { 'page': 'div', 'thread': 'div' } }, { 'type': 'France', 'webpage': 'https://www.mediapart.fr/journal/france', 'separador': { 'page': 'div', 'thread': 'div' } }, { 'type': 'Économie', 'webpage': 'https://www.mediapart.fr/journal/economie', 'separador': { 'page': 'div', 'thread': 'div' } }, { 'type': 'Culture', 'webpage': 'https://www.mediapart.fr/journal/culture-idees', 'separador': { 'page': 'div', 'thread': 'div' } }, ] def get_articles( type_of_article, webpage, separador_page='ul', separador_thread='li' ): specific_articles = [] webpage_article = [] soup = self.index_to_soup(webpage) page = soup.find('main', {'class': 'global-wrapper'}) fils = page.find(separador_page, {'class': 'post-list universe-journal'}) all_articles = fils.findAll(separador_thread) for article in all_articles: try: title = article.find('h3', recursive=False) if title is None or ''.join(title['class']) == 'title-specific': # print(f"[BAD title entry] Print value of title:\n {title}") continue # print(f"\n[OK title entry] Print value of title:\n {title}\n") try: article_mot_cle = article.find( 'a', { 'href': re.compile(r'.*\/mot-cle\/.*') } ).renderContents().decode('utf-8') except Exception: article_mot_cle = '' try: article_type = article.find( 'a', { 'href': re.compile(r'.*\/type-darticles\/.*') } ).renderContents().decode('utf-8') except Exception: article_type = '' for s in title('span'): s.replaceWith(s.renderContents().decode('utf-8') + "\n") url = title.find('a', href=True)['href'] date = article.find('time', datetime=True)['datetime'] article_date = datetime.strptime(date, '%Y-%m-%d') # Add French timezone to date of the article for date check article_date = article_date.replace(tzinfo=timezone.utc) + timedelta(hours=1) if article_date < self.oldest_article_date: print("article_date < self.oldest_article_date\n") continue # print("-------- Recent article added to the list ------- \n") all_authors = article.findAll( 'a', {'class': re.compile(r'\bjournalist\b')} ) authors = [self.tag_to_string(a) for a in all_authors] # print(f"Authors in tag : {authors}") # If not link to the author profile is available the # html separador is a span tag if not all_authors: try: all_authors = article.findAll( 'span', {'class': re.compile(r'\bjournalist\b')} ) authors = [self.tag_to_string(a) for a in all_authors] # print(f"Authors in tag : {authors}") except: authors = 'unknown' description = article.find('p').renderContents().decode('utf-8') # print(f"

in article : {self.tag_to_string(description).strip()} ") summary = { 'title': self.tag_to_string(title).strip(), 'description': description, 'date': article_date.strftime("%a, %d %b, %Y %H:%M"), 'author': ', '.join(authors), 'article_type': article_type, 'mot_cle': article_mot_cle.capitalize(), 'url': 'https://www.mediapart.fr' + url, } webpage_article.append(summary) except Exception: pass specific_articles += [(type_of_article, webpage_article)] if webpage_article else [] return specific_articles articles = [] for category in dict_article_sources: articles += get_articles( category['type'], category['webpage'], category['separador']['page'], category['separador']['thread'] ) return articles # non-locale specific date parse (strptime("%d %b %Y",s) would work with # french locale) def parse_french_date(self, date_str): date_arr = date_str.lower().split() return date( day=int(date_arr[0]), year=int(date_arr[2]), month=[ None, 'janvier', 'février', 'mars', 'avril', 'mai', 'juin', 'juillet', 'août', 'septembre', 'octobre', 'novembre', 'décembre' ].index(date_arr[1]) ) def get_browser(self): # -- Handle login def is_form_login(form): return "id" in form.attrs and form.attrs['id'] == "logFormEl" br = BasicNewsRecipe.get_browser(self) if self.username is not None and self.password is not None: br.open('https://www.mediapart.fr/login') br.select_form(predicate=is_form_login) br['name'] = self.username br['password'] = self.password br.submit() return br def default_cover(self, cover_file): ''' Create a generic cover for recipes that don't have a cover ''' from PyQt5.Qt import QImage, QPainter, QPen, Qt, QFont, QRect from calibre.gui2 import ensure_app, load_builtin_fonts, pixmap_to_data def init_environment(): ensure_app() load_builtin_fonts() def create_cover_mediapart(date): ' Create a cover for mediapart adding the date on Mediapart Cover' init_environment() # Get data image_url = 'https://raw.githubusercontent.com/lhoupert/calibre_contrib/main/mediapart.jpeg' data = self.index_to_soup(image_url, raw=True) # Get date and hour corresponding to french time zone today = datetime.now(timezone.utc) + timedelta(hours=1) wkd = today.weekday() french_weekday={0:'Mon',1:'Mar',2:'Mer',3:'Jeu',4:'Ven',5:'Sam',6:'Dim'} day = french_weekday[wkd]+'.' date = day + ' ' + today.strftime('%d %b. %Y') edition = today.strftime('Édition de %Hh') # Get Cover data img = QImage() img.loadFromData(data) # Overlay date on cover p = QPainter(img) pen = QPen(Qt.black) pen.setWidth(6) p.setPen(pen) font = QFont() font.setFamily('Times') font.setPointSize(78) p.setFont(font) r = QRect(0, 600, 744,100) p.drawText(r, Qt.AlignmentFlag.AlignJustify | Qt.AlignmentFlag.AlignVCenter | Qt.AlignmentFlag.AlignCenter, date) p.end() # Overlay edition information on cover p = QPainter(img) pen = QPen(Qt.black) pen.setWidth(4) p.setPen(pen) font = QFont() font.setFamily('Times') font.setItalic(True) font.setPointSize(66) p.setFont(font) # Add date r = QRect(0, 720, 744,100) p.drawText(r, Qt.AlignmentFlag.AlignJustify | Qt.AlignmentFlag.AlignVCenter | Qt.AlignmentFlag.AlignCenter, edition) p.end() return pixmap_to_data(img) try: today=datetime.today() date = today.strftime('%d %b %Y') img_data = create_cover_mediapart(date) cover_file.write(img_data) cover_file.flush() except Exception: self.log.exception('Failed to generate default cover') return False return True