mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-06-23 15:30:45 -04:00
317 lines
12 KiB
Python
317 lines
12 KiB
Python
#!/usr/bin/env python
|
|
# vim:fileencoding=utf-8
|
|
#
|
|
# 11 Jan 2021 - L. Houpert - Major changes in the Mediapart recipe:
|
|
# 1) Summary of the article are noow available
|
|
# 2) Additional sections International, France, Economie and Culture have
|
|
# been added through custom entries in the function my_parse_index.
|
|
# 3) Fix the cover image so it doesnt disappear from the Kindle menu
|
|
# ( cover image format is changed to .jpeg)
|
|
# 14 Jan 2021 - Add Mediapart Logo url as masthead_url and change cover
|
|
# by overlaying the date on top of the Mediapart cover
|
|
from __future__ import unicode_literals
|
|
|
|
__license__ = 'GPL v3'
|
|
__copyright__ = '2021, Loïc Houpert <houpertloic at gmail .com>. Adapted from: 2016, Daniel Bonnery; 2009, Mathieu Godlewski; 2010-2012, Louis Gesbert' # noqa
|
|
'''
|
|
Mediapart
|
|
'''
|
|
|
|
import re
|
|
from datetime import date, datetime, timezone, timedelta
|
|
from calibre.web.feeds import feeds_from_index
|
|
from calibre.web.feeds.news import BasicNewsRecipe
|
|
|
|
|
|
def classes(classes):
|
|
q = frozenset(classes.split(' '))
|
|
return dict(
|
|
attrs={'class': lambda x: x and frozenset(x.split()).intersection(q)}
|
|
)
|
|
|
|
|
|
class Mediapart(BasicNewsRecipe):
|
|
title = 'Mediapart'
|
|
__author__ = 'Loïc Houpert'
|
|
description = 'Global news in French from news site Mediapart'
|
|
publication_type = 'newspaper'
|
|
language = 'fr'
|
|
needs_subscription = True
|
|
oldest_article = 2
|
|
|
|
use_embedded_content = False
|
|
no_stylesheets = True
|
|
|
|
keep_only_tags = [
|
|
dict(name='h1'),
|
|
dict(name='div', **classes('author')),
|
|
classes('introduction content-article')
|
|
]
|
|
remove_tags = [classes('login-subscribe print-source_url')]
|
|
conversion_options = {'smarten_punctuation': True}
|
|
|
|
masthead_url = "https://raw.githubusercontent.com/lhoupert/calibre_contrib/main/mediapart_masthead.png"
|
|
# cover_url = 'https://raw.githubusercontent.com/lhoupert/calibre_contrib/main/mediapart.jpeg'
|
|
|
|
# --
|
|
|
|
# Get date in french time zone format
|
|
today = datetime.now(timezone.utc) + timedelta(hours=1)
|
|
oldest_article_date = today - timedelta(days=oldest_article)
|
|
|
|
feeds = [
|
|
('La Une', 'http://www.mediapart.fr/articles/feed'),
|
|
]
|
|
|
|
# The feed at 'http://www.mediapart.fr/articles/feed' only displayed the 10
|
|
# last elements so the articles are indexed on specific pages
|
|
# in the function my_parse_index. In this function the article are parsed
|
|
# using the funtion get_articles and the dict values dict_article_sources
|
|
|
|
def parse_feeds(self):
|
|
feeds = super(Mediapart, self).parse_feeds()
|
|
feeds += feeds_from_index(self.my_parse_index(feeds))
|
|
return feeds
|
|
|
|
def my_parse_index(self, la_une):
|
|
|
|
dict_article_sources = [
|
|
{
|
|
'type': 'Brèves',
|
|
'webpage': 'https://www.mediapart.fr/journal/fil-dactualites',
|
|
'separador': {
|
|
'page': 'ul',
|
|
'thread': 'li'
|
|
}
|
|
},
|
|
{
|
|
'type': 'International',
|
|
'webpage': 'https://www.mediapart.fr/journal/international',
|
|
'separador': {
|
|
'page': 'div',
|
|
'thread': 'div'
|
|
}
|
|
},
|
|
{
|
|
'type': 'France',
|
|
'webpage': 'https://www.mediapart.fr/journal/france',
|
|
'separador': {
|
|
'page': 'div',
|
|
'thread': 'div'
|
|
}
|
|
},
|
|
{
|
|
'type': 'Économie',
|
|
'webpage': 'https://www.mediapart.fr/journal/economie',
|
|
'separador': {
|
|
'page': 'div',
|
|
'thread': 'div'
|
|
}
|
|
},
|
|
{
|
|
'type': 'Culture',
|
|
'webpage': 'https://www.mediapart.fr/journal/culture-idees',
|
|
'separador': {
|
|
'page': 'div',
|
|
'thread': 'div'
|
|
}
|
|
},
|
|
]
|
|
|
|
def get_articles(
|
|
type_of_article, webpage, separador_page='ul', separador_thread='li'
|
|
):
|
|
|
|
specific_articles = []
|
|
|
|
webpage_article = []
|
|
soup = self.index_to_soup(webpage)
|
|
page = soup.find('main', {'class': 'global-wrapper'})
|
|
fils = page.find(separador_page, {'class': 'post-list universe-journal'})
|
|
|
|
all_articles = fils.findAll(separador_thread)
|
|
for article in all_articles:
|
|
try:
|
|
title = article.find('h3', recursive=False)
|
|
if title is None or ''.join(title['class']) == 'title-specific':
|
|
# print(f"[BAD title entry] Print value of title:\n {title}")
|
|
continue
|
|
# print(f"\n[OK title entry] Print value of title:\n {title}\n")
|
|
|
|
try:
|
|
article_mot_cle = article.find(
|
|
'a', {
|
|
'href': re.compile(r'.*\/mot-cle\/.*')
|
|
}
|
|
).renderContents().decode('utf-8')
|
|
except Exception:
|
|
article_mot_cle = ''
|
|
|
|
try:
|
|
article_type = article.find(
|
|
'a', {
|
|
'href': re.compile(r'.*\/type-darticles\/.*')
|
|
}
|
|
).renderContents().decode('utf-8')
|
|
except Exception:
|
|
article_type = ''
|
|
|
|
for s in title('span'):
|
|
s.replaceWith(s.renderContents().decode('utf-8') + "\n")
|
|
url = title.find('a', href=True)['href']
|
|
|
|
date = article.find('time', datetime=True)['datetime']
|
|
article_date = datetime.strptime(date, '%Y-%m-%d')
|
|
# Add French timezone to date of the article for date check
|
|
article_date = article_date.replace(tzinfo=timezone.utc) + timedelta(hours=1)
|
|
if article_date < self.oldest_article_date:
|
|
print("article_date < self.oldest_article_date\n")
|
|
continue
|
|
|
|
# print("-------- Recent article added to the list ------- \n")
|
|
all_authors = article.findAll(
|
|
'a', {'class': re.compile(r'\bjournalist\b')}
|
|
)
|
|
authors = [self.tag_to_string(a) for a in all_authors]
|
|
# print(f"Authors in tag <a>: {authors}")
|
|
|
|
# If not link to the author profile is available the
|
|
# html separador is a span tag
|
|
if not all_authors:
|
|
try:
|
|
all_authors = article.findAll(
|
|
'span', {'class': re.compile(r'\bjournalist\b')}
|
|
)
|
|
authors = [self.tag_to_string(a) for a in all_authors]
|
|
# print(f"Authors in tag <span>: {authors}")
|
|
except:
|
|
authors = 'unknown'
|
|
|
|
description = article.find('p').renderContents().decode('utf-8')
|
|
# print(f" <p> in article : {self.tag_to_string(description).strip()} ")
|
|
|
|
summary = {
|
|
'title': self.tag_to_string(title).strip(),
|
|
'description': description,
|
|
'date': article_date.strftime("%a, %d %b, %Y %H:%M"),
|
|
'author': ', '.join(authors),
|
|
'article_type': article_type,
|
|
'mot_cle': article_mot_cle.capitalize(),
|
|
'url': 'https://www.mediapart.fr' + url,
|
|
}
|
|
|
|
webpage_article.append(summary)
|
|
except Exception:
|
|
pass
|
|
|
|
specific_articles += [(type_of_article,
|
|
webpage_article)] if webpage_article else []
|
|
return specific_articles
|
|
|
|
articles = []
|
|
|
|
for category in dict_article_sources:
|
|
articles += get_articles(
|
|
category['type'], category['webpage'], category['separador']['page'],
|
|
category['separador']['thread']
|
|
)
|
|
|
|
return articles
|
|
|
|
# non-locale specific date parse (strptime("%d %b %Y",s) would work with
|
|
# french locale)
|
|
def parse_french_date(self, date_str):
|
|
date_arr = date_str.lower().split()
|
|
return date(
|
|
day=int(date_arr[0]),
|
|
year=int(date_arr[2]),
|
|
month=[
|
|
None, 'janvier', 'février', 'mars', 'avril', 'mai', 'juin',
|
|
'juillet', 'août', 'septembre', 'octobre', 'novembre', 'décembre'
|
|
].index(date_arr[1])
|
|
)
|
|
|
|
def get_browser(self):
|
|
# -- Handle login
|
|
|
|
def is_form_login(form):
|
|
return "id" in form.attrs and form.attrs['id'] == "logFormEl"
|
|
|
|
br = BasicNewsRecipe.get_browser(self)
|
|
if self.username is not None and self.password is not None:
|
|
br.open('https://www.mediapart.fr/login')
|
|
br.select_form(predicate=is_form_login)
|
|
br['name'] = self.username
|
|
br['password'] = self.password
|
|
br.submit()
|
|
return br
|
|
|
|
def default_cover(self, cover_file):
|
|
'''
|
|
Create a generic cover for recipes that don't have a cover
|
|
'''
|
|
from PyQt5.Qt import QImage, QPainter, QPen, Qt, QFont, QRect
|
|
from calibre.gui2 import ensure_app, load_builtin_fonts, pixmap_to_data
|
|
|
|
def init_environment():
|
|
ensure_app()
|
|
load_builtin_fonts()
|
|
|
|
def create_cover_mediapart(date):
|
|
' Create a cover for mediapart adding the date on Mediapart Cover'
|
|
init_environment()
|
|
# Get data
|
|
image_url = 'https://raw.githubusercontent.com/lhoupert/calibre_contrib/main/mediapart.jpeg'
|
|
data = self.index_to_soup(image_url, raw=True)
|
|
# Get date and hour corresponding to french time zone
|
|
today = datetime.now(timezone.utc) + timedelta(hours=1)
|
|
wkd = today.weekday()
|
|
french_weekday={0:'Mon',1:'Mar',2:'Mer',3:'Jeu',4:'Ven',5:'Sam',6:'Dim'}
|
|
day = french_weekday[wkd]+'.'
|
|
date = day + ' ' + today.strftime('%d %b. %Y')
|
|
edition = today.strftime('Édition de %Hh')
|
|
|
|
# Get Cover data
|
|
img = QImage()
|
|
img.loadFromData(data)
|
|
|
|
# Overlay date on cover
|
|
p = QPainter(img)
|
|
pen = QPen(Qt.black)
|
|
pen.setWidth(6)
|
|
p.setPen(pen)
|
|
font = QFont()
|
|
font.setFamily('Times')
|
|
font.setPointSize(78)
|
|
p.setFont(font)
|
|
r = QRect(0, 600, 744,100)
|
|
p.drawText(r, Qt.AlignmentFlag.AlignJustify | Qt.AlignmentFlag.AlignVCenter | Qt.AlignmentFlag.AlignCenter, date)
|
|
p.end()
|
|
|
|
# Overlay edition information on cover
|
|
p = QPainter(img)
|
|
pen = QPen(Qt.black)
|
|
pen.setWidth(4)
|
|
p.setPen(pen)
|
|
font = QFont()
|
|
font.setFamily('Times')
|
|
font.setItalic(True)
|
|
font.setPointSize(66)
|
|
p.setFont(font)
|
|
# Add date
|
|
r = QRect(0, 720, 744,100)
|
|
p.drawText(r, Qt.AlignmentFlag.AlignJustify | Qt.AlignmentFlag.AlignVCenter | Qt.AlignmentFlag.AlignCenter, edition)
|
|
p.end()
|
|
return pixmap_to_data(img)
|
|
|
|
try:
|
|
today=datetime.today()
|
|
date = today.strftime('%d %b %Y')
|
|
img_data = create_cover_mediapart(date)
|
|
cover_file.write(img_data)
|
|
cover_file.flush()
|
|
except Exception:
|
|
self.log.exception('Failed to generate default cover')
|
|
return False
|
|
return True
|