calibre/recipes/mediapart.recipe
Hervé M 4fb5580990
Change the way to address login form
Access the login form via its property "id" instead of its (changing) position, as it's not possible to use its name (this form has no name)
2017-11-22 07:35:40 +01:00

166 lines
6.0 KiB
Python

#!/usr/bin/env python2
# vim:fileencoding=utf-8
from __future__ import unicode_literals
__license__ = 'GPL v3'
__copyright__ = '2016, Daniel Bonnery ? (contact: DanielBonnery sur mobileread.com) 2009, Mathieu Godlewski <mathieu at godlewski.fr>; 2010-2012, Louis Gesbert <meta at antislash dot info>' # noqa
'''
Mediapart
'''
import re
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.web.feeds import feeds_from_index
from datetime import date, timedelta
class Mediapart(BasicNewsRecipe):
title = 'Mediapart'
__author__ = 'Daniel Bonnery from a version by Mathieu Godlewski, Louis Gesbert'
description = 'Global news in french from news site Mediapart'
publication_type = 'newspaper'
language = 'fr'
needs_subscription = True
oldest_article = 2
use_embedded_content = False
no_stylesheets = True
cover_url = 'https://static.mediapart.fr/files/M%20Philips/logo-mediapart.png'
# --
oldest_article_date = date.today() - timedelta(days=oldest_article)
# -- get the index (the feed at 'http://www.mediapart.fr/articles/feed' only has
# the 10 last elements :/)
feeds = [
('La Une', 'http://www.mediapart.fr/articles/feed'),
]
def parse_feeds(self):
feeds = super(Mediapart, self).parse_feeds()
feeds += feeds_from_index(self.my_parse_index(feeds))
return feeds
def my_parse_index(self, la_une):
articles = []
breves = []
liens = []
confidentiels = []
soup = self.index_to_soup(
'https://www.mediapart.fr/journal/fil-dactualites')
page = soup.find('main', {'class': 'global-wrapper'})
fils = page.find('ul', {'class': 'post-list universe-journal'})
for article in fils.findAll('li'):
try:
title = article.find('h3', recursive=False)
if title is None or title['class'] == 'title-specific':
continue
# print "found fil ",title
article_type = article.find('a', {'href': re.compile(
r'.*\/type-darticles\/.*')}).renderContents()
# print "kind: ",article_type
for s in title('span'):
s.replaceWith(s.renderContents() + "\n")
url = title.find('a', href=True)['href']
# article_date = self.parse_french_date(article.find("span", "article-date").renderContents())
# print("################################# 9")
# print(article_date)
# if article_date < self.oldest_article_date:
# print "too old"
# continue
authors = article.findAll(
'a', {'class': re.compile(r'\bjournalist\b')})
authors = [self.tag_to_string(a) for a in authors]
# description = article.find('div', {'class': lambda c: c != 'taxonomy-teaser'}, recursive=False).findAll('p')
# print "fil ",title," by ",authors," : ",description
summary = {
'title': self.tag_to_string(title).strip(),
'author': ', '.join(authors),
'url': 'https://www.mediapart.fr' + url
}
if article_type == 'Lien':
liens.append(summary)
if article_type == 'Confidentiel':
confidentiels.append(summary)
if article_type not in ['Lien', 'Confidentiel']:
breves.append(summary)
except:
pass
# print 'La Une: ', len(la_une), ' articles'
# for a in la_une: print a["title"]
# print 'Brèves: ', len(breves), ' articles'
# print 'Revue web: ', len(liens), ' articles'
# print 'Confidentiel: ', len(confidentiels), ' articles'
articles += [('Brèves', breves)] if breves else []
articles += [('Revue du Web', liens)] if liens else []
articles += [('Confidentiel', confidentiels)] if confidentiels else []
return articles
# -- print-version
conversion_options = {'smarten_punctuation': True}
remove_tags = [dict(name='div', attrs={'class': 'print-source_url'})]
# non-locale specific date parse (strptime("%d %b %Y",s) would work with
# french locale)
def parse_french_date(self, date_str):
date_arr = date_str.lower().split()
return date(day=int(date_arr[0]),
year=int(date_arr[2]),
month=[None, 'janvier', 'février', 'mars', 'avril', 'mai', 'juin', 'juillet',
'août', 'septembre', 'octobre', 'novembre', 'décembre'].index(date_arr[1]))
def print_version(self, url):
soup = self.index_to_soup(url)
# Filter old articles
# article_date = self.parse_french_date(self.tag_to_string(soup.find('span', 'article-date')))
# if article_date < self.oldest_article_date:
# return None
tools = soup.find('li', {'class': 'print'})
link = tools.find('a', {'href': re.compile(r'\/print\/.*')})
# if link is None:
# print 'Error: print link not found'
# return None
return 'https://mediapart.fr' + link['href']
# -- Handle login
def get_browser(self):
def is_form_login(form):
return "id" in form.attrs and form.attrs['id'] == "logFormEl"
br = BasicNewsRecipe.get_browser(self)
if self.username is not None and self.password is not None:
br.open('https://www.mediapart.fr/login')
br.select_form(predicate=is_form_login)
br['name'] = self.username
br['password'] = self.password
br.submit()
return br
# This is a workaround articles with scribd content that include
# <body></body> tags _within_ the body
preprocess_regexps = [
(re.compile(r'(<body.*?>)(.*)</body>', re.IGNORECASE | re.DOTALL),
lambda match:
match.group(1) + re.sub(
re.compile(r'</?body>', re.IGNORECASE | re.DOTALL), '', match.group(2)) + '</body>')
]