mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-06-23 15:30:45 -04:00
Access the login form via its property "id" instead of its (changing) position, as it's not possible to use its name (this form has no name)
166 lines
6.0 KiB
Python
166 lines
6.0 KiB
Python
#!/usr/bin/env python2
|
|
# vim:fileencoding=utf-8
|
|
from __future__ import unicode_literals
|
|
|
|
__license__ = 'GPL v3'
|
|
__copyright__ = '2016, Daniel Bonnery ? (contact: DanielBonnery sur mobileread.com) 2009, Mathieu Godlewski <mathieu at godlewski.fr>; 2010-2012, Louis Gesbert <meta at antislash dot info>' # noqa
|
|
'''
|
|
Mediapart
|
|
'''
|
|
|
|
import re
|
|
from calibre.web.feeds.news import BasicNewsRecipe
|
|
from calibre.web.feeds import feeds_from_index
|
|
from datetime import date, timedelta
|
|
|
|
|
|
class Mediapart(BasicNewsRecipe):
|
|
title = 'Mediapart'
|
|
__author__ = 'Daniel Bonnery from a version by Mathieu Godlewski, Louis Gesbert'
|
|
description = 'Global news in french from news site Mediapart'
|
|
publication_type = 'newspaper'
|
|
language = 'fr'
|
|
needs_subscription = True
|
|
oldest_article = 2
|
|
|
|
use_embedded_content = False
|
|
no_stylesheets = True
|
|
|
|
cover_url = 'https://static.mediapart.fr/files/M%20Philips/logo-mediapart.png'
|
|
|
|
# --
|
|
|
|
oldest_article_date = date.today() - timedelta(days=oldest_article)
|
|
|
|
# -- get the index (the feed at 'http://www.mediapart.fr/articles/feed' only has
|
|
# the 10 last elements :/)
|
|
|
|
feeds = [
|
|
('La Une', 'http://www.mediapart.fr/articles/feed'),
|
|
]
|
|
|
|
def parse_feeds(self):
|
|
feeds = super(Mediapart, self).parse_feeds()
|
|
feeds += feeds_from_index(self.my_parse_index(feeds))
|
|
return feeds
|
|
|
|
def my_parse_index(self, la_une):
|
|
articles = []
|
|
|
|
breves = []
|
|
liens = []
|
|
confidentiels = []
|
|
|
|
soup = self.index_to_soup(
|
|
'https://www.mediapart.fr/journal/fil-dactualites')
|
|
page = soup.find('main', {'class': 'global-wrapper'})
|
|
fils = page.find('ul', {'class': 'post-list universe-journal'})
|
|
|
|
for article in fils.findAll('li'):
|
|
try:
|
|
title = article.find('h3', recursive=False)
|
|
|
|
if title is None or title['class'] == 'title-specific':
|
|
continue
|
|
|
|
# print "found fil ",title
|
|
article_type = article.find('a', {'href': re.compile(
|
|
r'.*\/type-darticles\/.*')}).renderContents()
|
|
# print "kind: ",article_type
|
|
|
|
for s in title('span'):
|
|
s.replaceWith(s.renderContents() + "\n")
|
|
url = title.find('a', href=True)['href']
|
|
|
|
# article_date = self.parse_french_date(article.find("span", "article-date").renderContents())
|
|
# print("################################# 9")
|
|
# print(article_date)
|
|
|
|
# if article_date < self.oldest_article_date:
|
|
# print "too old"
|
|
# continue
|
|
|
|
authors = article.findAll(
|
|
'a', {'class': re.compile(r'\bjournalist\b')})
|
|
authors = [self.tag_to_string(a) for a in authors]
|
|
|
|
# description = article.find('div', {'class': lambda c: c != 'taxonomy-teaser'}, recursive=False).findAll('p')
|
|
|
|
# print "fil ",title," by ",authors," : ",description
|
|
|
|
summary = {
|
|
'title': self.tag_to_string(title).strip(),
|
|
'author': ', '.join(authors),
|
|
'url': 'https://www.mediapart.fr' + url
|
|
}
|
|
if article_type == 'Lien':
|
|
liens.append(summary)
|
|
if article_type == 'Confidentiel':
|
|
confidentiels.append(summary)
|
|
if article_type not in ['Lien', 'Confidentiel']:
|
|
breves.append(summary)
|
|
except:
|
|
pass
|
|
|
|
# print 'La Une: ', len(la_une), ' articles'
|
|
# for a in la_une: print a["title"]
|
|
# print 'Brèves: ', len(breves), ' articles'
|
|
# print 'Revue web: ', len(liens), ' articles'
|
|
# print 'Confidentiel: ', len(confidentiels), ' articles'
|
|
|
|
articles += [('Brèves', breves)] if breves else []
|
|
articles += [('Revue du Web', liens)] if liens else []
|
|
articles += [('Confidentiel', confidentiels)] if confidentiels else []
|
|
return articles
|
|
# -- print-version
|
|
|
|
conversion_options = {'smarten_punctuation': True}
|
|
|
|
remove_tags = [dict(name='div', attrs={'class': 'print-source_url'})]
|
|
|
|
# non-locale specific date parse (strptime("%d %b %Y",s) would work with
|
|
# french locale)
|
|
def parse_french_date(self, date_str):
|
|
date_arr = date_str.lower().split()
|
|
return date(day=int(date_arr[0]),
|
|
year=int(date_arr[2]),
|
|
month=[None, 'janvier', 'février', 'mars', 'avril', 'mai', 'juin', 'juillet',
|
|
'août', 'septembre', 'octobre', 'novembre', 'décembre'].index(date_arr[1]))
|
|
|
|
def print_version(self, url):
|
|
soup = self.index_to_soup(url)
|
|
# Filter old articles
|
|
# article_date = self.parse_french_date(self.tag_to_string(soup.find('span', 'article-date')))
|
|
|
|
# if article_date < self.oldest_article_date:
|
|
# return None
|
|
|
|
tools = soup.find('li', {'class': 'print'})
|
|
link = tools.find('a', {'href': re.compile(r'\/print\/.*')})
|
|
# if link is None:
|
|
# print 'Error: print link not found'
|
|
# return None
|
|
return 'https://mediapart.fr' + link['href']
|
|
|
|
# -- Handle login
|
|
def get_browser(self):
|
|
def is_form_login(form):
|
|
return "id" in form.attrs and form.attrs['id'] == "logFormEl"
|
|
br = BasicNewsRecipe.get_browser(self)
|
|
if self.username is not None and self.password is not None:
|
|
br.open('https://www.mediapart.fr/login')
|
|
br.select_form(predicate=is_form_login)
|
|
br['name'] = self.username
|
|
br['password'] = self.password
|
|
br.submit()
|
|
return br
|
|
|
|
# This is a workaround articles with scribd content that include
|
|
# <body></body> tags _within_ the body
|
|
preprocess_regexps = [
|
|
(re.compile(r'(<body.*?>)(.*)</body>', re.IGNORECASE | re.DOTALL),
|
|
lambda match:
|
|
match.group(1) + re.sub(
|
|
re.compile(r'</?body>', re.IGNORECASE | re.DOTALL), '', match.group(2)) + '</body>')
|
|
]
|