mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-31 14:33:54 -04:00
Update mediapart
This commit is contained in:
parent
a66c5bb4d5
commit
811b210303
@ -1,72 +1,172 @@
|
|||||||
|
# -*- mode:python -*-
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
__license__ = 'GPL v3'
|
__license__ = 'GPL v3'
|
||||||
__copyright__ = '2009, Mathieu Godlewski <mathieu at godlewski.fr>; 2010-2012, Louis Gesbert <meta at antislash dot info>; 2013, Malah <malah at neuf dot fr>'
|
__copyright__ = '2009, Mathieu Godlewski <mathieu at godlewski.fr>; 2010-2012, Louis Gesbert <meta at antislash dot info>'
|
||||||
'''
|
'''
|
||||||
Mediapart
|
Mediapart
|
||||||
'''
|
'''
|
||||||
|
|
||||||
__author__ = '2009, Mathieu Godlewski <mathieu at godlewski.fr>; 2010-2012, Louis Gesbert <meta at antislash dot info>; 2013, Malah <malah at neuf dot fr>'
|
__author__ = '2009, Mathieu Godlewski <mathieu at godlewski.fr>; 2010-2012, Louis Gesbert <meta at antislash dot info>'
|
||||||
|
|
||||||
import re
|
import re
|
||||||
from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag
|
from calibre.ebooks.BeautifulSoup import BeautifulSoup
|
||||||
from calibre.web.feeds.news import BasicNewsRecipe
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
from calibre.web.feeds import feeds_from_index
|
||||||
|
from datetime import date,timedelta
|
||||||
|
|
||||||
class Mediapart(BasicNewsRecipe):
|
class Mediapart(BasicNewsRecipe):
|
||||||
title = 'Mediapart'
|
title = 'Mediapart'
|
||||||
__author__ = 'Mathieu Godlewski, Louis Gesbert, Malah'
|
__author__ = 'Mathieu Godlewski, Louis Gesbert'
|
||||||
description = 'Global news in french from news site Mediapart'
|
description = 'Global news in french from news site Mediapart'
|
||||||
oldest_article = 7
|
publication_type = 'newspaper'
|
||||||
language = 'fr'
|
language = 'fr'
|
||||||
needs_subscription = True
|
needs_subscription = True
|
||||||
max_articles_per_feed = 50
|
oldest_article = 2
|
||||||
|
|
||||||
use_embedded_content = False
|
use_embedded_content = False
|
||||||
no_stylesheets = True
|
no_stylesheets = True
|
||||||
|
|
||||||
masthead_url = 'https://upload.wikimedia.org/wikipedia/fr/2/23/Mediapart.png'
|
cover_url = 'https://static.mediapart.fr/files/M%20Philips/logo-mediapart.png'
|
||||||
cover_url = 'http://static.mediapart.fr/files/pave_mediapart.jpg'
|
|
||||||
|
# --
|
||||||
|
|
||||||
|
oldest_article_date = date.today() - timedelta(days=oldest_article)
|
||||||
|
|
||||||
|
# -- get the index (the feed at 'http://www.mediapart.fr/articles/feed' only has
|
||||||
|
# the 10 last elements :/)
|
||||||
|
|
||||||
feeds = [
|
feeds = [
|
||||||
('Les articles', 'http://www.mediapart.fr/articles/feed'),
|
('La Une', 'http://www.mediapart.fr/articles/feed'),
|
||||||
]
|
]
|
||||||
|
|
||||||
# -- full-page-version
|
def parse_feeds(self):
|
||||||
|
feeds = super(Mediapart, self).parse_feeds()
|
||||||
|
feeds += feeds_from_index(self.my_parse_index(feeds))
|
||||||
|
return feeds
|
||||||
|
|
||||||
|
def my_parse_index(self, la_une):
|
||||||
|
articles = []
|
||||||
|
|
||||||
|
breves = []
|
||||||
|
liens = []
|
||||||
|
confidentiels = []
|
||||||
|
|
||||||
|
soup = self.index_to_soup('https://www.mediapart.fr/journal/fil-dactualites')
|
||||||
|
page = soup.find('div', {'id':'pageFirstContent'})
|
||||||
|
fils = page.find('div', {'class':re.compile(r'\bcontent-journal\b')})
|
||||||
|
|
||||||
|
for article in fils.findAll('div'):
|
||||||
|
try:
|
||||||
|
title = article.find('h2',recursive=False)
|
||||||
|
if title is None or title['class'] == 'title-specific':
|
||||||
|
continue
|
||||||
|
|
||||||
|
# print "found fil ",title
|
||||||
|
article_type = article.find('a', {'href': re.compile(r'.*\/type-darticles\/.*')}).renderContents()
|
||||||
|
# print "kind: ",article_type
|
||||||
|
|
||||||
|
for s in title('span'):
|
||||||
|
s.replaceWith(s.renderContents() + "\n")
|
||||||
|
url = title.find('a', href=True)['href']
|
||||||
|
|
||||||
|
article_date = self.parse_french_date(article.find("span", "article-date").renderContents())
|
||||||
|
|
||||||
|
if article_date < self.oldest_article_date:
|
||||||
|
# print "too old"
|
||||||
|
continue
|
||||||
|
|
||||||
|
authors = article.findAll('a',{'class':re.compile(r'\bjournalist\b')})
|
||||||
|
authors = [self.tag_to_string(a) for a in authors]
|
||||||
|
|
||||||
|
description = article.find('div', {'class': lambda c: c != 'taxonomy-teaser'}, recursive=False).findAll('p')
|
||||||
|
|
||||||
|
# print "fil ",title," by ",authors," : ",description
|
||||||
|
|
||||||
|
summary = {
|
||||||
|
'title': self.tag_to_string(title).strip(),
|
||||||
|
'author': ', '.join(authors),
|
||||||
|
'url': url,
|
||||||
|
'date': u'' + article_date.strftime("%A %d %b %Y"),
|
||||||
|
'description': '\n'.join([self.tag_to_string(d) for d in description]),
|
||||||
|
}
|
||||||
|
{
|
||||||
|
"Brève": breves,
|
||||||
|
"Lien": liens,
|
||||||
|
"Confidentiel": confidentiels,
|
||||||
|
}.get(article_type).append(summary)
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
|
# print 'La Une: ', len(la_une), ' articles'
|
||||||
|
# for a in la_une: print a["title"]
|
||||||
|
# print 'Brèves: ', len(breves), ' articles'
|
||||||
|
# print 'Revue web: ', len(liens), ' articles'
|
||||||
|
# print 'Confidentiel: ', len(confidentiels), ' articles'
|
||||||
|
|
||||||
|
articles += [('Brèves', breves)] if breves else []
|
||||||
|
articles += [('Revue du Web', liens)] if liens else []
|
||||||
|
articles += [('Confidentiel', confidentiels)] if confidentiels else []
|
||||||
|
return articles
|
||||||
|
|
||||||
|
# -- print-version
|
||||||
|
|
||||||
conversion_options = {'smarten_punctuation' : True}
|
conversion_options = {'smarten_punctuation' : True}
|
||||||
|
|
||||||
keep_only_tags = [
|
remove_tags = [dict(name='div', attrs={'class':'print-source_url'})]
|
||||||
dict(name='div', attrs={'class':'col-left fractal-desktop fractal-10-desktop collapse-7-desktop fractal-tablet fractal-6-tablet collapse-4-tablet'}),
|
|
||||||
dict(name='div', attrs={'id':'pageFirstContent'})
|
# non-locale specific date parse (strptime("%d %b %Y",s) would work with french locale)
|
||||||
]
|
def parse_french_date(self, date_str):
|
||||||
remove_tags = [
|
date_arr = date_str.lower().split()
|
||||||
dict(name='div', attrs={'id':'lire-aussi'}),
|
return date(day=int(date_arr[0]),
|
||||||
dict(name='div', attrs={'class':'col-right-content'})
|
year=int(date_arr[2]),
|
||||||
]
|
month=
|
||||||
|
[None, 'janvier', 'février', 'mars', 'avril', 'mai', 'juin', 'juillet',
|
||||||
|
'août', 'septembre', 'octobre', 'novembre', 'décembre'].index(date_arr[1]))
|
||||||
|
|
||||||
def print_version(self, url):
|
def print_version(self, url):
|
||||||
raw = self.browser.open(url).read()
|
raw = self.browser.open(url).read()
|
||||||
soup = BeautifulSoup(raw.decode('utf8', 'replace'))
|
soup = BeautifulSoup(raw.decode('utf8', 'replace'))
|
||||||
link = soup.find('a', {'href':re.compile('^.*?onglet=full$')})
|
|
||||||
if link is None:
|
# Filter old articles
|
||||||
|
article_date = self.parse_french_date(self.tag_to_string(soup.find('span', 'article-date')))
|
||||||
|
|
||||||
|
if article_date < self.oldest_article_date:
|
||||||
return None
|
return None
|
||||||
return link['href']
|
|
||||||
|
tools = soup.find('div', {'class':'menu-tools'})
|
||||||
|
link = tools.find('a', {'href': re.compile(r'\/print\/.*')})
|
||||||
|
if link is None:
|
||||||
|
print 'Error: print link not found'
|
||||||
|
return None
|
||||||
|
return 'https://mediapart.fr/' + link['href']
|
||||||
|
|
||||||
# -- Handle login
|
# -- Handle login
|
||||||
|
|
||||||
def get_browser(self):
|
def get_browser(self):
|
||||||
br = BasicNewsRecipe.get_browser(self)
|
br = BasicNewsRecipe.get_browser(self)
|
||||||
if self.username is not None and self.password is not None:
|
if self.username is not None and self.password is not None:
|
||||||
br.open('http://blogs.mediapart.fr/editions/guide-du-coordonnateur-d-edition')
|
br.open('https://www.mediapart.fr/user')
|
||||||
br.select_form(nr=1)
|
br.select_form(nr=1)
|
||||||
br['name'] = self.username
|
br['name'] = self.username
|
||||||
br['pass'] = self.password
|
br['pass'] = self.password
|
||||||
br.submit()
|
br.submit()
|
||||||
return br
|
return br
|
||||||
|
|
||||||
def preprocess_html(self, soup):
|
# This is a workaround articles with scribd content that include
|
||||||
for title in soup.findAll('p', {'class':'titre_page'}):
|
# <body></body> tags _within_ the body
|
||||||
title.name = 'h3'
|
preprocess_regexps = [
|
||||||
for legend in soup.findAll('span', {'class':'legend'}):
|
(re.compile(r'(<body.*?>)(.*)</body>', re.IGNORECASE|re.DOTALL),
|
||||||
legend.insert(0, Tag(soup, 'br', []))
|
lambda match:
|
||||||
legend.name = 'small'
|
match.group(1)
|
||||||
return soup
|
+ re.sub(re.compile(r'</?body>', re.IGNORECASE|re.DOTALL),'',
|
||||||
|
match.group(2))
|
||||||
|
+ '</body>')
|
||||||
|
]
|
||||||
|
|
||||||
|
# def preprocess_html(self, soup):
|
||||||
|
# for title in soup.findAll('p', {'class':'titre_page'}):
|
||||||
|
# title.name = 'h3'
|
||||||
|
# for legend in soup.findAll('span', {'class':'legend'}):
|
||||||
|
# legend.insert(0, Tag(soup, 'br', []))
|
||||||
|
# legend.name = 'em'
|
||||||
|
# return soup
|
||||||
|
Loading…
x
Reference in New Issue
Block a user