mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Update Mediapart
This commit is contained in:
parent
a45c2fa5a3
commit
91f95e52fb
@ -14,10 +14,16 @@ from calibre.web.feeds import feeds_from_index
|
|||||||
from datetime import date, timedelta
|
from datetime import date, timedelta
|
||||||
|
|
||||||
|
|
||||||
|
def classes(classes):
|
||||||
|
q = frozenset(classes.split(' '))
|
||||||
|
return dict(attrs={
|
||||||
|
'class': lambda x: x and frozenset(x.split()).intersection(q)})
|
||||||
|
|
||||||
|
|
||||||
class Mediapart(BasicNewsRecipe):
|
class Mediapart(BasicNewsRecipe):
|
||||||
title = 'Mediapart'
|
title = 'Mediapart'
|
||||||
__author__ = 'Daniel Bonnery from a version by Mathieu Godlewski, Louis Gesbert'
|
__author__ = 'Daniel Bonnery from a version by Mathieu Godlewski, Louis Gesbert'
|
||||||
description = 'Global news in french from news site Mediapart'
|
description = 'Global news in French from news site Mediapart'
|
||||||
publication_type = 'newspaper'
|
publication_type = 'newspaper'
|
||||||
language = 'fr'
|
language = 'fr'
|
||||||
needs_subscription = True
|
needs_subscription = True
|
||||||
@ -26,6 +32,15 @@ class Mediapart(BasicNewsRecipe):
|
|||||||
use_embedded_content = False
|
use_embedded_content = False
|
||||||
no_stylesheets = True
|
no_stylesheets = True
|
||||||
|
|
||||||
|
keep_only_tags = [
|
||||||
|
dict(name='h1'),
|
||||||
|
dict(name='div', **classes('author')),
|
||||||
|
classes('introduction content-article')
|
||||||
|
]
|
||||||
|
remove_tags = [
|
||||||
|
classes('login-subscribe print-source_url')
|
||||||
|
]
|
||||||
|
|
||||||
cover_url = 'https://static.mediapart.fr/files/M%20Philips/logo-mediapart.png'
|
cover_url = 'https://static.mediapart.fr/files/M%20Philips/logo-mediapart.png'
|
||||||
|
|
||||||
# --
|
# --
|
||||||
@ -116,8 +131,6 @@ class Mediapart(BasicNewsRecipe):
|
|||||||
|
|
||||||
conversion_options = {'smarten_punctuation': True}
|
conversion_options = {'smarten_punctuation': True}
|
||||||
|
|
||||||
remove_tags = [dict(name='div', attrs={'class': 'print-source_url'})]
|
|
||||||
|
|
||||||
# non-locale specific date parse (strptime("%d %b %Y",s) would work with
|
# non-locale specific date parse (strptime("%d %b %Y",s) would work with
|
||||||
# french locale)
|
# french locale)
|
||||||
def parse_french_date(self, date_str):
|
def parse_french_date(self, date_str):
|
||||||
@ -127,21 +140,6 @@ class Mediapart(BasicNewsRecipe):
|
|||||||
month=[None, 'janvier', 'février', 'mars', 'avril', 'mai', 'juin', 'juillet',
|
month=[None, 'janvier', 'février', 'mars', 'avril', 'mai', 'juin', 'juillet',
|
||||||
'août', 'septembre', 'octobre', 'novembre', 'décembre'].index(date_arr[1]))
|
'août', 'septembre', 'octobre', 'novembre', 'décembre'].index(date_arr[1]))
|
||||||
|
|
||||||
def print_version(self, url):
|
|
||||||
soup = self.index_to_soup(url)
|
|
||||||
# Filter old articles
|
|
||||||
# article_date = self.parse_french_date(self.tag_to_string(soup.find('span', 'article-date')))
|
|
||||||
|
|
||||||
# if article_date < self.oldest_article_date:
|
|
||||||
# return None
|
|
||||||
|
|
||||||
tools = soup.find('li', {'class': 'print'})
|
|
||||||
link = tools.find('a', {'href': re.compile(r'\/print\/.*')})
|
|
||||||
# if link is None:
|
|
||||||
# print 'Error: print link not found'
|
|
||||||
# return None
|
|
||||||
return 'https://mediapart.fr' + link['href']
|
|
||||||
|
|
||||||
# -- Handle login
|
# -- Handle login
|
||||||
def get_browser(self):
|
def get_browser(self):
|
||||||
def is_form_login(form):
|
def is_form_login(form):
|
||||||
@ -154,12 +152,3 @@ class Mediapart(BasicNewsRecipe):
|
|||||||
br['password'] = self.password
|
br['password'] = self.password
|
||||||
br.submit()
|
br.submit()
|
||||||
return br
|
return br
|
||||||
|
|
||||||
# This is a workaround articles with scribd content that include
|
|
||||||
# <body></body> tags _within_ the body
|
|
||||||
preprocess_regexps = [
|
|
||||||
(re.compile(r'(<body.*?>)(.*)</body>', re.IGNORECASE | re.DOTALL),
|
|
||||||
lambda match:
|
|
||||||
match.group(1) + re.sub(
|
|
||||||
re.compile(r'</?body>', re.IGNORECASE | re.DOTALL), '', match.group(2)) + '</body>')
|
|
||||||
]
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user