Update Mediapart

2025-07-09 03:04:10 -04:00 · 2020-09-09 20:14:29 +05:30 · 2020-09-09 20:14:29 +05:30 · 91f95e52fb
commit 91f95e52fb
parent a45c2fa5a3
1 changed files with 16 additions and 27 deletions
--- a/recipes/mediapart.recipe
+++ b/recipes/mediapart.recipe
@ -14,10 +14,16 @@ from calibre.web.feeds import feeds_from_index
 from datetime import date, timedelta


+def classes(classes):
+    q = frozenset(classes.split(' '))
+    return dict(attrs={
+        'class': lambda x: x and frozenset(x.split()).intersection(q)})
+
+
 class Mediapart(BasicNewsRecipe):
    title = 'Mediapart'
    __author__ = 'Daniel Bonnery from a version by Mathieu Godlewski, Louis Gesbert'
-    description = 'Global news in french from news site Mediapart'
+    description = 'Global news in French from news site Mediapart'
    publication_type = 'newspaper'
    language = 'fr'
    needs_subscription = True
@ -26,6 +32,15 @@ class Mediapart(BasicNewsRecipe):
    use_embedded_content = False
    no_stylesheets = True

+    keep_only_tags = [
+        dict(name='h1'),
+        dict(name='div', **classes('author')),
+        classes('introduction content-article')
+    ]
+    remove_tags = [
+        classes('login-subscribe print-source_url')
+    ]
+
    cover_url = 'https://static.mediapart.fr/files/M%20Philips/logo-mediapart.png'

 # --
@ -116,8 +131,6 @@ class Mediapart(BasicNewsRecipe):

    conversion_options = {'smarten_punctuation': True}

-    remove_tags = [dict(name='div', attrs={'class': 'print-source_url'})]
-
    # non-locale specific date parse (strptime("%d %b %Y",s) would work with
    # french locale)
    def parse_french_date(self, date_str):
@ -127,21 +140,6 @@ class Mediapart(BasicNewsRecipe):
                    month=[None, 'janvier', 'février', 'mars', 'avril', 'mai', 'juin', 'juillet',
                           'août', 'septembre', 'octobre', 'novembre', 'décembre'].index(date_arr[1]))

-    def print_version(self, url):
-        soup = self.index_to_soup(url)
-        # Filter old articles
-        # article_date = self.parse_french_date(self.tag_to_string(soup.find('span', 'article-date')))
-
-        # if article_date < self.oldest_article_date:
-        #     return None
-
-        tools = soup.find('li', {'class': 'print'})
-        link = tools.find('a', {'href': re.compile(r'\/print\/.*')})
-        # if link is None:
-        #    print 'Error: print link not found'
-        #    return None
-        return 'https://mediapart.fr' + link['href']
-
 # -- Handle login
    def get_browser(self):
        def is_form_login(form):
@ -154,12 +152,3 @@ class Mediapart(BasicNewsRecipe):
            br['password'] = self.password
            br.submit()
        return br
-
-    # This is a workaround articles with scribd content that include
-    # <body></body> tags _within_ the body
-    preprocess_regexps = [
-        (re.compile(r'(<body.*?>)(.*)</body>', re.IGNORECASE | re.DOTALL),
-         lambda match:
-             match.group(1) + re.sub(
-                 re.compile(r'</?body>', re.IGNORECASE | re.DOTALL), '', match.group(2)) + '</body>')
-    ]