Update recipe for El Diplo

2025-11-23 15:03:03 -05:00 · 2023-10-13 16:20:40 +02:00 · 2023-10-13 16:20:40 +02:00 · 99ebf08436
commit 99ebf08436
parent 9dd272e12f
1 changed files with 97 additions and 60 deletions
--- a/recipes/el_diplo.recipe
+++ b/recipes/el_diplo.recipe
@ -1,97 +1,134 @@
 # -*- mode: python; coding: utf-8; -*-
 # vim: set syntax=python fileencoding=utf-8
-__license__   = 'GPL v3'
+__license__ = "GPL v3"
-__copyright__ = '2021, Darko Miletic <darko.miletic at gmail.com>'
+__copyright__ = "2023, Tomás Di Domenico <tdido at tdido.eu>"
-'''
+"""
 www.eldiplo.org
-'''
+"""
 from calibre.web.feeds.news import BasicNewsRecipe
 from calibre.ptempfile import PersistentTemporaryFile
 from calibre.web.feeds.news import BasicNewsRecipe
-class ElDiplo2020(BasicNewsRecipe):
+class ElDiplo2023(BasicNewsRecipe):
-    title                = 'Le Monde Diplomatique - cono sur'
+    title = "Le Monde Diplomatique - cono sur"
-    __author__           = 'Darko Miletic'
+    __author__ = "Tomás Di Domenico"
-    description          = 'Publicación de Le Monde Diplomatique para el cono sur.'
+    description = "Publicación de Le Monde Diplomatique para el cono sur."
-    publisher            = 'Le Monde Diplomatique'
+    publisher = "Capital Intelectual"
-    category             = 'news, politics, Argentina, Uruguay, Paraguay, South America, World'
+    category = "news, politics, Argentina, Uruguay, Paraguay, South America, World"
-    oldest_article       = 31
+    oldest_article = 31
-    no_stylesheets       = True
+    no_stylesheets = True
-    encoding             = 'utf8'
+    encoding = "utf8"
    use_embedded_content = False
-    language             = 'es_AR'
+    language = "es_AR"
-    remove_empty_feeds   = True
+    remove_empty_feeds = True
-    publication_type     = 'magazine'
+    publication_type = "magazine"
-    auto_cleanup         = True
+    auto_cleanup = True
-    auto_cleanup_keep    = '//div[contains(@class, "autor")] | //div[@class="edicion"]'
+    delay = 1
    delay                  = 1
    simultaneous_downloads = 1
-    timeout                = 8
+    timeout = 8
-    needs_subscription     = 'optional'
+    needs_subscription = True
-    ignore_duplicate_articles = {'url'}
+    ignore_duplicate_articles = {"url"}
    articles_are_obfuscated = True
-    temp_files              = []
+    temp_files = []
-    fetch_retries           = 10
+    fetch_retries = 10
    handle_gzip = True
    compress_news_images = True
    scale_news_images_to_device = True
-    masthead_url = 'https://www.eldiplo.org/wp-content/themes/_polenta_/assets/diplo.png'
+    masthead_url = (
-    INDEX = 'https://www.eldiplo.org/'
+        "https://www.eldiplo.org/wp-content/themes/_polenta_/assets/diplo.png"
-
+    )
-    extra_css            = """
+    INDEX = "https://www.eldiplo.org/"
        body{font-family: "GT Super", serif}
        .autor{font-family: Inter, serif}
    """
    conversion_options = {
        'comment': description, 'tags': category, 'publisher': publisher, 'language': language
    }
    def get_browser(self):
        br = BasicNewsRecipe.get_browser(self)
        br.open(self.INDEX)
        if self.username is not None and self.password is not None:
-            br.select_form(id='loginform')
+            br.select_form(id="loginform")
-            br['log'] = self.username
+            br["log"] = self.username
-            br['pwd'] = self.password
+            br["pwd"] = self.password
            br.submit()
        return br
    def get_cover_url(self):
        soup_index = self.index_to_soup(self.INDEX)
        tag_sumario = soup_index.find("span", text="Sumario")
        url_sumario = "https://www.eldiplo.org" + tag_sumario.parent["href"]
        soup = self.index_to_soup(url_sumario)
        container = soup.find("div", class_="px-16")
        url = container.find("img")["src"]
        return getattr(self, "cover_url", url)
    def _process_article(self, article):
        url = article.find("a", href=True, attrs={"class": "title"})["href"]
        title = self.tag_to_string(article).replace("Editorial", "Editorial: ")
        try:
            title, authors = title.split(", por")
            authors = f"por {authors}"
        except ValueError:
            authors = ""
        self.log("title: ", title, " url: ", url)
        return {"title": title, "url": url, "description": authors, "date": ""}
    def preprocess_html(self, soup):
        # cleanup internal references' anchor links, leave the inner text
        # it would be nice to eventually make the internal links work
        import re
        for l in soup.find_all(name="a", attrs={"href": re.compile(r"#")}):
            l.replaceWithChildren()
        return soup
    def parse_index(self):
-        articles = []
+        soup_index = self.index_to_soup(self.INDEX)
-        soup = self.index_to_soup(self.INDEX)
+
-        mylink = soup.find('span', text='Sumario')
+        tag_sumario = soup_index.find("span", text="Sumario")
-        if mylink is None:
+
        if tag_sumario is None:
            return None
-        indexurl = "https://www.eldiplo.org" + mylink.parent['href']
+
-        self.log(indexurl)
+        url_sumario = "https://www.eldiplo.org" + tag_sumario.parent["href"]
-        parts = indexurl.split('www.eldiplo.org/', 1)
+        self.log(url_sumario)
-        series = parts[1].split('-', 1)[0]
+
-        self.conversion_options.update({'series' : self.title})
+        soup_sumario = self.index_to_soup(url_sumario)
-        self.conversion_options.update({'series_index' : series})
+
-        soupindex = self.index_to_soup(indexurl)
+        feeds = []
        totalfeeds = []
        articles = []
-        for article in soupindex.findAll('a', href=True, attrs={'class':'title'}):
+        dossiers = []
-            url = article['href']
+
-            title = self.tag_to_string(article)
+        sumario = soup_sumario.find("div", class_="sumario")
-            articles.append({'title': title, 'url': url, 'description': '', 'date': ''})
+
-            self.log('title: ', title, ' url: ', url)
+        for section in sumario.find_all("div", recursive=False):
-        totalfeeds.append(('Articles',articles))
+            classes = section.attrs["class"]
-        return totalfeeds
+
            if "dossier" in classes:
                dtitle = self.tag_to_string(section.find("h3"))
                darticles = []
                for article in section.find_all("div", recursive=False):
                    darticles.append(self._process_article(article))
                dossiers.append((dtitle, darticles))
            else:
                articles.append(self._process_article(section))
        feeds.append(("Artículos", articles))
        feeds += dossiers
        return feeds
    def get_obfuscated_article(self, url):
        result = None
        count = 0
-        while (count < self.fetch_retries):
+        while count < self.fetch_retries:
            try:
                response = self.browser.open(url, timeout=self.timeout)
                html = response.read()
                count = self.fetch_retries
-                tfile = PersistentTemporaryFile('_fa.html')
+                tfile = PersistentTemporaryFile("_fa.html")
                tfile.write(html)
                tfile.close()
                self.temp_files.append(tfile)