Update recipe for El Diplo

2025-08-11 09:13:57 -04:00 · 2023-10-13 16:20:40 +02:00 · 2023-10-13 16:20:40 +02:00 · 99ebf08436
commit 99ebf08436
parent 9dd272e12f
1 changed files with 97 additions and 60 deletions
--- a/recipes/el_diplo.recipe
+++ b/recipes/el_diplo.recipe
@ -1,97 +1,134 @@
 # -*- mode: python; coding: utf-8; -*-
 # vim: set syntax=python fileencoding=utf-8

-__license__   = 'GPL v3'
-__copyright__ = '2021, Darko Miletic <darko.miletic at gmail.com>'
+__license__ = "GPL v3"
+__copyright__ = "2023, Tomás Di Domenico <tdido at tdido.eu>"

-'''
+"""
 www.eldiplo.org
-'''
+"""

-from calibre.web.feeds.news import BasicNewsRecipe
 from calibre.ptempfile import PersistentTemporaryFile
+from calibre.web.feeds.news import BasicNewsRecipe


-class ElDiplo2020(BasicNewsRecipe):
-    title                = 'Le Monde Diplomatique - cono sur'
-    __author__           = 'Darko Miletic'
-    description          = 'Publicación de Le Monde Diplomatique para el cono sur.'
-    publisher            = 'Le Monde Diplomatique'
-    category             = 'news, politics, Argentina, Uruguay, Paraguay, South America, World'
-    oldest_article       = 31
-    no_stylesheets       = True
-    encoding             = 'utf8'
+class ElDiplo2023(BasicNewsRecipe):
+    title = "Le Monde Diplomatique - cono sur"
+    __author__ = "Tomás Di Domenico"
+    description = "Publicación de Le Monde Diplomatique para el cono sur."
+    publisher = "Capital Intelectual"
+    category = "news, politics, Argentina, Uruguay, Paraguay, South America, World"
+    oldest_article = 31
+    no_stylesheets = True
+    encoding = "utf8"
    use_embedded_content = False
-    language             = 'es_AR'
-    remove_empty_feeds   = True
-    publication_type     = 'magazine'
-    auto_cleanup         = True
-    auto_cleanup_keep    = '//div[contains(@class, "autor")] | //div[@class="edicion"]'
-    delay                  = 1
+    language = "es_AR"
+    remove_empty_feeds = True
+    publication_type = "magazine"
+    auto_cleanup = True
+    delay = 1
    simultaneous_downloads = 1
-    timeout                = 8
-    needs_subscription     = 'optional'
-    ignore_duplicate_articles = {'url'}
+    timeout = 8
+    needs_subscription = True
+    ignore_duplicate_articles = {"url"}
    articles_are_obfuscated = True
-    temp_files              = []
-    fetch_retries           = 10
+    temp_files = []
+    fetch_retries = 10
    handle_gzip = True
    compress_news_images = True
    scale_news_images_to_device = True
-    masthead_url = 'https://www.eldiplo.org/wp-content/themes/_polenta_/assets/diplo.png'
-    INDEX = 'https://www.eldiplo.org/'
-
-    extra_css            = """
-        body{font-family: "GT Super", serif}
-        .autor{font-family: Inter, serif}
-    """
-
-    conversion_options = {
-        'comment': description, 'tags': category, 'publisher': publisher, 'language': language
-    }
+    masthead_url = (
+        "https://www.eldiplo.org/wp-content/themes/_polenta_/assets/diplo.png"
+    )
+    INDEX = "https://www.eldiplo.org/"

    def get_browser(self):
        br = BasicNewsRecipe.get_browser(self)
        br.open(self.INDEX)
        if self.username is not None and self.password is not None:
-            br.select_form(id='loginform')
-            br['log'] = self.username
-            br['pwd'] = self.password
+            br.select_form(id="loginform")
+            br["log"] = self.username
+            br["pwd"] = self.password
            br.submit()
        return br

+    def get_cover_url(self):
+        soup_index = self.index_to_soup(self.INDEX)
+        tag_sumario = soup_index.find("span", text="Sumario")
+        url_sumario = "https://www.eldiplo.org" + tag_sumario.parent["href"]
+
+        soup = self.index_to_soup(url_sumario)
+
+        container = soup.find("div", class_="px-16")
+        url = container.find("img")["src"]
+
+        return getattr(self, "cover_url", url)
+
+    def _process_article(self, article):
+        url = article.find("a", href=True, attrs={"class": "title"})["href"]
+        title = self.tag_to_string(article).replace("Editorial", "Editorial: ")
+        try:
+            title, authors = title.split(", por")
+            authors = f"por {authors}"
+        except ValueError:
+            authors = ""
+        self.log("title: ", title, " url: ", url)
+        return {"title": title, "url": url, "description": authors, "date": ""}
+
+    def preprocess_html(self, soup):
+        # cleanup internal references' anchor links, leave the inner text
+        # it would be nice to eventually make the internal links work
+        import re
+
+        for l in soup.find_all(name="a", attrs={"href": re.compile(r"#")}):
+            l.replaceWithChildren()
+
+        return soup
+
    def parse_index(self):
-        articles = []
-        soup = self.index_to_soup(self.INDEX)
-        mylink = soup.find('span', text='Sumario')
-        if mylink is None:
+        soup_index = self.index_to_soup(self.INDEX)
+
+        tag_sumario = soup_index.find("span", text="Sumario")
+
+        if tag_sumario is None:
            return None
-        indexurl = "https://www.eldiplo.org" + mylink.parent['href']
-        self.log(indexurl)
-        parts = indexurl.split('www.eldiplo.org/', 1)
-        series = parts[1].split('-', 1)[0]
-        self.conversion_options.update({'series' : self.title})
-        self.conversion_options.update({'series_index' : series})
-        soupindex = self.index_to_soup(indexurl)
-        totalfeeds = []
+
+        url_sumario = "https://www.eldiplo.org" + tag_sumario.parent["href"]
+        self.log(url_sumario)
+
+        soup_sumario = self.index_to_soup(url_sumario)
+
+        feeds = []
        articles = []
-        for article in soupindex.findAll('a', href=True, attrs={'class':'title'}):
-            url = article['href']
-            title = self.tag_to_string(article)
-            articles.append({'title': title, 'url': url, 'description': '', 'date': ''})
-            self.log('title: ', title, ' url: ', url)
-        totalfeeds.append(('Articles',articles))
-        return totalfeeds
+        dossiers = []
+
+        sumario = soup_sumario.find("div", class_="sumario")
+
+        for section in sumario.find_all("div", recursive=False):
+            classes = section.attrs["class"]
+
+            if "dossier" in classes:
+                dtitle = self.tag_to_string(section.find("h3"))
+                darticles = []
+                for article in section.find_all("div", recursive=False):
+                    darticles.append(self._process_article(article))
+                dossiers.append((dtitle, darticles))
+            else:
+                articles.append(self._process_article(section))
+        feeds.append(("Artículos", articles))
+        feeds += dossiers
+
+        return feeds

    def get_obfuscated_article(self, url):
        result = None
        count = 0
-        while (count < self.fetch_retries):
+        while count < self.fetch_retries:
            try:
                response = self.browser.open(url, timeout=self.timeout)
                html = response.read()
                count = self.fetch_retries
-                tfile = PersistentTemporaryFile('_fa.html')
+                tfile = PersistentTemporaryFile("_fa.html")
                tfile.write(html)
                tfile.close()
                self.temp_files.append(tfile)