From 99ebf0843690097c7f9626901ef43d39c2588a91 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1s=20Di=20Domenico?= <tdido@tdido.eu>
Date: Fri, 13 Oct 2023 16:20:40 +0200
Subject: [PATCH] Update recipe for El Diplo

---
 recipes/el_diplo.recipe | 157 +++++++++++++++++++++++++---------------
 1 file changed, 97 insertions(+), 60 deletions(-)
diff --git a/recipes/el_diplo.recipe b/recipes/el_diplo.recipe
index d7e6ee9bb6..5c212bf92a 100644
--- a/recipes/el_diplo.recipe
+++ b/recipes/el_diplo.recipe
@@ -1,97 +1,134 @@
 # -*- mode: python; coding: utf-8; -*-
 # vim: set syntax=python fileencoding=utf-8
 
-__license__   = 'GPL v3'
-__copyright__ = '2021, Darko Miletic <darko.miletic at gmail.com>'
+__license__ = "GPL v3"
+__copyright__ = "2023, Tomás Di Domenico <tdido at tdido.eu>"
 
-'''
+"""
 www.eldiplo.org
-'''
+"""
 
-from calibre.web.feeds.news import BasicNewsRecipe
 from calibre.ptempfile import PersistentTemporaryFile
+from calibre.web.feeds.news import BasicNewsRecipe
 
 
-class ElDiplo2020(BasicNewsRecipe):
-    title                = 'Le Monde Diplomatique - cono sur'
-    __author__           = 'Darko Miletic'
-    description          = 'Publicación de Le Monde Diplomatique para el cono sur.'
-    publisher            = 'Le Monde Diplomatique'
-    category             = 'news, politics, Argentina, Uruguay, Paraguay, South America, World'
-    oldest_article       = 31
-    no_stylesheets       = True
-    encoding             = 'utf8'
+class ElDiplo2023(BasicNewsRecipe):
+    title = "Le Monde Diplomatique - cono sur"
+    __author__ = "Tomás Di Domenico"
+    description = "Publicación de Le Monde Diplomatique para el cono sur."
+    publisher = "Capital Intelectual"
+    category = "news, politics, Argentina, Uruguay, Paraguay, South America, World"
+    oldest_article = 31
+    no_stylesheets = True
+    encoding = "utf8"
     use_embedded_content = False
-    language             = 'es_AR'
-    remove_empty_feeds   = True
-    publication_type     = 'magazine'
-    auto_cleanup         = True
-    auto_cleanup_keep    = '//div[contains(@class, "autor")] | //div[@class="edicion"]'
-    delay                  = 1
+    language = "es_AR"
+    remove_empty_feeds = True
+    publication_type = "magazine"
+    auto_cleanup = True
+    delay = 1
     simultaneous_downloads = 1
-    timeout                = 8
-    needs_subscription     = 'optional'
-    ignore_duplicate_articles = {'url'}
+    timeout = 8
+    needs_subscription = True
+    ignore_duplicate_articles = {"url"}
     articles_are_obfuscated = True
-    temp_files              = []
-    fetch_retries           = 10
+    temp_files = []
+    fetch_retries = 10
     handle_gzip = True
     compress_news_images = True
     scale_news_images_to_device = True
-    masthead_url = 'https://www.eldiplo.org/wp-content/themes/_polenta_/assets/diplo.png'
-    INDEX = 'https://www.eldiplo.org/'
-
-    extra_css            = """
-        body{font-family: "GT Super", serif}
-        .autor{font-family: Inter, serif}
-    """
-
-    conversion_options = {
-        'comment': description, 'tags': category, 'publisher': publisher, 'language': language
-    }
+    masthead_url = (
+        "https://www.eldiplo.org/wp-content/themes/_polenta_/assets/diplo.png"
+    )
+    INDEX = "https://www.eldiplo.org/"
 
     def get_browser(self):
         br = BasicNewsRecipe.get_browser(self)
         br.open(self.INDEX)
         if self.username is not None and self.password is not None:
-            br.select_form(id='loginform')
-            br['log'] = self.username
-            br['pwd'] = self.password
+            br.select_form(id="loginform")
+            br["log"] = self.username
+            br["pwd"] = self.password
             br.submit()
         return br
 
+    def get_cover_url(self):
+        soup_index = self.index_to_soup(self.INDEX)
+        tag_sumario = soup_index.find("span", text="Sumario")
+        url_sumario = "https://www.eldiplo.org" + tag_sumario.parent["href"]
+
+        soup = self.index_to_soup(url_sumario)
+
+        container = soup.find("div", class_="px-16")
+        url = container.find("img")["src"]
+
+        return getattr(self, "cover_url", url)
+
+    def _process_article(self, article):
+        url = article.find("a", href=True, attrs={"class": "title"})["href"]
+        title = self.tag_to_string(article).replace("Editorial", "Editorial: ")
+        try:
+            title, authors = title.split(", por")
+            authors = f"por {authors}"
+        except ValueError:
+            authors = ""
+        self.log("title: ", title, " url: ", url)
+        return {"title": title, "url": url, "description": authors, "date": ""}
+
+    def preprocess_html(self, soup):
+        # cleanup internal references' anchor links, leave the inner text
+        # it would be nice to eventually make the internal links work
+        import re
+
+        for l in soup.find_all(name="a", attrs={"href": re.compile(r"#")}):
+            l.replaceWithChildren()
+
+        return soup
+
     def parse_index(self):
-        articles = []
-        soup = self.index_to_soup(self.INDEX)
-        mylink = soup.find('span', text='Sumario')
-        if mylink is None:
+        soup_index = self.index_to_soup(self.INDEX)
+
+        tag_sumario = soup_index.find("span", text="Sumario")
+
+        if tag_sumario is None:
             return None
-        indexurl = "https://www.eldiplo.org" + mylink.parent['href']
-        self.log(indexurl)
-        parts = indexurl.split('www.eldiplo.org/', 1)
-        series = parts[1].split('-', 1)[0]
-        self.conversion_options.update({'series' : self.title})
-        self.conversion_options.update({'series_index' : series})
-        soupindex = self.index_to_soup(indexurl)
-        totalfeeds = []
+
+        url_sumario = "https://www.eldiplo.org" + tag_sumario.parent["href"]
+        self.log(url_sumario)
+
+        soup_sumario = self.index_to_soup(url_sumario)
+
+        feeds = []
         articles = []
-        for article in soupindex.findAll('a', href=True, attrs={'class':'title'}):
-            url = article['href']
-            title = self.tag_to_string(article)
-            articles.append({'title': title, 'url': url, 'description': '', 'date': ''})
-            self.log('title: ', title, ' url: ', url)
-        totalfeeds.append(('Articles',articles))
-        return totalfeeds
+        dossiers = []
+
+        sumario = soup_sumario.find("div", class_="sumario")
+
+        for section in sumario.find_all("div", recursive=False):
+            classes = section.attrs["class"]
+
+            if "dossier" in classes:
+                dtitle = self.tag_to_string(section.find("h3"))
+                darticles = []
+                for article in section.find_all("div", recursive=False):
+                    darticles.append(self._process_article(article))
+                dossiers.append((dtitle, darticles))
+            else:
+                articles.append(self._process_article(section))
+        feeds.append(("Artículos", articles))
+        feeds += dossiers
+
+        return feeds
 
     def get_obfuscated_article(self, url):
         result = None
         count = 0
-        while (count < self.fetch_retries):
+        while count < self.fetch_retries:
             try:
                 response = self.browser.open(url, timeout=self.timeout)
                 html = response.read()
                 count = self.fetch_retries
-                tfile = PersistentTemporaryFile('_fa.html')
+                tfile = PersistentTemporaryFile("_fa.html")
                 tfile.write(html)
                 tfile.close()
                 self.temp_files.append(tfile)