From 3ad8452879fae5d8c94042263c01300788fa01bb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tom=C3=A1s=20Di=20Domenico?= Date: Mon, 5 Feb 2024 17:32:57 +0100 Subject: [PATCH] Update recipe for El Diplo (Le Monde Diplimatique Cono Sur) Complete rewrite of the recipe. A more conservative and minimalistic approach, it updates the look and feel to closely match the one from the original publication, including adding internal article images. It also fixes internal links which are now fully functional. --- recipes/el_diplo.recipe | 63 ++++++++++++++++++++++++----------------- 1 file changed, 37 insertions(+), 26 deletions(-) diff --git a/recipes/el_diplo.recipe b/recipes/el_diplo.recipe index 9a8c621428..f5142679a3 100644 --- a/recipes/el_diplo.recipe +++ b/recipes/el_diplo.recipe @@ -14,10 +14,10 @@ from calibre.web.feeds.news import BasicNewsRecipe class ElDiplo2023(BasicNewsRecipe): title = "Le Monde Diplomatique - cono sur" - __author__ = "Darko Miletic and Tomás Di Domenico" + __author__ = "Tomás Di Domenico" description = "Publicación de Le Monde Diplomatique para el cono sur." publisher = "Capital Intelectual" - category = "news, politics, Argentina, Uruguay, Paraguay, South America, World" + category = "News, Politics, Argentina, Uruguay, Paraguay, South America, World" oldest_article = 31 no_stylesheets = True encoding = "utf8" @@ -25,13 +25,11 @@ class ElDiplo2023(BasicNewsRecipe): language = "es_AR" remove_empty_feeds = True publication_type = "magazine" - auto_cleanup = True delay = 1 simultaneous_downloads = 1 timeout = 8 needs_subscription = True ignore_duplicate_articles = {"url"} - articles_are_obfuscated = True temp_files = [] fetch_retries = 10 handle_gzip = True @@ -42,6 +40,36 @@ class ElDiplo2023(BasicNewsRecipe): ) INDEX = "https://www.eldiplo.org/" + conversion_options = {"series": "El Dipló", "publisher": publisher, "base_font_size": 8, "tags": category} + + keep_only_tags = [dict(name=["article"])] + + remove_tags = [dict(name=["button"])] + + extra_css = """ + .entry-title { + text-align: center; + } + .text-right { + text-align: right; + } + .bajada { + display: block; + font-family: sans-serif; + text-align: center; + font-size: 110%; + padding: 2%; + } + .Destacado{ + display: block; + font-size: 120%; + font-weight: bold; + font-style: italic; + padding-left: 10%; + padding-right: 10%; + } + """ + def get_browser(self): br = BasicNewsRecipe.get_browser(self) br.open(self.INDEX) @@ -76,12 +104,13 @@ class ElDiplo2023(BasicNewsRecipe): return {"title": title, "url": url, "description": authors, "date": ""} def preprocess_html(self, soup): - # cleanup internal references' anchor links, leave the inner text - # it would be nice to eventually make the internal links work import re - for l in soup.find_all(name="a", attrs={"href": re.compile(r"#")}): - l.replaceWithChildren() + font_size = "90%" + + # make the footnotes smaller + for p in soup.find("div", id="nota_pie").findChildren("p", recursive=False): + p["style"] = f"font-size: {font_size};" return soup @@ -119,21 +148,3 @@ class ElDiplo2023(BasicNewsRecipe): feeds += dossiers return feeds - - def get_obfuscated_article(self, url): - result = None - count = 0 - while count < self.fetch_retries: - try: - response = self.browser.open(url, timeout=self.timeout) - html = response.read() - count = self.fetch_retries - tfile = PersistentTemporaryFile("_fa.html") - tfile.write(html) - tfile.close() - self.temp_files.append(tfile) - result = tfile.name - except: - self.info("Retrying download...") - count += 1 - return result