Update recipe for El Diplo (Le Monde Diplimatique Cono Sur)

Complete rewrite of the recipe. A more conservative and minimalistic
approach, it updates the look and feel to closely match the
one from the original publication, including adding internal article
images. It also fixes internal links which are now fully functional.
This commit is contained in:
Tomás Di Domenico 2024-02-05 17:32:57 +01:00
parent a725dd21ac
commit 3ad8452879

View File

@ -14,10 +14,10 @@ from calibre.web.feeds.news import BasicNewsRecipe
class ElDiplo2023(BasicNewsRecipe): class ElDiplo2023(BasicNewsRecipe):
title = "Le Monde Diplomatique - cono sur" title = "Le Monde Diplomatique - cono sur"
__author__ = "Darko Miletic and Tomás Di Domenico" __author__ = "Tomás Di Domenico"
description = "Publicación de Le Monde Diplomatique para el cono sur." description = "Publicación de Le Monde Diplomatique para el cono sur."
publisher = "Capital Intelectual" publisher = "Capital Intelectual"
category = "news, politics, Argentina, Uruguay, Paraguay, South America, World" category = "News, Politics, Argentina, Uruguay, Paraguay, South America, World"
oldest_article = 31 oldest_article = 31
no_stylesheets = True no_stylesheets = True
encoding = "utf8" encoding = "utf8"
@ -25,13 +25,11 @@ class ElDiplo2023(BasicNewsRecipe):
language = "es_AR" language = "es_AR"
remove_empty_feeds = True remove_empty_feeds = True
publication_type = "magazine" publication_type = "magazine"
auto_cleanup = True
delay = 1 delay = 1
simultaneous_downloads = 1 simultaneous_downloads = 1
timeout = 8 timeout = 8
needs_subscription = True needs_subscription = True
ignore_duplicate_articles = {"url"} ignore_duplicate_articles = {"url"}
articles_are_obfuscated = True
temp_files = [] temp_files = []
fetch_retries = 10 fetch_retries = 10
handle_gzip = True handle_gzip = True
@ -42,6 +40,36 @@ class ElDiplo2023(BasicNewsRecipe):
) )
INDEX = "https://www.eldiplo.org/" INDEX = "https://www.eldiplo.org/"
conversion_options = {"series": "El Dipló", "publisher": publisher, "base_font_size": 8, "tags": category}
keep_only_tags = [dict(name=["article"])]
remove_tags = [dict(name=["button"])]
extra_css = """
.entry-title {
text-align: center;
}
.text-right {
text-align: right;
}
.bajada {
display: block;
font-family: sans-serif;
text-align: center;
font-size: 110%;
padding: 2%;
}
.Destacado{
display: block;
font-size: 120%;
font-weight: bold;
font-style: italic;
padding-left: 10%;
padding-right: 10%;
}
"""
def get_browser(self): def get_browser(self):
br = BasicNewsRecipe.get_browser(self) br = BasicNewsRecipe.get_browser(self)
br.open(self.INDEX) br.open(self.INDEX)
@ -76,12 +104,13 @@ class ElDiplo2023(BasicNewsRecipe):
return {"title": title, "url": url, "description": authors, "date": ""} return {"title": title, "url": url, "description": authors, "date": ""}
def preprocess_html(self, soup): def preprocess_html(self, soup):
# cleanup internal references' anchor links, leave the inner text
# it would be nice to eventually make the internal links work
import re import re
for l in soup.find_all(name="a", attrs={"href": re.compile(r"#")}): font_size = "90%"
l.replaceWithChildren()
# make the footnotes smaller
for p in soup.find("div", id="nota_pie").findChildren("p", recursive=False):
p["style"] = f"font-size: {font_size};"
return soup return soup
@ -119,21 +148,3 @@ class ElDiplo2023(BasicNewsRecipe):
feeds += dossiers feeds += dossiers
return feeds return feeds
def get_obfuscated_article(self, url):
result = None
count = 0
while count < self.fetch_retries:
try:
response = self.browser.open(url, timeout=self.timeout)
html = response.read()
count = self.fetch_retries
tfile = PersistentTemporaryFile("_fa.html")
tfile.write(html)
tfile.close()
self.temp_files.append(tfile)
result = tfile.name
except:
self.info("Retrying download...")
count += 1
return result