calibre/recipes/el_diplo.recipe

# -*- mode: python; coding: utf-8; -*-
# vim: set syntax=python fileencoding=utf-8

__license__ = "GPL v3"
__copyright__ = "2023, Tomás Di Domenico <tdido at tdido.eu>"

"""
www.eldiplo.org
"""

from calibre.web.feeds.news import BasicNewsRecipe


class ElDiplo2023(BasicNewsRecipe):
    title = "Le Monde Diplomatique - cono sur"
    __author__ = "Tomás Di Domenico"
    description = "Publicación de Le Monde Diplomatique para el cono sur."
    publisher = "Capital Intelectual"
    category = "News, Politics, Argentina, Uruguay, Paraguay, South America, World"
    oldest_article = 31
    no_stylesheets = True
    encoding = "utf8"
    use_embedded_content = False
    language = "es_AR"
    remove_empty_feeds = True
    publication_type = "magazine"
    delay = 1
    simultaneous_downloads = 1
    timeout = 8
    needs_subscription = True
    ignore_duplicate_articles = {"url"}
    temp_files = []
    fetch_retries = 10
    handle_gzip = True
    compress_news_images = True
    scale_news_images_to_device = True
    masthead_url = (
        "https://www.eldiplo.org/wp-content/themes/_polenta_/assets/diplo.png"
    )
    INDEX = "https://www.eldiplo.org/"

    conversion_options = {"series": "El Dipló", "publisher": publisher, "base_font_size": 8, "tags": category}

    keep_only_tags = [dict(name=["article"])]

    remove_tags = [dict(name=["button"])]

    extra_css = """
        .entry-title {
            text-align: center;
        }
        .text-right {
            text-align: right;
        }
        .bajada {
            display: block;
            font-family: sans-serif;
            text-align: center;
            font-size: 110%;
            padding: 2%;
        }
        .Destacado{
            display: block;
            font-size: 120%;
            font-weight: bold;
            font-style: italic;
            padding-left: 10%;
            padding-right: 10%;
        }
    """

    def get_browser(self):
        br = BasicNewsRecipe.get_browser(self)
        br.open(self.INDEX)
        if self.username is not None and self.password is not None:
            br.select_form(id="loginform")
            br["log"] = self.username
            br["pwd"] = self.password
            br.submit()
        return br

    def get_cover_url(self):
        soup_index = self.index_to_soup(self.INDEX)
        tag_sumario = soup_index.find("span", text="Sumario")
        url_sumario = "https://www.eldiplo.org" + tag_sumario.parent["href"]

        soup = self.index_to_soup(url_sumario)

        container = soup.find("div", class_="px-16")
        url = container.find("img")["src"]

        return getattr(self, "cover_url", url)

    def _process_article(self, article):
        url = article.find("a", href=True, attrs={"class": "title"})["href"]
        title = self.tag_to_string(article).replace("Editorial", "Editorial: ")
        try:
            title, authors = title.split(", por")
            authors = f"por {authors}"
        except ValueError:
            authors = ""
        self.log("title: ", title, " url: ", url)
        return {"title": title, "url": url, "description": authors, "date": ""}

    def preprocess_html(self, soup):
        font_size = "90%"

        # make the footnotes smaller
        for p in soup.find("div", id="nota_pie").findChildren("p", recursive=False):
            p["style"] = f"font-size: {font_size};"

        return soup

    def parse_index(self):
        soup_index = self.index_to_soup(self.INDEX)

        tag_sumario = soup_index.find("span", text="Sumario")

        if tag_sumario is None:
            return None

        url_sumario = "https://www.eldiplo.org" + tag_sumario.parent["href"]
        self.log(url_sumario)

        soup_sumario = self.index_to_soup(url_sumario)

        feeds = []
        articles = []
        dossiers = []

        sumario = soup_sumario.find("div", class_="sumario")

        for section in sumario.find_all("div", recursive=False):
            classes = section.attrs["class"]

            if "dossier" in classes:
                dtitle = self.tag_to_string(section.find("h3"))
                darticles = []
                for article in section.find_all("div", recursive=False):
                    darticles.append(self._process_article(article))
                dossiers.append((dtitle, darticles))
            else:
                articles.append(self._process_article(section))
        feeds.append(("Artículos", articles))
        feeds += dossiers

        return feeds