calibre/recipes/el_diplo.recipe

# -*- mode: python; coding: utf-8; -*-
# vim: set syntax=python fileencoding=utf-8

__license__ = 'GPL v3'
__copyright__ = '2023, Tomás Di Domenico <tdido at tdido.eu>'

'''
www.eldiplo.org
'''

from calibre.web.feeds.news import BasicNewsRecipe


class ElDiplo2023(BasicNewsRecipe):
    title = 'Le Monde Diplomatique - cono sur'
    __author__ = 'Tomás Di Domenico'
    description = 'Publicación de Le Monde Diplomatique para el cono sur.'
    publisher = 'Capital Intelectual'
    category = 'News, Politics, Argentina, Uruguay, Paraguay, South America, World'
    oldest_article = 31
    no_stylesheets = True
    encoding = 'utf8'
    use_embedded_content = False
    language = 'es_AR'
    remove_empty_feeds = True
    publication_type = 'magazine'
    delay = 1
    simultaneous_downloads = 1
    timeout = 8
    needs_subscription = True
    ignore_duplicate_articles = {'url'}
    temp_files = []
    fetch_retries = 10
    handle_gzip = True
    compress_news_images = True
    scale_news_images_to_device = True
    masthead_url = (
        'https://www.eldiplo.org/wp-content/themes/_polenta_/assets/diplo.png'
    )
    INDEX = 'https://www.eldiplo.org/'

    conversion_options = {'series': 'El Dipló', 'publisher': publisher, 'base_font_size': 8, 'tags': category}

    keep_only_tags = [dict(name=['article'])]

    remove_tags = [dict(name=['button'])]

    extra_css = '''
        .entry-title {
            text-align: center;
        }
        .text-right {
            text-align: right;
        }
        .bajada {
            display: block;
            font-family: sans-serif;
            text-align: center;
            font-size: 110%;
            padding: 2%;
        }
        .Destacado{
            display: block;
            font-size: 120%;
            font-weight: bold;
            font-style: italic;
            padding-left: 10%;
            padding-right: 10%;
        }
    '''

    def get_browser(self):
        br = BasicNewsRecipe.get_browser(self)
        br.open(self.INDEX)
        if self.username is not None and self.password is not None:
            br.select_form(id='loginform')
            br['log'] = self.username
            br['pwd'] = self.password
            br.submit()
        return br

    def get_cover_url(self):
        soup_index = self.index_to_soup(self.INDEX)
        tag_sumario = soup_index.find('span', text='Sumario')
        url_sumario = 'https://www.eldiplo.org' + tag_sumario.parent['href']

        soup = self.index_to_soup(url_sumario)

        container = soup.find('div', class_='px-16')
        url = container.find('img')['src']

        return getattr(self, 'cover_url', url)

    def _process_article(self, article):
        url = article.find('a', href=True, attrs={'class': 'title'})['href']
        title = self.tag_to_string(article).replace('Editorial', 'Editorial: ')
        try:
            title, authors = title.split(', por')
            authors = f'por {authors}'
        except ValueError:
            authors = ''
        self.log('title: ', title, ' url: ', url)
        return {'title': title, 'url': url, 'description': authors, 'date': ''}

    def preprocess_html(self, soup):
        font_size = '90%'

        # make the footnotes smaller
        for p in soup.find('div', id='nota_pie').findChildren('p', recursive=False):
            p['style'] = f'font-size: {font_size};'

        return soup

    def parse_index(self):
        soup_index = self.index_to_soup(self.INDEX)

        tag_sumario = soup_index.find('span', text='Sumario')

        if tag_sumario is None:
            return None

        url_sumario = 'https://www.eldiplo.org' + tag_sumario.parent['href']
        self.log(url_sumario)

        soup_sumario = self.index_to_soup(url_sumario)

        feeds = []
        articles = []
        dossiers = []

        sumario = soup_sumario.find('div', class_='sumario')

        for section in sumario.find_all('div', recursive=False):
            classes = section.attrs['class']

            if 'dossier' in classes:
                dtitle = self.tag_to_string(section.find('h3'))
                darticles = []
                for article in section.find_all('div', recursive=False):
                    darticles.append(self._process_article(article))
                dossiers.append((dtitle, darticles))
            else:
                articles.append(self._process_article(section))
        feeds.append(('Artículos', articles))
        feeds += dossiers

        return feeds