calibre/recipes/horizons.recipe

#!/usr/bin/env python
# vim:fileencoding=utf-8
'''
https://www.cirsd.org/en/horizons
'''

from calibre.web.feeds.news import BasicNewsRecipe, classes


class horizons(BasicNewsRecipe):
    title = 'Horizons'
    __author__ = 'unkn0wn'
    description = (
        ' Horizons – Journal of International Relations and Sustainable Development.'
        ' Horizons serves as a high-level platform for influential voices from around the world to'
        ' provide informed analysis and conduct reasoned exchanges on the full spectrum of issues'
        ' that shape international developments.'
    )
    no_stylesheets = True
    use_embedded_content = False
    encoding = 'utf-8'
    language = 'en'
    remove_attributes = ['style', 'height', 'width']
    masthead_url = 'https://www.cirsd.org/bundles/olpublic/images/horizons-logo.jpg'
    ignore_duplicate_articles = {'url'}
    extra_css = 'em{color:#202020;}'
    simultaneous_downloads = 1

    keep_only_tags = [dict(name='div', attrs={'class': 'article'})]
    remove_tags = [
        classes('back-link'),
        dict(name='div', attrs={'class': 'single-post-footer'}),
    ]

    recipe_specific_options = {
        'issue_url': {
            'short': 'The issue URL ',
            'long': 'For example, https://www.cirsd.org/en/horizons/horizons-winter-2024--issue-no-25',
        }
    }

    def preprocess_raw_html(self, raw, *a):
        return raw.replace('<p>&nbsp;</p>', '').replace('<p dir="ltr">&nbsp;</p>', '')

    def get_browser(self):
        return BasicNewsRecipe.get_browser(self, verify_ssl_certificates=False)

    def parse_index(self):
        d = self.recipe_specific_options.get('issue_url')
        if d and isinstance(d, str):
            url = d
        else:
            soup = self.index_to_soup('https://www.cirsd.org/en/horizons')
            a = soup.find('a', href=True, attrs={'class':'horizon-gallery-box'})
            url = a['href']
            if url.startswith('/'):
                url = 'https://www.cirsd.org' + url
            self.cover_url = a.find('img')['src']
            self.log(self.cover_url)
        self.title = url.split('/')[-1].replace('-', ' ').title()
        self.log('Downloading Issue: ', self.title)
        soup = self.index_to_soup(url)

        feeds = []
        for section in soup.findAll('h2', attrs={'class': 'mt-3'}):
            secname = self.tag_to_string(section).strip()
            self.log(secname)
            articles = []
            div = section.findNext('div', attrs={'class': 'mb-3'})
            for li in div.findAll('li', attrs={'class': 'mb-2'}):
                a = li.find('a', href=True)
                url = a['href']
                if url.startswith('/'):
                    url = 'https://www.cirsd.org' + url
                title = self.tag_to_string(a)
                span = li.find('span', attrs={'class': 'section-author'})
                desc = ''
                if span:
                    desc = self.tag_to_string(span).strip()
                self.log('\t', title, '\n\t', desc, '\n\t\t', url)
                articles.append({'title': title, 'url': url, 'description': desc})
            if articles:
                feeds.append((secname, articles))
        return feeds