calibre/recipes/horizons.recipe
2025-03-27 22:06:31 +05:30

85 lines
3.2 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python
# vim:fileencoding=utf-8
'''
https://www.cirsd.org/en/horizons
'''
from calibre.web.feeds.news import BasicNewsRecipe, classes
class horizons(BasicNewsRecipe):
title = 'Horizons'
__author__ = 'unkn0wn'
description = (
' Horizons Journal of International Relations and Sustainable Development.'
' Horizons serves as a high-level platform for influential voices from around the world to'
' provide informed analysis and conduct reasoned exchanges on the full spectrum of issues'
' that shape international developments.'
)
no_stylesheets = True
use_embedded_content = False
encoding = 'utf-8'
language = 'en'
remove_attributes = ['style', 'height', 'width']
masthead_url = 'https://www.cirsd.org/bundles/olpublic/images/horizons-logo.jpg'
ignore_duplicate_articles = {'url'}
extra_css = 'em{color:#202020;}'
simultaneous_downloads = 1
keep_only_tags = [dict(name='div', attrs={'class': 'article'})]
remove_tags = [
classes('back-link'),
dict(name='div', attrs={'class': 'single-post-footer'}),
]
recipe_specific_options = {
'issue_url': {
'short': 'The issue URL ',
'long': 'For example, https://www.cirsd.org/en/horizons/horizons-winter-2024--issue-no-25',
}
}
def preprocess_raw_html(self, raw, *a):
return raw.replace('<p>&nbsp;</p>', '').replace('<p dir="ltr">&nbsp;</p>', '')
def get_browser(self):
return BasicNewsRecipe.get_browser(self, verify_ssl_certificates=False)
def parse_index(self):
d = self.recipe_specific_options.get('issue_url')
if d and isinstance(d, str):
url = d
else:
soup = self.index_to_soup('https://www.cirsd.org/en/horizons')
a = soup.find('a', href=True, attrs={'class':'horizon-gallery-box'})
url = a['href']
if url.startswith('/'):
url = 'https://www.cirsd.org' + url
self.cover_url = a.find('img')['src']
self.log(self.cover_url)
self.title = url.split('/')[-1].replace('-', ' ').title()
self.log('Downloading Issue: ', self.title)
soup = self.index_to_soup(url)
feeds = []
for section in soup.findAll('h2', attrs={'class': 'mt-3'}):
secname = self.tag_to_string(section).strip()
self.log(secname)
articles = []
div = section.findNext('div', attrs={'class': 'mb-3'})
for li in div.findAll('li', attrs={'class': 'mb-2'}):
a = li.find('a', href=True)
url = a['href']
if url.startswith('/'):
url = 'https://www.cirsd.org' + url
title = self.tag_to_string(a)
span = li.find('span', attrs={'class': 'section-author'})
desc = ''
if span:
desc = self.tag_to_string(span).strip()
self.log('\t', title, '\n\t', desc, '\n\t\t', url)
articles.append({'title': title, 'url': url, 'description': desc})
if articles:
feeds.append((secname, articles))
return feeds