calibre/recipes/project_syndicate.recipe
un-pogaz 41cee6f02d various whitespace (auto-fix)
ruff 'E201,E202,E211,E251,E275'
2025-01-24 11:14:24 +01:00

99 lines
4.2 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python
import random
from calibre.scraper.simple import read_url
from calibre.web.feeds.news import BasicNewsRecipe
class projectsynd(BasicNewsRecipe):
title = 'Project Syndicate'
__author__ = 'unkn0wn'
description = (
'Project Syndicate produces and delivers original, high-quality commentaries to a global audience. '
'Featuring exclusive contributions by prominent political leaders, policymakers, scholars, business '
'leaders, and civic activists from around the world, we provide news media and their readers with cutting-edge '
'analysis and insight, regardless of ability to pay. Our membership includes over 500 media outlets more '
'than half of which receive our commentaries for free or at subsidized rates in 156 countries. '
'Download Weekly.'
)
language = 'en'
no_stylesheets = True
remove_javascript = True
masthead_url = 'https://www.project-syndicate.org/images/PS_Logotype.svg'
ignore_duplicate_articles = {'title', 'url'}
resolve_internal_links = True
remove_empty_feeds = True
remove_attributes = ['style', 'height', 'width']
oldest_article = 7 # days
storage = []
articles_are_obfuscated = True
def get_obfuscated_article(self, url):
dom = random.choice(('fo', 'is', 'li', 'md', 'ph', 'vn'))
data = read_url(self.storage, 'https://archive.' + dom + '/latest/' + url.split('?')[0])
return {
'data': data,
'url': url.split('?')[0]
}
recipe_specific_options = {
'days': {
'short': 'Oldest article to download from this news source. In days ',
'long': 'For example, 0.5, gives you articles from the past 12 hours',
'default': str(oldest_article)
}
}
def __init__(self, *args, **kwargs):
BasicNewsRecipe.__init__(self, *args, **kwargs)
d = self.recipe_specific_options.get('days')
if d and isinstance(d, str):
self.oldest_article = float(d)
extra_css = '''
[itemprop^="associatedMedia"]{ font-size:small; text-align:center; }
[itemprop="author"], time { font-size:small; }
.sub, em { font-style:italic; color:#202020; }
[data-page-area="article-bottom"] { font-size:small; color:#202020; }
'''
keep_only_tags = [
dict(attrs={'itemprop':lambda x: x and 'associatedMedia' in x.split()}),
dict(attrs={'itemprop':['headline', 'datePublished', 'author', 'abstract', 'articleBody']}),
dict(name='aside', attrs={'data-page-area':'article-bottom'})
]
remove_tags = [
dict(name=['button', 'svg', 'source']),
dict(attrs={'data-message-area':True}),
dict(attrs={'id':['editorspicks', 'movie_player']}),
dict(name='aside', attrs={'id':lambda x: x and x.startswith('comments-')})
]
feeds = [
('Economics & Finance', 'https://www.project-syndicate.org/rss/section/economics'),
('Politics & World Affairs', 'https://www.project-syndicate.org/rss/section/politics-world-affairs'),
('On-Point', 'https://www.project-syndicate.org/rss/onpoint'),
('Sustainability Now', 'https://www.project-syndicate.org/rss/section/environment-sustainability'),
('Smart Development', 'https://www.project-syndicate.org/rss/section/global-health-development'),
('Culture & Society', 'https://www.project-syndicate.org/rss/section/culture-society'),
('Innovation & Technology', 'https://www.project-syndicate.org/rss/section/innovation-technology'),
]
def preprocess_html(self, soup):
for h2 in soup.findAll('h2'):
h2.name = 'h4'
for img in soup.findAll('img', attrs={'old-src':True}):
img['src'] = img['old-src'].replace('medium', 'xlarge')
if abst := soup.find(attrs={'itemprop':'abstract'}):
if div := abst.find('div'):
div.name = 'p'
div['class'] = 'sub'
bdy = soup.find(attrs={'itemprop':'articleBody'})
if bdy:
for div in bdy.findAll('div', recursive=False):
div.name = 'p'
for a in soup.findAll('a', href=True):
a['href'] = 'http' + a['href'].split('http')[-1]
return soup