Update substack.recipe

This commit is contained in:
unkn0w7n 2024-11-04 18:29:29 +05:30
parent 791f495389
commit 5f429c3a2d

View File

@ -6,6 +6,7 @@
# Copyright: Nathan Cook (nathan.cook@gmail.com) # Copyright: Nathan Cook (nathan.cook@gmail.com)
## ##
# Written: 2020-12-18 # Written: 2020-12-18
# Updated: 2024-11-04
## ##
__license__ = 'GNU General Public License v3 https://www.gnu.org/licenses/gpl-3.0.html' __license__ = 'GNU General Public License v3 https://www.gnu.org/licenses/gpl-3.0.html'
@ -14,6 +15,7 @@ __version__ = 'v0.1.1'
__date__ = '2020-12-19' __date__ = '2020-12-19'
__author__ = 'topynate' __author__ = 'topynate'
import re
import json import json
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
@ -22,20 +24,35 @@ from mechanize import Request
class Substack(BasicNewsRecipe): class Substack(BasicNewsRecipe):
title = 'Substack' title = 'Substack'
__author__ = 'topynate' __author__ = 'topynate, unkn0wn'
description = 'Use advanced menu if you want to add your own substack handles.'
oldest_article = 7 oldest_article = 7
language = 'en' language = 'en'
max_articles_per_feed = 100 max_articles_per_feed = 100
auto_cleanup = True auto_cleanup = True
auto_cleanup_keep = '//*[@class="subtitle"]'
needs_subscription = 'optional' needs_subscription = 'optional'
use_embedded_content = False use_embedded_content = False
masthead_url = 'https://substack.com/img/substack_wordmark.png'
cover_url = 'https://substack.com/img/substack.png'
extra_css = '.captioned-image-container, .image-container {font-size: small;}'
recipe_specific_options = { recipe_specific_options = {
'auths': {
'short': 'enter the @handles you subscribe to:\nseperated by a space',
'long': 'julianmacfarlane ianleslie .... ....',
'default': 'julianmacfarlane ianleslie thesalvo',
},
'days': { 'days': {
'short': 'Oldest article to download from this news source. In days ', 'short': 'Oldest article to download from this news source. In days ',
'long': 'For example, 0.5, gives you articles from the past 12 hours', 'long': 'For example, 0.5, gives you articles from the past 12 hours',
'default': str(oldest_article) 'default': str(oldest_article),
} },
'res': {
'short': 'For hi-res images, select a resolution from the\nfollowing options: 800, 1000, 1200 or 1500',
'long': 'This is useful for non e-ink devices, and for a lower file size\nthan the default, use 400 or 300.',
'default': '600',
},
} }
def __init__(self, *args, **kwargs): def __init__(self, *args, **kwargs):
@ -47,9 +64,9 @@ class Substack(BasicNewsRecipe):
# Every Substack publication has an RSS feed at https://{name}.substack.com/feed. # Every Substack publication has an RSS feed at https://{name}.substack.com/feed.
# The same URL provides either all posts, or all free posts + previews of paid posts, # The same URL provides either all posts, or all free posts + previews of paid posts,
# depending on whether you're logged in. # depending on whether you're logged in.
feeds = [ # feeds = [
('Novum Lumen', 'https://novumlumen.substack.com/feed'), # gratuitously self-promotional example # ('Novum Lumen', 'https://novumlumen.substack.com/feed'), # gratuitously self-promotional example
] # ]
def get_browser(self): def get_browser(self):
br = BasicNewsRecipe.get_browser(self) br = BasicNewsRecipe.get_browser(self)
@ -70,3 +87,24 @@ class Substack(BasicNewsRecipe):
if res.getcode() != 200: if res.getcode() != 200:
raise ValueError('Login failed, check username and password') raise ValueError('Login failed, check username and password')
return br return br
def get_feeds(self):
ans = []
u = self.recipe_specific_options.get('auths')
if u and isinstance(u, str):
for x in u.split():
ans.append('https://' + x.replace('@', ' ') + '.substack.com/feed')
return ans
def preprocess_html(self, soup):
res = '600'
w = self.recipe_specific_options.get('res')
if w and isinstance(w, str):
res = w
for img in soup.findAll('img', attrs={'src': True}):
img['src'] = re.sub(r'w_\d+', 'w_' + res, img['src'])
for src in soup.findAll(['source', 'svg']):
src.extract()
for but in soup.findAll(attrs={'class': ['button-wrapper']}):
but.extract()
return soup