diff --git a/recipes/substack.recipe b/recipes/substack.recipe index 75ea2f8c57..691871145c 100644 --- a/recipes/substack.recipe +++ b/recipes/substack.recipe @@ -6,6 +6,7 @@ # Copyright: Nathan Cook (nathan.cook@gmail.com) ## # Written: 2020-12-18 +# Updated: 2024-11-04 ## __license__ = 'GNU General Public License v3 – https://www.gnu.org/licenses/gpl-3.0.html' @@ -14,6 +15,7 @@ __version__ = 'v0.1.1' __date__ = '2020-12-19' __author__ = 'topynate' +import re import json from calibre.web.feeds.news import BasicNewsRecipe @@ -21,21 +23,36 @@ from mechanize import Request class Substack(BasicNewsRecipe): - title = 'Substack' - __author__ = 'topynate' + title = 'Substack' + __author__ = 'topynate, unkn0wn' + description = 'Use advanced menu if you want to add your own substack handles.' oldest_article = 7 language = 'en' max_articles_per_feed = 100 - auto_cleanup = True + auto_cleanup = True + auto_cleanup_keep = '//*[@class="subtitle"]' needs_subscription = 'optional' use_embedded_content = False + masthead_url = 'https://substack.com/img/substack_wordmark.png' + cover_url = 'https://substack.com/img/substack.png' + extra_css = '.captioned-image-container, .image-container {font-size: small;}' recipe_specific_options = { + 'auths': { + 'short': 'enter the @handles you subscribe to:\nseperated by a space', + 'long': 'julianmacfarlane ianleslie .... ....', + 'default': 'julianmacfarlane ianleslie thesalvo', + }, 'days': { 'short': 'Oldest article to download from this news source. In days ', 'long': 'For example, 0.5, gives you articles from the past 12 hours', - 'default': str(oldest_article) - } + 'default': str(oldest_article), + }, + 'res': { + 'short': 'For hi-res images, select a resolution from the\nfollowing options: 800, 1000, 1200 or 1500', + 'long': 'This is useful for non e-ink devices, and for a lower file size\nthan the default, use 400 or 300.', + 'default': '600', + }, } def __init__(self, *args, **kwargs): @@ -44,12 +61,12 @@ class Substack(BasicNewsRecipe): if d and isinstance(d, str): self.oldest_article = float(d) -# Every Substack publication has an RSS feed at https://{name}.substack.com/feed. -# The same URL provides either all posts, or all free posts + previews of paid posts, -# depending on whether you're logged in. - feeds = [ - ('Novum Lumen', 'https://novumlumen.substack.com/feed'), # gratuitously self-promotional example - ] + # Every Substack publication has an RSS feed at https://{name}.substack.com/feed. + # The same URL provides either all posts, or all free posts + previews of paid posts, + # depending on whether you're logged in. + # feeds = [ + # ('Novum Lumen', 'https://novumlumen.substack.com/feed'), # gratuitously self-promotional example + # ] def get_browser(self): br = BasicNewsRecipe.get_browser(self) @@ -70,3 +87,24 @@ class Substack(BasicNewsRecipe): if res.getcode() != 200: raise ValueError('Login failed, check username and password') return br + + def get_feeds(self): + ans = [] + u = self.recipe_specific_options.get('auths') + if u and isinstance(u, str): + for x in u.split(): + ans.append('https://' + x.replace('@', ' ') + '.substack.com/feed') + return ans + + def preprocess_html(self, soup): + res = '600' + w = self.recipe_specific_options.get('res') + if w and isinstance(w, str): + res = w + for img in soup.findAll('img', attrs={'src': True}): + img['src'] = re.sub(r'w_\d+', 'w_' + res, img['src']) + for src in soup.findAll(['source', 'svg']): + src.extract() + for but in soup.findAll(attrs={'class': ['button-wrapper']}): + but.extract() + return soup