calibre/recipes/substack.recipe

#!/usr/bin/env python
# vim:fileencoding=utf-8
#
# Title:        Substack
# License:      GNU General Public License v3 – https://www.gnu.org/licenses/gpl-3.0.html
# Copyright:    Nathan Cook (nathan.cook@gmail.com)
##
# Written:      2020-12-18
# Updated:      2024-11-04
##

__license__ = 'GNU General Public License v3 – https://www.gnu.org/licenses/gpl-3.0.html'
__copyright__ = 'Nathan Cook – 2020-12-19'
__version__ = 'v0.1.1'
__date__ = '2020-12-19'
__author__ = 'topynate'

import json
import re

from mechanize import Request

from calibre.web.feeds.news import BasicNewsRecipe


class Substack(BasicNewsRecipe):
    title = 'Substack'
    __author__ = 'topynate, unkn0wn'
    description = 'Use advanced menu if you want to add your own substack handles.'
    oldest_article = 7
    language = 'en'
    max_articles_per_feed = 100
    auto_cleanup = True
    auto_cleanup_keep = '//*[@class="subtitle"]'
    needs_subscription = 'optional'
    use_embedded_content = False
    masthead_url = 'https://substack.com/img/substack_wordmark.png'
    cover_url = 'https://substack.com/img/substack.png'
    extra_css = '.captioned-image-container, .image-container {font-size: small;}'

    recipe_specific_options = {
        'auths': {
            'short': 'enter the @handles you subscribe to:\nseperated by a space',
            'long': 'julianmacfarlane ianleslie .... ....',
            'default': 'julianmacfarlane ianleslie thesalvo',
        },
        'days': {
            'short': 'Oldest article to download from this news source. In days ',
            'long': 'For example, 0.5, gives you articles from the past 12 hours',
            'default': str(oldest_article),
        },
        'res': {
            'short': 'For hi-res images, select a resolution from the\nfollowing options: 800, 1000, 1200 or 1500',
            'long': 'This is useful for non e-ink devices, and for a lower file size\nthan the default, use 400 or 300.',
            'default': '600',
        },
    }

    def __init__(self, *args, **kwargs):
        BasicNewsRecipe.__init__(self, *args, **kwargs)
        d = self.recipe_specific_options.get('days')
        if d and isinstance(d, str):
            self.oldest_article = float(d)

    # Every Substack publication has an RSS feed at https://{name}.substack.com/feed.
    # The same URL provides either all posts, or all free posts + previews of paid posts,
    # depending on whether you're logged in.
    # feeds          = [
    #     ('Novum Lumen', 'https://novumlumen.substack.com/feed'),    # gratuitously self-promotional example
    # ]

    def get_browser(self):
        br = BasicNewsRecipe.get_browser(self)
        if self.username is not None and self.password is not None:
            br.open('https://substack.com/account/login?redirect=%2F&email=&with_password=')
            data = json.dumps({'email': self.username, 'password': self.password, 'captcha_response':None})
            req = Request(
                url='https://substack.com/api/v1/email-login',
                headers={
                    'Accept': '*/*',
                    'Content-Type': 'application/json',
                    'Origin': 'https://substack.com',
                    'Referer': 'https://substack.com/account/login?redirect=%2F&email=&with_password=',
                },
                data=data,
                method='POST')
            res = br.open(req)
            if res.getcode() != 200:
                raise ValueError('Login failed, check username and password')
        return br

    def get_feeds(self):
        ans = []
        u = self.recipe_specific_options.get('auths')
        if u and isinstance(u, str):
            for x in u.split():
                ans.append('https://' + x.replace('@', '') + '.substack.com/feed')
        return ans

    def preprocess_html(self, soup):
        res = '600'
        w = self.recipe_specific_options.get('res')
        if w and isinstance(w, str):
            res = w
        for img in soup.findAll('img', attrs={'src': True}):
            img['src'] = re.sub(r'w_\d+', 'w_' + res, img['src'])
        for src in soup.findAll(['source', 'svg']):
            src.extract()
        for but in soup.findAll(attrs={'class': ['button-wrapper']}):
            but.extract()
        return soup