calibre/recipes/scientific_american.recipe

#!/usr/bin/env python
__license__ = 'GPL v3'

import json
from datetime import datetime
from urllib.parse import urljoin

from calibre.web.feeds.news import BasicNewsRecipe, prefixed_classes


class ScientificAmerican(BasicNewsRecipe):
    title = 'Scientific American'
    description = 'Popular Science. Monthly magazine. Should be downloaded around the middle of each month.'
    category = 'science'
    __author__ = 'Kovid Goyal'
    no_stylesheets = True
    language = 'en'
    publisher = 'Nature Publishing Group'
    remove_empty_feeds = True
    remove_javascript = True
    timefmt = ' [%B %Y]'
    remove_attributes = ['height', 'width']
    masthead_url = (
        'https://static.scientificamerican.com/sciam/assets/Image/newsletter/salogo.png'
    )
    extra_css = '''
        [class^="article_dek-"] { font-style:italic; color:#202020; }
        [class^="article_authors-"] {font-size:small; color:#202020; }
        [class^="article__image-"], [class^="lead_image-"], .calibre-nuked-tag-figcaption { font-size:small; }
        [class^="bio-"] { font-size:small; color:#404040; }
        em, blockquote { color:#202020; }
    '''

    needs_subscription = 'optional'

    keep_only_tags = [
        prefixed_classes(
            'article_hed- article_dek- article_authors- lead_image- article__body- bio-'
        ),
    ]
    remove_tags = [
        dict(name=['button', 'svg', 'iframe', 'source'])
    ]

    def preprocess_html(self, soup):
        for h2 in soup.findAll(['h2', 'h3']):
            h2.name = 'h4'
        for fig in soup.findAll('figcaption'):
            for p in fig.findAll('p'):
                p.name = 'div'
        res = '?w=600'
        w = self.recipe_specific_options.get('res')
        if w and isinstance(w, str):
            res = '?w=' + w
        for img in soup.findAll('img', src=True):
            img['src'] = img['src'].split('?')[0] + res
        return soup

    def get_browser(self, *args):
        br = BasicNewsRecipe.get_browser(self)
        if self.username and self.password:
            br.open('https://www.scientificamerican.com/account/login/')
            br.select_form(predicate=lambda f: f.attrs.get('id') == 'login')
            br['emailAddress'] = self.username
            br['password'] = self.password
            br.submit()
        return br

    recipe_specific_options = {
        'issue_url': {
            'short': 'The issue URL ',
            'long': (
                'For example, https://www.scientificamerican.com/issue/sa/2024/07-01/'
                '\nYou can also download special-editions, physics, health, mind magazines by pasting the URL here.'
            )
        },
        'res': {
            'short': 'For hi-res images, select a resolution from the\nfollowing options: 800, 1000, 1200 or 1500',
            'long': 'This is useful for non e-ink devices, and for a lower file size\nthan the default, use 400 or 300.',
            'default': '600'
        }
    }

    def parse_index(self):
        # Get the cover, date and issue URL
        d = self.recipe_specific_options.get('issue_url')
        if d and isinstance(d, str):
            issue = d
        else:
            fp_soup = self.index_to_soup('https://www.scientificamerican.com')
            curr_issue_link = fp_soup.find(**prefixed_classes('latest_issue_links-'))
            if not curr_issue_link:
                self.abort_recipe_processing('Unable to find issue link')
            issue = 'https://www.scientificamerican.com' + curr_issue_link.a['href']

        soup = self.index_to_soup(issue)
        script = soup.find('script', id='__DATA__')
        if not script:
            self.abort_recipe_processing('Unable to find script')

        JSON = script.contents[0].split('JSON.parse(`')[1].replace('\\\\', '\\')
        data = json.JSONDecoder().raw_decode(JSON)[0]
        issue_info = (
            data
            .get('initialData', {})
            .get('issueData', {})
        )
        if not issue_info:
            self.abort_recipe_processing('Unable to find issue info')

        self.cover_url = issue_info['image_url'] + '?w=800'

        edition_date = datetime.strptime(issue_info['issue_date'], '%Y-%m-%d')
        self.timefmt = f' [{edition_date:%B %Y}]'

        feeds = {}
        for section in issue_info.get('article_previews', {}):
            for article in issue_info.get('article_previews', {}).get(section, []):
                self.log('\t', article['title'])
                if section.startswith('featur'):
                    feed_name = section.capitalize()
                else:
                    feed_name = article['category']
                if feed_name not in feeds:
                    feeds[feed_name] = []
                feeds[feed_name].append(
                    {
                        'title': article['title'],
                        'url': urljoin(
                            'https://www.scientificamerican.com/article/',
                            article['slug'],
                        ),
                        'description': article['summary'],
                    }
                )
        sorted_feeds = dict(sorted(feeds.items(), key=lambda x: (not x[0].startswith('Featur'), x[0])))
        return sorted_feeds.items()