calibre/recipes/himal_southasian.recipe

#!/usr/bin/env python
# vim:fileencoding=utf-8
import json

from html5_parser import parse

from calibre.web.feeds.news import BasicNewsRecipe


def get_story(story):
    str_type = story.get('type', '')
    if str_type == 'text':
        yield '\n' + story['text']
    elif str_type == 'image':
        yield ''.join(img(story))
    elif 'story-elements' in story:
        for x in story['story-elements']:
            yield from get_story(x)


def img(img):
    yield '<p>'
    if 'image-s3-key' in img:
        yield '<img src="{}">'.format('https://media.assettype.com/' + img['image-s3-key'])
    if 'title' in img:
        yield '<div class="cap">' + img['title'] + '</div>'
    yield '</p>'


class himal(BasicNewsRecipe):
    title = 'Himal Southasian'
    __author__ = 'unkn0wn'
    description = ('Himal Southasian is Southasia’s first and only regional magazine of politics and culture.'
    ' For over 30 years, Himal Southasian has challenged nationalist orthodoxies, and covered the region with '
    'imagination, rigour and irreverence, with contributions from some of the most interesting writers in the region.')
    language = 'en_IN'
    no_stylesheets = True
    remove_attributes = ['height', 'width', 'style']
    ignore_duplicate_articles = {'url'}
    masthead_url = 'https://gumlet.assettype.com/himalmag/2024-01/4ecc5615-eceb-4497-87c7-4e013083ba17/logo_.png'
    encoding = 'utf-8'
    resolve_internal_links = True
    oldest_article = 30  # days

    recipe_specific_options = {
        'days': {
            'short': 'Oldest article to download from this news source. In days ',
            'long': 'For example, 0.5, gives you articles from the past 12 hours',
            'default': str(oldest_article)
        }
    }

    def __init__(self, *args, **kwargs):
        BasicNewsRecipe.__init__(self, *args, **kwargs)
        d = self.recipe_specific_options.get('days')
        if d and isinstance(d, str):
            self.oldest_article = float(d)

    extra_css = '''
        .cap, .auth {font-size:small;}
        em, blockquote {color:#404040;}
        .subhead { font-style:italic; color:#202020; }
    '''

    feeds = [
        ('Articles', 'https://www.himalmag.com/feed')
    ]

    def preprocess_raw_html(self, raw, *a):
        root = parse(raw)
        m = root.xpath('//script[@id="static-page"]')
        data = json.loads(m[0].text)['qt']['data']['story']

        title = '<h1>' + data['headline'] + '</h1>'

        subhead = auth = caption = lede = ''

        if 'subheadline' in data:
            subhead = '\n<p class="subhead">' + data['subheadline'] + '</p>'

        if 'author-name' in data:
            auth = '\n<div class="auth">' + data['author-name'] + '</div>'

        if 'hero-image-s3-key' in data:
            lede = '\n<p><img src="{}">'.format('https://media.assettype.com/' + data['hero-image-s3-key'])

        if 'hero-image-caption' in data:
            caption = '<div class="cap">' + data['hero-image-caption'] + '</div>'

        body = ''
        for ele in data['cards']:
            for story in ele.get('story-elements', {}):
                body += '\n'.join(get_story(story))

        return '<html><body>\n' + title + subhead + auth + lede + caption + '<div>' + body + '\n</div></body></html>'