calibre/recipes/caravan_magazine.recipe

#!/usr/bin/env python
# vim:fileencoding=utf-8
import json
from urllib.parse import quote, urlparse

from calibre.web.feeds.news import BasicNewsRecipe
from mechanize import Request


def absurl(x):
    if x.startswith('//'):
        x = 'https:' + x
    elif not x.startswith('http'):
        x = 'https://caravanmagazine.in' + x
    return x

def safe_dict(data, *names):
    ans = data
    for x in names:
        ans = ans.get(x) or ''
    return ans


def parse_body(x):
    if x.get('type', '') == 'paragraph':
        yield '<p>'
        for p in x.get('content', {}):
            yield ''.join(parse_p(p))
        yield '</p>\n'
    elif x.get('type', '') in {'blockquote', 'pullquote'}:
        yield '<blockquote>'
        for p in x.get('content', {}):
            yield from parse_body(p)
        yield '</blockquote>'
    elif x.get('type', '') == 'figure':
        yield '<img src="{}">'.format(absurl(x['attrs']['src'].replace('=s0', '=s768-rw')))
        for p in x.get('content', {}):
            yield from parse_body(p)
    elif x.get('type', '') in {'caption', 'credit'}:
        yield '<div class="sub">'
        for div in x.get('content', {}):
            yield ''.join(parse_p(div))
        yield '</div>\n'
    elif x.get('type', '') != '':
        if 'content' in x:
            yield '<p>'
            for p in x.get('content', {}):
                yield from parse_body(p)
            yield '</p>'

def parse_p(p):
    if p.get('type', '') == 'text':
        if 'marks' in p:
            tag = p['marks'][0]['type']
            yield '<' + tag + '>'
            yield p['text']
            yield '</' + tag + '>'
        else:
            yield p['text']
    elif p.get('type', '') == 'hard_break':
        yield '<br>'


class CaravanMagazine(BasicNewsRecipe):

    title = 'Caravan Magazine'
    __author__ = 'Kovid Goyal, Gobelinus, unkn0wn'
    description = (
        'The Caravan has established itself as one of the country’s most respected and intellectually agile publications, '
        'setting new benchmarks for the Indian and South Asian media. We publish immersive reportage, daring commentary, '
        'path-breaking investigations, insightful literary criticism and more, spanning the worlds of politics, culture, '
        'business, society, media, the environment and the arts.'
    )
    language = 'en_IN'
    timefmt = ' [%b, %Y]'
    encoding = 'utf-8'

    no_stylesheets = True

    remove_attributes = ['style', 'height', 'width']
    ignore_duplicate_articles = {'url'}
    resolve_internal_links = True
    needs_subscription = 'optional'
    logged = False

    extra_css = '''
        img {display:block; margin:0 auto;}
        blockquote, em {color:#202020;}
        .desc {font-style:italic; color:#202020;}
        .sub {text-align:center; font-size:small;}
        .cat, .auth {font-size:small; color:#404040;}
    '''

    def get_browser(self, *args, **kw):
        br = BasicNewsRecipe.get_browser(self, *args, **kw)
        if not self.username or not self.password:
            return br
        data = json.dumps({"0":{"json":{"email":self.username,"password":self.password}}})
        if not isinstance(data, bytes):
            data = data.encode('utf-8')
        rq = Request(
            url='https://caravanmagazine.in/api/trpc/users.login?batch=1',
            data=data,
            headers={
                'Accept': 'application/json, text/plain, */*',
                'Origin': 'https://caravanmagazine.in',
                'Referer': 'https://caravanmagazine.in/',
                'Content-type': 'application/json;charset=UTF-8',
            },
            method='POST'
        )
        try:
            res = br.open(rq).read()
            res = res.decode('utf-8')
            res = json.loads(res)
            self.log(safe_dict(res[0], 'result', 'data', 'json', 'message'))
            self.logged = True
        except:
            self.log.warn('\n**Login failed, check your username and password\n')
            return br
        return br

    recipe_specific_options = {
        'date': {
            'short': 'The date of the edition to download (MM-YYYY format)',
            'long': 'For example, 07-2024'
        }
    }

    def parse_index(self):
        self.log(
            '\n***\nif this recipe fails, report it on: '
            'https://www.mobileread.com/forums/forumdisplay.php?f=228\n***\n'
        )

        api = 'https://api.caravanmagazine.in/api/trpc/magazines.getLatestIssue'
        d = self.recipe_specific_options.get('date')
        if d and isinstance(d, str):
            x = d.split('-')
            inp = json.dumps({"0":{"json":{"month":int(x[0]),"year":int(x[1])}}})
            api = 'https://api.caravanmagazine.in/api/trpc/magazines.getForMonthAndYear?batch=1&input=' + quote(inp, safe='')

        raw = json.loads(self.index_to_soup(api, raw=True))
        if isinstance(raw, list):
            data = raw[0]['result']['data']['json']
        else:
            data = raw['result']['data']['json']
        cover = safe_dict(data, 'issue', 'cover', 'data', 'url').replace('=s0', '=s768-rw')
        self.cover_url = absurl(cover)

        feeds = []

        for sec in data['categories']:
            section = sec['name']
            self.log(section)
            articles = []
            for arts in sec['amc']:
                title = safe_dict(arts, 'article', 'title')
                desc = safe_dict(arts, 'article', 'theme', 'name') + ' | ' + safe_dict(arts, 'article', 'printTitle')
                names = []
                for auth in arts['article']['authors']:
                    name = safe_dict(auth, 'profile', 'name')
                    if name != '':
                        names.append(name)
                if names:
                    desc = desc + ' | ' + ', '.join(names)
                url = absurl(arts['article']['slug'])
                self.log('\t', title, url, '\n\t', desc)
                articles.append({'title': title, 'description': desc, 'url': url})
            if articles:
                feeds.append((section, articles))
        return feeds

    def print_version(self, url):
        slug = urlparse(url).path
        inp = json.dumps({"0":{"json":{"slug":slug}}})
        return 'https://api.caravanmagazine.in/api/trpc/articles.getFromCache?batch=1&input=' + quote(inp, safe='')

    def preprocess_raw_html(self, raw, url):
        cache_data = json.loads(raw)[0]
        art_id = cache_data['result']['data']['json']['articleId']
        prim_data = cache_data['result']['data']['json']['data']

        cat = desc = lede = auth = ''

        cat = '<div class="cat">' + safe_dict(prim_data, 'printTitle') + '</div>\n'
        title = '<h1>' + safe_dict(prim_data, 'title') + '</h1>\n'
        desc = '<p class="desc">' + safe_dict(prim_data, 'description') + '</p>\n'

        authors = []
        for q in prim_data.get('authors', {}):
            authors.append(safe_dict(q, 'name'))
        dt = ''
        if prim_data.get('writtenAt', '') != '':
            import time
            from datetime import datetime, timedelta
            dt = datetime.fromisoformat(prim_data['writtenAt'][:-1]) + timedelta(seconds=time.timezone)
            dt = dt.strftime('%b %d, %Y, %I:%M %p')
        auth ='<p class="auth">' + ', '.join(authors) + ' | ' + dt + '</p>\n'
        lede = ''.join(parse_body(prim_data.get('cover', {})))

        free_cont = ''
        for x in prim_data['data']['content']:
            free_cont += '\n'+ ''.join(parse_body(x))

        premium_cont = ''
        if self.logged:
            cont_url = 'https://api.caravanmagazine.in/api/paywall/check-article?articleId='
            art_cont = json.loads(self.index_to_soup(cont_url + str(art_id), raw=True))
            for x in art_cont['premiumContent']:
                premium_cont += '\n' + ''.join(parse_body(x))

        return '<html><body><div>' \
                    + cat + title + desc + auth + lede + free_cont + premium_cont + \
                        '</div></body></html>'