#!/usr/bin/env python # vim:fileencoding=utf-8 import json from urllib.parse import quote, urlparse from calibre.web.feeds.news import BasicNewsRecipe from mechanize import Request def absurl(x): if x.startswith('//'): x = 'https:' + x elif not x.startswith('http'): x = 'https://caravanmagazine.in' + x return x def safe_dict(data, *names): ans = data for x in names: ans = ans.get(x) or '' return ans def parse_body(x): if x.get('type', '') == 'paragraph': yield '

' for p in x.get('content', {}): yield ''.join(parse_p(p)) yield '

\n' elif x.get('type', '') in {'blockquote', 'pullquote'}: yield '

' for p in x.get('content', {}): yield from parse_body(p) yield '

' elif x.get('type', '') == 'figure': yield '

'.format(absurl(x['attrs']['src'].replace('=s0', '=s768-rw'))) for p in x.get('content', {}): yield from parse_body(p) elif x.get('type', '') in {'caption', 'credit'}: yield '

' for div in x.get('content', {}): yield ''.join(parse_p(div)) yield '

\n' elif x.get('type', '') != '': if 'content' in x: yield '

' for p in x.get('content', {}): yield from parse_body(p) yield '

' def parse_p(p): if p.get('type', '') == 'text': if 'marks' in p: tag = p['marks'][0]['type'] yield '<' + tag + '>' yield p['text'] yield '' else: yield p['text'] elif p.get('type', '') == 'hard_break': yield '
' class CaravanMagazine(BasicNewsRecipe): title = 'Caravan Magazine' __author__ = 'Kovid Goyal, Gobelinus, unkn0wn' description = ( 'The Caravan has established itself as one of the country’s most respected and intellectually agile publications, ' 'setting new benchmarks for the Indian and South Asian media. We publish immersive reportage, daring commentary, ' 'path-breaking investigations, insightful literary criticism and more, spanning the worlds of politics, culture, ' 'business, society, media, the environment and the arts.' ) language = 'en_IN' timefmt = ' [%b, %Y]' encoding = 'utf-8' no_stylesheets = True remove_attributes = ['style', 'height', 'width'] ignore_duplicate_articles = {'url'} resolve_internal_links = True needs_subscription = 'optional' logged = False extra_css = ''' img {display:block; margin:0 auto;} blockquote, em {color:#202020;} .desc {font-style:italic; color:#202020;} .sub {text-align:center; font-size:small;} .cat, .auth {font-size:small; color:#404040;} ''' def get_browser(self, *args, **kw): br = BasicNewsRecipe.get_browser(self, *args, **kw) if not self.username or not self.password: return br data = json.dumps({"0":{"json":{"email":self.username,"password":self.password}}}) if not isinstance(data, bytes): data = data.encode('utf-8') rq = Request( url='https://caravanmagazine.in/api/trpc/users.login?batch=1', data=data, headers={ 'Accept': 'application/json, text/plain, */*', 'Origin': 'https://caravanmagazine.in', 'Referer': 'https://caravanmagazine.in/', 'Content-type': 'application/json;charset=UTF-8', }, method='POST' ) try: res = br.open(rq).read() res = res.decode('utf-8') res = json.loads(res) self.log(safe_dict(res[0], 'result', 'data', 'json', 'message')) self.logged = True except: self.log.warn('\n**Login failed, check your username and password\n') return br return br recipe_specific_options = { 'date': { 'short': 'The date of the edition to download (MM-YYYY format)', 'long': 'For example, 07-2024' } } def parse_index(self): self.log( '\n***\nif this recipe fails, report it on: ' 'https://www.mobileread.com/forums/forumdisplay.php?f=228\n***\n' ) api = 'https://api.caravanmagazine.in/api/trpc/magazines.getLatestIssue' d = self.recipe_specific_options.get('date') if d and isinstance(d, str): x = d.split('-') inp = json.dumps({"0":{"json":{"month":int(x[0]),"year":int(x[1])}}}) api = 'https://api.caravanmagazine.in/api/trpc/magazines.getForMonthAndYear?batch=1&input=' + quote(inp, safe='') raw = json.loads(self.index_to_soup(api, raw=True)) if isinstance(raw, list): data = raw[0]['result']['data']['json'] else: data = raw['result']['data']['json'] cover = safe_dict(data, 'issue', 'cover', 'data', 'url').replace('=s0', '=s768-rw') self.cover_url = absurl(cover) feeds = [] for sec in data['categories']: section = sec['name'] self.log(section) articles = [] for arts in sec['amc']: title = safe_dict(arts, 'article', 'title') desc = safe_dict(arts, 'article', 'theme', 'name') + ' | ' + safe_dict(arts, 'article', 'printTitle') names = [] for auth in arts['article']['authors']: name = safe_dict(auth, 'profile', 'name') if name != '': names.append(name) if names: desc = desc + ' | ' + ', '.join(names) url = absurl(arts['article']['slug']) self.log('\t', title, url, '\n\t', desc) articles.append({'title': title, 'description': desc, 'url': url}) if articles: feeds.append((section, articles)) return feeds def print_version(self, url): slug = urlparse(url).path inp = json.dumps({"0":{"json":{"slug":slug}}}) return 'https://api.caravanmagazine.in/api/trpc/articles.getFromCache?batch=1&input=' + quote(inp, safe='') def preprocess_raw_html(self, raw, url): cache_data = json.loads(raw)[0] art_id = cache_data['result']['data']['json']['articleId'] prim_data = cache_data['result']['data']['json']['data'] cat = desc = lede = auth = '' cat = '

' + safe_dict(prim_data, 'printTitle') + '

\n' title = '

' + safe_dict(prim_data, 'title') + '

\n' desc = '

' + safe_dict(prim_data, 'description') + '

\n' authors = [] for q in prim_data.get('authors', {}): authors.append(safe_dict(q, 'name')) dt = '' if prim_data.get('writtenAt', '') != '': import time from datetime import datetime, timedelta dt = datetime.fromisoformat(prim_data['writtenAt'][:-1]) + timedelta(seconds=time.timezone) dt = dt.strftime('%b %d, %Y, %I:%M %p') auth ='

' + ', '.join(authors) + ' | ' + dt + '

\n' lede = ''.join(parse_body(prim_data.get('cover', {}))) free_cont = '' for x in prim_data['data']['content']: free_cont += '\n'+ ''.join(parse_body(x)) premium_cont = '' if self.logged: cont_url = 'https://api.caravanmagazine.in/api/paywall/check-article?articleId=' art_cont = json.loads(self.index_to_soup(cont_url + str(art_id), raw=True)) for x in art_cont['premiumContent']: premium_cont += '\n' + ''.join(parse_body(x)) return '

' \ + cat + title + desc + auth + lede + free_cont + premium_cont + \ '