calibre/recipes/caravan_magazine.recipe
unkn0w7n 402b952f4d ...
caravan, foreign affairs, frontline
2024-07-23 11:51:43 +05:30

216 lines
7.9 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python
# vim:fileencoding=utf-8
import json
from urllib.parse import quote, urlparse
from calibre.web.feeds.news import BasicNewsRecipe
from mechanize import Request
def absurl(x):
if x.startswith('//'):
x = 'https:' + x
elif not x.startswith('http'):
x = 'https://caravanmagazine.in' + x
return x
def safe_dict(data, *names):
ans = data
for x in names:
ans = ans.get(x) or ''
return ans
def parse_body(x):
if x.get('type', '') == 'paragraph':
yield '<p>'
for p in x.get('content', {}):
yield ''.join(parse_p(p))
yield '</p>\n'
elif x.get('type', '') in {'blockquote', 'pullquote'}:
yield '<blockquote>'
for p in x.get('content', {}):
yield from parse_body(p)
yield '</blockquote>'
elif x.get('type', '') == 'figure':
yield '<img src="{}">'.format(absurl(x['attrs']['src'].replace('=s0', '=s768-rw')))
for p in x.get('content', {}):
yield from parse_body(p)
elif x.get('type', '') in {'caption', 'credit'}:
yield '<div class="sub">'
for div in x.get('content', {}):
yield ''.join(parse_p(div))
yield '</div>\n'
elif x.get('type', '') != '':
if 'content' in x:
yield '<p>'
for p in x.get('content', {}):
yield from parse_body(p)
yield '</p>'
def parse_p(p):
if p.get('type', '') == 'text':
if 'marks' in p:
tag = p['marks'][0]['type']
yield '<' + tag + '>'
yield p['text']
yield '</' + tag + '>'
else:
yield p['text']
elif p.get('type', '') == 'hard_break':
yield '<br>'
class CaravanMagazine(BasicNewsRecipe):
title = 'Caravan Magazine'
__author__ = 'Kovid Goyal, Gobelinus, unkn0wn'
description = (
'The Caravan has established itself as one of the countrys most respected and intellectually agile publications, '
'setting new benchmarks for the Indian and South Asian media. We publish immersive reportage, daring commentary, '
'path-breaking investigations, insightful literary criticism and more, spanning the worlds of politics, culture, '
'business, society, media, the environment and the arts.'
)
language = 'en_IN'
timefmt = ' [%b, %Y]'
encoding = 'utf-8'
no_stylesheets = True
remove_attributes = ['style', 'height', 'width']
ignore_duplicate_articles = {'url'}
resolve_internal_links = True
needs_subscription = 'optional'
logged = False
extra_css = '''
img {display:block; margin:0 auto;}
blockquote, em {color:#202020;}
.desc {font-style:italic; color:#202020;}
.sub {text-align:center; font-size:small;}
.cat, .auth {font-size:small; color:#404040;}
'''
def get_browser(self, *args, **kw):
br = BasicNewsRecipe.get_browser(self, *args, **kw)
if not self.username or not self.password:
return br
data = json.dumps({"0":{"json":{"email":self.username,"password":self.password}}})
if not isinstance(data, bytes):
data = data.encode('utf-8')
rq = Request(
url='https://caravanmagazine.in/api/trpc/users.login?batch=1',
data=data,
headers={
'Accept': 'application/json, text/plain, */*',
'Origin': 'https://caravanmagazine.in',
'Referer': 'https://caravanmagazine.in/',
'Content-type': 'application/json;charset=UTF-8',
},
method='POST'
)
try:
res = br.open(rq).read()
res = res.decode('utf-8')
res = json.loads(res)
self.log(safe_dict(res[0], 'result', 'data', 'json', 'message'))
self.logged = True
except:
self.log.warn('\n**Login failed, check your username and password\n')
return br
return br
recipe_specific_options = {
'date': {
'short': 'The date of the edition to download (MM-YYYY format)',
'long': 'For example, 07-2024'
}
}
def parse_index(self):
self.log(
'\n***\nif this recipe fails, report it on: '
'https://www.mobileread.com/forums/forumdisplay.php?f=228\n***\n'
)
api = 'https://api.caravanmagazine.in/api/trpc/magazines.getLatestIssue'
d = self.recipe_specific_options.get('date')
if d and isinstance(d, str):
x = d.split('-')
inp = json.dumps({"0":{"json":{"month":int(x[0]),"year":int(x[1])}}})
api = 'https://api.caravanmagazine.in/api/trpc/magazines.getForMonthAndYear?batch=1&input=' + quote(inp, safe='')
raw = json.loads(self.index_to_soup(api, raw=True))
if isinstance(raw, list):
data = raw[0]['result']['data']['json']
else:
data = raw['result']['data']['json']
cover = safe_dict(data, 'issue', 'cover', 'data', 'url').replace('=s0', '=s768-rw')
self.cover_url = absurl(cover)
feeds = []
for sec in data['categories']:
section = sec['name']
self.log(section)
articles = []
for arts in sec['amc']:
title = safe_dict(arts, 'article', 'title')
desc = safe_dict(arts, 'article', 'theme', 'name') + ' | ' + safe_dict(arts, 'article', 'printTitle')
names = []
for auth in arts['article']['authors']:
name = safe_dict(auth, 'profile', 'name')
if name != '':
names.append(name)
if names:
desc = desc + ' | ' + ', '.join(names)
url = absurl(arts['article']['slug'])
self.log('\t', title, url, '\n\t', desc)
articles.append({'title': title, 'description': desc, 'url': url})
if articles:
feeds.append((section, articles))
return feeds
def print_version(self, url):
slug = urlparse(url).path
inp = json.dumps({"0":{"json":{"slug":slug}}})
return 'https://api.caravanmagazine.in/api/trpc/articles.getFromCache?batch=1&input=' + quote(inp, safe='')
def preprocess_raw_html(self, raw, url):
cache_data = json.loads(raw)[0]
art_id = cache_data['result']['data']['json']['articleId']
prim_data = cache_data['result']['data']['json']['data']
cat = desc = lede = auth = ''
cat = '<div class="cat">' + safe_dict(prim_data, 'printTitle') + '</div>\n'
title = '<h1>' + safe_dict(prim_data, 'title') + '</h1>\n'
desc = '<p class="desc">' + safe_dict(prim_data, 'description') + '</p>\n'
authors = []
for q in prim_data.get('authors', {}):
authors.append(safe_dict(q, 'name'))
dt = ''
if prim_data.get('writtenAt', '') != '':
import time
from datetime import datetime, timedelta
dt = datetime.fromisoformat(prim_data['writtenAt'][:-1]) + timedelta(seconds=time.timezone)
dt = dt.strftime('%b %d, %Y, %I:%M %p')
auth ='<p class="auth">' + ', '.join(authors) + ' | ' + dt + '</p>\n'
lede = ''.join(parse_body(prim_data.get('cover', {})))
free_cont = ''
for x in prim_data['data']['content']:
free_cont += '\n'+ ''.join(parse_body(x))
premium_cont = ''
if self.logged:
cont_url = 'https://api.caravanmagazine.in/api/paywall/check-article?articleId='
art_cont = json.loads(self.index_to_soup(cont_url + str(art_id), raw=True))
for x in art_cont['premiumContent']:
premium_cont += '\n' + ''.join(parse_body(x))
return '<html><body><div>' \
+ cat + title + desc + auth + lede + free_cont + premium_cont + \
'</div></body></html>'