mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-09-29 15:31:08 -04:00
216 lines
7.9 KiB
Python
216 lines
7.9 KiB
Python
#!/usr/bin/env python
|
||
# vim:fileencoding=utf-8
|
||
import json
|
||
from urllib.parse import quote, urlparse
|
||
|
||
from calibre.web.feeds.news import BasicNewsRecipe
|
||
from mechanize import Request
|
||
|
||
|
||
def absurl(x):
|
||
if x.startswith('//'):
|
||
x = 'https:' + x
|
||
elif not x.startswith('http'):
|
||
x = 'https://caravanmagazine.in' + x
|
||
return x
|
||
|
||
def safe_dict(data, *names):
|
||
ans = data
|
||
for x in names:
|
||
ans = ans.get(x) or ''
|
||
return ans
|
||
|
||
|
||
def parse_body(x):
|
||
if x.get('type', '') == 'paragraph':
|
||
yield '<p>'
|
||
for p in x.get('content', {}):
|
||
yield ''.join(parse_p(p))
|
||
yield '</p>\n'
|
||
elif x.get('type', '') in {'blockquote', 'pullquote'}:
|
||
yield '<blockquote>'
|
||
for p in x.get('content', {}):
|
||
yield from parse_body(p)
|
||
yield '</blockquote>'
|
||
elif x.get('type', '') == 'figure':
|
||
yield '<img src="{}">'.format(absurl(x['attrs']['src'].replace('=s0', '=s768-rw')))
|
||
for p in x.get('content', {}):
|
||
yield from parse_body(p)
|
||
elif x.get('type', '') in {'caption', 'credit'}:
|
||
yield '<div class="sub">'
|
||
for div in x.get('content', {}):
|
||
yield ''.join(parse_p(div))
|
||
yield '</div>\n'
|
||
elif x.get('type', '') != '':
|
||
if 'content' in x:
|
||
yield '<p>'
|
||
for p in x.get('content', {}):
|
||
yield from parse_body(p)
|
||
yield '</p>'
|
||
|
||
def parse_p(p):
|
||
if p.get('type', '') == 'text':
|
||
if 'marks' in p:
|
||
tag = p['marks'][0]['type']
|
||
yield '<' + tag + '>'
|
||
yield p['text']
|
||
yield '</' + tag + '>'
|
||
else:
|
||
yield p['text']
|
||
elif p.get('type', '') == 'hard_break':
|
||
yield '<br>'
|
||
|
||
|
||
class CaravanMagazine(BasicNewsRecipe):
|
||
|
||
title = 'Caravan Magazine'
|
||
__author__ = 'Kovid Goyal, Gobelinus, unkn0wn'
|
||
description = (
|
||
'The Caravan has established itself as one of the country’s most respected and intellectually agile publications, '
|
||
'setting new benchmarks for the Indian and South Asian media. We publish immersive reportage, daring commentary, '
|
||
'path-breaking investigations, insightful literary criticism and more, spanning the worlds of politics, culture, '
|
||
'business, society, media, the environment and the arts.'
|
||
)
|
||
language = 'en_IN'
|
||
timefmt = ' [%b, %Y]'
|
||
encoding = 'utf-8'
|
||
|
||
no_stylesheets = True
|
||
|
||
remove_attributes = ['style', 'height', 'width']
|
||
ignore_duplicate_articles = {'url'}
|
||
resolve_internal_links = True
|
||
needs_subscription = 'optional'
|
||
logged = False
|
||
|
||
extra_css = '''
|
||
img {display:block; margin:0 auto;}
|
||
blockquote, em {color:#202020;}
|
||
.desc {font-style:italic; color:#202020;}
|
||
.sub {text-align:center; font-size:small;}
|
||
.cat, .auth {font-size:small; color:#404040;}
|
||
'''
|
||
|
||
def get_browser(self, *args, **kw):
|
||
br = BasicNewsRecipe.get_browser(self, *args, **kw)
|
||
if not self.username or not self.password:
|
||
return br
|
||
data = json.dumps({"0":{"json":{"email":self.username,"password":self.password}}})
|
||
if not isinstance(data, bytes):
|
||
data = data.encode('utf-8')
|
||
rq = Request(
|
||
url='https://caravanmagazine.in/api/trpc/users.login?batch=1',
|
||
data=data,
|
||
headers={
|
||
'Accept': 'application/json, text/plain, */*',
|
||
'Origin': 'https://caravanmagazine.in',
|
||
'Referer': 'https://caravanmagazine.in/',
|
||
'Content-type': 'application/json;charset=UTF-8',
|
||
},
|
||
method='POST'
|
||
)
|
||
try:
|
||
res = br.open(rq).read()
|
||
res = res.decode('utf-8')
|
||
res = json.loads(res)
|
||
self.log(safe_dict(res[0], 'result', 'data', 'json', 'message'))
|
||
self.logged = True
|
||
except:
|
||
self.log.warn('\n**Login failed, check your username and password\n')
|
||
return br
|
||
return br
|
||
|
||
recipe_specific_options = {
|
||
'date': {
|
||
'short': 'The date of the edition to download (MM-YYYY format)',
|
||
'long': 'For example, 07-2024'
|
||
}
|
||
}
|
||
|
||
def parse_index(self):
|
||
self.log(
|
||
'\n***\nif this recipe fails, report it on: '
|
||
'https://www.mobileread.com/forums/forumdisplay.php?f=228\n***\n'
|
||
)
|
||
|
||
api = 'https://api.caravanmagazine.in/api/trpc/magazines.getLatestIssue'
|
||
d = self.recipe_specific_options.get('date')
|
||
if d and isinstance(d, str):
|
||
x = d.split('-')
|
||
inp = json.dumps({"0":{"json":{"month":int(x[0]),"year":int(x[1])}}})
|
||
api = 'https://api.caravanmagazine.in/api/trpc/magazines.getForMonthAndYear?batch=1&input=' + quote(inp, safe='')
|
||
|
||
raw = json.loads(self.index_to_soup(api, raw=True))
|
||
if isinstance(raw, list):
|
||
data = raw[0]['result']['data']['json']
|
||
else:
|
||
data = raw['result']['data']['json']
|
||
cover = safe_dict(data, 'issue', 'cover', 'data', 'url').replace('=s0', '=s768-rw')
|
||
self.cover_url = absurl(cover)
|
||
|
||
feeds = []
|
||
|
||
for sec in data['categories']:
|
||
section = sec['name']
|
||
self.log(section)
|
||
articles = []
|
||
for arts in sec['amc']:
|
||
title = safe_dict(arts, 'article', 'title')
|
||
desc = safe_dict(arts, 'article', 'theme', 'name') + ' | ' + safe_dict(arts, 'article', 'printTitle')
|
||
names = []
|
||
for auth in arts['article']['authors']:
|
||
name = safe_dict(auth, 'profile', 'name')
|
||
if name != '':
|
||
names.append(name)
|
||
if names:
|
||
desc = desc + ' | ' + ', '.join(names)
|
||
url = absurl(arts['article']['slug'])
|
||
self.log('\t', title, url, '\n\t', desc)
|
||
articles.append({'title': title, 'description': desc, 'url': url})
|
||
if articles:
|
||
feeds.append((section, articles))
|
||
return feeds
|
||
|
||
def print_version(self, url):
|
||
slug = urlparse(url).path
|
||
inp = json.dumps({"0":{"json":{"slug":slug}}})
|
||
return 'https://api.caravanmagazine.in/api/trpc/articles.getFromCache?batch=1&input=' + quote(inp, safe='')
|
||
|
||
def preprocess_raw_html(self, raw, url):
|
||
cache_data = json.loads(raw)[0]
|
||
art_id = cache_data['result']['data']['json']['articleId']
|
||
prim_data = cache_data['result']['data']['json']['data']
|
||
|
||
cat = desc = lede = auth = ''
|
||
|
||
cat = '<div class="cat">' + safe_dict(prim_data, 'printTitle') + '</div>\n'
|
||
title = '<h1>' + safe_dict(prim_data, 'title') + '</h1>\n'
|
||
desc = '<p class="desc">' + safe_dict(prim_data, 'description') + '</p>\n'
|
||
|
||
authors = []
|
||
for q in prim_data.get('authors', {}):
|
||
authors.append(safe_dict(q, 'name'))
|
||
dt = ''
|
||
if prim_data.get('writtenAt', '') != '':
|
||
import time
|
||
from datetime import datetime, timedelta
|
||
dt = datetime.fromisoformat(prim_data['writtenAt'][:-1]) + timedelta(seconds=time.timezone)
|
||
dt = dt.strftime('%b %d, %Y, %I:%M %p')
|
||
auth ='<p class="auth">' + ', '.join(authors) + ' | ' + dt + '</p>\n'
|
||
lede = ''.join(parse_body(prim_data.get('cover', {})))
|
||
|
||
free_cont = ''
|
||
for x in prim_data['data']['content']:
|
||
free_cont += '\n'+ ''.join(parse_body(x))
|
||
|
||
premium_cont = ''
|
||
if self.logged:
|
||
cont_url = 'https://api.caravanmagazine.in/api/paywall/check-article?articleId='
|
||
art_cont = json.loads(self.index_to_soup(cont_url + str(art_id), raw=True))
|
||
for x in art_cont['premiumContent']:
|
||
premium_cont += '\n' + ''.join(parse_body(x))
|
||
|
||
return '<html><body><div>' \
|
||
+ cat + title + desc + auth + lede + free_cont + premium_cont + \
|
||
'</div></body></html>'
|