calibre/recipes/spectator_magazine.recipe
Kovid Goyal 792c2ca847
Update Spectator Magazine
Fixes #1902397 [Spectator Magazine no author names](https://bugs.launchpad.net/calibre/+bug/1902397)
2020-11-01 06:56:07 +05:30

127 lines
4.4 KiB
Python

#!/usr/bin/env python
# vim:fileencoding=utf-8
# License: GPLv3 Copyright: 2015, Kovid Goyal <kovid at kovidgoyal.net>
from __future__ import absolute_import, division, print_function, unicode_literals
import json
import re
from mechanize import Request
from calibre.web.feeds.recipes import BasicNewsRecipe
try:
from urllib.parse import quote
except ImportError:
from urllib import quote
def absolutize(url):
return 'https://spectator.co.uk' + url
class Spectator(BasicNewsRecipe):
title = 'Spectator Magazine'
__author__ = 'Kovid Goyal'
description = 'Magazine'
language = 'en'
no_stylesheets = True
use_embedded_content = True
def parse_index(self):
br = self.get_browser()
main_js = br.open_novisit('https://spectator.co.uk/main.js').read().decode('utf-8')
data = {}
fields = ('apiKey', 'apiSecret', 'contentEnvironment', 'siteUrl', 'magazineIssueContentUrl', 'contentUrl')
pat = r'this.({})\s*=\s*"(.+?)"'.format('|'.join(fields))
for m in re.finditer(pat, main_js):
data[m.group(1)] = m.group(2)
self.log('Got Spectator data:', data)
headers = {
'api_key': data['apiKey'],
'origin': data['siteUrl'],
'access_token': data['apiSecret'],
'Accept-language': 'en-GB,en-US;q=0.9,en;q=0.8',
'Accept-encoding': 'gzip, deflate',
'Accept': '*/*',
}
def make_url(utype, query, includes=(), limit=None):
ans = data[utype] + '/entries?environment=' + data['contentEnvironment']
if limit is not None:
ans += '&limit={}'.format(limit)
for inc in includes:
ans += '&include[]=' + inc
ans += '&query=' + quote(json.dumps(query))
return ans
def get_result(url):
self.log('Fetching:', url)
req = Request(url, headers=headers)
raw = br.open_novisit(req).read().decode('utf-8')
return json.loads(raw)['entries']
# Get current issue
url = data['magazineIssueContentUrl'] + '/entries?environment=' + data['contentEnvironment'] + "&desc=issue_date&limit=1&only[BASE][]=url"
result = get_result(url)
slug = result[0]['url']
uid = result[0]['uid'] # noqa
date = slug.split('/')[-1]
self.log('Downloading issue:', date)
# Cover information
url = make_url(
'magazineIssueContentUrl',
{'url': slug},
limit=1
)
self.cover_url = get_result(url)[0]['magazine_cover']['url']
self.log('Found cover:', self.cover_url)
# List of articles
url = make_url(
'contentUrl',
{
"magazine_content_production_only.magazine_issue": {
"$in_query": {"url": slug},
"_content_type_uid": "magazine_issue"
},
"_content_type_uid": "article"
},
includes=(
'topic', 'magazine_content_production_only.magazine_issue',
'magazine_content_production_only.magazine_subsection', 'author'
)
)
result = get_result(url)
articles = {}
for entry in result:
title = entry['title']
url = absolutize(entry['url'])
blocks = []
a = blocks.append
byline = entry.get('byline') or ''
if byline:
a('<h3>{}</h3>'.format(byline))
if entry.get('author'):
for au in reversed(entry['author']):
cac = ''
if au.get('caricature'):
cac = '<div><img style="max-width: 80px" src="{}"></div>'.format(au['caricature']['url'])
a('<div>{} <a href="{}">{}</a></div>'.format(cac, absolutize(au['url']), au['title']))
if entry.get('hero_image'):
hi = entry['hero_image'][0]
a('<div style="text-align: center"><img src="{}"></div>'.format(hi['url']))
a(entry['text_body'])
section = 'Unknown'
if entry.get('topic'):
topic = entry['topic'][0]
section = topic['title']
articles.setdefault(section, []).append({
'title': title, 'url': url, 'description': byline, 'content': '\n\n'.join(blocks)})
return [(sec, articles[sec]) for sec in sorted(articles)]