#!/usr/bin/env python # vim:fileencoding=utf-8 # License: GPLv3 Copyright: 2015, Kovid Goyal from __future__ import absolute_import, division, print_function, unicode_literals import json import re from mechanize import Request from calibre.web.feeds.recipes import BasicNewsRecipe try: from urllib.parse import quote except ImportError: from urllib import quote def absolutize(url): return 'https://spectator.co.uk' + url class Spectator(BasicNewsRecipe): title = 'Spectator Magazine' __author__ = 'Kovid Goyal' description = 'Magazine' language = 'en' no_stylesheets = True use_embedded_content = True def parse_index(self): br = self.get_browser() main_js = br.open_novisit('https://spectator.co.uk/main.js').read().decode('utf-8') data = {} fields = ('apiKey', 'apiSecret', 'contentEnvironment', 'siteUrl', 'magazineIssueContentUrl', 'contentUrl') pat = r'this.({})\s*=\s*"(.+?)"'.format('|'.join(fields)) for m in re.finditer(pat, main_js): data[m.group(1)] = m.group(2) self.log('Got Spectator data:', data) headers = { 'api_key': data['apiKey'], 'origin': data['siteUrl'], 'access_token': data['apiSecret'], 'Accept-language': 'en-GB,en-US;q=0.9,en;q=0.8', 'Accept-encoding': 'gzip, deflate', 'Accept': '*/*', } def make_url(utype, query, includes=(), limit=None): ans = data[utype] + '/entries?environment=' + data['contentEnvironment'] if limit is not None: ans += '&limit={}'.format(limit) for inc in includes: ans += '&include[]=' + inc ans += '&query=' + quote(json.dumps(query)) return ans def get_result(url): self.log('Fetching:', url) req = Request(url, headers=headers) raw = br.open_novisit(req).read().decode('utf-8') return json.loads(raw)['entries'] # Get current issue url = data['magazineIssueContentUrl'] + '/entries?environment=' + data['contentEnvironment'] + "&desc=issue_date&limit=1&only[BASE][]=url" result = get_result(url) slug = result[0]['url'] uid = result[0]['uid'] # noqa date = slug.split('/')[-1] self.log('Downloading issue:', date) # Cover information url = make_url( 'magazineIssueContentUrl', {'url': slug}, limit=1 ) self.cover_url = get_result(url)[0]['magazine_cover']['url'] self.log('Found cover:', self.cover_url) # List of articles url = make_url( 'contentUrl', { "magazine_content_production_only.magazine_issue": { "$in_query": {"url": slug}, "_content_type_uid": "magazine_issue" }, "_content_type_uid": "article" }, includes=( 'topic', 'magazine_content_production_only.magazine_issue', 'magazine_content_production_only.magazine_subsection', 'author' ) ) result = get_result(url) articles = {} for entry in result: title = entry['title'] url = absolutize(entry['url']) blocks = [] a = blocks.append byline = entry.get('byline') or '' if byline: a('

{}

'.format(byline)) if entry.get('author'): for au in reversed(entry['author']): cac = '' if au.get('caricature'): cac = '
'.format(au['caricature']['url']) a('
{} {}
'.format(cac, absolutize(au['url']), au['title'])) if entry.get('hero_image'): hi = entry['hero_image'][0] a('
'.format(hi['url'])) a(entry['text_body']) section = 'Unknown' if entry.get('topic'): topic = entry['topic'][0] section = topic['title'] articles.setdefault(section, []).append({ 'title': title, 'url': url, 'description': byline, 'content': '\n\n'.join(blocks)}) return [(sec, articles[sec]) for sec in sorted(articles)]