calibre/recipes/spectator_magazine.recipe

#!/usr/bin/env python
# vim:fileencoding=utf-8
# License: GPLv3 Copyright: 2015, Kovid Goyal <kovid at kovidgoyal.net>

from __future__ import absolute_import, division, print_function, unicode_literals

import json
import re

from mechanize import Request

from calibre.web.feeds.recipes import BasicNewsRecipe

try:
    from urllib.parse import quote
except ImportError:
    from urllib import quote


def absolutize(url):
    return 'https://spectator.co.uk' + url


class Spectator(BasicNewsRecipe):

    title = 'Spectator Magazine'
    __author__ = 'Kovid Goyal'
    description = 'Magazine'
    language = 'en'

    no_stylesheets = True
    use_embedded_content = True

    def parse_index(self):
        br = self.get_browser()
        main_js = br.open_novisit('https://spectator.co.uk/main.js').read().decode('utf-8')
        data = {}
        fields = ('apiKey', 'apiSecret', 'contentEnvironment', 'siteUrl', 'magazineIssueContentUrl', 'contentUrl')
        pat = r'this.({})\s*=\s*"(.+?)"'.format('|'.join(fields))
        for m in re.finditer(pat, main_js):
            data[m.group(1)] = m.group(2)
        self.log('Got Spectator data:', data)
        headers = {
            'api_key': data['apiKey'],
            'origin': data['siteUrl'],
            'access_token': data['apiSecret'],
            'Accept-language': 'en-GB,en-US;q=0.9,en;q=0.8',
            'Accept-encoding': 'gzip, deflate',
            'Accept': '*/*',
        }

        def make_url(utype, query, includes=(), limit=None):
            ans = data[utype] + '/entries?environment=' + data['contentEnvironment']
            if limit is not None:
                ans += '&limit={}'.format(limit)
            for inc in includes:
                ans += '&include[]=' + inc
            ans += '&query=' + quote(json.dumps(query))
            return ans

        def get_result(url):
            self.log('Fetching:', url)
            req = Request(url, headers=headers)
            raw = br.open_novisit(req).read().decode('utf-8')
            return json.loads(raw)['entries']

        # Get current issue
        url = data['magazineIssueContentUrl'] + '/entries?environment=' + data['contentEnvironment'] + "&desc=issue_date&limit=1&only[BASE][]=url"
        result = get_result(url)
        slug = result[0]['url']
        uid = result[0]['uid']  # noqa
        date = slug.split('/')[-1]
        self.log('Downloading issue:', date)

        # Cover information
        url = make_url(
            'magazineIssueContentUrl',
            {'url': slug},
            limit=1
        )
        self.cover_url = get_result(url)[0]['magazine_cover']['url']
        self.log('Found cover:', self.cover_url)

        # List of articles
        url = make_url(
            'contentUrl',
            {
                "magazine_content_production_only.magazine_issue": {
                    "$in_query": {"url": slug},
                    "_content_type_uid": "magazine_issue"
                },
                "_content_type_uid": "article"
            },
            includes=(
                'topic', 'magazine_content_production_only.magazine_issue',
                'magazine_content_production_only.magazine_subsection', 'author'
            )
        )
        result = get_result(url)
        articles = {}
        for entry in result:
            title = entry['title']
            url = absolutize(entry['url'])
            blocks = []
            a = blocks.append
            byline = entry.get('byline') or ''
            if byline:
                a('<h3>{}</h3>'.format(byline))
            if entry.get('author'):
                for au in reversed(entry['author']):
                    cac = ''
                    if au.get('caricature'):
                        cac = '<div><img style="max-width: 80px" src="{}"></div>'.format(au['caricature']['url'])
                    a('<div>{} <a href="{}">{}</a></div>'.format(cac, absolutize(au['url']), au['title']))

            if entry.get('hero_image'):
                hi = entry['hero_image'][0]
                a('<div style="text-align: center"><img src="{}"></div>'.format(hi['url']))
            a(entry['text_body'])
            section = 'Unknown'
            if entry.get('topic'):
                topic = entry['topic'][0]
                section = topic['title']
            articles.setdefault(section, []).append({
                'title': title, 'url': url, 'description': byline, 'content': '\n\n'.join(blocks)})
        return [(sec, articles[sec]) for sec in sorted(articles)]