calibre/recipes/time_magazine.recipe

#!/usr/bin/env python
# vim:fileencoding=utf-8
# License: GPLv3 Copyright: 2016, Kovid Goyal <kovid at kovidgoyal.net>

from __future__ import absolute_import, division, print_function, unicode_literals

import json

from calibre.constants import iswindows
from calibre.ptempfile import PersistentTemporaryFile
from calibre.web.feeds.news import BasicNewsRecipe


class TimeMagazine(BasicNewsRecipe):

    title = 'TIME Magazine'
    __author__ = 'Kovid Goyal'
    description = 'Weekly US magazine.'
    language = 'en'
    no_stylesheets = True
    remove_javascript = True

    def parse_index(self):
        root = self.index_to_soup('https://time.com/magazine', as_tree=True)
        for script in root.iterdescendants('script'):
            if script.text and script.text.startswith('Time.bootstrap ='):
                data = json.loads(script.text.partition('=')[2].lstrip())
                break
        else:
            raise ValueError(
                'The TIME website has changed, this recipe needs to be rewritten')
        data = data['magazine']['us'][0]
        self.timefmt = ' [%s]' % data['title'].split('|')[0].strip()
        self.cover_url = data['hero']['src']['large']
        articles = []
        self.turl_map = {}
        for article in data['articles']:
            title = article.get('friendly_title') or article.get('short_title')
            if title == 'In the Latest Issue' or 'content' not in article:
                continue
            url = article['shortlink'].replace('?p=', '')
            desc = article.get('excerpt') or ''
            self.log(title, ' at ', url)
            self.log('\t', desc)
            try:
                cover_url = article['hero']['src']['large']
            except Exception:
                cover_url = ''
            authors = ''
            for aut in article.get('authors') or ():
                authors += '<p>' + aut.get('bio') + '</p>'
            articles.append({'title': title, 'url': url, 'desc': desc})
            text = '<html><head><meta charset="utf-8"></head><body><h1>{}</h1>{}<div><img src="{}"></div><div>{}</div></body></html>'.format(
                title, authors, cover_url, article['content'])
            with PersistentTemporaryFile('-time-recipe.html') as f:
                f.write(text.encode('utf-8'))
            name = ('/' if iswindows else '') + f.name
            self.turl_map[url] = 'file://' + name
        return [('Articles', articles)]

    def print_version(self, url):
        return self.turl_map[url]

    def preprocess_html(self, soup):
        for img in soup.findAll('img', attrs={'data-lazy-src': True}):
            img['src'] = img['data-lazy-src']
        for img in soup.findAll('img', src=lambda x: not x):
            img.extract()
        return soup