calibre/recipes/time_magazine.recipe

#!/usr/bin/env python
# vim:fileencoding=utf-8
# License: GPLv3 Copyright: 2016, Kovid Goyal <kovid at kovidgoyal.net>

from __future__ import absolute_import, division, print_function, unicode_literals
import json
from calibre.constants import iswindows
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ptempfile import PersistentTemporaryFile


class TimeMagazine(BasicNewsRecipe):

    title = 'TIME Magazine'
    __author__ = 'Kovid Goyal'
    description = 'Weekly US magazine.'
    language = 'en'
    needs_subscription = True
    no_stylesheets = True
    remove_javascript = True

    def get_browser(self):
        br = BasicNewsRecipe.get_browser(self)
        # self.time_magazine_page = open('/t/raw.html').read()
        # return br
        base = 'http://subscription-assets.time.com/prod/assets/themes/magazines/SUBS/templates/velocity/site/td-pcslogin/'
        url = base + 'login.html'
        br.open(url)
        br.select_form(nr=0)
        br.form.action = 'https://auth.time.com/login.php?rurl={}&turl={}'.format(
            url, 'http://time.com/magazine')
        br['username'] = self.username
        br['password'] = self.password
        r = br.submit()
        # print(111111, r.geturl())
        self.time_magazine_page = r.read()
        return br

    def parse_index(self):
        import html5lib
        root = html5lib.parse(
            self.time_magazine_page, treebuilder='lxml', namespaceHTMLElements=False).getroot()
        for script in root.iterdescendants('script'):
            if script.text and script.text.startswith('Time.bootstrap ='):
                data = json.loads(script.text.partition('=')[2].lstrip())
                break
        else:
            raise ValueError(
                'The TIME website has changed, this recipe needs to be rewritten')
        data = data['magazine']['us'][0]
        self.timefmt = ' [%s]' % data['title'].split('|')[0].strip()
        self.cover_url = data['hero']['src']['large']
        articles = []
        self.turl_map = {}
        for article in data['articles']:
            title = article.get('friendly_title') or article.get('short_title')
            if title == 'In the Latest Issue' or 'content' not in article:
                continue
            url = article['shortlink']
            desc = article.get('excerpt') or ''
            self.log(title, ' at ', url)
            self.log('\t', desc)
            try:
                cover_url = article['hero']['src']['large']
            except Exception:
                cover_url = ''
            authors = ''
            for aut in article.get('authors') or ():
                authors += '<p>' + aut.get('bio') + '</p>'
            articles.append({'title': title, 'url': url, 'desc': desc})
            text = '<html><head><meta charset="utf-8"></head><body><h1>{}</h1>{}<div><img src="{}"></div><div>{}</div></body></html>'.format(
                title, authors, cover_url, article['content'])
            with PersistentTemporaryFile('-time-recipe.html') as f:
                f.write(text.encode('utf-8'))
            name = ('/' if iswindows else '') + f.name
            self.turl_map[url] = 'file://' + name
        return [('Articles', articles)]

    def print_version(self, url):
        return self.turl_map[url]

    def preprocess_html(self, soup):
        for img in soup.findAll('img', attrs={'data-lazy-src': True}):
            img['src'] = img['data-lazy-src']
        for img in soup.findAll('img', src=lambda x: not x):
            img.extract()
        return soup