calibre/recipes/time_magazine.recipe

70 lines
2.7 KiB
Python

#!/usr/bin/env python
# vim:fileencoding=utf-8
# License: GPLv3 Copyright: 2016, Kovid Goyal <kovid at kovidgoyal.net>
from __future__ import absolute_import, division, print_function, unicode_literals
import json
from calibre.constants import iswindows
from calibre.ptempfile import PersistentTemporaryFile
from calibre.web.feeds.news import BasicNewsRecipe
class TimeMagazine(BasicNewsRecipe):
title = 'TIME Magazine'
__author__ = 'Kovid Goyal'
description = 'Weekly US magazine.'
language = 'en'
no_stylesheets = True
remove_javascript = True
def parse_index(self):
root = self.index_to_soup('https://time.com/magazine', as_tree=True)
for script in root.iterdescendants('script'):
if script.text and script.text.startswith('Time.bootstrap ='):
data = json.loads(script.text.partition('=')[2].lstrip())
break
else:
raise ValueError(
'The TIME website has changed, this recipe needs to be rewritten')
data = data['magazine']['us'][0]
self.timefmt = ' [%s]' % data['title'].split('|')[0].strip()
self.cover_url = data['hero']['src']['large']
articles = []
self.turl_map = {}
for article in data['articles']:
title = article.get('friendly_title') or article.get('short_title')
if title == 'In the Latest Issue' or 'content' not in article:
continue
url = article['shortlink'].replace('?p=', '')
desc = article.get('excerpt') or ''
self.log(title, ' at ', url)
self.log('\t', desc)
try:
cover_url = article['hero']['src']['large']
except Exception:
cover_url = ''
authors = ''
for aut in article.get('authors') or ():
authors += '<p>' + aut.get('bio') + '</p>'
articles.append({'title': title, 'url': url, 'desc': desc})
text = '<html><head><meta charset="utf-8"></head><body><h1>{}</h1>{}<div><img src="{}"></div><div>{}</div></body></html>'.format(
title, authors, cover_url, article['content'])
with PersistentTemporaryFile('-time-recipe.html') as f:
f.write(text.encode('utf-8'))
name = ('/' if iswindows else '') + f.name
self.turl_map[url] = 'file://' + name
return [('Articles', articles)]
def print_version(self, url):
return self.turl_map[url]
def preprocess_html(self, soup):
for img in soup.findAll('img', attrs={'data-lazy-src': True}):
img['src'] = img['data-lazy-src']
for img in soup.findAll('img', src=lambda x: not x):
img.extract()
return soup