calibre/recipes/time_magazine.recipe
Kovid Goyal 29cd8d64ea
Change shebangs to python from python2
Also remove a few other miscellaneous references to python2
2020-08-22 18:47:51 +05:30

88 lines
3.5 KiB
Python

#!/usr/bin/env python
# vim:fileencoding=utf-8
# License: GPLv3 Copyright: 2016, Kovid Goyal <kovid at kovidgoyal.net>
from __future__ import absolute_import, division, print_function, unicode_literals
import json
from calibre.constants import iswindows
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ptempfile import PersistentTemporaryFile
class TimeMagazine(BasicNewsRecipe):
title = 'TIME Magazine'
__author__ = 'Kovid Goyal'
description = 'Weekly US magazine.'
language = 'en'
needs_subscription = True
no_stylesheets = True
remove_javascript = True
def get_browser(self):
br = BasicNewsRecipe.get_browser(self)
# self.time_magazine_page = open('/t/raw.html').read()
# return br
base = 'http://subscription-assets.time.com/prod/assets/themes/magazines/SUBS/templates/velocity/site/td-pcslogin/'
url = base + 'login.html'
br.open(url)
br.select_form(nr=0)
br.form.action = 'https://auth.time.com/login.php?rurl={}&turl={}'.format(
url, 'http://time.com/magazine')
br['username'] = self.username
br['password'] = self.password
r = br.submit()
# print(111111, r.geturl())
self.time_magazine_page = r.read()
return br
def parse_index(self):
import html5lib
root = html5lib.parse(
self.time_magazine_page, treebuilder='lxml', namespaceHTMLElements=False).getroot()
for script in root.iterdescendants('script'):
if script.text and script.text.startswith('Time.bootstrap ='):
data = json.loads(script.text.partition('=')[2].lstrip())
break
else:
raise ValueError(
'The TIME website has changed, this recipe needs to be rewritten')
data = data['magazine']['us'][0]
self.timefmt = ' [%s]' % data['title'].split('|')[0].strip()
self.cover_url = data['hero']['src']['large']
articles = []
self.turl_map = {}
for article in data['articles']:
title = article.get('friendly_title') or article.get('short_title')
if title == 'In the Latest Issue' or 'content' not in article:
continue
url = article['shortlink']
desc = article.get('excerpt') or ''
self.log(title, ' at ', url)
self.log('\t', desc)
try:
cover_url = article['hero']['src']['large']
except Exception:
cover_url = ''
authors = ''
for aut in article.get('authors') or ():
authors += '<p>' + aut.get('bio') + '</p>'
articles.append({'title': title, 'url': url, 'desc': desc})
text = '<html><head><meta charset="utf-8"></head><body><h1>{}</h1>{}<div><img src="{}"></div><div>{}</div></body></html>'.format(
title, authors, cover_url, article['content'])
with PersistentTemporaryFile('-time-recipe.html') as f:
f.write(text.encode('utf-8'))
name = ('/' if iswindows else '') + f.name
self.turl_map[url] = 'file://' + name
return [('Articles', articles)]
def print_version(self, url):
return self.turl_map[url]
def preprocess_html(self, soup):
for img in soup.findAll('img', attrs={'data-lazy-src': True}):
img['src'] = img['data-lazy-src']
for img in soup.findAll('img', src=lambda x: not x):
img.extract()
return soup