TIME recipe no longer uses Qt WebKit

This commit is contained in:
Kovid Goyal 2016-04-24 22:16:10 +05:30
parent d8457fe972
commit 9f4b6f9991

View File

@ -1,73 +1,59 @@
#!/usr/bin/env python2 #!/usr/bin/env python2
# vim:fileencoding=utf-8
# License: GPLv3 Copyright: 2016, Kovid Goyal <kovid at kovidgoyal.net>
from __future__ import (unicode_literals, division, absolute_import, from __future__ import (unicode_literals, division, absolute_import,
print_function) print_function)
__license__ = 'GPL v3'
__copyright__ = '2013, Kovid Goyal <kovid@kovidgoyal.net>'
'''
time.com
'''
from calibre.web.feeds.jsnews import JavascriptRecipe
from calibre.ptempfile import PersistentTemporaryFile
from lxml import html
import json import json
from calibre.constants import iswindows
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ptempfile import PersistentTemporaryFile
# Keep the login method as standalone, so it can be easily tested
def do_login(browser, username, password):
from calibre.web.jsbrowser.browser import Timeout
browser.visit(
'http://subscription-assets.time.com/prod/assets/themes/magazines/SUBS/templates/velocity/site/td-pcslogin/login.html')
form = browser.select_form('#sign-in-form')
form['username'] = username
form['password'] = password
browser.submit('#sign-in-button')
try:
browser.wait_for_element('body.is-signed-in', timeout=180)
except Timeout:
raise ValueError(
'Failed to login to time.com, check your username and password and try again in a little while.')
class Time(JavascriptRecipe): class TimeMagazine(BasicNewsRecipe):
title = u'Time'
__author__ = 'Kovid Goyal'
description = 'Weekly US magazine.'
language = 'en'
needs_subscription = True
requires_version = (0, 9, 35)
no_stylesheets = True title = 'TIME Magazine'
remove_javascript = True __author__ = 'Kovid Goyal'
time_initial_phase = True __author__ = 'Kovid Goyal'
description = 'Weekly US magazine.'
language = 'en'
needs_subscription = True
no_stylesheets = True
remove_javascript = True
def do_login(self, browser, username, password): def get_browser(self):
do_login(browser, username, password) br = BasicNewsRecipe.get_browser(self)
# self.time_magazine_page = open('/t/raw.html').read()
def print_version(self, url): # return br
return self.turl_map[url] base = 'http://subscription-assets.time.com/prod/assets/themes/magazines/SUBS/templates/velocity/site/td-pcslogin/'
url = base + 'login.html'
def get_publication_data(self, browser): br.open(url)
# raw = open('/t/time.html', 'rb').read().decode('utf-8') br.select_form(nr=0)
browser.visit('http://time.com/magazine') br.form.action = 'https://auth.time.com/login.php?rurl={}&turl={}'.format(
raw = browser.html url, 'http://time.com/magazine')
br['username'] = self.username
root = html.fromstring(raw) br['password'] = self.password
self.time_initial_phase = False r = br.submit()
dates = ''.join(root.xpath('//*[@class="rail-article-magazine-issue"]/date/text()')) # print(111111, r.geturl())
if dates: self.time_magazine_page = r.read()
self.timefmt = ' [%s]'%dates return br
def parse_index(self):
import html5lib
root = html5lib.parse(
self.time_magazine_page, treebuilder='lxml', namespaceHTMLElements=False).getroot()
for script in root.iterdescendants('script'): for script in root.iterdescendants('script'):
if script.text and script.text.startswith('Time.bootstrap ='): if script.text and script.text.startswith('Time.bootstrap ='):
data = json.loads(script.text.partition('=')[2].lstrip()) data = json.loads(script.text.partition('=')[2].lstrip())
break break
else: else:
raise ValueError('The time website has changed, this recipe needs to be rewritten') raise ValueError(
'The TIME website has changed, this recipe needs to be rewritten')
data = data['magazine']['us'][0] data = data['magazine']['us'][0]
self.turl_map = {} self.timefmt = ' [%s]' % data['title'].split('|')[0].strip()
ans = {} self.cover_url = data['hero']['src']['large']
articles = [] articles = []
self.turl_map = {}
for article in data['articles']: for article in data['articles']:
title = article.get('friendly_title') or article.get('short_title') title = article.get('friendly_title') or article.get('short_title')
if title == 'In the Latest Issue' or 'content' not in article: if title == 'In the Latest Issue' or 'content' not in article:
@ -83,32 +69,21 @@ class Time(JavascriptRecipe):
authors = '' authors = ''
for aut in article.get('authors') or (): for aut in article.get('authors') or ():
authors += '<p>' + aut.get('bio') + '</p>' authors += '<p>' + aut.get('bio') + '</p>'
articles.append({'title':title, 'url':url, 'desc':desc}) articles.append({'title': title, 'url': url, 'desc': desc})
text = '<html><head><meta charset="utf-8"></head><body><h1>{}</h1>{}<div><img src={}></div><div>{}</div></body></html>'.format( text = '<html><head><meta charset="utf-8"></head><body><h1>{}</h1>{}<div><img src="{}"></div><div>{}</div></body></html>'.format(
title, authors, cover_url, article['content']) title, authors, cover_url, article['content'])
with PersistentTemporaryFile('-time-recipe.html') as f: with PersistentTemporaryFile('-time-recipe.html') as f:
f.write(text.encode('utf-8')) f.write(text.encode('utf-8'))
self.turl_map[url] = 'file:///' + f.name name = ('/' if iswindows else '') + f.name
# from pprint import pprint self.turl_map[url] = 'file://' + name
# pprint(data['hero']) return [('Articles', articles)]
ans['cover'] = browser.get_resource(data['hero']['src']['large']) def print_version(self, url):
ans['index'] = [('Articles', articles)] return self.turl_map[url]
return ans
def preprocess_stage1(self, article, browser, url, recursion_level): def preprocess_html(self, soup):
for img in browser.css_select('img[data-lazy-src]', all=True): for img in soup.findAll('img', attrs={'data-lazy-src':True}):
img.setAttribute('src', img.attribute('data-lazy-src')) img['src'] = img['data-lazy-src']
for img in soup.findAll('img', src=lambda x: not x):
def load_complete(self, browser, url, rl): img.extract()
if self.time_initial_phase: return soup
browser.wait_for_element('footer.article-footer')
return True
if __name__ == '__main__':
# Test the login
import sys
from calibre import jsbrowser
br = jsbrowser(default_timeout=120)
do_login(br, sys.argv[-2], sys.argv[-1])
br.show_browser()