mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-09-29 15:31:08 -04:00
109 lines
3.8 KiB
Python
109 lines
3.8 KiB
Python
#!/usr/bin/env python
|
|
from __future__ import (unicode_literals, division, absolute_import,
|
|
print_function)
|
|
|
|
__license__ = 'GPL v3'
|
|
__copyright__ = '2013, Kovid Goyal <kovid@kovidgoyal.net>'
|
|
|
|
'''
|
|
time.com
|
|
'''
|
|
|
|
from calibre.web.feeds.jsnews import JavascriptRecipe
|
|
from lxml import html
|
|
|
|
# Keep the login method as standalone, so it can be easily tested
|
|
def do_login(browser, username, password):
|
|
from calibre.web.jsbrowser.browser import Timeout
|
|
browser.visit('http://time.com/magazine')
|
|
form = browser.select_form('#sign-in-form')
|
|
form['username'] = username
|
|
form['password'] = password
|
|
browser.submit('#Sign_In')
|
|
try:
|
|
browser.wait_for_element('body.is-signed-in', timeout=180)
|
|
except Timeout:
|
|
raise ValueError('Failed to login to time.com, check your username and password and try again in a little while.')
|
|
|
|
def evaljs(elem, js):
|
|
# Need this to work with both PyQt4 and PyQt5
|
|
ret = elem.evaluateJavaScript(js)
|
|
try:
|
|
return unicode(ret.toString())
|
|
except AttributeError:
|
|
return unicode(ret)
|
|
|
|
class Time(JavascriptRecipe):
|
|
title = u'Time'
|
|
__author__ = 'Kovid Goyal'
|
|
description = 'Weekly US magazine.'
|
|
language = 'en'
|
|
needs_subscription = True
|
|
requires_version = (0, 9, 35)
|
|
|
|
no_stylesheets = True
|
|
remove_javascript = True
|
|
keep_only_tags = ['.article-viewport .full-article']
|
|
remove_tags = ['.read-more-list', '.read-more-inline', '.article-footer', '.subscribe', '.tooltip', '#first-visit']
|
|
|
|
def do_login(self, browser, username, password):
|
|
do_login(browser, username, password)
|
|
|
|
def get_time_cover(self, browser):
|
|
selector = '#rail-articles img.magazine-thumb'
|
|
cover = browser.css_select(selector)
|
|
# URL for large cover
|
|
cover_url = evaljs(cover, 'this.src')
|
|
cover_url = cover_url.partition('?')[0] + '?w=814'
|
|
return browser.get_resource(cover_url)
|
|
|
|
def get_publication_data(self, browser):
|
|
# We are already at the magazine page thanks to the do_login() method
|
|
ans = {}
|
|
raw = browser.html
|
|
root = html.fromstring(raw)
|
|
|
|
dates = ''.join(root.xpath('//*[@class="rail-article-magazine-issue"]/date/text()'))
|
|
if dates:
|
|
self.timefmt = ' [%s]'%dates
|
|
|
|
parent = root.xpath('//section[@id="rail-articles"]')[0]
|
|
articles = []
|
|
for h3 in parent.xpath(
|
|
'descendant::h3[contains(@class, "rail-article-title")]'):
|
|
title = html.tostring(h3[0], encoding=unicode, method='text').strip()
|
|
a = h3.xpath('descendant::a[@href]')[0]
|
|
url = a.get('href')
|
|
h2 = h3.xpath('following-sibling::h2[@class="rail-article-excerpt"]')
|
|
desc = ''
|
|
if h2:
|
|
desc = html.tostring(h2[0], encoding=unicode, method='text').strip()
|
|
if title.strip() == 'In the Latest Issue':
|
|
continue
|
|
self.log('\nFound article:', title)
|
|
self.log('\t' + desc)
|
|
articles.append({'title':title, 'url':url, 'date':'', 'description':desc})
|
|
|
|
ans['index'] = [('Articles', articles)]
|
|
ans['cover'] = self.get_time_cover(browser)
|
|
return ans
|
|
|
|
def load_complete(self, browser, url, rl):
|
|
browser.wait_for_element('footer.article-footer')
|
|
return True
|
|
|
|
def postprocess_html(self, article, root, url, recursion_level):
|
|
# get rid of the first visit div which for some reason remove_tags is
|
|
# not removing
|
|
for div in root.xpath('//*[@id="first-visit"]'):
|
|
div.getparent().remove(div)
|
|
return root
|
|
|
|
if __name__ == '__main__':
|
|
# Test the login
|
|
import sys
|
|
from calibre import jsbrowser
|
|
br = jsbrowser(default_timeout=120)
|
|
do_login(br, sys.argv[-2], sys.argv[-1])
|
|
br.show_browser()
|