#!/usr/bin/env python2 from __future__ import (unicode_literals, division, absolute_import, print_function) __license__ = 'GPL v3' __copyright__ = '2013, Kovid Goyal ' ''' time.com ''' from calibre.web.feeds.jsnews import JavascriptRecipe from lxml import html # Keep the login method as standalone, so it can be easily tested def do_login(browser, username, password): from calibre.web.jsbrowser.browser import Timeout browser.visit( 'http://subscription-assets.time.com/prod/assets/themes/magazines/SUBS/templates/velocity/site/td-pcslogin/login.html') form = browser.select_form('#sign-in-form') form['username'] = username form['password'] = password browser.submit('#sign-in-button') try: browser.wait_for_element('body.is-signed-in', timeout=180) except Timeout: raise ValueError( 'Failed to login to time.com, check your username and password and try again in a little while.') def evaljs(elem, js): # Need this to work with both PyQt4 and PyQt5 ret = elem.evaluateJavaScript(js) try: return unicode(ret.toString()) except AttributeError: return unicode(ret) class Time(JavascriptRecipe): title = u'Time' __author__ = 'Kovid Goyal' description = 'Weekly US magazine.' language = 'en' needs_subscription = True requires_version = (0, 9, 35) no_stylesheets = True remove_javascript = True keep_only_tags = ['.article-viewport .full-article'] remove_tags = ['.read-more-list', '.read-more-inline', '.article-footer', '.subscribe', '.tooltip', '#first-visit'] def do_login(self, browser, username, password): do_login(browser, username, password) def get_time_cover(self, browser): selector = '#rail-articles img.magazine-thumb' cover = browser.css_select(selector) # URL for large cover cover_url = evaljs(cover, 'this.src') cover_url = cover_url.partition('?')[0] + '?w=814' return browser.get_resource(cover_url) def get_publication_data(self, browser): browser.visit('http://time.com/magazine') ans = {} raw = browser.html root = html.fromstring(raw) dates = ''.join(root.xpath('//*[@class="rail-article-magazine-issue"]/date/text()')) if dates: self.timefmt = ' [%s]'%dates parent = root.xpath('//section[@id="rail-articles"]')[0] articles = [] for h3 in parent.xpath( 'descendant::h3[contains(@class, "rail-article-title")]'): title = html.tostring(h3[0], encoding=unicode, method='text').strip() a = h3.xpath('descendant::a[@href]')[0] url = a.get('href') h2 = h3.xpath('following-sibling::h2[@class="rail-article-excerpt"]') desc = '' if h2: desc = html.tostring(h2[0], encoding=unicode, method='text').strip() if title.strip() == 'In the Latest Issue': continue self.log('\nFound article:', title) self.log('\t' + desc) articles.append({'title':title, 'url':url, 'date':'', 'description':desc}) ans['index'] = [('Articles', articles)] ans['cover'] = self.get_time_cover(browser) return ans def load_complete(self, browser, url, rl): browser.wait_for_element('footer.article-footer') return True def postprocess_html(self, article, root, url, recursion_level): # get rid of the first visit div which for some reason remove_tags is # not removing for div in root.xpath('//*[@id="first-visit"]'): div.getparent().remove(div) return root if __name__ == '__main__': # Test the login import sys from calibre import jsbrowser br = jsbrowser(default_timeout=120) do_login(br, sys.argv[-2], sys.argv[-1]) br.show_browser()