#!/usr/bin/env python from __future__ import (unicode_literals, division, absolute_import, print_function) __license__ = 'GPL v3' __copyright__ = '2013, Kovid Goyal ' ''' time.com ''' from calibre.web.feeds.jsnews import JavascriptRecipe from lxml import html def wait_for_load(browser): # This element is present in the black login bar at the top browser.wait_for_element('#site-header p.constrain', timeout=180) # Keep the login method as standalone, so it can be easily tested def do_login(browser, username, password): from calibre.web.jsbrowser.browser import Timeout browser.visit('http://www.time.com/time/magazine') form = browser.select_form('#magazine-signup') form['username'] = username form['password'] = password browser.submit('#paid-wall-submit') try: wait_for_load(browser) except Timeout: raise ValueError('Failed to login to time.com, check your username and password and try again in a little while.') class Time(JavascriptRecipe): title = u'Time' __author__ = 'Kovid Goyal' description = 'Weekly US magazine.' language = 'en' needs_subscription = True requires_version = (0, 9, 35) no_stylesheets = True remove_javascript = True keep_only_tags = ['article.post'] remove_tags = ['meta', '.entry-sharing', '.entry-footer', '.wp-paginate', '.post-rail', '.entry-comments', '.entry-tools', '#paid-wall-cm-ad'] recursions = 1 links_from_selectors = ['.wp-paginate a.page[href]'] extra_css = '.entry-date { padding-left: 2ex }' def do_login(self, browser, username, password): do_login(browser, username, password) def get_publication_data(self, browser): selector = 'section.sec-mag-showcase ul.ul-mag-showcase img[src]' cover = browser.css_select(selector) # URL for large cover cover_url = unicode(cover.evaluateJavaScript('this.src').toString()).replace('_400.', '_600.') raw = browser.html ans = {'cover': browser.get_resource(cover_url)} # We are already at the magazine page thanks to the do_login() method root = html.fromstring(raw) dates = ''.join(root.xpath('//time[@class="updated"]/text()')) if dates: self.timefmt = ' [%s]'%dates feeds = [] parent = root.xpath('//div[@class="content-main-aside"]')[0] for sec in parent.xpath( 'descendant::section[contains(@class, "sec-mag-section")]'): h3 = sec.xpath('./h3') if h3: section = html.tostring(h3[0], encoding=unicode, method='text').strip().capitalize() self.log('Found section', section) articles = list(self.find_articles(sec)) if articles: feeds.append((section, articles)) ans['index'] = feeds return ans def find_articles(self, sec): for article in sec.xpath('./article'): h2 = article.xpath('./*[@class="entry-title"]') if not h2: continue a = h2[0].xpath('./a[@href]') if not a: continue title = html.tostring(a[0], encoding=unicode, method='text').strip() if not title: continue url = a[0].get('href') if url.startswith('/'): url = 'http://www.time.com'+url desc = '' p = article.xpath('./*[@class="entry-content"]') if p: desc = html.tostring(p[0], encoding=unicode, method='text') self.log('\t', title, ':\n\t\t', url) yield { 'title' : title, 'url' : url, 'date' : '', 'description' : desc } def load_complete(self, browser, url, recursion_level): # This is needed as without it, subscriber content is blank. time.com # appears to be using some crazy iframe+js callback for loading content wait_for_load(browser) return True def postprocess_html(self, article, root, url, recursion_level): # Remove the header and page n of m messages from pages after the first # page if recursion_level > 0: for h in root.xpath('//header[@class="entry-header"]|//span[@class="page"]'): h.getparent().remove(h) # Unfloat the article images and also remove them from pages after the # first page as they are repeated on every page. for fig in root.xpath('//figure'): parent = fig.getparent() if recursion_level > 0: parent.remove(fig) else: idx = parent.index(fig) for img in reversed(fig.xpath('descendant::img')): parent.insert(idx, img) parent.remove(fig) return root if __name__ == '__main__': # Test the login import sys from calibre import jsbrowser br = jsbrowser(default_timeout=120) do_login(br, sys.argv[-2], sys.argv[-1]) br.show_browser()