#!/usr/bin/env python __license__ = 'GPL v3' __copyright__ = '2008, Kovid Goyal ' ''' time.com ''' import re from calibre.web.feeds.news import BasicNewsRecipe from lxml import html class Time(BasicNewsRecipe): #recipe_disabled = ('This recipe has been disabled as TIME no longer' # ' publish complete articles on the web.') title = u'Time' __author__ = 'Kovid Goyal' description = ('Weekly US magazine.') encoding = 'utf-8' no_stylesheets = True language = 'en' remove_javascript = True #needs_subscription = 'optional' keep_only_tags = [ { 'class':['artHd', 'articleContent', 'entry-title','entry-meta', 'entry-content', 'thumbnail'] }, ] remove_tags = [ {'class':['content-tools', 'quigo', 'see', 'first-tier-social-tools', 'navigation', 'enlarge lightbox']}, {'id':['share-tools']}, {'rel':'lightbox'}, ] recursions = 10 match_regexps = [r'/[0-9,]+-(2|3|4|5|6|7|8|9)(,\d+){0,1}.html',r'http://www.time.com/time/specials/packages/article/.*'] preprocess_regexps = [(re.compile( r''), lambda m:'')] def get_browser(self): br = BasicNewsRecipe.get_browser(self) if False and self.username and self.password: # This site uses javascript in its login process res = br.open('http://www.time.com/time/magazine') br.select_form(nr=1) br['username'] = self.username br['password'] = self.password res = br.submit() raw = res.read() if '>Log Out<' not in raw: raise ValueError('Failed to login to time.com, check' ' your username and password') return br def parse_index(self): raw = self.index_to_soup('http://www.time.com/time/magazine', raw=True) root = html.fromstring(raw) img = root.xpath('//a[.="View Large Cover" and @href]') if img: cover_url = 'http://www.time.com' + img[0].get('href') try: nsoup = self.index_to_soup(cover_url) img = nsoup.find('img', src=re.compile('archive/covers')) if img is not None: self.cover_url = img['src'] except: self.log.exception('Failed to fetch cover') feeds = [] parent = root.xpath('//div[@class="content-main-aside"]')[0] for sec in parent.xpath( 'descendant::section[contains(@class, "sec-mag-section")]'): h3 = sec.xpath('./h3') if h3: section = html.tostring(h3[0], encoding=unicode, method='text').strip().capitalize() self.log('Found section', section) articles = list(self.find_articles(sec)) if articles: feeds.append((section, articles)) return feeds def find_articles(self, sec): for article in sec.xpath('./article'): h2 = article.xpath('./*[@class="entry-title"]') if not h2: continue a = h2[0].xpath('./a[@href]') if not a: continue title = html.tostring(a[0], encoding=unicode, method='text').strip() if not title: continue url = a[0].get('href') if url.startswith('/'): url = 'http://www.time.com'+url desc = '' p = article.xpath('./*[@class="entry-content"]') if p: desc = html.tostring(p[0], encoding=unicode, method='text') self.log('\t', title, ':\n\t\t', desc) yield { 'title' : title, 'url' : url, 'date' : '', 'description' : desc } def postprocess_html(self,soup,first): for tag in soup.findAll(attrs ={'class':['artPag','pagination']}): tag.extract() return soup