#!/usr/bin/env python __copyright__ = '2008, Kovid Goyal ' __license__ = 'GPL v3' ''' calibre recipe for slate.com ''' import re from calibre.web.feeds.recipes import BasicNewsRecipe class Slate(BasicNewsRecipe): description = 'A general-interest publication offering analysis and commentary about politics, news and culture.' __author__ = 'Kovid Goyal' timefmt = '' no_stylesheets = True language = 'en' title = 'Slate' INDEX = 'http://slate.com' encoding = 'utf-8' preprocess_regexps = [ (re.compile(r'', re.DOTALL), lambda x: ''), (re.compile(r'^.*?]+?/>', re.DOTALL), lambda x:''), ] remove_tags = [ {'name':['link', 'script']}, {'class':['share-box-flank', 'sl-crumbs', 'sl-tbar', 'sl-chunky-tbar']}, ] remove_tags_after = [{'class':'sl-art-creds-cntr'}] keep_only_tags = {'class':'sl-body-wrapper'} remove_attributes = ['style'] def print_version(self, url): return url.replace('.html', '.single.html') def parse_index(self) : ans = [] for sectitle, url in ( ('News & Politics', '/articles/news_and_politics.html'), ('Technology', '/articles/technology.html'), ('Business', '/articles/business.html'), ('Arts', '/articles/arts.html'), ('Life', '/articles/life.html'), ('Health & Science', '/articles/health_and_science.html'), ('Sports', '/articles/sports.html'), ('Double X', '/articles/double_x.html'), ): url = self.INDEX + url self.log('Found section:', sectitle) articles = self.slate_section_articles(self.index_to_soup(url)) if articles: ans.append((sectitle, articles)) return ans def slate_section_articles(self, soup): cont = soup.find('div', id='most_read') seen = set() ans = [] for h4 in cont.findAll('h4'): a = h4.find('a', href=True) if a is None: continue url = a['href'] if url.startswith('/'): url = self.INDEX + url if url in seen: continue seen.add(url) title = self.tag_to_string(a) parent = h4.parent h3 = parent.find('h3') desc = '' if h3 is not None: desc = self.tag_to_string(h3) a = parent.find('a', rel='author') if a is not None: a = self.tag_to_string(a) art = {'title':title, 'description':desc, 'date':'', 'url':url} if a: art['author'] = a self.log('\tFound article:', title, ' by ', a) ans.append(art) return ans def get_masthead_url(self): masthead = 'http://img.slate.com/images/redesign2008/slate_logo.gif' br = BasicNewsRecipe.get_browser() try: br.open(masthead) except: self.log("\nMasthead unavailable") masthead = None return masthead