#!/usr/bin/env python from calibre.web.feeds.news import BasicNewsRecipe import re __license__ = 'GPL v3' __copyright__ = '2015, Ned Letcher ' """ calibre recipe for Slate Star Codex. """ class SlateStarCodex(BasicNewsRecipe): title = u'Slate Star Codex' description = 'IN A MAD WORLD, ALL BLOGGING IS PSYCHIATRY BLOGGING' __author__ = 'Ned Letcher' max_articles_per_feed = 20 language = 'en' encoding = 'utf-8' no_stylesheets = True # alternative candidate for keep_only_tags: # [{'attrs':{'class':['pjgm-posttitle', 'pjgm-postmeta', 'pjgm-postcontent']}}] keep_only_tags = [ dict(name='div', attrs={'class': re.compile(r'\bpost\b')})] remove_tags = [ dict(name='div', attrs={'class': re.compile(r'\bsharedaddy\b')})] def get_archived_posts(self): soup = self.index_to_soup('http://slatestarcodex.com/archives/') entries = soup.findAll(attrs={'class': 'sya_postcontent'}) posts = [] for entry in entries: atag = entry.find('a') url = atag['href'] post = { 'title': atag.contents[0], 'url': url, 'date': "-".join(url.strip('/').split('/')[-4:-1]), } posts.append(post) return posts def parse_index(self): posts = self.get_archived_posts() return [[self.title, posts]]