#!/usr/bin/env python2 # vim:fileencoding=utf-8 import os import tempfile from calibre.web.feeds.recipes import BasicNewsRecipe class Dilbert(BasicNewsRecipe): title = u'Dilbert' __author__ = 'TechnoCat' description = 'Dilbert, by Scott Adams. Includes last three or so comics and blog entries.' cover_url = 'http://dilbert.com/assets/dilbert-logo-4152bd0c31f7de7443b4bc90abd818da.png' auto_cleanup = True encoding = 'utf8' language = 'en' needs_subscription = False no_stylesheets = True oldest_article = 7 remove_javascript = True recursions = 0 max_articles_per_feed = 20 debugMessages = True BASE_URL = 'http://dilbert.com' # Note no www. COMIC_DIV_TAG = 'img-comic-container' BLOG_DIV_TAG = 'media' tempfiles = [] # Creates a temp file for the wrapped image url def writeImage(self, title, imageURL) : tempFile = tempfile.NamedTemporaryFile(delete=False) self.tempfiles.append(tempFile) tempFile.write(''+title+'') tempFile.write(imageURL.prettify()) tempFile.write('') tempFile.flush() tempFile.close() return tempFile.name def cleanUpTempFiles(self): for tempFile in self.tempfiles: tempFile.close() os.unlink(tempFile.name) def cleanup(self): self.cleanUpTempFiles() # Extract comic links from the soup # Returns a list of comics (articles) as: # { # 'title' : article title, # 'url' : URL of print version, # 'date' : The publication date of the article as a string, # 'description' : A summary of the article # 'content' : The full article (can be an empty string). This is used by FullContentProfile # } def comicFeed(self, soup) : feedset = [] for comicContainer in soup.findAll('div', {'class': self.COMIC_DIV_TAG}) : comic = comicContainer.find('img') if comic is not None: filelink = self.writeImage(comic['alt'], comic) feedset.append( dict(title=comic['alt'], url='file://'+filelink, description=comic['alt'], content='')) return feedset def blogFeed(self, soup) : feedset = [] for blogContainer in soup.findAll('div', {'class': self.BLOG_DIV_TAG}) : blog = blogContainer.find('a', {'class':'link-blended'}) if blog is not None: feedset.append( dict(title=blog['title'], url=blog['href'], description=blog['title'], content='')) return feedset def parse_index(self): root = self.index_to_soup(self.BASE_URL) comics = self.comicFeed(root) blogs = self.blogFeed(root) return [('Comics', comics), ('Blog Entries', blogs)]