calibre/recipes/dilbert.recipe

#!/usr/bin/env python
# vim:fileencoding=utf-8

import os
import tempfile
from calibre.web.feeds.recipes import BasicNewsRecipe


class Dilbert(BasicNewsRecipe):
    title = u'Dilbert'
    __author__ = 'TechnoCat'
    description = 'Dilbert, by Scott Adams. Includes last three or so comics and blog entries.'
    cover_url = 'http://dilbert.com/assets/dilbert-logo-4152bd0c31f7de7443b4bc90abd818da.png'
    auto_cleanup = True
    encoding = 'utf8'
    language = 'en'
    needs_subscription = False
    no_stylesheets = True
    oldest_article = 7
    remove_javascript = True
    recursions = 0
    max_articles_per_feed = 20
    debugMessages = True
    BASE_URL = 'http://dilbert.com'  # Note no www.
    COMIC_DIV_TAG = 'img-comic-container'
    BLOG_DIV_TAG  = 'media'
    tempfiles = []

    # Creates a temp file for the wrapped image url
    def writeImage(self, title, imageURL) :
        tempFile = tempfile.NamedTemporaryFile(delete=False)
        self.tempfiles.append(tempFile)
        tempFile.write('<html><head><title>'+title+'</title></head><body>')
        tempFile.write(imageURL.prettify())
        tempFile.write('</body></html>')
        tempFile.flush()
        tempFile.close()
        return tempFile.name

    def cleanUpTempFiles(self):
        for tempFile in self.tempfiles:
            tempFile.close()
            os.unlink(tempFile.name)

    def cleanup(self):
        self.cleanUpTempFiles()

    # Extract comic links from the soup
    # Returns a list of comics (articles) as:
    # {
    # 'title'       : article title,
    # 'url'         : URL of print version,
    # 'date'        : The publication date of the article as a string,
    # 'description' : A summary of the article
    # 'content'     : The full article (can be an empty string). This is used by FullContentProfile
    # }
    def comicFeed(self, soup) :
        feedset = []
        for comicContainer in soup.findAll('div', {'class': self.COMIC_DIV_TAG}) :
            comic = comicContainer.find('img')
            if comic is not None:
                filelink = self.writeImage(comic['alt'], comic)
                feedset.append(
                    dict(title=comic['alt'], url='file://'+filelink,  description=comic['alt'], content=''))
        return feedset

    def blogFeed(self, soup) :
        feedset = []
        for blogContainer in soup.findAll('div', {'class': self.BLOG_DIV_TAG}) :
            blog = blogContainer.find('a', {'class':'link-blended'})
            if blog is not None:
                feedset.append(
                    dict(title=blog['title'], url=blog['href'], description=blog['title'], content=''))
        return feedset

    def parse_index(self):
        root = self.index_to_soup(self.BASE_URL)
        comics = self.comicFeed(root)
        blogs =  self.blogFeed(root)
        return [('Comics', comics), ('Blog Entries', blogs)]