calibre/recipes/smilezilla.recipe

from __future__ import absolute_import, division, print_function, unicode_literals

import os
import re

from calibre.ptempfile import PersistentTemporaryDirectory
from calibre.web.feeds.news import BasicNewsRecipe


class SmileZilla(BasicNewsRecipe):

    title = 'SmileZilla'
    language = 'en'
    __author__ = "Will"
    JOKES_INDEX = 'http://www.smilezilla.com/joke.do'
    STORIES_INDEX = 'http://www.smilezilla.com/story.do'
    description = 'Daily Jokes and funny stoires'
    oldest_article = 1
    no_stylesheets = True
    encoding = 'utf-8'

    remove_tags = [dict(name='table')]

    def _get_entry(self, soup):
        return soup.find('form', attrs={'name': 'contentForm'})

    def _get_section_title(self, soup):
        title_div = soup.find('div', attrs={'class': 'title'})
        return self.tag_to_string(title_div).strip()

    def parse_index(self):
        self.tdir = PersistentTemporaryDirectory()

        def as_soup(url):
            soup = self.index_to_soup(url)
            for img in soup.findAll('img', src=True):
                if img['src'].startswith('/'):
                    img['src'] = 'http://www.smilezilla.com' + img['src']
            return soup

        articles = []

        soup = as_soup(self.JOKES_INDEX)
        jokes_entry = self._get_entry(soup)
        section_title = self._get_section_title(soup)
        todays_jokes = []
        for i, text in enumerate(re.findall(r'<hr.*?>(.+?)<table', type(u'')(jokes_entry), flags=re.DOTALL)):
            title = 'Joke {}'.format(i + 1)
            with open(os.path.join(self.tdir, 'j{}.html'.format(i)), 'wb') as f:
                f.write(b'<html><body>')
                f.write(text.encode('utf-8'))
                todays_jokes.append({'title': title, 'url': 'file:///' + f.name})
        articles.append((section_title, todays_jokes))

        soup = as_soup(self.STORIES_INDEX)
        entry = self._get_entry(soup)
        section_title = self._get_section_title(soup)

        todays_stories = []
        for i, text in enumerate(re.findall(r'<hr.*?>(.+?)<table', type(u'')(entry), flags=re.DOTALL)):
            title = 'Story {}'.format(i)
            with open(os.path.join(self.tdir, 's{}.html'.format(i)), 'wb') as f:
                f.write(b'<html><body>')
                f.write(text.encode('utf-8'))
                todays_stories.append({'title': title, 'url': 'file:///' + f.name})
        articles.append((section_title, todays_stories))

        return articles