calibre/recipes/smilezilla.recipe


from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulSoup
from calibre.ptempfile import PersistentTemporaryFile


class SmileZilla(BasicNewsRecipe):

    title = 'SmileZilla'
    language = 'en'
    __author__ = "Will"
    JOKES_INDEX = 'http://www.smilezilla.com/joke.do'
    STORIES_INDEX = 'http://www.smilezilla.com/story.do'
    description = 'Daily Jokes and funny stoires'
    oldest_article = 1
    remove_tags = [
    ]
    keep_only_tags = []
    no_stylesheets = True
    simultaneous_downloads = 1
    articles_are_obfuscated = True
    encoding = 'utf-8'

    remove_tags = [dict(name='table')]

    counter = {JOKES_INDEX: 0, STORIES_INDEX: 0}
    cache = {}

    def cached_fetch(self, url):
        cache = self.cache

        if url in cache:
            f = open(cache[url])
            html = f.read()
            f.close()
            return BeautifulSoup(html, fromEncoding=self.encoding)

        br = BasicNewsRecipe.get_browser(self)
        response = br.open(url)
        html = response.read()
        soup = BeautifulSoup(html, fromEncoding=self.encoding)
        for img in soup.findAll('img', src=True):
            if img['src'].startswith('/'):
                img['src'] = 'http://www.smilezilla.com' + img['src']
        pt = PersistentTemporaryFile('.html')
        pt.write(str(soup.html).encode(self.encoding))
        pt.close()
        cache[url] = pt.name
        return soup

    def _get_entry(self, soup):
        return soup.find('form', attrs={'name': 'contentForm'})

    def _get_section_title(self, soup):
        title_div = soup.find('div', attrs={'class': 'title'})
        return self.tag_to_string(title_div).strip()

    def parse_index(self):
        articles = []

        soup = self.cached_fetch(self.JOKES_INDEX)
        jokes_entry = self._get_entry(soup)
        section_title = self._get_section_title(soup)
        todays_jokes = []
        for hr in enumerate(jokes_entry.findAll('hr')):
            title = 'Joke ' + str(hr[0] + 1)
            url = self.JOKES_INDEX
            todays_jokes.append({'title': title, 'url': url,
                                 'description': '', 'date': ''})
        articles.append((section_title, todays_jokes))

        soup = self.cached_fetch(self.STORIES_INDEX)
        entry = self._get_entry(soup)
        section_title = self._get_section_title(soup)

        todays_stories = []
        for hr in enumerate(entry.findAll('hr')):
            title = 'Story ' + str(hr[0] + 1)
            current = hr[1]
            while True:
                current = current.findPrevious()
                if current is None:
                    break
                elif current.name == 'hr':
                    break
                elif current.name == 'b':
                    title = title + ': ' + self.tag_to_string(current)
                    break
            url = self.STORIES_INDEX
            todays_stories.append({'title': title, 'url': url,
                                   'description': '', 'date': ''})
        articles.append((section_title, todays_stories))

        return articles

    def get_obfuscated_article(self, url):
        return self.cache[url]

    def preprocess_raw_html(self, raw_html, url):
        url = self.JOKES_INDEX if (
            self.cache[self.JOKES_INDEX] in url) else self.STORIES_INDEX
        count = self.counter[url] + 1
        self.counter[url] = count
        soup = self.index_to_soup(raw_html)
        entry = self._get_entry(soup)
        soup2 = BeautifulSoup('<html><head></head><body></body></html>')
        body = soup2.find('body')
        entries = str(entry).split('<hr />')
        body.insert(0, entries[count - 1])

        return str(soup2)