diff --git a/recipes/smilezilla.recipe b/recipes/smilezilla.recipe index 18678bc770..f483715515 100644 --- a/recipes/smilezilla.recipe +++ b/recipes/smilezilla.recipe @@ -1,7 +1,10 @@ +from __future__ import absolute_import, division, print_function, unicode_literals +import os +import re + +from calibre.ptempfile import PersistentTemporaryDirectory from calibre.web.feeds.news import BasicNewsRecipe -from calibre.ebooks.BeautifulSoup import BeautifulSoup -from calibre.ptempfile import PersistentTemporaryFile class SmileZilla(BasicNewsRecipe): @@ -13,41 +16,11 @@ class SmileZilla(BasicNewsRecipe): STORIES_INDEX = 'http://www.smilezilla.com/story.do' description = 'Daily Jokes and funny stoires' oldest_article = 1 - remove_tags = [ - ] - keep_only_tags = [] no_stylesheets = True - simultaneous_downloads = 1 - articles_are_obfuscated = True encoding = 'utf-8' remove_tags = [dict(name='table')] - counter = {JOKES_INDEX: 0, STORIES_INDEX: 0} - cache = {} - - def cached_fetch(self, url): - cache = self.cache - - if url in cache: - f = open(cache[url]) - html = f.read() - f.close() - return BeautifulSoup(html, fromEncoding=self.encoding) - - br = BasicNewsRecipe.get_browser(self) - response = br.open(url) - html = response.read() - soup = BeautifulSoup(html, fromEncoding=self.encoding) - for img in soup.findAll('img', src=True): - if img['src'].startswith('/'): - img['src'] = 'http://www.smilezilla.com' + img['src'] - pt = PersistentTemporaryFile('.html') - pt.write(type(u'')(soup.html).encode(self.encoding)) - pt.close() - cache[url] = pt.name - return soup - def _get_entry(self, soup): return soup.find('form', attrs={'name': 'contentForm'}) @@ -56,56 +29,40 @@ class SmileZilla(BasicNewsRecipe): return self.tag_to_string(title_div).strip() def parse_index(self): + self.tdir = PersistentTemporaryDirectory() + + def as_soup(url): + soup = self.index_to_soup(url) + for img in soup.findAll('img', src=True): + if img['src'].startswith('/'): + img['src'] = 'http://www.smilezilla.com' + img['src'] + return soup + articles = [] - soup = self.cached_fetch(self.JOKES_INDEX) + soup = as_soup(self.JOKES_INDEX) jokes_entry = self._get_entry(soup) section_title = self._get_section_title(soup) todays_jokes = [] - for hr in enumerate(jokes_entry.findAll('hr')): - title = 'Joke ' + type(u'')(hr[0] + 1) - url = self.JOKES_INDEX - todays_jokes.append({'title': title, 'url': url, - 'description': '', 'date': ''}) + for i, text in enumerate(re.findall(r'(.+?)') + f.write(text.encode('utf-8')) + todays_jokes.append({'title': title, 'url': 'file:///' + f.name}) articles.append((section_title, todays_jokes)) - soup = self.cached_fetch(self.STORIES_INDEX) + soup = as_soup(self.STORIES_INDEX) entry = self._get_entry(soup) section_title = self._get_section_title(soup) todays_stories = [] - for hr in enumerate(entry.findAll('hr')): - title = 'Story ' + type(u'')(hr[0] + 1) - current = hr[1] - while True: - current = current.findPrevious() - if current is None: - break - elif current.name == 'hr': - break - elif current.name == 'b': - title = title + ': ' + self.tag_to_string(current) - break - url = self.STORIES_INDEX - todays_stories.append({'title': title, 'url': url, - 'description': '', 'date': ''}) + for i, text in enumerate(re.findall(r'(.+?)') + f.write(text.encode('utf-8')) + todays_stories.append({'title': title, 'url': 'file:///' + f.name}) articles.append((section_title, todays_stories)) return articles - - def get_obfuscated_article(self, url): - return self.cache[url] - - def preprocess_raw_html(self, raw_html, url): - url = self.JOKES_INDEX if ( - self.cache[self.JOKES_INDEX] in url) else self.STORIES_INDEX - count = self.counter[url] + 1 - self.counter[url] = count - soup = self.index_to_soup(raw_html) - entry = self._get_entry(soup) - soup2 = BeautifulSoup('') - body = soup2.find('body') - entries = type(u'')(entry).split('
') - body.insert(0, entries[count - 1]) - - return type(u'')(soup2)