Update Smilezilla

Fixes #1819960 [recipe broken Smilezilla](https://bugs.launchpad.net/calibre/+bug/1819960)
2025-07-09 03:04:10 -04:00 · 2019-09-20 10:38:58 +05:30 · 2019-09-20 10:38:58 +05:30 · 46f32de20b
commit 46f32de20b
parent 8c54867bbd
1 changed files with 28 additions and 71 deletions
--- a/recipes/smilezilla.recipe
+++ b/recipes/smilezilla.recipe
@ -1,7 +1,10 @@
+from __future__ import absolute_import, division, print_function, unicode_literals

+import os
+import re
+
+from calibre.ptempfile import PersistentTemporaryDirectory
 from calibre.web.feeds.news import BasicNewsRecipe
-from calibre.ebooks.BeautifulSoup import BeautifulSoup
-from calibre.ptempfile import PersistentTemporaryFile


 class SmileZilla(BasicNewsRecipe):
@ -13,41 +16,11 @@ class SmileZilla(BasicNewsRecipe):
    STORIES_INDEX = 'http://www.smilezilla.com/story.do'
    description = 'Daily Jokes and funny stoires'
    oldest_article = 1
-    remove_tags = [
-    ]
-    keep_only_tags = []
    no_stylesheets = True
-    simultaneous_downloads = 1
-    articles_are_obfuscated = True
    encoding = 'utf-8'

    remove_tags = [dict(name='table')]

-    counter = {JOKES_INDEX: 0, STORIES_INDEX: 0}
-    cache = {}
-
-    def cached_fetch(self, url):
-        cache = self.cache
-
-        if url in cache:
-            f = open(cache[url])
-            html = f.read()
-            f.close()
-            return BeautifulSoup(html, fromEncoding=self.encoding)
-
-        br = BasicNewsRecipe.get_browser(self)
-        response = br.open(url)
-        html = response.read()
-        soup = BeautifulSoup(html, fromEncoding=self.encoding)
-        for img in soup.findAll('img', src=True):
-            if img['src'].startswith('/'):
-                img['src'] = 'http://www.smilezilla.com' + img['src']
-        pt = PersistentTemporaryFile('.html')
-        pt.write(type(u'')(soup.html).encode(self.encoding))
-        pt.close()
-        cache[url] = pt.name
-        return soup
-
    def _get_entry(self, soup):
        return soup.find('form', attrs={'name': 'contentForm'})

@ -56,56 +29,40 @@ class SmileZilla(BasicNewsRecipe):
        return self.tag_to_string(title_div).strip()

    def parse_index(self):
+        self.tdir = PersistentTemporaryDirectory()
+
+        def as_soup(url):
+            soup = self.index_to_soup(url)
+            for img in soup.findAll('img', src=True):
+                if img['src'].startswith('/'):
+                    img['src'] = 'http://www.smilezilla.com' + img['src']
+            return soup
+
        articles = []

-        soup = self.cached_fetch(self.JOKES_INDEX)
+        soup = as_soup(self.JOKES_INDEX)
        jokes_entry = self._get_entry(soup)
        section_title = self._get_section_title(soup)
        todays_jokes = []
-        for hr in enumerate(jokes_entry.findAll('hr')):
-            title = 'Joke ' + type(u'')(hr[0] + 1)
-            url = self.JOKES_INDEX
-            todays_jokes.append({'title': title, 'url': url,
-                                 'description': '', 'date': ''})
+        for i, text in enumerate(re.findall(r'<hr.*?>(.+?)<table', type(u'')(jokes_entry), flags=re.DOTALL)):
+            title = 'Joke {}'.format(i + 1)
+            with open(os.path.join(self.tdir, 'j{}.html'.format(i)), 'wb') as f:
+                f.write(b'<html><body>')
+                f.write(text.encode('utf-8'))
+                todays_jokes.append({'title': title, 'url': 'file:///' + f.name})
        articles.append((section_title, todays_jokes))

-        soup = self.cached_fetch(self.STORIES_INDEX)
+        soup = as_soup(self.STORIES_INDEX)
        entry = self._get_entry(soup)
        section_title = self._get_section_title(soup)

        todays_stories = []
-        for hr in enumerate(entry.findAll('hr')):
-            title = 'Story ' + type(u'')(hr[0] + 1)
-            current = hr[1]
-            while True:
-                current = current.findPrevious()
-                if current is None:
-                    break
-                elif current.name == 'hr':
-                    break
-                elif current.name == 'b':
-                    title = title + ': ' + self.tag_to_string(current)
-                    break
-            url = self.STORIES_INDEX
-            todays_stories.append({'title': title, 'url': url,
-                                   'description': '', 'date': ''})
+        for i, text in enumerate(re.findall(r'<hr.*?>(.+?)<table', type(u'')(entry), flags=re.DOTALL)):
+            title = 'Story {}'.format(i)
+            with open(os.path.join(self.tdir, 's{}.html'.format(i)), 'wb') as f:
+                f.write(b'<html><body>')
+                f.write(text.encode('utf-8'))
+                todays_stories.append({'title': title, 'url': 'file:///' + f.name})
        articles.append((section_title, todays_stories))

        return articles
-
-    def get_obfuscated_article(self, url):
-        return self.cache[url]
-
-    def preprocess_raw_html(self, raw_html, url):
-        url = self.JOKES_INDEX if (
-            self.cache[self.JOKES_INDEX] in url) else self.STORIES_INDEX
-        count = self.counter[url] + 1
-        self.counter[url] = count
-        soup = self.index_to_soup(raw_html)
-        entry = self._get_entry(soup)
-        soup2 = BeautifulSoup('<html><head></head><body></body></html>')
-        body = soup2.find('body')
-        entries = type(u'')(entry).split('<hr />')
-        body.insert(0, entries[count - 1])
-
-        return type(u'')(soup2)