Update Smilezilla

Fixes #1819960 [recipe broken Smilezilla](https://bugs.launchpad.net/calibre/+bug/1819960)
This commit is contained in:
Kovid Goyal 2019-09-20 10:38:58 +05:30
parent 8c54867bbd
commit 46f32de20b
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C

View File

@ -1,7 +1,10 @@
from __future__ import absolute_import, division, print_function, unicode_literals
import os
import re
from calibre.ptempfile import PersistentTemporaryDirectory
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulSoup
from calibre.ptempfile import PersistentTemporaryFile
class SmileZilla(BasicNewsRecipe):
@ -13,41 +16,11 @@ class SmileZilla(BasicNewsRecipe):
STORIES_INDEX = 'http://www.smilezilla.com/story.do'
description = 'Daily Jokes and funny stoires'
oldest_article = 1
remove_tags = [
]
keep_only_tags = []
no_stylesheets = True
simultaneous_downloads = 1
articles_are_obfuscated = True
encoding = 'utf-8'
remove_tags = [dict(name='table')]
counter = {JOKES_INDEX: 0, STORIES_INDEX: 0}
cache = {}
def cached_fetch(self, url):
cache = self.cache
if url in cache:
f = open(cache[url])
html = f.read()
f.close()
return BeautifulSoup(html, fromEncoding=self.encoding)
br = BasicNewsRecipe.get_browser(self)
response = br.open(url)
html = response.read()
soup = BeautifulSoup(html, fromEncoding=self.encoding)
for img in soup.findAll('img', src=True):
if img['src'].startswith('/'):
img['src'] = 'http://www.smilezilla.com' + img['src']
pt = PersistentTemporaryFile('.html')
pt.write(type(u'')(soup.html).encode(self.encoding))
pt.close()
cache[url] = pt.name
return soup
def _get_entry(self, soup):
return soup.find('form', attrs={'name': 'contentForm'})
@ -56,56 +29,40 @@ class SmileZilla(BasicNewsRecipe):
return self.tag_to_string(title_div).strip()
def parse_index(self):
self.tdir = PersistentTemporaryDirectory()
def as_soup(url):
soup = self.index_to_soup(url)
for img in soup.findAll('img', src=True):
if img['src'].startswith('/'):
img['src'] = 'http://www.smilezilla.com' + img['src']
return soup
articles = []
soup = self.cached_fetch(self.JOKES_INDEX)
soup = as_soup(self.JOKES_INDEX)
jokes_entry = self._get_entry(soup)
section_title = self._get_section_title(soup)
todays_jokes = []
for hr in enumerate(jokes_entry.findAll('hr')):
title = 'Joke ' + type(u'')(hr[0] + 1)
url = self.JOKES_INDEX
todays_jokes.append({'title': title, 'url': url,
'description': '', 'date': ''})
for i, text in enumerate(re.findall(r'<hr.*?>(.+?)<table', type(u'')(jokes_entry), flags=re.DOTALL)):
title = 'Joke {}'.format(i + 1)
with open(os.path.join(self.tdir, 'j{}.html'.format(i)), 'wb') as f:
f.write(b'<html><body>')
f.write(text.encode('utf-8'))
todays_jokes.append({'title': title, 'url': 'file:///' + f.name})
articles.append((section_title, todays_jokes))
soup = self.cached_fetch(self.STORIES_INDEX)
soup = as_soup(self.STORIES_INDEX)
entry = self._get_entry(soup)
section_title = self._get_section_title(soup)
todays_stories = []
for hr in enumerate(entry.findAll('hr')):
title = 'Story ' + type(u'')(hr[0] + 1)
current = hr[1]
while True:
current = current.findPrevious()
if current is None:
break
elif current.name == 'hr':
break
elif current.name == 'b':
title = title + ': ' + self.tag_to_string(current)
break
url = self.STORIES_INDEX
todays_stories.append({'title': title, 'url': url,
'description': '', 'date': ''})
for i, text in enumerate(re.findall(r'<hr.*?>(.+?)<table', type(u'')(entry), flags=re.DOTALL)):
title = 'Story {}'.format(i)
with open(os.path.join(self.tdir, 's{}.html'.format(i)), 'wb') as f:
f.write(b'<html><body>')
f.write(text.encode('utf-8'))
todays_stories.append({'title': title, 'url': 'file:///' + f.name})
articles.append((section_title, todays_stories))
return articles
def get_obfuscated_article(self, url):
return self.cache[url]
def preprocess_raw_html(self, raw_html, url):
url = self.JOKES_INDEX if (
self.cache[self.JOKES_INDEX] in url) else self.STORIES_INDEX
count = self.counter[url] + 1
self.counter[url] = count
soup = self.index_to_soup(raw_html)
entry = self._get_entry(soup)
soup2 = BeautifulSoup('<html><head></head><body></body></html>')
body = soup2.find('body')
entries = type(u'')(entry).split('<hr />')
body.insert(0, entries[count - 1])
return type(u'')(soup2)