Fix #4649 (Sport Illustrated Recipe)

This commit is contained in:
Kovid Goyal 2010-01-24 09:06:56 -07:00
parent 839b5618cb
commit abf95b3511

View File

@ -1,6 +1,5 @@
from calibre.web.feeds.recipes import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulSoup
#from random import randint
from urllib import quote
class SportsIllustratedRecipe(BasicNewsRecipe) :
@ -9,12 +8,11 @@ class SportsIllustratedRecipe(BasicNewsRecipe) :
__license__ = 'GPL v3'
language = 'en'
description = 'Sports Illustrated'
version = 1
version = 3
title = u'Sports Illustrated'
no_stylesheets = True
remove_javascript = True
#template_css = ''
use_embedded_content = False
INDEX = 'http://sportsillustrated.cnn.com/'
@ -22,13 +20,39 @@ class SportsIllustratedRecipe(BasicNewsRecipe) :
def parse_index(self):
answer = []
soup = self.index_to_soup(self.INDEX)
# Find the link to the current issue on the front page.
# Find the link to the current issue on the front page. SI Cover
cover = soup.find('img', attrs = {'alt' : 'Read All Articles', 'style' : 'vertical-align:bottom;'})
if cover:
currentIssue = cover.parent['href']
if currentIssue:
# Open the index of current issue
index = self.index_to_soup(currentIssue)
self.log('\tLooking for current issue in: ' + currentIssue)
# Now let us see if they updated their frontpage
nav = index.find('div', attrs = {'class': 'siv_trav_top'})
if nav:
img = nav.find('img', attrs = {'src': 'http://i.cdn.turner.com/sivault/.element/img/1.0/btn_next_v2.jpg'})
if img:
parent = img.parent
if parent.name == 'a':
# They didn't update their frontpage; Load the next issue from here
href = self.INDEX + parent['href']
index = self.index_to_soup(href)
self.log('\tLooking for current issue in: ' + href)
if index.find('div', 'siv_noArticleMessage'):
nav = index.find('div', attrs = {'class': 'siv_trav_top'})
if nav:
# Their frontpage points to an issue without any articles; Use the previous issue
img = nav.find('img', attrs = {'src': 'http://i.cdn.turner.com/sivault/.element/img/1.0/btn_previous_v2.jpg'})
if img:
parent = img.parent
if parent.name == 'a':
href = self.INDEX + parent['href']
index = self.index_to_soup(href)
self.log('\tLooking for current issue in: ' + href)
# Find all articles.
list = index.find('div', attrs = {'class' : 'siv_artList'})
@ -69,31 +93,26 @@ class SportsIllustratedRecipe(BasicNewsRecipe) :
def preprocess_html(self, soup):
header = soup.find('div', attrs = {'class' : 'siv_artheader'})
if header:
# It's an article, prepare a container for the content
homeMadeSoup = BeautifulSoup('<html><head></head><body></body></html>')
body = homeMadeSoup.find('body')
homeMadeSoup = BeautifulSoup('<html><head></head><body></body></html>')
body = homeMadeSoup.body
# Find the date, title and byline
temp = header.find('td', attrs = {'class' : 'title'})
if temp :
date = temp.find('div', attrs = {'class' : 'date'})
if date:
body.append(date)
if temp.h1:
body.append(temp.h1)
if temp.h2 :
body.append(temp.h2)
byline = temp.find('div', attrs = {'class' : 'byline'})
if byline:
body.append(byline)
# Find the date, title and byline
temp = header.find('td', attrs = {'class' : 'title'})
if temp :
date = temp.find('div', attrs = {'class' : 'date'})
if date:
body.append(date)
if temp.h1:
body.append(temp.h1)
if temp.h2 :
body.append(temp.h2)
byline = temp.find('div', attrs = {'class' : 'byline'})
if byline:
body.append(byline)
# Find the content
for para in soup.findAll('div', attrs = {'class' : 'siv_artpara'}) :
body.append(para)
# Find the content
for para in soup.findAll('div', attrs = {'class' : 'siv_artpara'}) :
body.append(para)
return homeMadeSoup
else :
# It's a TOC, just return the whole lot
return soup
return homeMadeSoup