diff --git a/resources/recipes/sportsillustrated.recipe b/resources/recipes/sportsillustrated.recipe index 0dbae1ebc0..dd1df16ac7 100644 --- a/resources/recipes/sportsillustrated.recipe +++ b/resources/recipes/sportsillustrated.recipe @@ -1,6 +1,5 @@ from calibre.web.feeds.recipes import BasicNewsRecipe from calibre.ebooks.BeautifulSoup import BeautifulSoup -#from random import randint from urllib import quote class SportsIllustratedRecipe(BasicNewsRecipe) : @@ -9,12 +8,11 @@ class SportsIllustratedRecipe(BasicNewsRecipe) : __license__ = 'GPL v3' language = 'en' description = 'Sports Illustrated' - version = 1 + version = 3 title = u'Sports Illustrated' no_stylesheets = True remove_javascript = True - #template_css = '' use_embedded_content = False INDEX = 'http://sportsillustrated.cnn.com/' @@ -22,13 +20,39 @@ class SportsIllustratedRecipe(BasicNewsRecipe) : def parse_index(self): answer = [] soup = self.index_to_soup(self.INDEX) - # Find the link to the current issue on the front page. + # Find the link to the current issue on the front page. SI Cover cover = soup.find('img', attrs = {'alt' : 'Read All Articles', 'style' : 'vertical-align:bottom;'}) if cover: currentIssue = cover.parent['href'] if currentIssue: # Open the index of current issue + index = self.index_to_soup(currentIssue) + self.log('\tLooking for current issue in: ' + currentIssue) + # Now let us see if they updated their frontpage + nav = index.find('div', attrs = {'class': 'siv_trav_top'}) + if nav: + img = nav.find('img', attrs = {'src': 'http://i.cdn.turner.com/sivault/.element/img/1.0/btn_next_v2.jpg'}) + if img: + parent = img.parent + if parent.name == 'a': + # They didn't update their frontpage; Load the next issue from here + href = self.INDEX + parent['href'] + index = self.index_to_soup(href) + self.log('\tLooking for current issue in: ' + href) + + if index.find('div', 'siv_noArticleMessage'): + nav = index.find('div', attrs = {'class': 'siv_trav_top'}) + if nav: + # Their frontpage points to an issue without any articles; Use the previous issue + img = nav.find('img', attrs = {'src': 'http://i.cdn.turner.com/sivault/.element/img/1.0/btn_previous_v2.jpg'}) + if img: + parent = img.parent + if parent.name == 'a': + href = self.INDEX + parent['href'] + index = self.index_to_soup(href) + self.log('\tLooking for current issue in: ' + href) + # Find all articles. list = index.find('div', attrs = {'class' : 'siv_artList'}) @@ -69,31 +93,26 @@ class SportsIllustratedRecipe(BasicNewsRecipe) : def preprocess_html(self, soup): header = soup.find('div', attrs = {'class' : 'siv_artheader'}) - if header: - # It's an article, prepare a container for the content - homeMadeSoup = BeautifulSoup('') - body = homeMadeSoup.find('body') + homeMadeSoup = BeautifulSoup('') + body = homeMadeSoup.body - # Find the date, title and byline - temp = header.find('td', attrs = {'class' : 'title'}) - if temp : - date = temp.find('div', attrs = {'class' : 'date'}) - if date: - body.append(date) - if temp.h1: - body.append(temp.h1) - if temp.h2 : - body.append(temp.h2) - byline = temp.find('div', attrs = {'class' : 'byline'}) - if byline: - body.append(byline) + # Find the date, title and byline + temp = header.find('td', attrs = {'class' : 'title'}) + if temp : + date = temp.find('div', attrs = {'class' : 'date'}) + if date: + body.append(date) + if temp.h1: + body.append(temp.h1) + if temp.h2 : + body.append(temp.h2) + byline = temp.find('div', attrs = {'class' : 'byline'}) + if byline: + body.append(byline) - # Find the content - for para in soup.findAll('div', attrs = {'class' : 'siv_artpara'}) : - body.append(para) + # Find the content + for para in soup.findAll('div', attrs = {'class' : 'siv_artpara'}) : + body.append(para) - return homeMadeSoup - else : - # It's a TOC, just return the whole lot - return soup + return homeMadeSoup