Fix #4649 (Sport Illustrated Recipe)

2025-07-09 03:04:10 -04:00 · 2010-01-24 09:06:56 -07:00 · 2010-01-24 09:06:56 -07:00 · abf95b3511
commit abf95b3511
parent 839b5618cb
1 changed files with 47 additions and 28 deletions
--- a/resources/recipes/sportsillustrated.recipe
+++ b/resources/recipes/sportsillustrated.recipe
@ -1,6 +1,5 @@
 from calibre.web.feeds.recipes import BasicNewsRecipe
 from calibre.ebooks.BeautifulSoup import BeautifulSoup
-#from random import randint
 from urllib import quote

 class SportsIllustratedRecipe(BasicNewsRecipe) :
@ -9,12 +8,11 @@ class SportsIllustratedRecipe(BasicNewsRecipe) :
    __license__ = 'GPL v3'
    language = 'en'
    description = 'Sports Illustrated'
-    version = 1
+    version = 3
    title          = u'Sports Illustrated'

    no_stylesheets = True
    remove_javascript = True
-    #template_css = ''
    use_embedded_content   = False

    INDEX = 'http://sportsillustrated.cnn.com/'
@ -22,13 +20,39 @@ class SportsIllustratedRecipe(BasicNewsRecipe) :
    def parse_index(self):
        answer = []
        soup = self.index_to_soup(self.INDEX)
-        # Find the link to the current issue on the front page.
+        # Find the link to the current issue on the front page. SI Cover
        cover = soup.find('img', attrs = {'alt' : 'Read All Articles', 'style' : 'vertical-align:bottom;'})
        if cover:
            currentIssue = cover.parent['href']
            if currentIssue:
                # Open the index of current issue
+
                index = self.index_to_soup(currentIssue)
+                self.log('\tLooking for current issue in: ' + currentIssue)
+                # Now let us see if they updated their frontpage
+                nav = index.find('div', attrs = {'class': 'siv_trav_top'})
+                if nav:
+                    img = nav.find('img', attrs = {'src': 'http://i.cdn.turner.com/sivault/.element/img/1.0/btn_next_v2.jpg'})
+                    if img:
+                        parent = img.parent
+                        if parent.name == 'a':
+                            # They didn't update their frontpage; Load the next issue from here
+                            href = self.INDEX + parent['href']
+                            index = self.index_to_soup(href)
+                            self.log('\tLooking for current issue in: ' + href)
+
+                if index.find('div', 'siv_noArticleMessage'):
+                    nav = index.find('div', attrs = {'class': 'siv_trav_top'})
+                    if nav:
+                    # Their frontpage points to an issue without any articles; Use the previous issue
+                        img = nav.find('img', attrs = {'src': 'http://i.cdn.turner.com/sivault/.element/img/1.0/btn_previous_v2.jpg'})
+                        if img:
+                            parent = img.parent
+                            if parent.name == 'a':
+                                href = self.INDEX + parent['href']
+                                index = self.index_to_soup(href)
+                                self.log('\tLooking for current issue in: ' + href)
+

                # Find all articles.
                list = index.find('div', attrs = {'class' : 'siv_artList'})
@ -69,31 +93,26 @@ class SportsIllustratedRecipe(BasicNewsRecipe) :

    def preprocess_html(self, soup):
        header = soup.find('div', attrs = {'class' : 'siv_artheader'})
-        if header:
-            # It's an article, prepare a container for the content
-            homeMadeSoup = BeautifulSoup('<html><head></head><body></body></html>')
-            body = homeMadeSoup.find('body')
+        homeMadeSoup = BeautifulSoup('<html><head></head><body></body></html>')
+        body = homeMadeSoup.body

-            # Find the date, title and byline
-            temp = header.find('td', attrs = {'class' : 'title'})
-            if temp :
-                date = temp.find('div', attrs = {'class' : 'date'})
-                if date:
-                    body.append(date)
-                if temp.h1:
-                    body.append(temp.h1)
-                if temp.h2 :
-                    body.append(temp.h2)
-                byline = temp.find('div', attrs = {'class' : 'byline'})
-                if byline:
-                    body.append(byline)
+        # Find the date, title and byline
+        temp = header.find('td', attrs = {'class' : 'title'})
+        if temp :
+            date = temp.find('div', attrs = {'class' : 'date'})
+            if date:
+                body.append(date)
+            if temp.h1:
+                body.append(temp.h1)
+            if temp.h2 :
+                body.append(temp.h2)
+            byline = temp.find('div', attrs = {'class' : 'byline'})
+            if byline:
+                body.append(byline)

-            # Find the content
-            for para in soup.findAll('div', attrs = {'class' : 'siv_artpara'}) :
-                body.append(para)
+        # Find the content
+        for para in soup.findAll('div', attrs = {'class' : 'siv_artpara'}) :
+            body.append(para)

-            return homeMadeSoup
-        else :
-            # It's a TOC, just return the whole lot
-            return soup
+        return homeMadeSoup