From 9729c554135893ad9eb81c380fb01d6035c68048 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Thu, 26 Nov 2009 12:58:59 -0700 Subject: [PATCH] New recipes for Sports Illustrated and Sports Illustrated Columnists by kwetal --- resources/recipes/sportsillustrated.recipe | 99 +++++++++++++++++++ .../sportsillustrated_columnists.recipe | 60 +++++++++++ 2 files changed, 159 insertions(+) create mode 100644 resources/recipes/sportsillustrated.recipe create mode 100644 resources/recipes/sportsillustrated_columnists.recipe diff --git a/resources/recipes/sportsillustrated.recipe b/resources/recipes/sportsillustrated.recipe new file mode 100644 index 0000000000..0dbae1ebc0 --- /dev/null +++ b/resources/recipes/sportsillustrated.recipe @@ -0,0 +1,99 @@ +from calibre.web.feeds.recipes import BasicNewsRecipe +from calibre.ebooks.BeautifulSoup import BeautifulSoup +#from random import randint +from urllib import quote + +class SportsIllustratedRecipe(BasicNewsRecipe) : + __author__ = 'kwetal' + __copyright__ = 'kwetal' + __license__ = 'GPL v3' + language = 'en' + description = 'Sports Illustrated' + version = 1 + title = u'Sports Illustrated' + + no_stylesheets = True + remove_javascript = True + #template_css = '' + use_embedded_content = False + + INDEX = 'http://sportsillustrated.cnn.com/' + + def parse_index(self): + answer = [] + soup = self.index_to_soup(self.INDEX) + # Find the link to the current issue on the front page. + cover = soup.find('img', attrs = {'alt' : 'Read All Articles', 'style' : 'vertical-align:bottom;'}) + if cover: + currentIssue = cover.parent['href'] + if currentIssue: + # Open the index of current issue + index = self.index_to_soup(currentIssue) + + # Find all articles. + list = index.find('div', attrs = {'class' : 'siv_artList'}) + if list: + articles = [] + # Get all the artcles ready for calibre. + for headline in list.findAll('div', attrs = {'class' : 'headline'}): + title = self.tag_to_string(headline.a) + '\n' + self.tag_to_string(headline.findNextSibling('div', attrs = {'class' : 'info'})) + url = self.INDEX + headline.a['href'] + description = self.tag_to_string(headline.findNextSibling('a').div) + article = {'title' : title, 'date' : u'', 'url' : url, 'description' : description} + + articles.append(article) + + # See if we can find a meaningfull title + feedTitle = 'Current Issue' + hasTitle = index.find('div', attrs = {'class' : 'siv_imageText_head'}) + if hasTitle : + feedTitle = self.tag_to_string(hasTitle.h1) + + answer.append([feedTitle, articles]) + + return answer + + + def print_version(self, url) : + # This is the url and the parameters that work to get the print version. + printUrl = 'http://si.printthis.clickability.com/pt/printThis?clickMap=printThis' + printUrl += '&fb=Y&partnerID=2356&url=' + quote(url) + + return printUrl + + # However the original javascript also uses the following parameters, but they can be left out: + # title : can be some random string + # random : some random number, but I think the number of digits is important + # expire : no idea what value to use + # All this comes from the Javascript function that redirects to the print version. It's called PT() and is defined in the file 48.js + + def preprocess_html(self, soup): + header = soup.find('div', attrs = {'class' : 'siv_artheader'}) + if header: + # It's an article, prepare a container for the content + homeMadeSoup = BeautifulSoup('') + body = homeMadeSoup.find('body') + + # Find the date, title and byline + temp = header.find('td', attrs = {'class' : 'title'}) + if temp : + date = temp.find('div', attrs = {'class' : 'date'}) + if date: + body.append(date) + if temp.h1: + body.append(temp.h1) + if temp.h2 : + body.append(temp.h2) + byline = temp.find('div', attrs = {'class' : 'byline'}) + if byline: + body.append(byline) + + # Find the content + for para in soup.findAll('div', attrs = {'class' : 'siv_artpara'}) : + body.append(para) + + return homeMadeSoup + else : + # It's a TOC, just return the whole lot + return soup + diff --git a/resources/recipes/sportsillustrated_columnists.recipe b/resources/recipes/sportsillustrated_columnists.recipe new file mode 100644 index 0000000000..2f6b18c992 --- /dev/null +++ b/resources/recipes/sportsillustrated_columnists.recipe @@ -0,0 +1,60 @@ +from calibre.web.feeds.recipes import BasicNewsRecipe +from calibre.ebooks.BeautifulSoup import BeautifulSoup +#from random import randint +from urllib import quote + +class SportsIllustratedColumnistsRecipe(BasicNewsRecipe) : + title = u'Sports Illustrated Columnists' + __author__ = u'kwetal' + __license__ = u'GPL v3' + language = 'en' + version = 2 + + oldest_article = 7 + max_articles_per_feed = 100 + no_stylesheets = True + remove_javascript = True + + feeds = [] + # RSS sources found at http://sportsillustrated.cnn.com/services/rss/ + feeds.append((u'Jon Heyman', u'http://rss.cnn.com/rss/si_jon_heyman.rss')) + feeds.append((u'Austin Murphy', u'http://rss.cnn.com/rss/si_austin_murphy.rss')) + feeds.append((u'Lars Anderson', u'http://rss.cnn.com/rss/si_lars_anderson.rss')) + feeds.append((u'Melissa Segura', u'http://rss.cnn.com/rss/si_melissa_segura.rss')) + feeds.append((u'Peter King', u'http://rss.cnn.com/rss/si_peter_king.rss')) + feeds.append((u'Scott Wraight', u'http://rss.cnn.com/rss/si_scott_wraight.rss')) + + def print_version(self, url) : + # This is the url and the parameters that work to get the print version. + printUrl = 'http://si.printthis.clickability.com/pt/printThis?clickMap=printThis' + printUrl += '&fb=Y&partnerID=2356&url=' + quote(url) + + return printUrl + + # However the original javascript also uses the following parameters, but they can be left out: + # title : can be some random string + # random : some random number, but I think the number of digits is important + # expire : no idea what value to use + # All this comes from the Javascript function that redirects to the print version. It's called PT() and is defined in the file 48.js + + def preprocess_html(self, soup) : + temp = soup.find('div', attrs = {'class' : 'cnnstoryheadline'}) + if temp : + # It's an article, make a valid content container + homeMadeSoup = BeautifulSoup('') + body = homeMadeSoup.find('body') + + headline = temp.find('h1') + if headline : + body.append(headline) + + for td in soup.findAll('td', attrs = {'class' : 'cnnstorycontentarea'}) : + for p in td.findAll('p') : + body.append(p) + + return homeMadeSoup + else : + # It's a TOC, just return the whole lot + return soup + +