from calibre.web.feeds.recipes import BasicNewsRecipe from calibre.ebooks.BeautifulSoup import BeautifulSoup #from random import randint from urllib import quote class SportsIllustratedColumnistsRecipe(BasicNewsRecipe) : title = u'Sports Illustrated Columnists' __author__ = u'kwetal' __license__ = u'GPL v3' language = 'en' version = 2 oldest_article = 7 max_articles_per_feed = 100 no_stylesheets = True remove_javascript = True feeds = [] # RSS sources found at http://sportsillustrated.cnn.com/services/rss/ feeds.append((u'Jon Heyman', u'http://rss.cnn.com/rss/si_jon_heyman.rss')) feeds.append((u'Austin Murphy', u'http://rss.cnn.com/rss/si_austin_murphy.rss')) feeds.append((u'Lars Anderson', u'http://rss.cnn.com/rss/si_lars_anderson.rss')) feeds.append((u'Melissa Segura', u'http://rss.cnn.com/rss/si_melissa_segura.rss')) feeds.append((u'Peter King', u'http://rss.cnn.com/rss/si_peter_king.rss')) feeds.append((u'Scott Wraight', u'http://rss.cnn.com/rss/si_scott_wraight.rss')) def print_version(self, url) : # This is the url and the parameters that work to get the print version. printUrl = 'http://si.printthis.clickability.com/pt/printThis?clickMap=printThis' printUrl += '&fb=Y&partnerID=2356&url=' + quote(url) return printUrl # However the original javascript also uses the following parameters, but they can be left out: # title : can be some random string # random : some random number, but I think the number of digits is important # expire : no idea what value to use # All this comes from the Javascript function that redirects to the print version. It's called PT() and is defined in the file 48.js def preprocess_html(self, soup) : temp = soup.find('div', attrs = {'class' : 'cnnstoryheadline'}) if temp : # It's an article, make a valid content container homeMadeSoup = BeautifulSoup('') body = homeMadeSoup.find('body') headline = temp.find('h1') if headline : body.append(headline) for td in soup.findAll('td', attrs = {'class' : 'cnnstorycontentarea'}) : for p in td.findAll('p') : body.append(p) return homeMadeSoup else : # It's a TOC, just return the whole lot return soup