import re import urllib2 import time from calibre.web.feeds.news import BasicNewsRecipe from calibre.ebooks.BeautifulSoup import BeautifulSoup, SoupStrainer from calibre import strftime ''' Help Needed: Still can't figure out why I'm getting strange characters. Esp. the Great Movies descriptions in the TOC. Anyone help me figure that out? Change Log: 2011-02-19: Version 2: Added "Oscars" section and fixed date problem ''' class Ebert(BasicNewsRecipe): title = 'Roger Ebert' __author__ = 'Shane Erstad' version = 2 description = 'Roger Ebert Movie Reviews' publisher = 'Chicago Sun Times' category = 'movies' oldest_article = 8 max_articles_per_feed = 100 no_stylesheets = True use_embedded_content = False encoding = 'UTF-8' masthead_url = 'http://rogerebert.suntimes.com/graphics/global/roger.jpg' language = 'en' remove_empty_feeds = False PREFIX = 'http://rogerebert.suntimes.com' patternReviews = r'(.*?).*?
(.*?)
(.*?)' patternCommentary = r'
.*?(.*?).*?
(.*?)
' patternPeople = r'
.*?(.*?).*?
(.*?)
' patternOscars = r'
.*?(.*?).*?
(.*?)
' patternGlossary = r'
.*?(.*?).*?
(.*?)
' conversion_options = { 'comment' : description , 'tags' : category , 'publisher' : publisher , 'language' : language , 'linearize_tables' : True } feeds = [ (u'Reviews' , u'http://rogerebert.suntimes.com/apps/pbcs.dll/section?category=reviews' ) ,(u'Commentary' , u'http://rogerebert.suntimes.com/apps/pbcs.dll/section?category=COMMENTARY') ,(u'Great Movies' , u'http://rogerebert.suntimes.com/apps/pbcs.dll/section?category=REVIEWS08') ,(u'People' , u'http://rogerebert.suntimes.com/apps/pbcs.dll/section?category=PEOPLE') ,(u'Oscars' , u'http://rogerebert.suntimes.com/apps/pbcs.dll/section?category=OSCARS') ,(u'Glossary' , u'http://rogerebert.suntimes.com/apps/pbcs.dll/section?category=GLOSSARY') ] preprocess_regexps = [ (re.compile(r'.*?This is a printer friendly.*?.*?
', re.DOTALL|re.IGNORECASE), lambda m: '') ] def print_version(self, url): return url + '&template=printart' def parse_index(self): totalfeeds = [] lfeeds = self.get_feeds() for feedobj in lfeeds: feedtitle, feedurl = feedobj self.log('\tFeedurl: ', feedurl) self.report_progress(0, _('Fetching feed')+' %s...'%(feedtitle if feedtitle else feedurl)) articles = [] page = urllib2.urlopen(feedurl).read() if feedtitle == 'Reviews' or feedtitle == 'Great Movies': pattern = self.patternReviews elif feedtitle == 'Commentary': pattern = self.patternCommentary elif feedtitle == 'People': pattern = self.patternPeople elif feedtitle == 'Glossary': pattern = self.patternGlossary elif feedtitle == 'Oscars': pattern = self.patternOscars regex = re.compile(pattern, re.IGNORECASE|re.DOTALL) for match in regex.finditer(page): if feedtitle == 'Reviews' or feedtitle == 'Great Movies': movietitle = match.group(1) thislink = match.group(2) description = match.group(3) elif feedtitle == 'Commentary' or feedtitle == 'People' or feedtitle == 'Glossary' or feedtitle == 'Oscars': thislink = match.group(1) description = match.group(2) self.log(thislink) for link in BeautifulSoup(thislink, parseOnlyThese=SoupStrainer('a')): thisurl = self.PREFIX + link['href'] thislinktext = self.tag_to_string(link) if feedtitle == 'Reviews' or feedtitle == 'Great Movies': thistitle = movietitle elif feedtitle == 'Commentary' or feedtitle == 'People' or feedtitle == 'Glossary' or feedtitle == 'Oscars': thistitle = thislinktext if thistitle == '': continue pattern2 = r'AID=\/(.*?)\/' reg2 = re.compile(pattern2, re.IGNORECASE|re.DOTALL) match2 = reg2.search(thisurl) if match2: c = time.strptime(match2.group(1),"%Y%m%d") mydate=strftime("%A, %B %d, %Y", c) else: mydate = strftime("%A, %B %d, %Y") self.log(mydate) articles.append({ 'title' :thistitle ,'date' :' [' + mydate + ']' ,'url' :thisurl ,'description':description }) totalfeeds.append((feedtitle, articles)) return totalfeeds