From e42664da72abf03af7c86d6b8ded585389e61b65 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sat, 22 Jan 2011 11:08:13 -0700 Subject: [PATCH] Roger Ebert by Shane Erstad --- resources/recipes/roger_ebert.recipe | 120 +++++++++++++++++++++++++++ 1 file changed, 120 insertions(+) create mode 100644 resources/recipes/roger_ebert.recipe diff --git a/resources/recipes/roger_ebert.recipe b/resources/recipes/roger_ebert.recipe new file mode 100644 index 0000000000..2ea5b52a45 --- /dev/null +++ b/resources/recipes/roger_ebert.recipe @@ -0,0 +1,120 @@ +import re +import urllib2 +from calibre.web.feeds.news import BasicNewsRecipe +from calibre.ebooks.BeautifulSoup import BeautifulSoup, SoupStrainer + +class Ebert(BasicNewsRecipe): + title = 'Roger Ebert' + __author__ = 'Shane Erstad' + description = 'Roger Ebert Movie Reviews' + publisher = 'Chicago Sun Times' + category = 'movies' + oldest_article = 8 + max_articles_per_feed = 100 + no_stylesheets = True + use_embedded_content = False + encoding = 'utf-8' + masthead_url = 'http://rogerebert.suntimes.com/graphics/global/roger.jpg' + language = 'en' + remove_empty_feeds = False + PREFIX = 'http://rogerebert.suntimes.com' + patternReviews = r'(.*?).*?
(.*?)
(.*?)' + patternCommentary = r'
.*?(.*?).*?
(.*?)
' + patternPeople = r'
.*?(.*?).*?
(.*?)
' + patternGlossary = r'
.*?(.*?).*?
(.*?)
' + + + + conversion_options = { + 'comment' : description + , 'tags' : category + , 'publisher' : publisher + , 'language' : language + , 'linearize_tables' : True + } + + + feeds = [ + (u'Reviews' , u'http://rogerebert.suntimes.com/apps/pbcs.dll/section?category=reviews' ) + ,(u'Commentary' , u'http://rogerebert.suntimes.com/apps/pbcs.dll/section?category=COMMENTARY') + ,(u'Great Movies' , u'http://rogerebert.suntimes.com/apps/pbcs.dll/section?category=REVIEWS08') + ,(u'People' , u'http://rogerebert.suntimes.com/apps/pbcs.dll/section?category=PEOPLE') + ,(u'Glossary' , u'http://rogerebert.suntimes.com/apps/pbcs.dll/section?category=GLOSSARY') + + ] + + preprocess_regexps = [ + (re.compile(r'.*?This is a printer friendly.*?.*?
', re.DOTALL|re.IGNORECASE), + lambda m: '') + ] + + + + def print_version(self, url): + return url + '&template=printart' + + def parse_index(self): + totalfeeds = [] + lfeeds = self.get_feeds() + for feedobj in lfeeds: + feedtitle, feedurl = feedobj + self.log('\tFeedurl: ', feedurl) + self.report_progress(0, _('Fetching feed')+' %s...'%(feedtitle if feedtitle else feedurl)) + articles = [] + page = urllib2.urlopen(feedurl).read() + + if feedtitle == 'Reviews' or feedtitle == 'Great Movies': + pattern = self.patternReviews + elif feedtitle == 'Commentary': + pattern = self.patternCommentary + elif feedtitle == 'People': + pattern = self.patternPeople + elif feedtitle == 'Glossary': + pattern = self.patternGlossary + + + regex = re.compile(pattern, re.IGNORECASE|re.DOTALL) + + for match in regex.finditer(page): + if feedtitle == 'Reviews' or feedtitle == 'Great Movies': + movietitle = match.group(1) + thislink = match.group(2) + description = match.group(3) + elif feedtitle == 'Commentary' or feedtitle == 'People' or feedtitle == 'Glossary': + thislink = match.group(1) + description = match.group(2) + + self.log(thislink) + + for link in BeautifulSoup(thislink, parseOnlyThese=SoupStrainer('a')): + thisurl = self.PREFIX + link['href'] + thislinktext = self.tag_to_string(link) + + if feedtitle == 'Reviews' or feedtitle == 'Great Movies': + thistitle = movietitle + elif feedtitle == 'Commentary' or feedtitle == 'People' or feedtitle == 'Glossary': + thistitle = thislinktext + + if thistitle == '': + thistitle = 'Ebert Journal Post' + + """ + pattern2 = r'AID=\/(.*?)\/' + reg2 = re.compile(pattern2, re.IGNORECASE|re.DOTALL) + match2 = reg2.search(thisurl) + date = match2.group(1) + c = time.strptime(match2.group(1),"%Y%m%d") + date=time.strftime("%a, %b %d, %Y", c) + self.log(date) + """ + + articles.append({ + 'title' :thistitle + ,'date' :'' + ,'url' :thisurl + ,'description':description + }) + totalfeeds.append((feedtitle, articles)) + + return totalfeeds +