Roger Ebert by Shane Erstad

2025-07-09 03:04:10 -04:00 · 2011-01-22 11:08:13 -07:00 · 2011-01-22 11:08:13 -07:00 · e42664da72
commit e42664da72
parent 1a1af35af1
1 changed files with 120 additions and 0 deletions
--- a/resources/recipes/roger_ebert.recipe
+++ b/resources/recipes/roger_ebert.recipe
@ -0,0 +1,120 @@
+import re
+import urllib2
+from calibre.web.feeds.news import BasicNewsRecipe
+from calibre.ebooks.BeautifulSoup import BeautifulSoup, SoupStrainer
+
+class Ebert(BasicNewsRecipe):
+    title                 = 'Roger Ebert'
+    __author__            = 'Shane Erstad'
+    description           = 'Roger Ebert Movie Reviews'
+    publisher             = 'Chicago Sun Times'
+    category              = 'movies'
+    oldest_article        = 8
+    max_articles_per_feed = 100
+    no_stylesheets        = True
+    use_embedded_content  = False
+    encoding              = 'utf-8'
+    masthead_url          = 'http://rogerebert.suntimes.com/graphics/global/roger.jpg'
+    language              = 'en'
+    remove_empty_feeds    = False
+    PREFIX                  = 'http://rogerebert.suntimes.com'
+    patternReviews                = r'<span class="*?movietitle"*?>(.*?)</span>.*?<div class="*?headline"*?>(.*?)</div>(.*?)</div>'
+    patternCommentary       = r'<div class="*?headline"*?>.*?(<a href="/apps/pbcs.dll/article\?AID=.*?COMMENTARY.*?" id="ltred">.*?</a>).*?<div class="blurb clear">(.*?)</div>'
+    patternPeople           = r'<div class="*?headline"*?>.*?(<a href="/apps/pbcs.dll/article\?AID=.*?PEOPLE.*?" id="ltred">.*?</a>).*?<div class="blurb clear">(.*?)</div>'
+    patternGlossary           = r'<div class="*?headline"*?>.*?(<a href="/apps/pbcs.dll/article\?AID=.*?GLOSSARY.*?" id="ltred">.*?</a>).*?<div class="blurb clear">(.*?)</div>'
+
+
+
+    conversion_options = {
+                          'comment'          : description
+                        , 'tags'             : category
+                        , 'publisher'        : publisher
+                        , 'language'         : language
+                        , 'linearize_tables' : True
+                        }
+
+
+    feeds          = [
+                        (u'Reviews'   , u'http://rogerebert.suntimes.com/apps/pbcs.dll/section?category=reviews' )
+                        ,(u'Commentary'   , u'http://rogerebert.suntimes.com/apps/pbcs.dll/section?category=COMMENTARY')
+                        ,(u'Great Movies'   , u'http://rogerebert.suntimes.com/apps/pbcs.dll/section?category=REVIEWS08')
+                        ,(u'People'   , u'http://rogerebert.suntimes.com/apps/pbcs.dll/section?category=PEOPLE')
+                        ,(u'Glossary'   , u'http://rogerebert.suntimes.com/apps/pbcs.dll/section?category=GLOSSARY')
+
+                     ]
+
+    preprocess_regexps = [
+        (re.compile(r'<font.*?>.*?This is a printer friendly.*?</font>.*?<hr>', re.DOTALL|re.IGNORECASE),
+            lambda m: '')
+    ]
+
+
+
+    def print_version(self, url):
+        return url + '&template=printart'
+
+    def parse_index(self):
+        totalfeeds = []
+        lfeeds = self.get_feeds()
+        for feedobj in lfeeds:
+            feedtitle, feedurl = feedobj
+            self.log('\tFeedurl: ', feedurl)
+            self.report_progress(0, _('Fetching feed')+' %s...'%(feedtitle if feedtitle else feedurl))
+            articles = []
+            page = urllib2.urlopen(feedurl).read()
+
+            if feedtitle == 'Reviews' or feedtitle == 'Great Movies':
+                    pattern = self.patternReviews
+            elif feedtitle == 'Commentary':
+                    pattern = self.patternCommentary
+            elif feedtitle == 'People':
+                    pattern = self.patternPeople
+            elif feedtitle == 'Glossary':
+                    pattern = self.patternGlossary
+
+
+            regex = re.compile(pattern, re.IGNORECASE|re.DOTALL)
+
+            for match in regex.finditer(page):
+                if feedtitle == 'Reviews' or feedtitle == 'Great Movies':
+                    movietitle = match.group(1)
+                    thislink = match.group(2)
+                    description = match.group(3)
+                elif feedtitle == 'Commentary' or feedtitle == 'People' or feedtitle == 'Glossary':
+                    thislink = match.group(1)
+                    description = match.group(2)
+
+                self.log(thislink)
+
+                for link in BeautifulSoup(thislink, parseOnlyThese=SoupStrainer('a')):
+                    thisurl = self.PREFIX + link['href']
+                    thislinktext = self.tag_to_string(link)
+
+                    if feedtitle == 'Reviews' or feedtitle == 'Great Movies':
+                        thistitle = movietitle
+                    elif feedtitle == 'Commentary' or feedtitle == 'People' or feedtitle == 'Glossary':
+                        thistitle = thislinktext
+
+                    if thistitle == '':
+                        thistitle = 'Ebert Journal Post'
+
+                    """
+                    pattern2 = r'AID=\/(.*?)\/'
+                    reg2 = re.compile(pattern2, re.IGNORECASE|re.DOTALL)
+                    match2 = reg2.search(thisurl)
+                    date = match2.group(1)
+                    c = time.strptime(match2.group(1),"%Y%m%d")
+                    date=time.strftime("%a, %b %d, %Y", c)
+                    self.log(date)
+                    """
+
+                    articles.append({
+                                      'title'      :thistitle
+                                     ,'date'       :''
+                                     ,'url'        :thisurl
+                                     ,'description':description
+                                    })
+            totalfeeds.append((feedtitle, articles))
+
+        return totalfeeds
+