Roger Ebert Journal by Shane Erstad

2025-11-15 19:13:02 -05:00 · 2011-02-19 17:05:19 -07:00 · 2011-02-19 17:05:19 -07:00 · 9adc087ad5
commit 9adc087ad5
parent 152f569455
1 changed files with 102 additions and 0 deletions
--- a/resources/recipes/roger_ebert_blog.recipe
+++ b/resources/recipes/roger_ebert_blog.recipe
@ -0,0 +1,102 @@
+import re
+from calibre.web.feeds.news import BasicNewsRecipe
+
+class EbertJournal(BasicNewsRecipe):
+    title                 = 'Roger Ebert Journal'
+    __author__            = 'Shane Erstad'
+    description           = 'Roger Ebert Journal'
+    publisher             = 'Chicago Sun Times'
+    category              = 'movies'
+    oldest_article        = 8
+    max_articles_per_feed = 100
+    no_stylesheets        = True
+    use_embedded_content  = False
+    encoding              = 'ISO-8859-1'
+    masthead_url          = 'http://rogerebert.suntimes.com/graphics/global/roger.jpg'
+    language              = 'en'
+    remove_empty_feeds    = False
+    PREFIX                  = 'http://blogs.suntimes.com/ebert'
+
+    remove_tags_before = dict(id='content')
+    remove_tags_after = dict(id='comments-open')
+
+
+
+
+
+    extra_css             = """
+                                @font-face {font-family: "sans1";src:url(res:///opt/sony/ebook/FONT/tt0003m_.ttf)}
+                                .article_description,body{font-family: Arial,Helvetica,sans1,sans-serif}
+                                .color-2{display:block; margin-bottom: 10px; padding: 5px, 10px;
+                                border-left: 1px solid #D00000; color: #D00000}
+                                img{margin-bottom: 0.8em} """
+
+
+    conversion_options = {
+                          'comment'          : description
+                        , 'tags'             : category
+                        , 'publisher'        : publisher
+                        , 'language'         : language
+                        , 'linearize_tables' : True
+                        }
+
+
+    feeds          = [
+                        (u'Roger Ebert Journal'   , u'http://blogs.suntimes.com/ebert/' )
+                     ]
+
+    preprocess_regexps = [
+
+        (re.compile(r'<span class="vcard author">Roger Ebert</span>', re.DOTALL|re.IGNORECASE),
+            lambda m: 'Roger Ebert'),
+
+        (re.compile(r'<span class="vcard author">', re.DOTALL|re.IGNORECASE),
+            lambda m: '<hr width="80%"><span class="vcard author">'),
+
+        (re.compile(r'<blockquote>', re.DOTALL|re.IGNORECASE),
+            lambda m: ''),
+
+        (re.compile(r'<a class="a2a_dd".*?</a>', re.DOTALL|re.IGNORECASE),
+            lambda m: ''),
+
+        (re.compile(r'<h2 class="comments-open-header">Leave a comment</h2>', re.DOTALL|re.IGNORECASE),
+            lambda m: ''),
+
+        (re.compile(r'a title="Reply".*?</a>', re.DOTALL|re.IGNORECASE),
+            lambda m: '')
+    ]
+
+
+    def parse_index(self):
+
+        totalfeeds = []
+        lfeeds = self.get_feeds()
+        for feedobj in lfeeds:
+            feedtitle, feedurl = feedobj
+            self.report_progress(0, _('Fetching feed')+' %s...'%(feedtitle if feedtitle else feedurl))
+            articles = []
+            soup = self.index_to_soup(feedurl)
+            for item in soup.findAll(attrs={'class':['entry-asset asset hentry']}):
+
+                item.find(attrs={'class':['mt-enclosure mt-enclosure-image']}).replaceWith('')
+                bodysection = item.find(attrs={'class':['asset-body']})
+                datesection = item.find(attrs={'class':['published']})
+                titlesection = item.find(attrs={'class':['asset-name entry-title']})
+
+
+                self.log(bodysection)
+
+                link = titlesection.find('a')
+                url         = link['href']
+                title       = self.tag_to_string(link)
+                self.log(url)
+                self.log(title)
+                articles.append({
+                                      'title'      :title
+                                     ,'date'       :' [' + self.tag_to_string(datesection) + ']'
+                                     ,'url'        :url
+                                     ,'description':self.tag_to_string(bodysection)
+                                    })
+            totalfeeds.append((feedtitle, articles))
+        return totalfeeds
+