New recipe for Mother Jones by kwetal

2026-06-07 06:25:26 -04:00 · 2010-01-02 13:24:27 -07:00
parent 4378c69bcd
commit 96ae3c2086
2 changed files with 103 additions and 0 deletions
@@ -0,0 +1,103 @@
+from calibre.web.feeds.news import BasicNewsRecipe
+from calibre.ebooks.BeautifulSoup import Tag
+from calibre.ptempfile import PersistentTemporaryFile
+
+class MotherJonesRecipe(BasicNewsRecipe):
+    __license__  = 'GPL v3'
+    __author__ = 'kwetal'
+    language = 'en'
+    version = 1
+
+    title = u'Mother Jones'
+    publisher = u'Mother Jones'
+    category = u'News, Investigative journalism'
+    description = u'Independent investigative, political, and social justice reporting. Takes no prisoners, cleaves to no dogma, and tells it like it is.'
+
+    oldest_article = 14
+    max_articles_per_feed = 100
+    use_embedded_content = False
+
+    remove_empty_feeds = True
+    no_stylesheets = True
+    remove_javascript = True
+    simultaneous_downloads = 3
+
+    keep_only_tags = []
+    keep_only_tags.append(dict(name = 'h1'))
+    keep_only_tags.append(dict(name = 'div', attrs = {'class': 'dek'}))
+    keep_only_tags.append(dict(name = 'p', attrs = {'class': 'submitted'}))
+    keep_only_tags.append(dict(name = 'div', attrs = {'class': 'print-content'}))
+    #keep_only_tags.append(dict(name = '', attrs = {'': ''}))
+
+    remove_tags = []
+    remove_tags.append(dict(name = 'base'))
+    #remove_tags.append(dict(name = '', attrs = {'': ''}))
+
+    remove_attributes = ['style']
+
+    # feeds from http://motherjones.com/about/rss
+    feeds = []
+    feeds.append((u'Latest News', u'http://feeds.feedburner.com/motherjones/main?format=xml'))
+    feeds.append((u'Politics & Current Affairs', u'http://motherjones.com/rss/sections/Politics/feed&format=xml'))
+    feeds.append((u'Environment & Health', u'http://motherjones.com/rss/sections/Environment/feed'))
+    feeds.append((u'Media & Culture', u'http://motherjones.com/rss/sections/Media/feed'))
+    feeds.append((u'Blog: Kevin Drum', u'http://motherjones.com/rss/blogs/Kevin+Drum/feed'))
+    feeds.append((u'Blog: MoJo Blog', u'http://motherjones.com/rss/blogs/mojo/feed'))
+    feeds.append((u'Blog: Blue Marble', u'http://motherjones.com/rss/blogs/Blue+Marble/feed'))
+    feeds.append((u'Blog: The Riff', u'http://motherjones.com/rss/blogs/Riff/feed'))
+    ##feeds.append((u'', u''))
+
+    extra_css = '''
+                body{font-family:verdana,arial,helvetica,geneva,sans-serif;}
+                img {float: left; margin-right: 0.5em;}
+                div.dek {font-style: italic;}
+                p.submitted {font-size: x-small; color: #696969;}
+                div.mj_support {font-size: x-small; color: #0666666; border: 1px solid black; padding: 0.5em}
+                a, a[href] {text-decoration: none; color: blue;}
+                '''
+
+    conversion_options = {'comments': description, 'tags': category, 'language': 'en',
+                          'publisher': publisher}
+
+    temp_files = []
+    articles_are_obfuscated = True
+
+    def get_obfuscated_article(self, url):
+        '''
+        The print version is sort of hard to get. I think they look at the referer header, and if
+        it is not right they serve the original. This method works around that.
+        '''
+        br = self.get_browser()
+        br.open(url)
+
+        response = br.follow_link(url_regex = r'/print/[0-9]+', nr = 0)
+        html = response.read()
+
+        self.temp_files.append(PersistentTemporaryFile('_motherjones.html'))
+        self.temp_files[-1].write(html)
+        self.temp_files[-1].close()
+
+        return self.temp_files[-1].name
+
+    def get_article_url(self, article):
+        '''
+        Some of the feeds are served by feedburner (grr). Then the workaround to get their
+        print version doesn't work anymore. This method provides a workaround.
+        '''
+        if hasattr(article, 'feedburner_origlink'):
+            return article.feedburner_origlink
+        else:
+            return article.link
+
+    def preprocess_html(self, soup):
+        for img in soup.findAll('img', attrs = {'src': True}):
+            if not img['src'].startswith('http://'):
+                img['src'] = 'http://motherjones.com' + img['src']
+
+        div = Tag(soup, 'div', [('class', 'mj_support')])
+        div.append('''Your tax-deductible gifts help keep Mother Jones independent and uncompromised.
+                      To make a contribution, visit MotherJones.com or call 877-GIV-MOJO.
+                   ''')
+        soup.body.append(div)
+
+        return soup