From 603d01b5d1d3995a397b7e7e50147ff63014c8de Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Fri, 12 Jun 2009 20:11:30 -0700 Subject: [PATCH] New recipes for content from The Kellog School of Management --- src/calibre/web/feeds/recipes/__init__.py | 2 +- .../feeds/recipes/recipe_kellog_faculty.py | 69 +++++++++++++++++++ .../feeds/recipes/recipe_kellog_insight.py | 35 ++++++++++ .../web/feeds/recipes/recipe_new_yorker.py | 12 ++-- 4 files changed, 111 insertions(+), 7 deletions(-) create mode 100644 src/calibre/web/feeds/recipes/recipe_kellog_faculty.py create mode 100644 src/calibre/web/feeds/recipes/recipe_kellog_insight.py diff --git a/src/calibre/web/feeds/recipes/__init__.py b/src/calibre/web/feeds/recipes/__init__.py index 9b75b60656..57cd30874b 100644 --- a/src/calibre/web/feeds/recipes/__init__.py +++ b/src/calibre/web/feeds/recipes/__init__.py @@ -47,7 +47,7 @@ recipe_modules = ['recipe_' + r for r in ( 'climate_progress', 'carta', 'slashdot', 'publico', 'the_budget_fashionista', 'elperiodico_catalan', 'elperiodico_spanish', 'expansion_spanish', 'lavanguardia', - 'marca', + 'marca', 'kellog_faculty', 'kellog_insight', )] import re, imp, inspect, time, os diff --git a/src/calibre/web/feeds/recipes/recipe_kellog_faculty.py b/src/calibre/web/feeds/recipes/recipe_kellog_faculty.py new file mode 100644 index 0000000000..b66659382b --- /dev/null +++ b/src/calibre/web/feeds/recipes/recipe_kellog_faculty.py @@ -0,0 +1,69 @@ +#!/usr/bin/env python +# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai +from __future__ import with_statement + +__license__ = 'GPL v3' +__copyright__ = '2009, Kovid Goyal ' +__docformat__ = 'restructuredtext en' + +from calibre.web.feeds.news import BasicNewsRecipe +from calibre.ebooks.BeautifulSoup import BeautifulSoup + +class KellogFaculty(BasicNewsRecipe): + + title = 'Kellog Faculty Blogs' + __author__ = 'Kovid Goyal' + description = 'Blogs of the Kellog School of Management Faculty' + no_stylesheets = True + encoding = 'utf-8' + language = _('English') + remove_tags_before = {'name':'h2'} + remove_tags_after = {'class':'col-two-text'} + + def parse_index(self): + soup = self.index_to_soup('http://www.kellogg.northwestern.edu/Faculty/Blogroll.aspx') + feeds, articles = [], [] + feed_title = None + main = soup.find(id='bodyCopy') + for tag in main.findAll(['h3', 'div']): + if tag.name == 'h3': + title = self.tag_to_string(tag).capitalize() + a = tag.find('a', href=True) + if articles and feed_title: + feeds.append((feed_title, articles)) + articles = [] + # Keep only blogs hosted on the Kellog servers + feed_title = title if a and 'insight.kellog' in a['href'] else None + elif tag.name == 'div' and tag.get('class', '') == 'rssfeed': + script = tag.find('script', src=True) + text = \ + self.browser.open(script['src']).read().replace('document.write(', + '')[:-2] + text = eval(text) + asoup = BeautifulSoup(text) + for tag in asoup.findAll('div', + attrs={'class':'rssincl-entry'}): + title = self.tag_to_string(tag.find(attrs={'class':'rssincl-itemtitle'})) + try: + desc = self.tag_to_string(tag.find(attrs={'class':'rssincl-itemdesc'})) + except: + desc = '' + url = tag.find('a', href=True)['href'] + + articles.append({ + 'title':title.strip(), 'url':url, 'description':desc.strip(), 'date':'' + }) + + return feeds + + def postprocess_html(self, soup, first_fetch): + for tag in soup.findAll(style=True): + del tag['style'] + head = soup.find('head') + if head is not None: + for p in head.findAll('p'): p.extract() + for meta in soup.findAll('meta', attrs={'name':'description'}): meta.extract() + for t in head.findAll(text=True): t.extract() + return soup + + diff --git a/src/calibre/web/feeds/recipes/recipe_kellog_insight.py b/src/calibre/web/feeds/recipes/recipe_kellog_insight.py new file mode 100644 index 0000000000..9dcf4c7039 --- /dev/null +++ b/src/calibre/web/feeds/recipes/recipe_kellog_insight.py @@ -0,0 +1,35 @@ +#!/usr/bin/env python +# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai +from __future__ import with_statement + +__license__ = 'GPL v3' +__copyright__ = '2009, Kovid Goyal ' +__docformat__ = 'restructuredtext en' + + +from calibre.web.feeds.news import BasicNewsRecipe + +class KellogInsight(BasicNewsRecipe): + + title = 'Kellog Insight' + __author__ = 'Kovid Goyal' + description = 'Articles from the Kellog School of Management' + no_stylesheets = True + encoding = 'utf-8' + language = _('English') + oldest_article = 60 + remove_tags_before = {'name':'h1'} + remove_tags_after = {'class':'col-two-text'} + + + + feeds = [('Articles', + 'http://insight.kellogg.northwestern.edu/index.php/Kellogg/RSS')] + + def get_article_url(self, article): + # Get only article not blog links + link = BasicNewsRecipe.get_article_url(self, article) + if link and '/article/' in link: + return link + self.log('Skipping non-article', link) + return None diff --git a/src/calibre/web/feeds/recipes/recipe_new_yorker.py b/src/calibre/web/feeds/recipes/recipe_new_yorker.py index a3c01df8e9..9a737b7aa5 100644 --- a/src/calibre/web/feeds/recipes/recipe_new_yorker.py +++ b/src/calibre/web/feeds/recipes/recipe_new_yorker.py @@ -7,13 +7,13 @@ newyorker.com ''' from calibre.web.feeds.news import BasicNewsRecipe -from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag +from calibre.ebooks.BeautifulSoup import Tag class NewYorker(BasicNewsRecipe): title = 'The New Yorker' __author__ = 'Darko Miletic' - description = 'The best of US journalism' - oldest_article = 7 + description = 'The best of US journalism' + oldest_article = 15 language = _('English') max_articles_per_feed = 100 no_stylesheets = True @@ -21,14 +21,14 @@ class NewYorker(BasicNewsRecipe): publisher = 'Conde Nast Publications' category = 'news, politics, USA' encoding = 'cp1252' - + html2lrf_options = [ '--comment', description , '--category', category , '--publisher', publisher ] - - html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"' + + html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"' keep_only_tags = [dict(name='div', attrs={'id':'printbody'})] remove_tags_after = dict(name='div',attrs={'id':'articlebody'})