Sync to trunk

This commit is contained in:
John Schember 2009-06-13 09:09:12 -04:00
commit 66b547979a
4 changed files with 111 additions and 7 deletions

View File

@ -47,7 +47,7 @@ recipe_modules = ['recipe_' + r for r in (
'climate_progress', 'carta', 'slashdot', 'publico', 'climate_progress', 'carta', 'slashdot', 'publico',
'the_budget_fashionista', 'elperiodico_catalan', 'the_budget_fashionista', 'elperiodico_catalan',
'elperiodico_spanish', 'expansion_spanish', 'lavanguardia', 'elperiodico_spanish', 'expansion_spanish', 'lavanguardia',
'marca', 'marca', 'kellog_faculty', 'kellog_insight',
)] )]
import re, imp, inspect, time, os import re, imp, inspect, time, os

View File

@ -0,0 +1,69 @@
#!/usr/bin/env python
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from __future__ import with_statement
__license__ = 'GPL v3'
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulSoup
class KellogFaculty(BasicNewsRecipe):
title = 'Kellog Faculty Blogs'
__author__ = 'Kovid Goyal'
description = 'Blogs of the Kellog School of Management Faculty'
no_stylesheets = True
encoding = 'utf-8'
language = _('English')
remove_tags_before = {'name':'h2'}
remove_tags_after = {'class':'col-two-text'}
def parse_index(self):
soup = self.index_to_soup('http://www.kellogg.northwestern.edu/Faculty/Blogroll.aspx')
feeds, articles = [], []
feed_title = None
main = soup.find(id='bodyCopy')
for tag in main.findAll(['h3', 'div']):
if tag.name == 'h3':
title = self.tag_to_string(tag).capitalize()
a = tag.find('a', href=True)
if articles and feed_title:
feeds.append((feed_title, articles))
articles = []
# Keep only blogs hosted on the Kellog servers
feed_title = title if a and 'insight.kellog' in a['href'] else None
elif tag.name == 'div' and tag.get('class', '') == 'rssfeed':
script = tag.find('script', src=True)
text = \
self.browser.open(script['src']).read().replace('document.write(',
'')[:-2]
text = eval(text)
asoup = BeautifulSoup(text)
for tag in asoup.findAll('div',
attrs={'class':'rssincl-entry'}):
title = self.tag_to_string(tag.find(attrs={'class':'rssincl-itemtitle'}))
try:
desc = self.tag_to_string(tag.find(attrs={'class':'rssincl-itemdesc'}))
except:
desc = ''
url = tag.find('a', href=True)['href']
articles.append({
'title':title.strip(), 'url':url, 'description':desc.strip(), 'date':''
})
return feeds
def postprocess_html(self, soup, first_fetch):
for tag in soup.findAll(style=True):
del tag['style']
head = soup.find('head')
if head is not None:
for p in head.findAll('p'): p.extract()
for meta in soup.findAll('meta', attrs={'name':'description'}): meta.extract()
for t in head.findAll(text=True): t.extract()
return soup

View File

@ -0,0 +1,35 @@
#!/usr/bin/env python
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from __future__ import with_statement
__license__ = 'GPL v3'
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
from calibre.web.feeds.news import BasicNewsRecipe
class KellogInsight(BasicNewsRecipe):
title = 'Kellog Insight'
__author__ = 'Kovid Goyal'
description = 'Articles from the Kellog School of Management'
no_stylesheets = True
encoding = 'utf-8'
language = _('English')
oldest_article = 60
remove_tags_before = {'name':'h1'}
remove_tags_after = {'class':'col-two-text'}
feeds = [('Articles',
'http://insight.kellogg.northwestern.edu/index.php/Kellogg/RSS')]
def get_article_url(self, article):
# Get only article not blog links
link = BasicNewsRecipe.get_article_url(self, article)
if link and '/article/' in link:
return link
self.log('Skipping non-article', link)
return None

View File

@ -7,13 +7,13 @@ newyorker.com
''' '''
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag from calibre.ebooks.BeautifulSoup import Tag
class NewYorker(BasicNewsRecipe): class NewYorker(BasicNewsRecipe):
title = 'The New Yorker' title = 'The New Yorker'
__author__ = 'Darko Miletic' __author__ = 'Darko Miletic'
description = 'The best of US journalism' description = 'The best of US journalism'
oldest_article = 7 oldest_article = 15
language = _('English') language = _('English')
max_articles_per_feed = 100 max_articles_per_feed = 100
no_stylesheets = True no_stylesheets = True
@ -21,14 +21,14 @@ class NewYorker(BasicNewsRecipe):
publisher = 'Conde Nast Publications' publisher = 'Conde Nast Publications'
category = 'news, politics, USA' category = 'news, politics, USA'
encoding = 'cp1252' encoding = 'cp1252'
html2lrf_options = [ html2lrf_options = [
'--comment', description '--comment', description
, '--category', category , '--category', category
, '--publisher', publisher , '--publisher', publisher
] ]
html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"' html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"'
keep_only_tags = [dict(name='div', attrs={'id':'printbody'})] keep_only_tags = [dict(name='div', attrs={'id':'printbody'})]
remove_tags_after = dict(name='div',attrs={'id':'articlebody'}) remove_tags_after = dict(name='div',attrs={'id':'articlebody'})