From 556d8971d2246c9661138907b962f3cc42178ebf Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Wed, 3 Mar 2010 18:31:23 -0700 Subject: [PATCH] Smithsonian Magazine by Krittika Goyal --- resources/recipes/smith.recipe | 52 ++++++++++++++++++++++++++++++++++ 1 file changed, 52 insertions(+) create mode 100644 resources/recipes/smith.recipe diff --git a/resources/recipes/smith.recipe b/resources/recipes/smith.recipe new file mode 100644 index 0000000000..e52b2ee709 --- /dev/null +++ b/resources/recipes/smith.recipe @@ -0,0 +1,52 @@ +from calibre.web.feeds.news import BasicNewsRecipe +from calibre.ebooks.BeautifulSoup import BeautifulSoup + +class SmithsonianMagazine(BasicNewsRecipe): + title = u'Smithsonian Magazine' + language = 'en' + __author__ = 'Krittika Goyal' + oldest_article = 31#days + max_articles_per_feed = 50 + #encoding = 'latin1' + recursions = 1 + match_regexps = ['&page=[2-9]$'] + + remove_stylesheets = True + #remove_tags_before = dict(name='h1', attrs={'class':'heading'}) + remove_tags_after = dict(name='p', attrs={'id':'articlePaginationWrapper'}) + remove_tags = [ + dict(name='iframe'), + dict(name='div', attrs={'class':'article_sidebar_border'}), + dict(name='div', attrs={'id':['article_sidebar_border', 'most-popular_large']}), + #dict(name='ul', attrs={'class':'article-tools'}), + dict(name='ul', attrs={'class':'cat-breadcrumb col three last'}), + ] + + + feeds = [ +('History and Archeology', + 'http://feeds.feedburner.com/smithsonianmag/history-archaeology'), +('People and Places', + 'http://feeds.feedburner.com/smithsonianmag/people-places'), +('Science and Nature', + 'http://feeds.feedburner.com/smithsonianmag/science-nature'), +('Arts and Culture', + 'http://feeds.feedburner.com/smithsonianmag/arts-culture'), +('Travel', + 'http://feeds.feedburner.com/smithsonianmag/travel'), +] + + def preprocess_html(self, soup): + story = soup.find(name='div', attrs={'id':'article-left'}) + #td = heading.findParent(name='td') + #td.extract() + soup = BeautifulSoup('t') + body = soup.find(name='body') + body.insert(0, story) + return soup + + def postprocess_html(self, soup, first): + for p in soup.findAll(id='articlePaginationWrapper'): p.extract() + if not first: + for div in soup.findAll(id='article-head'): div.extract() + return soup