Smithsonian Magazine by Krittika Goyal

2025-07-09 03:04:10 -04:00 · 2010-03-03 18:31:23 -07:00 · 2010-03-03 18:31:23 -07:00 · 556d8971d2
commit 556d8971d2
parent 0d0932a4e2
1 changed files with 52 additions and 0 deletions
--- a/resources/recipes/smith.recipe
+++ b/resources/recipes/smith.recipe
@ -0,0 +1,52 @@
+from calibre.web.feeds.news import BasicNewsRecipe
+from calibre.ebooks.BeautifulSoup import BeautifulSoup
+
+class SmithsonianMagazine(BasicNewsRecipe):
+    title          = u'Smithsonian Magazine'
+    language       = 'en'
+    __author__     = 'Krittika Goyal'
+    oldest_article = 31#days
+    max_articles_per_feed = 50
+    #encoding = 'latin1'
+    recursions = 1
+    match_regexps = ['&page=[2-9]$']
+
+    remove_stylesheets = True
+    #remove_tags_before = dict(name='h1', attrs={'class':'heading'})
+    remove_tags_after  = dict(name='p', attrs={'id':'articlePaginationWrapper'})
+    remove_tags = [
+       dict(name='iframe'),
+       dict(name='div', attrs={'class':'article_sidebar_border'}),
+       dict(name='div', attrs={'id':['article_sidebar_border', 'most-popular_large']}),
+       #dict(name='ul', attrs={'class':'article-tools'}),
+       dict(name='ul', attrs={'class':'cat-breadcrumb col three last'}),
+    ]
+
+
+    feeds          = [
+('History and Archeology',
+ 'http://feeds.feedburner.com/smithsonianmag/history-archaeology'),
+('People and Places',
+ 'http://feeds.feedburner.com/smithsonianmag/people-places'),
+('Science and Nature',
+ 'http://feeds.feedburner.com/smithsonianmag/science-nature'),
+('Arts and Culture',
+ 'http://feeds.feedburner.com/smithsonianmag/arts-culture'),
+('Travel',
+ 'http://feeds.feedburner.com/smithsonianmag/travel'),
+]
+
+    def preprocess_html(self, soup):
+        story = soup.find(name='div', attrs={'id':'article-left'})
+        #td = heading.findParent(name='td')
+        #td.extract()
+        soup = BeautifulSoup('<html><head><title>t</title></head><body></body></html>')
+        body = soup.find(name='body')
+        body.insert(0, story)
+        return soup
+
+    def postprocess_html(self, soup, first):
+        for p in soup.findAll(id='articlePaginationWrapper'): p.extract()
+        if not first:
+             for div in soup.findAll(id='article-head'): div.extract()
+        return soup