Smithsonian Magazine by Krittika Goyal

2025-07-09 03:04:10 -04:00 · 2010-03-03 18:31:23 -07:00 · 2010-03-03 18:31:23 -07:00 · 556d8971d2
commit 556d8971d2
parent 0d0932a4e2
1 changed files with 52 additions and 0 deletions
--- a/resources/recipes/smith.recipe
+++ b/resources/recipes/smith.recipe
@ -0,0 +1,52 @@
 from calibre.web.feeds.news import BasicNewsRecipe
 from calibre.ebooks.BeautifulSoup import BeautifulSoup
 class SmithsonianMagazine(BasicNewsRecipe):
    title          = u'Smithsonian Magazine'
    language       = 'en'
    __author__     = 'Krittika Goyal'
    oldest_article = 31#days
    max_articles_per_feed = 50
    #encoding = 'latin1'
    recursions = 1
    match_regexps = ['&page=[2-9]$']
    remove_stylesheets = True
    #remove_tags_before = dict(name='h1', attrs={'class':'heading'})
    remove_tags_after  = dict(name='p', attrs={'id':'articlePaginationWrapper'})
    remove_tags = [
       dict(name='iframe'),
       dict(name='div', attrs={'class':'article_sidebar_border'}),
       dict(name='div', attrs={'id':['article_sidebar_border', 'most-popular_large']}),
       #dict(name='ul', attrs={'class':'article-tools'}),
       dict(name='ul', attrs={'class':'cat-breadcrumb col three last'}),
    ]
    feeds          = [
 ('History and Archeology',
 'http://feeds.feedburner.com/smithsonianmag/history-archaeology'),
 ('People and Places',
 'http://feeds.feedburner.com/smithsonianmag/people-places'),
 ('Science and Nature',
 'http://feeds.feedburner.com/smithsonianmag/science-nature'),
 ('Arts and Culture',
 'http://feeds.feedburner.com/smithsonianmag/arts-culture'),
 ('Travel',
 'http://feeds.feedburner.com/smithsonianmag/travel'),
 ]
    def preprocess_html(self, soup):
        story = soup.find(name='div', attrs={'id':'article-left'})
        #td = heading.findParent(name='td')
        #td.extract()
        soup = BeautifulSoup('<html><head><title>t</title></head><body></body></html>')
        body = soup.find(name='body')
        body.insert(0, story)
        return soup
    def postprocess_html(self, soup, first):
        for p in soup.findAll(id='articlePaginationWrapper'): p.extract()
        if not first:
             for div in soup.findAll(id='article-head'): div.extract()
        return soup