calibre/recipes/smith.recipe

from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulSoup

class SmithsonianMagazine(BasicNewsRecipe):
    title          = u'Smithsonian Magazine'
    language       = 'en'
    __author__     = 'Krittika Goyal'
    oldest_article = 31#days
    max_articles_per_feed = 50
    use_embedded_content = False
    #encoding = 'latin1'
    recursions = 1
    match_regexps = ['&page=[2-9]$']

    remove_stylesheets = True
    #remove_tags_before = dict(name='h1', attrs={'class':'heading'})
    remove_tags_after  = dict(name='p', attrs={'id':'articlePaginationWrapper'})
    remove_tags = [
       dict(name='iframe'),
       dict(name='div', attrs={'class':'article_sidebar_border'}),
       dict(name='div', attrs={'id':['article_sidebar_border', 'most-popular_large', 'most-popular-body_large']}),
       ##dict(name='ul', attrs={'class':'article-tools'}),
       dict(name='ul', attrs={'class':'cat-breadcrumb col three last'}),
    ]


    feeds          = [
('History and Archeology',
 'http://feeds.feedburner.com/smithsonianmag/history-archaeology'),
('People and Places',
 'http://feeds.feedburner.com/smithsonianmag/people-places'),
('Science and Nature',
 'http://feeds.feedburner.com/smithsonianmag/science-nature'),
('Arts and Culture',
 'http://feeds.feedburner.com/smithsonianmag/arts-culture'),
('Travel',
 'http://feeds.feedburner.com/smithsonianmag/travel'),
]

    def preprocess_html(self, soup):
        story = soup.find(name='div', attrs={'id':'article-body'})
        ##td = heading.findParent(name='td')
        ##td.extract()
        soup = BeautifulSoup('<html><head><title>t</title></head><body></body></html>')
        body = soup.find(name='body')
        body.insert(0, story)
        return soup

    #def postprocess_html(self, soup, first):
        #for p in soup.findAll(id='articlePaginationWrapper'): p.extract()
        #if not first:
             #for div in soup.findAll(id='article-head'): div.extract()
        #return soup