calibre/recipes/scientific_american.recipe

#!/usr/bin/env  python
__license__   = 'GPL v3'

import re
from calibre.web.feeds.news import BasicNewsRecipe

class ScientificAmerican(BasicNewsRecipe):
    title                 = u'Scientific American'
    description           = u'Popular Science. Monthly magazine.'
    category              = 'science'
    __author__            = 'Starson17'
    no_stylesheets        = True
    use_embedded_content  = False
    language              = 'en'
    publisher             = 'Nature Publishing Group'
    remove_empty_feeds    = True
    remove_javascript     = True
    oldest_article        = 30
    max_articles_per_feed = 100

    conversion_options = {'linearize_tables'  : True
                        , 'comment'           : description
                        , 'tags'              : category
                        , 'publisher'         : publisher
                        , 'language'          : language
                        }

    keep_only_tags = [
                dict(name='h2', attrs={'class':'articleTitle'})
                ,dict(name='p', attrs={'id':'articleDek'})
                ,dict(name='p', attrs={'class':'articleInfo'})
                ,dict(name='div', attrs={'id':['articleContent']})
                ,dict(name='img', attrs={'src':re.compile(r'/media/inline/blog/Image/', re.DOTALL|re.IGNORECASE)})
                ]

    remove_tags = [dict(name='a', attrs={'class':'tinyCommentCount'})
                                ,dict(name='div', attrs={'id':'bigCoverModule'})
                                ,dict(name='div', attrs={'class':'addInfo'})
                                ]
    def parse_index(self):
        soup = self.index_to_soup('http://www.scientificamerican.com/sciammag/')
        issuetag = soup.find('p',attrs={'id':'articleDek'})
        self.timefmt = ' [%s]'%(self.tag_to_string(issuetag))
        img = soup.find('img', alt='Scientific American Magazine', src=True)
        if img is not None:
            self.cover_url = img['src']
        features, feeds = [], []
        for a in soup.find(attrs={'class':'doubleWide'}).find(attrs={'class':'primaryCol'}).findAll('a',attrs={'title':'Feature'}):
            if a is None:
                continue
            desc = ''
            s = a.parent.parent.find(attrs={'class':'dek'})
            desc = self.tag_to_string(s)
            article = {
                    'url' : a['href'],
                    'title' : self.tag_to_string(a),
                    'date' : '',
                    'description' : desc,
                    }
            features.append(article)
        feeds.append(('Features', features))
        department = []
        title = None
        for li in soup.find(attrs={'class':'secondaryCol'}).findAll('li'):
            if 'department.cfm' in li.a['href']:
                if department:
                    feeds.append((title, department))
                title = self.tag_to_string(li.a)
                department = []
            if 'article.cfm' in li.h3.a['href']:
                article = {
                        'url' : li.h3.a['href'],
                        'title' : self.tag_to_string(li.h3.a),
                        'date': '',
                        'description': self.tag_to_string(li.p),
                    }
                department.append(article)
        if department:
            feeds.append((title, department))
        return feeds

    def postprocess_html(self, soup, first_fetch):
        for item in soup.findAll('a'):
            if 'topic.cfm' in item['href']:
                item.replaceWith(item.string)
        return soup

    extra_css = '''
                p{font-weight: normal; font-size:small}
                li{font-weight: normal; font-size:small}
                .headline p{font-size:x-small; font-family:Arial,Helvetica,sans-serif;}
                h2{font-size:large; font-family:Arial,Helvetica,sans-serif;}
                h3{font-size:x-small;font-family:Arial,Helvetica,sans-serif;}
                '''