calibre/resources/recipes/new_scientist.recipe


__license__   = 'GPL v3'
__copyright__ = '2008-2010, AprilHare, Darko Miletic <darko.miletic at gmail.com>'
'''
newscientist.com
'''

import re
from calibre.web.feeds.news import BasicNewsRecipe

class NewScientist(BasicNewsRecipe):
    title                 = 'New Scientist - Online News'
    __author__            = 'Darko Miletic'
    description           = 'Science news and science articles from New Scientist.'
    language              = 'en'
    publisher             = 'New Scientist'
    category              = 'science news, science articles, science jobs, drugs, cancer, depression, computer software'
    oldest_article        = 7
    max_articles_per_feed = 100
    no_stylesheets        = True
    use_embedded_content  = False
    cover_url             = 'http://www.newscientist.com/currentcover.jpg'
    masthead_url          = 'http://www.newscientist.com/img/misc/ns_logo.jpg'
    encoding              = 'utf-8'
    extra_css             = ' body{font-family: Arial,sans-serif} img{margin-bottom: 0.8em} '

    conversion_options = {
                          'comment'          : description
                        , 'tags'             : category
                        , 'publisher'        : publisher
                        , 'language'         : language
                        }
    preprocess_regexps = [(re.compile(r'</title>.*?</head>', re.DOTALL|re.IGNORECASE),lambda match: '</title></head>')]

    keep_only_tags = [dict(name='div', attrs={'id':['pgtop','maincol','blgmaincol','nsblgposts','hldgalcols']})]

    remove_tags = [
                     dict(name='div'  , attrs={'class':['hldBd','adline','pnl','infotext' ]})
                    ,dict(name='div'  , attrs={'id'   :['compnl','artIssueInfo','artTools','comments','blgsocial']})
                    ,dict(name='p'    , attrs={'class':['marker','infotext'               ]})
                    ,dict(name='meta' , attrs={'name' :'description'                       })
                    ,dict(name='a'    , attrs={'rel'  :'tag'                                })
                  ]
    remove_tags_after = dict(attrs={'class':['nbpcopy','comments']})
    remove_attributes = ['height','width']

    feeds          = [
                        (u'Latest Headlines'        , u'http://feeds.newscientist.com/science-news'              )
                       ,(u'Magazine'                , u'http://www.newscientist.com/feed/magazine'               )
                       ,(u'Health'                  , u'http://www.newscientist.com/feed/view?id=2&type=channel' )
                       ,(u'Life'                    , u'http://www.newscientist.com/feed/view?id=3&type=channel' )
                       ,(u'Space'                   , u'http://www.newscientist.com/feed/view?id=6&type=channel' )
                       ,(u'Physics and Mathematics' , u'http://www.newscientist.com/feed/view?id=4&type=channel' )
                       ,(u'Environment'             , u'http://www.newscientist.com/feed/view?id=1&type=channel' )
                       ,(u'Science in Society'      , u'http://www.newscientist.com/feed/view?id=5&type=channel' )
                       ,(u'Tech'                    , u'http://www.newscientist.com/feed/view?id=7&type=channel' )
                     ]

    def get_article_url(self, article):
        return article.get('guid',  None)

    def print_version(self, url):
        return url + '?full=true&print=true'

    def preprocess_html(self, soup):
        for tg in soup.findAll('a'):
            if tg.string == 'Home':
                tg.parent.extract()
                return self.adeify_images(soup)
        return self.adeify_images(soup)