calibre/recipes/new_scientist.recipe

##
## Title:        Microwave Journal RSS recipe
## Contact:      AprilHare, Darko Miletic <darko.miletic at gmail.com>
##
## License:      GNU General Public License v3 - http://www.gnu.org/copyleft/gpl.html
## Copyright:    2008-2010, AprilHare, Darko Miletic <darko.miletic at gmail.com>
##
## Written:      2008
## Last Edited:  Jan 2012
##

'''
01-19-2012: Added GrayScale Image conversion and Duplicant article removals
'''

__license__   = 'GNU General Public License v3 - http://www.gnu.org/copyleft/gpl.html'
__copyright__ = '2008-2012, AprilHare, Darko Miletic <darko.miletic at gmail.com>'
__version__     = 'v0.5.0'
__date__        = '2012-01-19'
__author__      = 'Darko Miletic'

'''
newscientist.com
'''

import re
import urllib
from calibre.utils.magick import Image
from calibre.web.feeds.news import BasicNewsRecipe

class NewScientist(BasicNewsRecipe):
    title                 = 'New Scientist - Online News w. subscription'
    description           = 'Science news and science articles from New Scientist.'
    language              = 'en'
    publisher             = 'Reed Business Information Ltd.'
    category              = 'science news, science articles, science jobs, drugs, cancer, depression, computer software'
    oldest_article        = 7
    max_articles_per_feed = 100
    no_stylesheets        = True
    use_embedded_content  = False
    cover_url             = 'http://www.newscientist.com/currentcover.jpg'
    masthead_url          = 'http://www.newscientist.com/img/misc/ns_logo.jpg'
    encoding              = 'utf-8'
    needs_subscription    = 'optional'
    extra_css             = """
                                 body{font-family: Arial,sans-serif}
                                 img{margin-bottom: 0.8em; display: block}
                                 .quotebx{font-size: x-large; font-weight: bold; margin-right: 2em; margin-left: 2em}
                            """

    conversion_options = {
                          'comment'          : description
                        , 'tags'             : category
                        , 'publisher'        : publisher
                        , 'language'         : language
                        }
    preprocess_regexps = [(re.compile(r'</title>.*?</head>', re.DOTALL|re.IGNORECASE),lambda match: '</title></head>')]

    keep_only_tags = [dict(name='div', attrs={'id':['pgtop','maincol','blgmaincol','nsblgposts','hldgalcols']})]

    # Whether to omit duplicates of articles (typically arsing when articles are indexed in
    # more than one section). If True, only the first occurance will be downloaded.
    filterDuplicates = True

    # Whether to convert images to grayscale for eInk readers.
    Convert_Grayscale = False

    url_list = []   # This list is used to check if an article had already been included.

    def get_browser(self):
        br = BasicNewsRecipe.get_browser(self)
        br.open('http://www.newscientist.com/')
        if self.username is not None and self.password is not None:
            br.open('https://www.newscientist.com/user/login')
            data = urllib.urlencode({ 'source':'form'
                                     ,'redirectURL':''
                                     ,'loginId':self.username
                                     ,'password':self.password
                                   })
            br.open('https://www.newscientist.com/user/login',data)
        return br

    remove_tags = [
                     dict(name='div'  , attrs={'class':['hldBd','adline','pnl','infotext' ]})
                    ,dict(name='div'  , attrs={'id'   :['compnl','artIssueInfo','artTools','comments','blgsocial','sharebtns']})
                    ,dict(name='p'    , attrs={'class':['marker','infotext'               ]})
                    ,dict(name='meta' , attrs={'name' :'description'                       })
                    ,dict(name='a'    , attrs={'rel'  :'tag'                               })
                    ,dict(name='ul'   , attrs={'class':'markerlist'                        })
                    ,dict(name=['link','base','meta','iframe','object','embed'])
                  ]
    remove_tags_after = dict(attrs={'class':['nbpcopy','comments']})
    remove_attributes = ['height','width','lang','onclick']

    feeds          = [
                        (u'Latest Headlines'        , u'http://feeds.newscientist.com/science-news'       )
                       ,(u'Magazine'                , u'http://feeds.newscientist.com/magazine'           )
                       ,(u'Health'                  , u'http://feeds.newscientist.com/health'             )
                       ,(u'Life'                    , u'http://feeds.newscientist.com/life'               )
                       ,(u'Space'                   , u'http://feeds.newscientist.com/space'              )
                       ,(u'Physics and Mathematics' , u'http://feeds.newscientist.com/physics-math'       )
                       ,(u'Environment'             , u'http://feeds.newscientist.com/environment'        )
                       ,(u'Science in Society'      , u'http://feeds.newscientist.com/science-in-society' )
                       ,(u'Tech'                    , u'http://feeds.newscientist.com/tech'               )
                     ]

    def get_article_url(self, article):
        return article.get('guid',  None)

    def print_version(self, url):
        if self.filterDuplicates:
            if url in self.url_list:
                return
        self.url_list.append(url)
        return url + '?full=true&print=true'

    def preprocess_html(self, soup):
        if soup.html.has_key('id'):
           del soup.html['id']
        for item in soup.findAll(style=True):
            del item['style']
        for item in soup.findAll(['quote','quotetext']):
            item.name='p'
        for item in soup.findAll(['xref','figref']):
            tstr = item.string
            item.replaceWith(tstr)
        for tg in soup.findAll('a'):
            if tg.string == 'Home':
                tg.parent.extract()
            else:
                if tg.string is not None:
                   tstr = tg.string
                   tg.replaceWith(tstr)
        return soup

    # Converts images to Gray Scale
    def postprocess_html(self, soup, first):
        if self.Convert_Grayscale:
            #process all the images
            for tag in soup.findAll(lambda tag: tag.name.lower()=='img' and tag.has_key('src')):
                iurl = tag['src']
                img = Image()
                img.open(iurl)
                if img < 0:
                    raise RuntimeError('Out of memory')
                img.type = "GrayscaleType"
                img.save(iurl)
        return soup