calibre/recipes/chr_mon.recipe

__license__   = 'GPL v3'
__copyright__ = '2012, Darko Miletic <darko.miletic at gmail.com>'
'''
www.csmonitor.com
'''

import re
from calibre.web.feeds.news import BasicNewsRecipe

class CSMonitor(BasicNewsRecipe):
    title                 = 'The Christian Science Monitor - daily'
    __author__            = 'Darko Miletic'
    description           = 'The Christian Science Monitor is an international news organization that delivers thoughtful, global coverage via its website, weekly magazine, daily news briefing, and email newsletters.'
    publisher             = 'The Christian Science Monitor'
    category              = 'news, politics, USA'
    oldest_article        = 2
    max_articles_per_feed = 200
    no_stylesheets        = True
    encoding              = 'utf8'
    use_embedded_content  = False
    language              = 'en'
    remove_empty_feeds    = True
    publication_type      = 'newspaper'
    masthead_url          = 'http://www.csmonitor.com/extension/csm_base/design/csm_design/images/csmlogo_179x46.gif'
    extra_css             = """
                               body{font-family: Arial,Tahoma,Verdana,Helvetica,sans-serif }
                               img{margin-bottom: 0.4em; display:block}
                               .head {font-family: Georgia,"Times New Roman",Times,serif}
                               .sByline,.caption{font-size: x-small}
                               .hide{display: none}
                               .sLoc{font-weight: bold}
                               ul{list-style-type: none}
                            """

    conversion_options = {
                          'comment'   : description
                        , 'tags'      : category
                        , 'publisher' : publisher
                        , 'language'  : language
                        }

    remove_tags = [
                     dict(name=['meta','link','iframe','object','embed'])
                    ,dict(attrs={'class':re.compile('(^|| )podStoryRel($|| )', re.DOTALL)})
                    ,dict(attrs={'class':['bottom-rel','hide']})
                    ,dict(attrs={'id':['pgallerycarousel_enlarge','pgallerycarousel_related']})
                  ]
    keep_only_tags = [
                        dict(name='h1', attrs={'class':'head'})
                       ,dict(name='h2', attrs={'class':'subhead'})
                       ,dict(attrs={'class':['sByline','thePhoto','ui-body-header']})
                       ,dict(attrs={'class':re.compile('(^|| )sBody($|| )', re.DOTALL)})
                     ]
    remove_attributes=['xmlns:fb']

    feeds = [
              (u'USA'           , u'http://rss.csmonitor.com/feeds/usa'        )
             ,(u'World'         , u'http://rss.csmonitor.com/feeds/world'      )
             ,(u'Politics'      , u'http://rss.csmonitor.com/feeds/politics'   )
             ,(u'Business'      , u'http://rss.csmonitor.com/feeds/wam'        )
             ,(u'Commentary'    , u'http://rss.csmonitor.com/feeds/commentary' )
             ,(u'Books'         , u'http://rss.csmonitor.com/feeds/books'      )
             ,(u'Arts'          , u'http://rss.csmonitor.com/feeds/arts'       )
             ,(u'Environment'   , u'http://rss.csmonitor.com/feeds/environment')
             ,(u'Innovation'    , u'http://rss.csmonitor.com/feeds/scitech'    )
             ,(u'Living'        , u'http://rss.csmonitor.com/feeds/living'     )
             ,(u'Science'       , u'http://rss.csmonitor.com/feeds/science'    )
             ,(u'The Culture'   , u'http://rss.csmonitor.com/feeds/theculture' )
             ,(u'The Home Forum', u'http://rss.csmonitor.com/feeds/homeforum'  )
             ,(u'Articles'      , u'http://rss.csmonitor.com/feeds/csarticles' )
            ]

    def append_page(self, soup):
        pager = soup.find('div', attrs={'class':'navigation'})
        if pager:
           nexttag = pager.find(attrs={'id':'next-button'})
           if nexttag:
              nurl = 'http://www.csmonitor.com' + nexttag['href']
              soup2 = self.index_to_soup(nurl)
              texttag = soup2.find(attrs={'class':re.compile('(^|| )sBody($|| )', re.DOTALL)})
              if texttag:
                  appendtag = soup.find(attrs={'class':re.compile('(^|| )sBody($|| )', re.DOTALL)})
                  for citem in texttag.findAll(attrs={'class':[re.compile('(^|| )podStoryRel($|| )', re.DOTALL),'bottom-rel','hide']}):
                      citem.extract()
                  self.append_page(soup2)
                  texttag.extract()
                  pager.extract()
                  appendtag.append(texttag)

    def preprocess_html(self, soup):
        self.append_page(soup)
        pager = soup.find('div', attrs={'class':'navigation'})
        if pager:
           pager.extract()
        for item in soup.findAll('a'):
            limg = item.find('img')
            if item.string is not None:
               str = item.string
               item.replaceWith(str)
            else:
               if limg:
                  item.name = 'div'
                  item.attrs = []
               else:
                   str = self.tag_to_string(item)
                   item.replaceWith(str)
        for item in soup.findAll('img'):
            if 'scorecardresearch' in item['src']:
               item.extract()
            else:
                if not item.has_key('alt'):
                   item['alt'] = 'image'
        return soup