calibre/recipes/chr_mon.recipe

__license__ = 'GPL v3'
__copyright__ = '2012, Darko Miletic <darko.miletic at gmail.com>'
'''
www.csmonitor.com
'''

import re
from calibre.web.feeds.news import BasicNewsRecipe


class CSMonitor(BasicNewsRecipe):
    title = 'The Christian Science Monitor - daily'
    __author__ = 'Darko Miletic'
    description = 'The Christian Science Monitor is an international news organization that delivers thoughtful, global coverage via its website, weekly magazine, daily news briefing, and email newsletters.'  # noqa
    publisher = 'The Christian Science Monitor'
    category = 'news, politics, USA'
    oldest_article = 2
    max_articles_per_feed = 200
    no_stylesheets = True
    encoding = 'utf8'
    use_embedded_content = False
    language = 'en'
    remove_empty_feeds = True
    publication_type = 'newspaper'
    masthead_url = 'http://www.csmonitor.com/extension/csm_base/design/csm_design/images/csmlogo_179x46.gif'
    extra_css             = """
                               body{font-family: Arial,Tahoma,Verdana,Helvetica,sans-serif }
                               img{margin-bottom: 0.4em; display:block}
                               .head {font-family: Georgia,"Times New Roman",Times,serif}
                               .sByline,.caption{font-size: x-small}
                               .hide{display: none}
                               .sLoc{font-weight: bold}
                               ul{list-style-type: none}
                            """

    conversion_options = {
        'comment': description, 'tags': category, 'publisher': publisher, 'language': language
    }

    remove_tags = [
        dict(name=['meta', 'link', 'iframe', 'object', 'embed']), dict(attrs={'class': re.compile('(^|| )podStoryRel($|| )', re.DOTALL)}), dict(
            attrs={'class': ['bottom-rel', 'hide']}), dict(attrs={'id': ['pgallerycarousel_enlarge', 'pgallerycarousel_related']})
    ]
    keep_only_tags = [
        dict(name='h1', attrs={'class': 'head'}), dict(name='h2', attrs={'class': 'subhead'}), dict(attrs={'class': [
            'sByline', 'thePhoto', 'ui-body-header']}), dict(attrs={'class': re.compile('(^|| )sBody($|| )', re.DOTALL)})
    ]
    remove_attributes = ['xmlns:fb']

    feeds = [

    (u'USA', u'http://rss.csmonitor.com/feeds/usa'),
    (u'World', u'http://rss.csmonitor.com/feeds/world'),
    (u'Politics', u'http://rss.csmonitor.com/feeds/politics'),
    (u'Business', u'http://rss.csmonitor.com/feeds/wam'),
    (u'Commentary', u'http://rss.csmonitor.com/feeds/commentary'),
    (u'Books', u'http://rss.csmonitor.com/feeds/books'),
    (u'Arts', u'http://rss.csmonitor.com/feeds/arts'),
    (u'Environment', u'http://rss.csmonitor.com/feeds/environment'),
    (u'Innovation', u'http://rss.csmonitor.com/feeds/scitech'),
    (u'Living', u'http://rss.csmonitor.com/feeds/living'),
    (u'Science', u'http://rss.csmonitor.com/feeds/science'),
    (u'The Culture', u'http://rss.csmonitor.com/feeds/theculture'),
    (u'The Home Forum', u'http://rss.csmonitor.com/feeds/homeforum'),
    (u'Articles', u'http://rss.csmonitor.com/feeds/csarticles')
    ]

    def append_page(self, soup):
        pager = soup.find('div', attrs={'class': 'navigation'})
        if pager:
            nexttag = pager.find(attrs={'id': 'next-button'})
            if nexttag:
                nurl = 'http://www.csmonitor.com' + nexttag['href']
                soup2 = self.index_to_soup(nurl)
                texttag = soup2.find(
                    attrs={'class': re.compile('(^|| )sBody($|| )', re.DOTALL)})
                if texttag:
                    appendtag = soup.find(
                        attrs={'class': re.compile('(^|| )sBody($|| )', re.DOTALL)})
                    for citem in texttag.findAll(attrs={'class': [re.compile('(^|| )podStoryRel($|| )', re.DOTALL), 'bottom-rel', 'hide']}):
                        citem.extract()
                    self.append_page(soup2)
                    texttag.extract()
                    pager.extract()
                    appendtag.append(texttag)

    def preprocess_html(self, soup):
        self.append_page(soup)
        pager = soup.find('div', attrs={'class': 'navigation'})
        if pager:
            pager.extract()
        for item in soup.findAll('a'):
            limg = item.find('img')
            if item.string is not None:
                str = item.string
                item.replaceWith(str)
            else:
                if limg:
                    item.name = 'div'
                    item.attrs = []
                else:
                    str = self.tag_to_string(item)
                    item.replaceWith(str)
        for item in soup.findAll('img', src=True):
            if 'scorecardresearch' in item['src']:
                item.extract()
        return soup