calibre/recipes/scmp.recipe

'''
scmp.com
'''

from calibre.web.feeds.news import BasicNewsRecipe


def classes(classes):
    q = frozenset(classes.split(' '))
    return dict(attrs={
        'class': lambda x: x and frozenset(x.split()).intersection(q)})


class SCMP(BasicNewsRecipe):
    title = 'South China Morning Post'
    __author__ = 'llam'
    description = "SCMP.com, Hong Kong's premier online English daily provides exclusive up-to-date news, audio video news, podcasts, RSS Feeds, Blogs, breaking news, top stories, award winning news and analysis on Hong Kong and China."  # noqa
    publisher = 'South China Morning Post Publishers Ltd.'
    oldest_article = 2
    delay = 1
    max_articles_per_feed = 200
    no_stylesheets = True
    encoding = 'utf-8'
    use_embedded_content = False
    language = 'en_CN'
    remove_empty_feeds = True
    needs_subscription = 'optional'
    publication_type = 'newspaper'

    keep_only_tags = [
        dict(name='h1'),
        classes('field-name-field-subheading scmp-gallery-swiper pane-node-body field-name-field-authors'),
        dict(itemprop='dateCreated dateModified'.split()),
    ]
    remove_tags = [
        dict(name='button')
    ]

    def get_browser(self):
        br = BasicNewsRecipe.get_browser(self)
        # br.set_debug_http(True)
        # br.set_debug_responses(True)
        # br.set_debug_redirects(True)
        if self.username is not None and self.password is not None:
            br.open('https://www.scmp.com/user/login')
            br.select_form(nr=0)
            br['name'] = self.username
            br['pass'] = self.password
            br.submit()
        return br

    feeds = [
        (u'Business', u'http://www.scmp.com/rss/business.xml'),
        (u'Hong Kong', u'http://www.scmp.com/rss/hong_kong.xml'),
        (u'China', u'http://www.scmp.com/rss/china.xml'),
        (u'Asia & World', u'http://www.scmp.com/rss/news_asia_world.xml'),
        (u'Opinion', u'http://www.scmp.com/rss/opinion.xml'),
        (u'LifeSTYLE', u'http://www.scmp.com/rss/lifestyle.xml'),
        (u'Sport', u'http://www.scmp.com/rss/sport.xml')
    ]

    def preprocess_html(self, soup):
        for img in soup.findAll("img", attrs={'data-original':True}):
            img['src'] = img['data-original']
        return soup