calibre/recipes/scmp.recipe

'''
scmp.com
'''

from mechanize import Request
import json
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import Tag


def classes(classes):
    q = frozenset(classes.split(' '))
    return dict(attrs={
        'class': lambda x: x and frozenset(x.split()).intersection(q)})


def new_tag(soup, name, attrs=()):
    impl = getattr(soup, 'new_tag', None)
    if impl is not None:
        return impl(name, attrs=dict(attrs))
    return Tag(soup, name, attrs=attrs or None)


class SCMP(BasicNewsRecipe):
    title = 'South China Morning Post'
    __author__ = 'llam'
    description = "SCMP.com, Hong Kong's premier online English daily provides exclusive up-to-date news, audio video news, podcasts, RSS Feeds, Blogs, breaking news, top stories, award winning news and analysis on Hong Kong and China."  # noqa
    publisher = 'South China Morning Post Publishers Ltd.'
    oldest_article = 2
    delay = 1
    max_articles_per_feed = 200
    no_stylesheets = True
    encoding = 'utf-8'
    use_embedded_content = False
    language = 'en_CN'
    remove_empty_feeds = True
    needs_subscription = 'optional'
    publication_type = 'newspaper'

    keep_only_tags = [
        dict(name='h1'),
        classes('info__subHeadline article-author main__right'),
    ]
    remove_tags = [
        dict(name='button')
    ]

    def get_browser(self):
        br = BasicNewsRecipe.get_browser(self)
        if self.username is not None and self.password is not None:
            # br.set_debug_http(True)
            # br.set_debug_responses(True)
            # br.set_debug_redirects(True)
            rq = Request('https://account.scmp.com/login', headers={
                'Accept': 'application/json, text/plain, */*',
                'Content-Type': 'application/json;charset=UTF-8',
                'Referer': 'https://account.scmp.com/login',
                }, data=json.dumps({'username': self.username, 'password': self.password}))
            self.log('Sending login request...')
            try:
                res = br.open(rq)
            except Exception as err:
                if hasattr(err, 'read'):
                    raise Exception('Login request failed with error: {} and body: {}'.format(err, err.read().decode('utf-8', 'replace')))
                raise
            if res.code != 200:
                raise ValueError('Failed to login, check your username and password')
            nonce = json.loads(res.read())['nonce']
            rq = Request('https://www.scmp.com/centralize/signin?nonce=' + nonce, headers={
                'referer': 'https://account.scmp.com/login',
                'sec-fetch-mode': 'navigate',
                'sec-fetch-site': 'same-site',
                'sec-fetch-user': '?1'})
            res = br.open(rq)
            if res.code != 200:
                raise ValueError('Failed to login, check your username and password')
        return br

    feeds = [
        ('Hong Kong', 'https://www.scmp.com/rss/2/feed'),
        ('China', 'https://www.scmp.com/rss/4/feed'),
        ('Asia', 'https://www.scmp.com/rss/3/feed'),
        ('World', 'https://www.scmp.com/rss/5/feed'),
        ('Business', 'https://www.scmp.com/rss/92/feed'),
        ('Tech', 'https://www.scmp.com/rss/36/feed'),
        ('Life', 'https://www.scmp.com/rss/94/feed'),
        ('Culture', 'https://www.scmp.com/rss/322296/feed'),
        ('Sport', 'https://www.scmp.com/rss/95/feed'),
        ('Post Mag', 'https://www.scmp.com/rss/71/feed'),
        ('Style', 'https://www.scmp.com/rss/72/feed'),
    ]

    def preprocess_html(self, soup):
        for img in soup.findAll("img", attrs={'data-original':True}):
            img['src'] = img['data-original']
        meta = soup.find('meta', attrs={'name':'twitter:image:src'}, content=True)
        if meta is not None:
            wrapper = soup.find(**classes('image-wrapper__placeholder'))
            if wrapper is not None:
                p = wrapper.parent
                img = new_tag(soup, 'img')
                img['src'] = meta['content']
                p.append(img)
                wrapper.extract()
        return soup