calibre/recipes/scmp.recipe

#!/usr/bin/env python
'''
scmp.com
'''

import json
import time
from datetime import datetime

from html5_parser import parse
from lxml import etree

from calibre import replace_entities
from calibre.web.feeds.news import BasicNewsRecipe, classes


def E(parent, name, text='', **attrs):
    ans = parent.makeelement(name, **attrs)
    ans.text = text
    parent.append(ans)
    return ans


def process_node(node, html_parent):
    ntype = node.get('type')

    if ntype not in {'track-viewed-percentage', 'inline-ad-slot', 'inline-widget', 'text'}:
        c = html_parent.makeelement(ntype)
        if ntype != 'p':
            c.attrib.update({k: v or '' for k, v in node.get('attribs', {}).items()})
        html_parent.append(c)
        for nc in node.get('children', ()):
            process_node(nc, c)
    elif ntype == 'text':
        text = node.get('data')
        if text:
            text = replace_entities(text)
            if len(html_parent):
                t = html_parent[-1]
                t.tail = (t.tail or '') + text
            else:
                html_parent.text = (html_parent.text or '') + text


def ts_date(x):
    dt = datetime.fromtimestamp(x/1000 + time.timezone)
    return dt.strftime('%b %d, %Y at %I:%M %p')


def load_article_from_json(raw, root):
    # open('/t/raw.json', 'w').write(raw)
    data = json.loads(raw)['props']['pageProps']['payload']['data']['article']
    body = root.xpath('//body')[0]
    for child in tuple(body):
        body.remove(child)
    article = E(body, 'article')
    E(article, 'div', replace_entities(data['firstTopic']['name']), style='color: gray; font-size:small; font-weight:bold;')
    E(article, 'h1', replace_entities(data['headline']))
    # E(article, 'p', replace_entities(data['subHeadline']['text']), style='font-style: italic; color:#202020;')
    for subh in data['subHeadline']['json']:
        process_node(subh, article)
    auth = ts_date(data['publishedDate']) + ' | ' + str(data.get('readingTime', '')) + ' min read | ' + ', '.join([a['name'] for a in data['authors']])
    E(article, 'p', auth, style='color: #202020; font-size:small;')
    main_image_url = sub_img = ''
    for l in data['images']:
        if l['type'] == 'leading':
            main_image_url = l['url']
            sub_img = l['title']
    if main_image_url != '':
        div = E(article, 'div')
        E(div, 'img', src=main_image_url)
        E(div, 'div', sub_img, style='text-align:center; font-size:small;')
    for node in data['body']['json']:
        process_node(node, article)


class SCMP(BasicNewsRecipe):
    title = 'South China Morning Post'
    __author__ = 'unkn0wn'
    description = (
        'The South China Morning Post is a leading news media company that has reported on China and Asia '
        'for more than a century with global impact. Founded in 1903, SCMP is headquartered in Hong Kong, '
        'where it is the city’s newspaper of record. Our teams span across Asia and the United States, '
        'working together to connect with news consumers around the world. We are committed to informing '
        'and inspiring through journalism of the highest standards. Our vision is to “Elevate Thought”, '
        'and our mission is to “Lead the global conversation about China”.'
    )
    publisher = 'South China Morning Post Publishers Ltd.'
    oldest_article = 1
    no_stylesheets = True
    remove_javascript = True
    remove_attributes = ['width', 'height']
    encoding = 'utf-8'
    use_embedded_content = False
    language = 'en_HK'
    remove_empty_feeds = True
    resolve_internal_links = True
    publication_type = 'newspaper'
    ignore_duplicate_articles = {'title', 'url'}
    extra_css = 'blockquote, em { color: #202020; }'
    masthead_url = 'https://upload.wikimedia.org/wikipedia/commons/c/c3/SCMP_logo.svg'

    def get_cover_url(self):
        soup = self.index_to_soup('https://www.frontpages.com/south-china-morning-post/')
        return 'https://www.frontpages.com' + soup.find('img', attrs={'id':'giornale-img'})['src']

    recipe_specific_options = {
        'days': {
            'short': 'Oldest article to download from this news source. In days ',
            'long': 'For example, 0.5, gives you articles from the past 12 hours',
            'default': str(oldest_article)
        },
        'comp': {
            'short': 'Compress News Images?',
            'long': 'enter yes',
            'default': 'no'
        },
        'rev': {
            'short': 'Reverse the order of articles in each feed?',
            'long': 'enter yes',
            'default': 'no'
        }
    }

    def __init__(self, *args, **kwargs):
        BasicNewsRecipe.__init__(self, *args, **kwargs)
        d = self.recipe_specific_options.get('days')
        if d and isinstance(d, str):
            self.oldest_article = float(d)
        r = self.recipe_specific_options.get('rev')
        if r and isinstance(r, str):
            if r.lower() == 'yes':
                self.reverse_article_order = True
        c = self.recipe_specific_options.get('comp')
        if c and isinstance(c, str):
            if c.lower() == 'yes':
                self.compress_news_images = True

    # used when unable to extract article from <script>, particularly in the Sports section
    remove_tags = [
        dict(
            classes(
                'sticky-wrap relative social-media social-media--extended__shares'
                ' article-body-comment scmp_button_comment_wrapper social-media--extended__in-site'
                ' footer scmp-advert-tile sidebar-col related-article share-widget'
            )
        ),
        dict(attrs={'addthis_title': True}),
        dict(name=['script', 'style']),
    ]

    # https://www.scmp.com/rss
    feeds = [
        ('Hong Kong', 'https://www.scmp.com/rss/2/feed'),
        ('China', 'https://www.scmp.com/rss/4/feed'),
        ('Asia', 'https://www.scmp.com/rss/3/feed'),
        ('World', 'https://www.scmp.com/rss/5/feed'),
        ('Business', 'https://www.scmp.com/rss/92/feed'),
        ('Tech', 'https://www.scmp.com/rss/36/feed'),
        ('Life', 'https://www.scmp.com/rss/94/feed'),
        ('Culture', 'https://www.scmp.com/rss/322296/feed'),
        ('Sport', 'https://www.scmp.com/rss/95/feed'),
        ('Post Mag', 'https://www.scmp.com/rss/71/feed'),
        ('Style', 'https://www.scmp.com/rss/72/feed'),
        ('News', 'https://www.scmp.com/rss/91/feed')
    ]

    def print_version(self, url):
        return url.split('?')[0]

    def preprocess_raw_html(self, raw_html, url):
        body = '<html><body><article></article></body></html>'
        b_root = parse(body)
        root = parse(raw_html)
        script = root.xpath('//script[@id="__NEXT_DATA__"]')
        if script:
            try:
                load_article_from_json(script[0].text, b_root)
            except Exception:
                self.log('** Failed parse: ', url)
                return raw_html
            head = b_root.xpath('//h2') + b_root.xpath('//h3')
            for h2 in head:
                h2.tag = 'h4'
            raw = etree.tostring(b_root, encoding='unicode')
            return raw
        return raw_html

    def preprocess_html(self, soup):
        from urllib.parse import urlparse
        for img in soup.findAll('img', attrs={'src':True}):
            y = 'https://img.i-scmp.com/cdn-cgi/image/fit=contain,width=768,format=auto'
            img['src'] = y + urlparse(img['src']).path
        for img in soup.findAll('img', attrs={'title':True}):
            div = soup.new_tag('div', attrs={'style':'text-align:center; font-size:small;'})
            div.string = img.get('title', '')
            img.find_parent('div').append(div)
        return soup