#!/usr/bin/env python ''' scmp.com ''' import json import time from datetime import datetime from html5_parser import parse from lxml import etree from calibre import replace_entities from calibre.web.feeds.news import BasicNewsRecipe, classes def E(parent, name, text='', **attrs): ans = parent.makeelement(name, **attrs) ans.text = text parent.append(ans) return ans def process_node(node, html_parent): ntype = node.get('type') if ntype not in {'track-viewed-percentage', 'inline-ad-slot', 'inline-widget', 'text'}: c = html_parent.makeelement(ntype) if ntype != 'p': c.attrib.update({k: v or '' for k, v in node.get('attribs', {}).items()}) html_parent.append(c) for nc in node.get('children', ()): process_node(nc, c) elif ntype == 'text': text = node.get('data') if text: text = replace_entities(text) if len(html_parent): t = html_parent[-1] t.tail = (t.tail or '') + text else: html_parent.text = (html_parent.text or '') + text def ts_date(x): dt = datetime.fromtimestamp(x/1000 + time.timezone) return dt.strftime('%b %d, %Y at %I:%M %p') def load_article_from_json(raw, root): # open('/t/raw.json', 'w').write(raw) data = json.loads(raw)['props']['pageProps']['payload']['data']['article'] body = root.xpath('//body')[0] for child in tuple(body): body.remove(child) article = E(body, 'article') E(article, 'div', replace_entities(data['firstTopic']['name']), style='color: gray; font-size:small; font-weight:bold;') E(article, 'h1', replace_entities(data['headline'])) # E(article, 'p', replace_entities(data['subHeadline']['text']), style='font-style: italic; color:#202020;') for subh in data['subHeadline']['json']: process_node(subh, article) auth = ts_date(data['publishedDate']) + ' | ' + str(data.get('readingTime', '')) + ' min read | ' + ', '.join([a['name'] for a in data['authors']]) E(article, 'p', auth, style='color: #202020; font-size:small;') main_image_url = sub_img = '' for l in data['images']: if l['type'] == 'leading': main_image_url = l['url'] sub_img = l['title'] if main_image_url != '': div = E(article, 'div') E(div, 'img', src=main_image_url) E(div, 'div', sub_img, style='text-align:center; font-size:small;') for node in data['body']['json']: process_node(node, article) class SCMP(BasicNewsRecipe): title = 'South China Morning Post' __author__ = 'unkn0wn' description = ( 'The South China Morning Post is a leading news media company that has reported on China and Asia ' 'for more than a century with global impact. Founded in 1903, SCMP is headquartered in Hong Kong, ' 'where it is the city’s newspaper of record. Our teams span across Asia and the United States, ' 'working together to connect with news consumers around the world. We are committed to informing ' 'and inspiring through journalism of the highest standards. Our vision is to “Elevate Thought”, ' 'and our mission is to “Lead the global conversation about China”.' ) publisher = 'South China Morning Post Publishers Ltd.' oldest_article = 1 no_stylesheets = True remove_javascript = True remove_attributes = ['width', 'height'] encoding = 'utf-8' use_embedded_content = False language = 'en_HK' remove_empty_feeds = True resolve_internal_links = True publication_type = 'newspaper' ignore_duplicate_articles = {'title', 'url'} extra_css = 'blockquote, em { color: #202020; }' masthead_url = 'https://upload.wikimedia.org/wikipedia/commons/c/c3/SCMP_logo.svg' def get_cover_url(self): soup = self.index_to_soup('https://www.frontpages.com/south-china-morning-post/') return 'https://www.frontpages.com' + soup.find('img', attrs={'id':'giornale-img'})['src'] recipe_specific_options = { 'days': { 'short': 'Oldest article to download from this news source. In days ', 'long': 'For example, 0.5, gives you articles from the past 12 hours', 'default': str(oldest_article) }, 'comp': { 'short': 'Compress News Images?', 'long': 'enter yes', 'default': 'no' }, 'rev': { 'short': 'Reverse the order of articles in each feed?', 'long': 'enter yes', 'default': 'no' } } def __init__(self, *args, **kwargs): BasicNewsRecipe.__init__(self, *args, **kwargs) d = self.recipe_specific_options.get('days') if d and isinstance(d, str): self.oldest_article = float(d) r = self.recipe_specific_options.get('rev') if r and isinstance(r, str): if r.lower() == 'yes': self.reverse_article_order = True c = self.recipe_specific_options.get('comp') if c and isinstance(c, str): if c.lower() == 'yes': self.compress_news_images = True # used when unable to extract article from