diff --git a/recipes/icons/scmp.png b/recipes/icons/scmp.png index a8ce467738..f8663e55ab 100644 Binary files a/recipes/icons/scmp.png and b/recipes/icons/scmp.png differ diff --git a/recipes/scmp.recipe b/recipes/scmp.recipe index a70d8a4762..14652183c2 100644 --- a/recipes/scmp.recipe +++ b/recipes/scmp.recipe @@ -1,34 +1,108 @@ #!/usr/bin/env python -# vim:fileencoding=utf-8 """ scmp.com """ import json import re -from datetime import datetime, timedelta, timezone +from datetime import datetime +import time +from html5_parser import parse +from lxml import etree + +from calibre import replace_entities from calibre.ebooks.BeautifulSoup import BeautifulSoup from calibre.web.feeds.news import BasicNewsRecipe, classes +def E(parent, name, text='', **attrs): + ans = parent.makeelement(name, **attrs) + ans.text = text + parent.append(ans) + return ans + + +def process_node(node, html_parent): + ntype = node.get('type') + + if not ntype in {'track-viewed-percentage', 'inline-ad-slot', 'inline-widget', 'text'}: + c = html_parent.makeelement(ntype) + if ntype != 'p': + c.attrib.update({k: v or '' for k, v in node.get('attribs', {}).items()}) + html_parent.append(c) + for nc in node.get('children', ()): + process_node(nc, c) + elif ntype == 'text': + text = node.get('data') + if text: + text = replace_entities(text) + if len(html_parent): + t = html_parent[-1] + t.tail = (t.tail or '') + text + else: + html_parent.text = (html_parent.text or '') + text + + +def ts_date(x): + dt = datetime.fromtimestamp(x/1000 + time.timezone) + return dt.strftime('%b %d, %Y at %I:%M %p') + + +def auth(x): + return ', '.join([a['name'] for a in x]) + + +def load_article_from_json(raw, root): + # open('/t/raw.json', 'w').write(raw) + data = json.loads(raw)['props']['pageProps']['payload']['data']['article'] + body = root.xpath('//body')[0] + for child in tuple(body): + body.remove(child) + article = E(body, 'article') + E(article, 'div', replace_entities(data['firstTopic']['name']) , style='color: gray; font-size:small; font-weight:bold;') + E(article, 'h1', replace_entities(data['headline'])) + # E(article, 'p', replace_entities(data['subHeadline']['text']), style='font-style: italic; color:#202020;') + for subh in data['subHeadline']['json']: + process_node(subh, article) + E(article, 'p', ts_date(data['publishedDate']) + ' | ' + auth(data['authors']), style='color: #202020; font-size:small;') + main_image_url = sub_img = '' + for l in data['images']: + if l['type'] == 'leading': + main_image_url = l['url'] + sub_img = l['title'] + if main_image_url != '': + div = E(article, 'div') + E(div, 'img', src=main_image_url) + E(div, 'div', sub_img, style='text-align:center; font-size:small;') + for node in data['body']['json']: + process_node(node, article) + + class SCMP(BasicNewsRecipe): title = "South China Morning Post" - __author__ = "llam" - description = "SCMP.com, Hong Kong's premier online English daily provides exclusive up-to-date news, audio video news, podcasts, RSS Feeds, Blogs, breaking news, top stories, award winning news and analysis on Hong Kong and China." # noqa + __author__ = "unkn0wn" + description = ( + 'The South China Morning Post is a leading news media company that has reported on China and Asia ' + 'for more than a century with global impact. Founded in 1903, SCMP is headquartered in Hong Kong, ' + 'where it is the city’s newspaper of record. Our teams span across Asia and the United States, ' + 'working together to connect with news consumers around the world. We are committed to informing ' + 'and inspiring through journalism of the highest standards. Our vision is to “Elevate Thought”, ' + 'and our mission is to “Lead the global conversation about China”.' + ) publisher = "South China Morning Post Publishers Ltd." oldest_article = 1 - max_articles_per_feed = 25 no_stylesheets = True remove_javascript = True + remove_attributes = ['width', 'height'] encoding = "utf-8" use_embedded_content = False - language = "en" + language = "en_HK" remove_empty_feeds = True + resolve_internal_links = True publication_type = "newspaper" - auto_cleanup = False - compress_news_images = True ignore_duplicate_articles = {"title", "url"} + extra_css = 'blockquote, em { color: #202020; }' recipe_specific_options = { 'days': { @@ -44,6 +118,10 @@ class SCMP(BasicNewsRecipe): if d and isinstance(d, str): self.oldest_article = float(d) + def get_cover_url(self): + soup = self.index_to_soup('https://www.frontpages.com/south-china-morning-post/') + return 'https://www.frontpages.com' + soup.find('img', attrs={'id':'giornale-img'})['src'] + # used when unable to extract article from