diff --git a/recipes/icons/scmp.png b/recipes/icons/scmp.png index a8ce467738..f8663e55ab 100644 Binary files a/recipes/icons/scmp.png and b/recipes/icons/scmp.png differ diff --git a/recipes/scmp.recipe b/recipes/scmp.recipe index a70d8a4762..fcaba61257 100644 --- a/recipes/scmp.recipe +++ b/recipes/scmp.recipe @@ -1,34 +1,111 @@ #!/usr/bin/env python -# vim:fileencoding=utf-8 """ scmp.com """ import json import re -from datetime import datetime, timedelta, timezone +from datetime import datetime +import time +from html5_parser import parse +from lxml import etree + +from calibre import replace_entities from calibre.ebooks.BeautifulSoup import BeautifulSoup from calibre.web.feeds.news import BasicNewsRecipe, classes +def E(parent, name, text='', **attrs): + ans = parent.makeelement(name, **attrs) + ans.text = text + parent.append(ans) + return ans + +def process_node(node, html_parent): + ntype = node.get('type') + + if not ntype in {'track-viewed-percentage', 'inline-ad-slot', 'inline-widget', 'text'}: + c = html_parent.makeelement(ntype) + if ntype != 'p': + c.attrib.update({k: v or '' for k, v in node.get('attribs', {}).items()}) + html_parent.append(c) + for nc in node.get('children', ()): + process_node(nc, c) + elif ntype == 'text': + text = node.get('data') + if text: + text = replace_entities(text) + if len(html_parent): + t = html_parent[-1] + t.tail = (t.tail or '') + text + else: + html_parent.text = (html_parent.text or '') + text + + +def ts_date(x): + dt = datetime.fromtimestamp(x/1000 + time.timezone) + return dt.strftime('%b %d, %Y at %I:%M %p') + +def auth(x): + return ', '.join([a['name'] for a in x]) + +def load_article_from_json(raw, root): + # open('/t/raw.json', 'w').write(raw) + data = json.loads(raw)['props']['pageProps']['payload']['data']['article'] + body = root.xpath('//body')[0] + for child in tuple(body): + body.remove(child) + article = E(body, 'article') + E(article, 'div', replace_entities(data['firstTopic']['name']) , style='color: gray; font-size:small; font-weight:bold;') + E(article, 'h1', replace_entities(data['headline'])) + # E(article, 'p', replace_entities(data['subHeadline']['text']), style='font-style: italic; color:#202020;') + for subh in data['subHeadline']['json']: + process_node(subh, article) + auth = ts_date(data['publishedDate']) + ' | ' + data['readingTime'] or '' + ' min read | ' + auth(data['authors']) + E(article, 'p', auth, style='color: #202020; font-size:small;') + main_image_url = sub_img = '' + for l in data['images']: + if l['type'] == 'leading': + main_image_url = l['url'] + sub_img = l['title'] + if main_image_url != '': + div = E(article, 'div') + E(div, 'img', src=main_image_url) + E(div, 'div', sub_img, style='text-align:center; font-size:small;') + for node in data['body']['json']: + process_node(node, article) + + class SCMP(BasicNewsRecipe): title = "South China Morning Post" - __author__ = "llam" - description = "SCMP.com, Hong Kong's premier online English daily provides exclusive up-to-date news, audio video news, podcasts, RSS Feeds, Blogs, breaking news, top stories, award winning news and analysis on Hong Kong and China." # noqa + __author__ = "unkn0wn" + description = ( + 'The South China Morning Post is a leading news media company that has reported on China and Asia ' + 'for more than a century with global impact. Founded in 1903, SCMP is headquartered in Hong Kong, ' + 'where it is the city’s newspaper of record. Our teams span across Asia and the United States, ' + 'working together to connect with news consumers around the world. We are committed to informing ' + 'and inspiring through journalism of the highest standards. Our vision is to “Elevate Thought”, ' + 'and our mission is to “Lead the global conversation about China”.' + ) publisher = "South China Morning Post Publishers Ltd." oldest_article = 1 - max_articles_per_feed = 25 no_stylesheets = True remove_javascript = True + remove_attributes = ['width', 'height'] encoding = "utf-8" use_embedded_content = False - language = "en" + language = "en_HK" remove_empty_feeds = True + resolve_internal_links = True publication_type = "newspaper" - auto_cleanup = False - compress_news_images = True ignore_duplicate_articles = {"title", "url"} + extra_css = 'blockquote, em { color: #202020; }' + masthead_url = 'https://upload.wikimedia.org/wikipedia/commons/c/c3/SCMP_logo.svg' + + def get_cover_url(self): + soup = self.index_to_soup('https://www.frontpages.com/south-china-morning-post/') + return 'https://www.frontpages.com' + soup.find('img', attrs={'id':'giornale-img'})['src'] recipe_specific_options = { 'days': { @@ -56,22 +133,6 @@ class SCMP(BasicNewsRecipe): dict(attrs={"addthis_title": True}), dict(name=["script", "style"]), ] - remove_attributes = ["style", "font"] - - extra_css = """ - .headline { font-size: 1.8rem; margin-bottom: 0.4rem; } - .sub-headline { font-size: 1rem; margin-bottom: 1.5rem; } - .sub-headline ul { padding-left: 1rem; } - .sub-headline ul li { fmargin-bottom: 0.8rem; } - .article-meta, .article-header__publish { padding-bottom: 0.5rem; } - .article-meta .author { text-transform: uppercase; font-weight: bold; } - .article-meta .published-dt { margin-left: 0.5rem; } - .article-img { margin-bottom: 0.8rem; max-width: 100%; } - .article-img img, .carousel__slide img { - display: block; margin-bottom: 0.3rem; max-width: 100%; height: auto; - box-sizing: border-box; } - .article-img .caption, .article-caption { font-size: 0.8rem; } - """ # https://www.scmp.com/rss feeds = [ @@ -86,156 +147,36 @@ class SCMP(BasicNewsRecipe): ("Sport", "https://www.scmp.com/rss/95/feed"), ("Post Mag", "https://www.scmp.com/rss/71/feed"), ("Style", "https://www.scmp.com/rss/72/feed"), + ("News", 'https://www.scmp.com/rss/91/feed') ] - masthead_url = 'https://upload.wikimedia.org/wikipedia/commons/c/c3/SCMP_logo.svg' - - def get_cover_url(self): - from datetime import date - cover = 'https://img.kiosko.net/' + str( - date.today().year - ) + '/' + date.today().strftime('%m') + '/' + date.today( - ).strftime('%d') + '/cn/scmp.750.jpg' - br = BasicNewsRecipe.get_browser(self, verify_ssl_certificates=False) - try: - br.open(cover) - except: - index = 'https://es.kiosko.net/cn/np/scmp.html' - soup = self.index_to_soup(index) - for image in soup.findAll('img', src=True): - if image['src'].endswith('750.jpg'): - return 'https:' + image['src'] - self.log("\nCover unavailable") - cover = None - return cover - - def _extract_child_nodes(self, children, ele, soup, level=1): - if not children: - return - - child_html = "" - for child in children: - if child.get("type", "") == "text": - child_html += child["data"] - else: - if child["type"] == "iframe": - # change iframe to with the src linked - new_ele = soup.new_tag("span") - new_ele["class"] = f'embed-{child["type"]}' - iframe_src = child.get("attribs", {}).get("src") - a_tag = soup.new_tag("a") - a_tag["href"] = iframe_src - a_tag.string = f"[Embed: {iframe_src}]" - new_ele.append(a_tag) - else: - new_ele = soup.new_tag(child["type"]) - for k, v in child.get("attribs", {}).items(): - if k.startswith("data-"): - continue - new_ele[k] = v - if child.get("children"): - self._extract_child_nodes( - child["children"], new_ele, soup, level + 1 - ) - child_html += str(new_ele) - if child["type"] == "img": - # generate a caption tag for - caption_text = child.get("attribs", {}).get("alt") or child.get( - "attribs", {} - ).get("title") - if caption_text: - new_ele = soup.new_tag("span") - new_ele.append(caption_text) - new_ele["class"] = "caption" - child_html += str(new_ele) - ele["class"] = "article-img" - ele.append(BeautifulSoup(child_html)) + def print_version(self, url): + return url.split('?')[0] def preprocess_raw_html(self, raw_html, url): - article = None - soup = BeautifulSoup(raw_html) - - for script in soup.find_all("script"): - if not script.contents: - continue - if not script.contents[0].startswith("window.__APOLLO_STATE__"): - continue - article_js = re.sub( - r"window.__APOLLO_STATE__\s*=\s*", "", script.contents[0].strip() - ) - if article_js.endswith(";"): - article_js = article_js[:-1] + body = '
' + b_root = parse(body) + root = parse(raw_html) + script = root.xpath('//script[@id="__NEXT_DATA__"]') + if script: try: - article = json.loads(article_js) - break - except json.JSONDecodeError: - self.log.exception("Unable to parse __APOLLO_STATE__") + load_article_from_json(script[0].text, b_root) + except Exception: + return raw_html + head = b_root.xpath('//h2') + b_root.xpath('//h3') + for h2 in head: + h2.tag = 'h4' + raw = etree.tostring(b_root, encoding='unicode') + return raw + return raw_html - if not (article and article.get("contentService")): - # Sometimes the page does not have article content in the