From 931ee9867ab1e73b478f707e6722adc327414d07 Mon Sep 17 00:00:00 2001
From: unkn0w7n <51942695+unkn0w7n@users.noreply.github.com>
Date: Sun, 8 Sep 2024 19:09:57 +0530
Subject: [PATCH 1/4] Update SCMP

---
 recipes/icons/scmp.png | Bin 1081 -> 180 bytes
 recipes/scmp.recipe    | 284 +++++++++++++++++------------------------
 2 files changed, 114 insertions(+), 170 deletions(-)
diff --git a/recipes/icons/scmp.png b/recipes/icons/scmp.png
index a8ce4677381c9645a330a9557ff837dabe034b38..f8663e55ab6de9c95e14c3868f7cf10a6dc6d495 100644
GIT binary patch
delta 163
zcmdnVv4wGhWIZzj1B1(wu46z-CBP@d)xE&yA1e5Niitri`2YX^(`K#t|7hC3$5VyY
z-R%RaXDbQv3ubuVP+#8IBR<IwC}Qd9;uvDlo17qF+0ZJeozSSA@bmyz#6zwb7CUY(
zXlQPh<+8AxF;S$TS4+Z#i^E0ipv()k83}Fn3=DlvT*XYA?S27GVeoYIb6Mw<&;$Tj
CTtxK%

delta 1071
zcmV+~1kn4m0l5f}8Gi-<0047(dh`GQ1OQ1yK~#9!%v1x=bVm@}*tTukwrv}Q*g3|w
zZQHhU4`=hU?fW}j9sRbp>K^81r+a#OpBZEMe_!7C-HS~y{I2o;88!Y;!T9Zx@mpoS
zSElU8J>07rtfiVk_hMm;Z}E@)O)#1=x8`7O&BEN2fVnX~U4MS6b!!HyO$=d;u>#J1
zCJ^Znu;1a&o<ToH%?XQ`qseX;%`F*FHYLv3p9NugERH9}aRfq_nA>tQxbyKSaa$h7
zXI*M&k`Mou-k!NXVG>&orAcNw0bvrGGbz%kGoOGkiLLWG>j;!*2LgXqW*q@yk3qm|
z!CV{q|L0>YV1GmRYyN~{*Xal4dzgPU<JX6jrx<gT>-+y*IoGMCFvd@S#TE)WIP|3)
zPJW|+?H^BJ?!Q5_dDo}YIODA%4&(ouJe*Xab?=Ykpcivk?Vcz^g0U?gNn*d}vRLRQ
z7xUiWU^njD_qhyqc`}V1`CBmHsOFdif-x+{<d@u5oPYFc9$VZQ#p8drU;_fodDWkI
z^!H{s^?lgkp#-*nIDwbKtyq}(yFQTuxZX{6z+CYMmb}?TbMG)Rtpk{kip)VJ-j26p
zq&f;ak=VO@{;}ADWtnFJ(a{KaBG~n*w2Ga{E?JCBZ7l4w)Uo1kGk)oBl5vfDW0;@n
zY<zzVb$?w$=ozeH33j|Q3H1+EVK?-F22-7<LoMhVu4*h7e^^niJ)X``vB+oy{821>
zYZy*rzY>zo{W$l%63k`4<Gx>-6x;gIB#UVF+-&K2E2?qZkB#7Bewor|@MEb5#g0MA
zF9KEzdP!=-`{H;f)`6ePJy@D#O&~Ug2UKtwl79$%Q;$mLkcuAod<HjtSFh)to=8Jx
z^2?OU-;*;v==m&CBwX$o!gf^PwP-t9`i8N{jjjkhMFj>&M&X!O@;HpS9Gw~<=9mhs
zM>*HHn{&C~XDm-5loIEBP^N*hB5=#M^)MH61Xulmxq~+5^n|c4rQG^~IPU+s5mi7w
zdVew8hOWU8EPj(i1*HJ`SZgs7GME2O1CH+$cq7>Hu_Rt_TCx3oF&x3Uf12sU8h1o6
z*R>%QX~n>BRqCAZW)Y{-CkxQ=7rtM{o^<(vl>P;`O&x!f?gSggI`>dw8u(2r!X-nq
z^9g?k=A^7!an47+lEWb{X60y|U5}-Wdw&q<y3cEI);mQke_I%HFuu*>Nvz4)&coxo
zG-I4QcA}a0q0}qi71cYB9`@z?sywR3JMj<Em>(g}Xl!{J=wit)AFz9Uc>Ws9f%V|>
z0=3|&!^X#Sz<?LUqqX6HH%JD&NjBh((gAOl4`9dSEVyGbpj|UOozvSi=$zic+9)~2
p?4CDY%%@rRyb)-2)geh_D*-T6UO1U8TqFPh002ovPDHLkV1mcZ8SMZ7

diff --git a/recipes/scmp.recipe b/recipes/scmp.recipe
index a70d8a4762..14652183c2 100644
--- a/recipes/scmp.recipe
+++ b/recipes/scmp.recipe
@@ -1,34 +1,108 @@
 #!/usr/bin/env python
-# vim:fileencoding=utf-8
 """
 scmp.com
 """
 
 import json
 import re
-from datetime import datetime, timedelta, timezone
+from datetime import datetime
+import time
 
+from html5_parser import parse
+from lxml import etree
+
+from calibre import replace_entities
 from calibre.ebooks.BeautifulSoup import BeautifulSoup
 from calibre.web.feeds.news import BasicNewsRecipe, classes
 
 
+def E(parent, name, text='', **attrs):
+    ans = parent.makeelement(name, **attrs)
+    ans.text = text
+    parent.append(ans)
+    return ans
+
+
+def process_node(node, html_parent):
+    ntype = node.get('type')
+    
+    if not ntype in {'track-viewed-percentage', 'inline-ad-slot', 'inline-widget', 'text'}:
+        c = html_parent.makeelement(ntype)
+        if ntype != 'p':
+            c.attrib.update({k: v or '' for k, v in node.get('attribs', {}).items()})
+        html_parent.append(c)
+        for nc in node.get('children', ()):
+            process_node(nc, c)
+    elif ntype == 'text':
+        text = node.get('data')
+        if text:
+            text = replace_entities(text)
+            if len(html_parent):
+                t = html_parent[-1]
+                t.tail = (t.tail or '') + text
+            else:
+                html_parent.text = (html_parent.text or '') + text
+
+
+def ts_date(x):
+    dt = datetime.fromtimestamp(x/1000 + time.timezone)
+    return dt.strftime('%b %d, %Y at %I:%M %p')
+
+
+def auth(x):
+    return ', '.join([a['name'] for a in x])
+
+
+def load_article_from_json(raw, root):
+    # open('/t/raw.json', 'w').write(raw)
+    data = json.loads(raw)['props']['pageProps']['payload']['data']['article']
+    body = root.xpath('//body')[0]
+    for child in tuple(body):
+        body.remove(child)
+    article = E(body, 'article')
+    E(article, 'div', replace_entities(data['firstTopic']['name']) , style='color: gray; font-size:small; font-weight:bold;')
+    E(article, 'h1', replace_entities(data['headline']))
+    # E(article, 'p', replace_entities(data['subHeadline']['text']), style='font-style: italic; color:#202020;')
+    for subh in data['subHeadline']['json']:
+        process_node(subh, article)
+    E(article, 'p', ts_date(data['publishedDate']) + ' | ' + auth(data['authors']), style='color: #202020; font-size:small;')
+    main_image_url = sub_img = ''
+    for l in data['images']:
+        if l['type'] == 'leading':
+            main_image_url = l['url']
+            sub_img = l['title']
+    if main_image_url != '':
+        div = E(article, 'div')
+        E(div, 'img', src=main_image_url)
+        E(div, 'div', sub_img, style='text-align:center; font-size:small;')
+    for node in data['body']['json']:
+        process_node(node, article)
+
+
 class SCMP(BasicNewsRecipe):
     title = "South China Morning Post"
-    __author__ = "llam"
-    description = "SCMP.com, Hong Kong's premier online English daily provides exclusive up-to-date news, audio video news, podcasts, RSS Feeds, Blogs, breaking news, top stories, award winning news and analysis on Hong Kong and China."  # noqa
+    __author__ = "unkn0wn"
+    description = (
+        'The South China Morning Post is a leading news media company that has reported on China and Asia '
+        'for more than a century with global impact. Founded in 1903, SCMP is headquartered in Hong Kong, '
+        'where it is the city’s newspaper of record. Our teams span across Asia and the United States, '
+        'working together to connect with news consumers around the world. We are committed to informing '
+        'and inspiring through journalism of the highest standards. Our vision is to “Elevate Thought”, '
+        'and our mission is to “Lead the global conversation about China”.'
+    )
     publisher = "South China Morning Post Publishers Ltd."
     oldest_article = 1
-    max_articles_per_feed = 25
     no_stylesheets = True
     remove_javascript = True
+    remove_attributes = ['width', 'height']
     encoding = "utf-8"
     use_embedded_content = False
-    language = "en"
+    language = "en_HK"
     remove_empty_feeds = True
+    resolve_internal_links = True
     publication_type = "newspaper"
-    auto_cleanup = False
-    compress_news_images = True
     ignore_duplicate_articles = {"title", "url"}
+    extra_css = 'blockquote, em { color: #202020; }'
 
     recipe_specific_options = {
         'days': {
@@ -44,6 +118,10 @@ class SCMP(BasicNewsRecipe):
         if d and isinstance(d, str):
             self.oldest_article = float(d)
 
+    def get_cover_url(self):
+        soup = self.index_to_soup('https://www.frontpages.com/south-china-morning-post/')
+        return 'https://www.frontpages.com' + soup.find('img', attrs={'id':'giornale-img'})['src']
+
     # used when unable to extract article from <script>, particularly in the Sports section
     remove_tags = [
         dict(
@@ -56,22 +134,6 @@ class SCMP(BasicNewsRecipe):
         dict(attrs={"addthis_title": True}),
         dict(name=["script", "style"]),
     ]
-    remove_attributes = ["style", "font"]
-
-    extra_css = """
-    .headline { font-size: 1.8rem; margin-bottom: 0.4rem; }
-    .sub-headline { font-size: 1rem; margin-bottom: 1.5rem; }
-    .sub-headline ul { padding-left: 1rem; }
-    .sub-headline ul li { fmargin-bottom: 0.8rem; }
-    .article-meta, .article-header__publish { padding-bottom: 0.5rem; }
-    .article-meta .author { text-transform: uppercase; font-weight: bold; }
-    .article-meta .published-dt { margin-left: 0.5rem; }
-    .article-img { margin-bottom: 0.8rem; max-width: 100%; }
-    .article-img img, .carousel__slide img {
-        display: block; margin-bottom: 0.3rem; max-width: 100%; height: auto;
-        box-sizing: border-box; }
-    .article-img .caption, .article-caption { font-size: 0.8rem; }
-    """
 
     # https://www.scmp.com/rss
     feeds = [
@@ -86,156 +148,38 @@ class SCMP(BasicNewsRecipe):
         ("Sport", "https://www.scmp.com/rss/95/feed"),
         ("Post Mag", "https://www.scmp.com/rss/71/feed"),
         ("Style", "https://www.scmp.com/rss/72/feed"),
+        ("News", 'https://www.scmp.com/rss/91/feed')
     ]
 
+    def print_version(self, url):
+        return url.split('?')[0]
+
     masthead_url = 'https://upload.wikimedia.org/wikipedia/commons/c/c3/SCMP_logo.svg'
 
-    def get_cover_url(self):
-        from datetime import date
-        cover = 'https://img.kiosko.net/' + str(
-            date.today().year
-        ) + '/' + date.today().strftime('%m') + '/' + date.today(
-        ).strftime('%d') + '/cn/scmp.750.jpg'
-        br = BasicNewsRecipe.get_browser(self, verify_ssl_certificates=False)
-        try:
-            br.open(cover)
-        except:
-            index = 'https://es.kiosko.net/cn/np/scmp.html'
-            soup = self.index_to_soup(index)
-            for image in soup.findAll('img', src=True):
-                if image['src'].endswith('750.jpg'):
-                    return 'https:' + image['src']
-            self.log("\nCover unavailable")
-            cover = None
-        return cover
-
-    def _extract_child_nodes(self, children, ele, soup, level=1):
-        if not children:
-            return
-
-        child_html = ""
-        for child in children:
-            if child.get("type", "") == "text":
-                child_html += child["data"]
-            else:
-                if child["type"] == "iframe":
-                    # change iframe to <span> with the src linked
-                    new_ele = soup.new_tag("span")
-                    new_ele["class"] = f'embed-{child["type"]}'
-                    iframe_src = child.get("attribs", {}).get("src")
-                    a_tag = soup.new_tag("a")
-                    a_tag["href"] = iframe_src
-                    a_tag.string = f"[Embed: {iframe_src}]"
-                    new_ele.append(a_tag)
-                else:
-                    new_ele = soup.new_tag(child["type"])
-                    for k, v in child.get("attribs", {}).items():
-                        if k.startswith("data-"):
-                            continue
-                        new_ele[k] = v
-                    if child.get("children"):
-                        self._extract_child_nodes(
-                            child["children"], new_ele, soup, level + 1
-                        )
-                child_html += str(new_ele)
-                if child["type"] == "img":
-                    # generate a caption <span> tag for <img>
-                    caption_text = child.get("attribs", {}).get("alt") or child.get(
-                        "attribs", {}
-                    ).get("title")
-                    if caption_text:
-                        new_ele = soup.new_tag("span")
-                        new_ele.append(caption_text)
-                        new_ele["class"] = "caption"
-                        child_html += str(new_ele)
-                    ele["class"] = "article-img"
-        ele.append(BeautifulSoup(child_html))
-
     def preprocess_raw_html(self, raw_html, url):
-        article = None
-        soup = BeautifulSoup(raw_html)
-
-        for script in soup.find_all("script"):
-            if not script.contents:
-                continue
-            if not script.contents[0].startswith("window.__APOLLO_STATE__"):
-                continue
-            article_js = re.sub(
-                r"window.__APOLLO_STATE__\s*=\s*", "", script.contents[0].strip()
-            )
-            if article_js.endswith(";"):
-                article_js = article_js[:-1]
+        body = '<html><body><article></article></body></html>'
+        b_root = parse(body)
+        root = parse(raw_html)
+        script = root.xpath('//script[@id="__NEXT_DATA__"]')
+        if script:
             try:
-                article = json.loads(article_js)
-                break
-            except json.JSONDecodeError:
-                self.log.exception("Unable to parse __APOLLO_STATE__")
+                load_article_from_json(script[0].text, b_root)
+            except Exception:
+                return raw_html
+            head = root.xpath('//h2') + root.xpath('//h3')    
+            for h2 in head:
+                h2.tag = 'h4'
+            raw = etree.tostring(b_root, encoding='unicode')
+            return raw
+        return raw_html
 
-        if not (article and article.get("contentService")):
-            # Sometimes the page does not have article content in the <script>
-            # particularly in the Sports section, so we fallback to
-            # raw_html and rely on remove_tags to clean it up
-            self.log(f"Unable to find article from script in {url}")
-            return raw_html
-
-        content_service = article.get("contentService")
-        content_node_id = None
-        for k, v in content_service["ROOT_QUERY"].items():
-            if not k.startswith("content"):
-                continue
-            content_node_id = v["id"]
-            break
-        content = content_service.get(content_node_id)
-
-        if content.get("sponsorType"):
-            # skip sponsored articles
-            self.abort_article(f"Sponsored article: {url}")
-
-        body = None
-        for k, v in content.items():
-            if (not k.startswith("body(")) or v.get("type", "") != "json":
-                continue
-            body = v
-
-        authors = [content_service[a["id"]]["name"] for a in content["authors"]]
-        date_published = datetime.fromtimestamp(
-            content["publishedDate"] / 1000, timezone.utc)
-        date_published_loc = date_published.astimezone(
-            timezone(offset=timedelta(hours=8))  # HK time
-        )
-        try:
-            df = date_published_loc.strftime('%-I:%M%p, %-d %b, %Y')
-        except Exception:
-            df = ''
-
-        html_output = f"""<html><head><title>{content["headline"]}</title></head>
-        <body>
-            <article>
-            <h1 class="headline">{content["headline"]}</h1>
-            <div class="sub-headline"></div>
-            <div class="article-meta">
-                <span class="author">{", ".join(authors)}</span>
-                <span class="published-dt">{df}</span>
-            </div>
-            </article>
-        </body></html>
-        """
-
-        new_soup = BeautifulSoup(html_output, "html.parser")
-        # sub headline
-        for c in content.get("subHeadline", {}).get("json", []):
-            ele = new_soup.new_tag(c["type"])
-            self._extract_child_nodes(c.get("children", []), ele, new_soup)
-            new_soup.find(class_="sub-headline").append(ele)
-
-        # article content
-        for node in body["json"]:
-            if node["type"] not in ["p", "div"]:
-                continue
-            new_ele = new_soup.new_tag(node["type"])
-            new_ele.string = ""
-            if node.get("children"):
-                self._extract_child_nodes(node["children"], new_ele, new_soup)
-            new_soup.article.append(new_ele)
-
-        return str(new_soup)
+    def preprocess_html(self, soup):
+        from urllib.parse import urlparse
+        for img in soup.findAll('img', attrs={'src':True}):
+            y = 'https://img.i-scmp.com/cdn-cgi/image/fit=contain,width=768,format=auto'
+            img['src'] = y + urlparse(img['src']).path
+        for img in soup.findAll('img', attrs={'title':True}):
+            div = soup.new_tag('div', attrs={'style':'text-align:center; font-size:small;'})
+            div.string = img['title']
+            img.find_parent('div').append(div)
+        return soup

From cd126ea6588cb19dcfe257bf47c14512d50e510d Mon Sep 17 00:00:00 2001
From: unkn0w7n <51942695+unkn0w7n@users.noreply.github.com>
Date: Sun, 8 Sep 2024 19:23:47 +0530
Subject: [PATCH 2/4] ...

---
 recipes/scmp.recipe | 14 +++++---------
 1 file changed, 5 insertions(+), 9 deletions(-)

diff --git a/recipes/scmp.recipe b/recipes/scmp.recipe
index 14652183c2..a7ee802969 100644
--- a/recipes/scmp.recipe
+++ b/recipes/scmp.recipe
@@ -22,7 +22,6 @@ def E(parent, name, text='', **attrs):
     parent.append(ans)
     return ans
 
-
 def process_node(node, html_parent):
     ntype = node.get('type')
     
@@ -48,11 +47,9 @@ def ts_date(x):
     dt = datetime.fromtimestamp(x/1000 + time.timezone)
     return dt.strftime('%b %d, %Y at %I:%M %p')
 
-
 def auth(x):
     return ', '.join([a['name'] for a in x])
 
-
 def load_article_from_json(raw, root):
     # open('/t/raw.json', 'w').write(raw)
     data = json.loads(raw)['props']['pageProps']['payload']['data']['article']
@@ -103,6 +100,11 @@ class SCMP(BasicNewsRecipe):
     publication_type = "newspaper"
     ignore_duplicate_articles = {"title", "url"}
     extra_css = 'blockquote, em { color: #202020; }'
+    masthead_url = 'https://upload.wikimedia.org/wikipedia/commons/c/c3/SCMP_logo.svg'
+
+    def get_cover_url(self):
+        soup = self.index_to_soup('https://www.frontpages.com/south-china-morning-post/')
+        return 'https://www.frontpages.com' + soup.find('img', attrs={'id':'giornale-img'})['src']
 
     recipe_specific_options = {
         'days': {
@@ -118,10 +120,6 @@ class SCMP(BasicNewsRecipe):
         if d and isinstance(d, str):
             self.oldest_article = float(d)
 
-    def get_cover_url(self):
-        soup = self.index_to_soup('https://www.frontpages.com/south-china-morning-post/')
-        return 'https://www.frontpages.com' + soup.find('img', attrs={'id':'giornale-img'})['src']
-
     # used when unable to extract article from <script>, particularly in the Sports section
     remove_tags = [
         dict(
@@ -154,8 +152,6 @@ class SCMP(BasicNewsRecipe):
     def print_version(self, url):
         return url.split('?')[0]
 
-    masthead_url = 'https://upload.wikimedia.org/wikipedia/commons/c/c3/SCMP_logo.svg'
-
     def preprocess_raw_html(self, raw_html, url):
         body = '<html><body><article></article></body></html>'
         b_root = parse(body)

From cf5c4a7d15777c80b08a69987282657e419ade2a Mon Sep 17 00:00:00 2001
From: unkn0w7n <51942695+unkn0w7n@users.noreply.github.com>
Date: Sun, 8 Sep 2024 20:24:17 +0530
Subject: [PATCH 3/4] ...

---
 recipes/scmp.recipe | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/recipes/scmp.recipe b/recipes/scmp.recipe
index a7ee802969..ba3abf18d0 100644
--- a/recipes/scmp.recipe
+++ b/recipes/scmp.recipe
@@ -62,7 +62,8 @@ def load_article_from_json(raw, root):
     # E(article, 'p', replace_entities(data['subHeadline']['text']), style='font-style: italic; color:#202020;')
     for subh in data['subHeadline']['json']:
         process_node(subh, article)
-    E(article, 'p', ts_date(data['publishedDate']) + ' | ' + auth(data['authors']), style='color: #202020; font-size:small;')
+    auth = ts_date(data['publishedDate']) + ' | ' + data['readingTime'] or '' + ' min read | ' + auth(data['authors'])
+    E(article, 'p', auth, style='color: #202020; font-size:small;')
     main_image_url = sub_img = ''
     for l in data['images']:
         if l['type'] == 'leading':

From 57a9682a71a3614323849808b234d00fca786da7 Mon Sep 17 00:00:00 2001
From: unkn0w7n <51942695+unkn0w7n@users.noreply.github.com>
Date: Sun, 8 Sep 2024 21:26:05 +0530
Subject: [PATCH 4/4] Update TLS

missing book details
---
 recipes/scmp.recipe    |  2 +-
 recipes/tls_mag.recipe | 27 ++++++++++++++-------------
 2 files changed, 15 insertions(+), 14 deletions(-)

diff --git a/recipes/scmp.recipe b/recipes/scmp.recipe
index ba3abf18d0..fcaba61257 100644
--- a/recipes/scmp.recipe
+++ b/recipes/scmp.recipe
@@ -163,7 +163,7 @@ class SCMP(BasicNewsRecipe):
                 load_article_from_json(script[0].text, b_root)
             except Exception:
                 return raw_html
-            head = root.xpath('//h2') + root.xpath('//h3')    
+            head = b_root.xpath('//h2') + b_root.xpath('//h3')
             for h2 in head:
                 h2.tag = 'h4'
             raw = etree.tostring(b_root, encoding='unicode')
diff --git a/recipes/tls_mag.recipe b/recipes/tls_mag.recipe
index 1b301d15c0..f3c019685d 100644
--- a/recipes/tls_mag.recipe
+++ b/recipes/tls_mag.recipe
@@ -118,18 +118,6 @@ class tls(BasicNewsRecipe):
         else:
             prim = title = desc = label = auth = lede = ''
 
-        bks = ''
-        if 'bookdetails' in data and data['bookdetails']:
-            bks += '<br>'
-            for a in data['bookdetails']:
-                for x, y in a.items():
-                    if isinstance(y, str):
-                        if x == 'imageurl':
-                            bks += '<img src="{}">'.format(y)
-                        elif y:
-                            bks += '<div class="det">' + y + '</div>\n'
-                bks += '<br>'
-
         if 'article_data_leadimage' in data:
             i = data['article_data_leadimage']
             if 'full_image' in i and i['full_image']:
@@ -138,7 +126,20 @@ class tls(BasicNewsRecipe):
                         + i['imagecredit'] + '</i>'
                 )
         cont = self.index_to_soup('https://www.the-tls.co.uk/wp-json/tls/v2/single-article/' + data['ID'], raw=True)
-        body = json.loads(cont)['content']
+        c_data = json.loads(cont)
+        body = c_data['content']
+
+        bks = ''
+        if 'bookdetails' in c_data and c_data['bookdetails']:
+            bks += '<br>'
+            for a in c_data['bookdetails']:
+                for x, y in a.items():
+                    if isinstance(y, str):
+                        if x == 'imageurl':
+                            bks += '<img src="{}">'.format(y)
+                        elif y:
+                            bks += '<div class="det">' + y + '</div>\n'
+                bks += '<br>'
 
         html = '<html><body><div>' \
                     + label + title + desc + auth + lede + bks + body + \