Merge branch 'scmp-recipe' of https://github.com/ping/calibre

2025-07-09 03:04:10 -04:00 · 2022-04-12 07:19:10 +05:30 · 2022-04-12 07:19:10 +05:30 · 6b773361c2
commit 6b773361c2
parent df0d17df59 517859adcb
1 changed files with 178 additions and 86 deletions
--- a/recipes/scmp.recipe
+++ b/recipes/scmp.recipe
@ -1,105 +1,197 @@
-'''
+"""
 scmp.com
-'''
+"""

-from mechanize import Request
 import json
-from calibre.web.feeds.news import BasicNewsRecipe
-from calibre.ebooks.BeautifulSoup import Tag
+import re
+from datetime import datetime, timedelta, timezone

-
-def classes(classes):
-    q = frozenset(classes.split(' '))
-    return dict(attrs={
-        'class': lambda x: x and frozenset(x.split()).intersection(q)})
-
-
-def new_tag(soup, name, attrs=()):
-    impl = getattr(soup, 'new_tag', None)
-    if impl is not None:
-        return impl(name, attrs=dict(attrs))
-    return Tag(soup, name, attrs=attrs or None)
+from calibre.ebooks.BeautifulSoup import BeautifulSoup
+from calibre.web.feeds.news import BasicNewsRecipe, classes


 class SCMP(BasicNewsRecipe):
-    title = 'South China Morning Post'
-    __author__ = 'llam'
+    title = "South China Morning Post"
+    __author__ = "llam"
    description = "SCMP.com, Hong Kong's premier online English daily provides exclusive up-to-date news, audio video news, podcasts, RSS Feeds, Blogs, breaking news, top stories, award winning news and analysis on Hong Kong and China."  # noqa
-    publisher = 'South China Morning Post Publishers Ltd.'
-    oldest_article = 2
-    delay = 1
-    max_articles_per_feed = 200
+    publisher = "South China Morning Post Publishers Ltd."
+    oldest_article = 1
+    max_articles_per_feed = 25
    no_stylesheets = True
-    encoding = 'utf-8'
+    remove_javascript = True
+    encoding = "utf-8"
    use_embedded_content = False
-    language = 'en_CN'
+    language = "en"
    remove_empty_feeds = True
-    needs_subscription = 'optional'
-    publication_type = 'newspaper'
+    publication_type = "newspaper"
+    auto_cleanup = False
+    compress_news_images = True
+    ignore_duplicate_articles = {"title", "url"}

-    keep_only_tags = [
-        dict(name='h1'),
-        classes('info__subHeadline article-author main__right'),
-    ]
+    # used when unable to extract article from <script>, particularly in the Sports section
    remove_tags = [
-        dict(name='button')
+        dict(
+            classes(
+                "sticky-wrap relative social-media social-media--extended__shares"
+                " article-body-comment scmp_button_comment_wrapper social-media--extended__in-site"
+                " footer scmp-advert-tile sidebar-col related-article share-widget"
+            )
+        ),
+        dict(attrs={"addthis_title": True}),
+        dict(name=["script", "style"]),
    ]
+    remove_attributes = ["style", "font"]

-    def get_browser(self):
-        br = BasicNewsRecipe.get_browser(self)
-        if self.username is not None and self.password is not None:
-            # br.set_debug_http(True)
-            # br.set_debug_responses(True)
-            # br.set_debug_redirects(True)
-            rq = Request('https://account.scmp.com/login', headers={
-                'Accept': 'application/json, text/plain, */*',
-                'Content-Type': 'application/json;charset=UTF-8',
-                'Referer': 'https://account.scmp.com/login',
-                }, data=json.dumps({'username': self.username, 'password': self.password}))
-            self.log('Sending login request...')
-            try:
-                res = br.open(rq)
-            except Exception as err:
-                if hasattr(err, 'read'):
-                    raise Exception('Login request failed with error: {} and body: {}'.format(err, err.read().decode('utf-8', 'replace')))
-                raise
-            if res.code != 200:
-                raise ValueError('Failed to login, check your username and password')
-            nonce = json.loads(res.read())['nonce']
-            rq = Request('https://www.scmp.com/centralize/signin?nonce=' + nonce, headers={
-                'referer': 'https://account.scmp.com/login',
-                'sec-fetch-mode': 'navigate',
-                'sec-fetch-site': 'same-site',
-                'sec-fetch-user': '?1'})
-            res = br.open(rq)
-            if res.code != 200:
-                raise ValueError('Failed to login, check your username and password')
-        return br
+    extra_css = """
+    .headline { font-size: 1.8rem; margin-bottom: 0.4rem; }
+    .sub-headline { font-size: 1rem; margin-bottom: 1.5rem; }
+    .sub-headline ul { padding-left: 1rem; }
+    .sub-headline ul li { fmargin-bottom: 0.8rem; }
+    .article-meta, .article-header__publish { padding-bottom: 0.5rem; }
+    .article-meta .author { text-transform: uppercase; font-weight: bold; }
+    .article-meta .published-dt { margin-left: 0.5rem; }
+    .article-img { margin-bottom: 0.8rem; max-width: 100%; }
+    .article-img img, .carousel__slide img {
+        display: block; margin-bottom: 0.3rem; max-width: 100%; height: auto;
+        box-sizing: border-box; }
+    .article-img .caption, .article-caption { font-size: 0.8rem; }
+    """

+    # https://www.scmp.com/rss
    feeds = [
-        ('Hong Kong', 'https://www.scmp.com/rss/2/feed'),
-        ('China', 'https://www.scmp.com/rss/4/feed'),
-        ('Asia', 'https://www.scmp.com/rss/3/feed'),
-        ('World', 'https://www.scmp.com/rss/5/feed'),
-        ('Business', 'https://www.scmp.com/rss/92/feed'),
-        ('Tech', 'https://www.scmp.com/rss/36/feed'),
-        ('Life', 'https://www.scmp.com/rss/94/feed'),
-        ('Culture', 'https://www.scmp.com/rss/322296/feed'),
-        ('Sport', 'https://www.scmp.com/rss/95/feed'),
-        ('Post Mag', 'https://www.scmp.com/rss/71/feed'),
-        ('Style', 'https://www.scmp.com/rss/72/feed'),
+        ("Hong Kong", "https://www.scmp.com/rss/2/feed"),
+        ("China", "https://www.scmp.com/rss/4/feed"),
+        ("Asia", "https://www.scmp.com/rss/3/feed"),
+        ("World", "https://www.scmp.com/rss/5/feed"),
+        ("Business", "https://www.scmp.com/rss/92/feed"),
+        ("Tech", "https://www.scmp.com/rss/36/feed"),
+        ("Life", "https://www.scmp.com/rss/94/feed"),
+        ("Culture", "https://www.scmp.com/rss/322296/feed"),
+        ("Sport", "https://www.scmp.com/rss/95/feed"),
+        ("Post Mag", "https://www.scmp.com/rss/71/feed"),
+        ("Style", "https://www.scmp.com/rss/72/feed"),
    ]

-    def preprocess_html(self, soup):
-        for img in soup.findAll("img", attrs={'data-original':True}):
-            img['src'] = img['data-original']
-        meta = soup.find('meta', attrs={'name':'twitter:image:src'}, content=True)
-        if meta is not None:
-            wrapper = soup.find(**classes('image-wrapper__placeholder'))
-            if wrapper is not None:
-                p = wrapper.parent
-                img = new_tag(soup, 'img')
-                img['src'] = meta['content']
-                p.append(img)
-                wrapper.extract()
-        return soup
+    def _extract_child_nodes(self, children, ele, soup, level=1):
+        if not children:
+            return
+
+        child_html = ""
+        for child in children:
+            if child.get("type", "") == "text":
+                child_html += child["data"]
+            else:
+                if child["type"] == "iframe":
+                    # change iframe to <span> with the src linked
+                    new_ele = soup.new_tag("span")
+                    new_ele["class"] = f'embed-{child["type"]}'
+                    iframe_src = child.get("attribs", {}).get("src")
+                    a_tag = soup.new_tag("a")
+                    a_tag["href"] = iframe_src
+                    a_tag.string = f"[Embed: {iframe_src}]"
+                    new_ele.append(a_tag)
+                else:
+                    new_ele = soup.new_tag(child["type"])
+                    for k, v in child.get("attribs", {}).items():
+                        if k.startswith("data-"):
+                            continue
+                        new_ele[k] = v
+                    if child.get("children"):
+                        self._extract_child_nodes(
+                            child["children"], new_ele, soup, level + 1
+                        )
+                child_html += str(new_ele)
+                if child["type"] == "img":
+                    # generate a caption <span> tag for <img>
+                    caption_text = child.get("attribs", {}).get("alt") or child.get(
+                        "attribs", {}
+                    ).get("title")
+                    caption_tag = soup.new_tag("span")
+                    caption_tag.string = caption_text
+                    caption_tag["class"] = "caption"
+                    child_html += str(caption_tag)
+                    ele["class"] = "article-img"
+        ele.append(BeautifulSoup(child_html))
+
+    def preprocess_raw_html(self, raw_html, url):
+        article = None
+        soup = BeautifulSoup(raw_html)
+
+        for script in soup.find_all("script"):
+            if not script.text.startswith("window.__APOLLO_STATE__"):
+                continue
+            article_js = re.sub(
+                r"window.__APOLLO_STATE__\s*=\s*", "", script.text.strip()
+            )
+            if article_js.endswith(";"):
+                article_js = article_js[:-1]
+            article = json.loads(article_js)
+            break
+
+        if not (article and article.get("contentService")):
+            # Sometimes the page does not have article content in the <script>
+            # particularly in the Sports section, so we fallback to
+            # raw_html and rely on remove_tags to clean it up
+            self.log(f"Unable to find article from script in {url}")
+            return raw_html
+
+        content_service = article.get("contentService")
+        content_node_id = None
+        for k, v in content_service["ROOT_QUERY"].items():
+            if not k.startswith("content"):
+                continue
+            content_node_id = v["id"]
+            break
+        content = content_service.get(content_node_id)
+
+        if content.get("sponsorType"):
+            # skip sponsored articles
+            self.abort_article(f"Sponsored article: {url}")
+
+        body = None
+        for k, v in content.items():
+            if (not k.startswith("body(")) or v.get("type", "") != "json":
+                continue
+            body = v
+
+        authors = [content_service[a["id"]]["name"] for a in content["authors"]]
+        date_published = datetime.utcfromtimestamp(
+            content["publishedDate"] / 1000
+        ).replace(tzinfo=timezone.utc)
+        date_published_loc = date_published.astimezone(
+            timezone(offset=timedelta(hours=8))  # HK time
+        )
+
+        html_output = f"""<html><head><title>{content["headline"]}</title></head>
+        <body>
+            <article>
+            <h1 class="headline">{content["headline"]}</h1>
+            <div class="sub-headline"></div>
+            <div class="article-meta">
+                <span class="author">{", ".join(authors)}</span>
+                <span class="published-dt">
+                    {date_published_loc:%-I:%M%p, %-d %b, %Y}
+                </span>
+            </div>
+            </article>
+        </body></html>
+        """
+
+        new_soup = BeautifulSoup(html_output, "html.parser")
+        # sub headline
+        for c in content.get("subHeadline", {}).get("json", []):
+            ele = new_soup.new_tag(c["type"])
+            self._extract_child_nodes(c.get("children", []), ele, new_soup)
+            new_soup.find(class_="sub-headline").append(ele)
+
+        # article content
+        for node in body["json"]:
+            if node["type"] not in ["p", "div"]:
+                continue
+            new_ele = new_soup.new_tag(node["type"])
+            new_ele.string = ""
+            if node.get("children"):
+                self._extract_child_nodes(node["children"], new_ele, new_soup)
+            new_soup.article.append(new_ele)
+
+        return str(new_soup)