Merge branch 'master' of https://github.com/unkn0w7n/calibre

2025-07-09 03:04:10 -04:00 · 2024-09-08 21:33:09 +05:30 · 2024-09-08 21:33:09 +05:30 · 9ed56a34af
commit 9ed56a34af
parent 147d71d705 57a9682a71
3 changed files with 126 additions and 184 deletions
--- a/recipes/icons/scmp.png
+++ b/recipes/icons/scmp.png
--- a/recipes/scmp.recipe
+++ b/recipes/scmp.recipe
@ -1,34 +1,111 @@
 #!/usr/bin/env python
 # vim:fileencoding=utf-8
 """
 scmp.com
 """
 import json
 import re
-from datetime import datetime, timedelta, timezone
+from datetime import datetime
 import time
 from html5_parser import parse
 from lxml import etree
 from calibre import replace_entities
 from calibre.ebooks.BeautifulSoup import BeautifulSoup
 from calibre.web.feeds.news import BasicNewsRecipe, classes
 def E(parent, name, text='', **attrs):
    ans = parent.makeelement(name, **attrs)
    ans.text = text
    parent.append(ans)
    return ans
 def process_node(node, html_parent):
    ntype = node.get('type')
    if not ntype in {'track-viewed-percentage', 'inline-ad-slot', 'inline-widget', 'text'}:
        c = html_parent.makeelement(ntype)
        if ntype != 'p':
            c.attrib.update({k: v or '' for k, v in node.get('attribs', {}).items()})
        html_parent.append(c)
        for nc in node.get('children', ()):
            process_node(nc, c)
    elif ntype == 'text':
        text = node.get('data')
        if text:
            text = replace_entities(text)
            if len(html_parent):
                t = html_parent[-1]
                t.tail = (t.tail or '') + text
            else:
                html_parent.text = (html_parent.text or '') + text
 def ts_date(x):
    dt = datetime.fromtimestamp(x/1000 + time.timezone)
    return dt.strftime('%b %d, %Y at %I:%M %p')
 def auth(x):
    return ', '.join([a['name'] for a in x])
 def load_article_from_json(raw, root):
    # open('/t/raw.json', 'w').write(raw)
    data = json.loads(raw)['props']['pageProps']['payload']['data']['article']
    body = root.xpath('//body')[0]
    for child in tuple(body):
        body.remove(child)
    article = E(body, 'article')
    E(article, 'div', replace_entities(data['firstTopic']['name']) , style='color: gray; font-size:small; font-weight:bold;')
    E(article, 'h1', replace_entities(data['headline']))
    # E(article, 'p', replace_entities(data['subHeadline']['text']), style='font-style: italic; color:#202020;')
    for subh in data['subHeadline']['json']:
        process_node(subh, article)
    auth = ts_date(data['publishedDate']) + ' | ' + data['readingTime'] or '' + ' min read | ' + auth(data['authors'])
    E(article, 'p', auth, style='color: #202020; font-size:small;')
    main_image_url = sub_img = ''
    for l in data['images']:
        if l['type'] == 'leading':
            main_image_url = l['url']
            sub_img = l['title']
    if main_image_url != '':
        div = E(article, 'div')
        E(div, 'img', src=main_image_url)
        E(div, 'div', sub_img, style='text-align:center; font-size:small;')
    for node in data['body']['json']:
        process_node(node, article)
 class SCMP(BasicNewsRecipe):
    title = "South China Morning Post"
-    __author__ = "llam"
+    __author__ = "unkn0wn"
-    description = "SCMP.com, Hong Kong's premier online English daily provides exclusive up-to-date news, audio video news, podcasts, RSS Feeds, Blogs, breaking news, top stories, award winning news and analysis on Hong Kong and China."  # noqa
+    description = (
        'The South China Morning Post is a leading news media company that has reported on China and Asia '
        'for more than a century with global impact. Founded in 1903, SCMP is headquartered in Hong Kong, '
        'where it is the city’s newspaper of record. Our teams span across Asia and the United States, '
        'working together to connect with news consumers around the world. We are committed to informing '
        'and inspiring through journalism of the highest standards. Our vision is to “Elevate Thought”, '
        'and our mission is to “Lead the global conversation about China”.'
    )
    publisher = "South China Morning Post Publishers Ltd."
    oldest_article = 1
    max_articles_per_feed = 25
    no_stylesheets = True
    remove_javascript = True
    remove_attributes = ['width', 'height']
    encoding = "utf-8"
    use_embedded_content = False
-    language = "en"
+    language = "en_HK"
    remove_empty_feeds = True
    resolve_internal_links = True
    publication_type = "newspaper"
    auto_cleanup = False
    compress_news_images = True
    ignore_duplicate_articles = {"title", "url"}
    extra_css = 'blockquote, em { color: #202020; }'
    masthead_url = 'https://upload.wikimedia.org/wikipedia/commons/c/c3/SCMP_logo.svg'
    def get_cover_url(self):
        soup = self.index_to_soup('https://www.frontpages.com/south-china-morning-post/')
        return 'https://www.frontpages.com' + soup.find('img', attrs={'id':'giornale-img'})['src']
    recipe_specific_options = {
        'days': {
@ -56,22 +133,6 @@ class SCMP(BasicNewsRecipe):
        dict(attrs={"addthis_title": True}),
        dict(name=["script", "style"]),
    ]
    remove_attributes = ["style", "font"]
    extra_css = """
    .headline { font-size: 1.8rem; margin-bottom: 0.4rem; }
    .sub-headline { font-size: 1rem; margin-bottom: 1.5rem; }
    .sub-headline ul { padding-left: 1rem; }
    .sub-headline ul li { fmargin-bottom: 0.8rem; }
    .article-meta, .article-header__publish { padding-bottom: 0.5rem; }
    .article-meta .author { text-transform: uppercase; font-weight: bold; }
    .article-meta .published-dt { margin-left: 0.5rem; }
    .article-img { margin-bottom: 0.8rem; max-width: 100%; }
    .article-img img, .carousel__slide img {
        display: block; margin-bottom: 0.3rem; max-width: 100%; height: auto;
        box-sizing: border-box; }
    .article-img .caption, .article-caption { font-size: 0.8rem; }
    """
    # https://www.scmp.com/rss
    feeds = [
@ -86,156 +147,36 @@ class SCMP(BasicNewsRecipe):
        ("Sport", "https://www.scmp.com/rss/95/feed"),
        ("Post Mag", "https://www.scmp.com/rss/71/feed"),
        ("Style", "https://www.scmp.com/rss/72/feed"),
        ("News", 'https://www.scmp.com/rss/91/feed')
    ]
-    masthead_url = 'https://upload.wikimedia.org/wikipedia/commons/c/c3/SCMP_logo.svg'
+    def print_version(self, url):
-
+        return url.split('?')[0]
    def get_cover_url(self):
        from datetime import date
        cover = 'https://img.kiosko.net/' + str(
            date.today().year
        ) + '/' + date.today().strftime('%m') + '/' + date.today(
        ).strftime('%d') + '/cn/scmp.750.jpg'
        br = BasicNewsRecipe.get_browser(self, verify_ssl_certificates=False)
        try:
            br.open(cover)
        except:
            index = 'https://es.kiosko.net/cn/np/scmp.html'
            soup = self.index_to_soup(index)
            for image in soup.findAll('img', src=True):
                if image['src'].endswith('750.jpg'):
                    return 'https:' + image['src']
            self.log("\nCover unavailable")
            cover = None
        return cover
    def _extract_child_nodes(self, children, ele, soup, level=1):
        if not children:
            return
        child_html = ""
        for child in children:
            if child.get("type", "") == "text":
                child_html += child["data"]
            else:
                if child["type"] == "iframe":
                    # change iframe to <span> with the src linked
                    new_ele = soup.new_tag("span")
                    new_ele["class"] = f'embed-{child["type"]}'
                    iframe_src = child.get("attribs", {}).get("src")
                    a_tag = soup.new_tag("a")
                    a_tag["href"] = iframe_src
                    a_tag.string = f"[Embed: {iframe_src}]"
                    new_ele.append(a_tag)
                else:
                    new_ele = soup.new_tag(child["type"])
                    for k, v in child.get("attribs", {}).items():
                        if k.startswith("data-"):
                            continue
                        new_ele[k] = v
                    if child.get("children"):
                        self._extract_child_nodes(
                            child["children"], new_ele, soup, level + 1
                        )
                child_html += str(new_ele)
                if child["type"] == "img":
                    # generate a caption <span> tag for <img>
                    caption_text = child.get("attribs", {}).get("alt") or child.get(
                        "attribs", {}
                    ).get("title")
                    if caption_text:
                        new_ele = soup.new_tag("span")
                        new_ele.append(caption_text)
                        new_ele["class"] = "caption"
                        child_html += str(new_ele)
                    ele["class"] = "article-img"
        ele.append(BeautifulSoup(child_html))
    def preprocess_raw_html(self, raw_html, url):
-        article = None
+        body = '<html><body><article></article></body></html>'
-        soup = BeautifulSoup(raw_html)
+        b_root = parse(body)
-
+        root = parse(raw_html)
-        for script in soup.find_all("script"):
+        script = root.xpath('//script[@id="__NEXT_DATA__"]')
-            if not script.contents:
+        if script:
                continue
            if not script.contents[0].startswith("window.__APOLLO_STATE__"):
                continue
            article_js = re.sub(
                r"window.__APOLLO_STATE__\s*=\s*", "", script.contents[0].strip()
            )
            if article_js.endswith(";"):
                article_js = article_js[:-1]
            try:
-                article = json.loads(article_js)
+                load_article_from_json(script[0].text, b_root)
-                break
+            except Exception:
-            except json.JSONDecodeError:
+                return raw_html
-                self.log.exception("Unable to parse __APOLLO_STATE__")
+            head = b_root.xpath('//h2') + b_root.xpath('//h3')
-
+            for h2 in head:
-        if not (article and article.get("contentService")):
+                h2.tag = 'h4'
-            # Sometimes the page does not have article content in the <script>
+            raw = etree.tostring(b_root, encoding='unicode')
-            # particularly in the Sports section, so we fallback to
+            return raw
            # raw_html and rely on remove_tags to clean it up
            self.log(f"Unable to find article from script in {url}")
        return raw_html
-        content_service = article.get("contentService")
+    def preprocess_html(self, soup):
-        content_node_id = None
+        from urllib.parse import urlparse
-        for k, v in content_service["ROOT_QUERY"].items():
+        for img in soup.findAll('img', attrs={'src':True}):
-            if not k.startswith("content"):
+            y = 'https://img.i-scmp.com/cdn-cgi/image/fit=contain,width=768,format=auto'
-                continue
+            img['src'] = y + urlparse(img['src']).path
-            content_node_id = v["id"]
+        for img in soup.findAll('img', attrs={'title':True}):
-            break
+            div = soup.new_tag('div', attrs={'style':'text-align:center; font-size:small;'})
-        content = content_service.get(content_node_id)
+            div.string = img['title']
-
+            img.find_parent('div').append(div)
-        if content.get("sponsorType"):
+        return soup
            # skip sponsored articles
            self.abort_article(f"Sponsored article: {url}")
        body = None
        for k, v in content.items():
            if (not k.startswith("body(")) or v.get("type", "") != "json":
                continue
            body = v
        authors = [content_service[a["id"]]["name"] for a in content["authors"]]
        date_published = datetime.fromtimestamp(
            content["publishedDate"] / 1000, timezone.utc)
        date_published_loc = date_published.astimezone(
            timezone(offset=timedelta(hours=8))  # HK time
        )
        try:
            df = date_published_loc.strftime('%-I:%M%p, %-d %b, %Y')
        except Exception:
            df = ''
        html_output = f"""<html><head><title>{content["headline"]}</title></head>
        <body>
            <article>
            <h1 class="headline">{content["headline"]}</h1>
            <div class="sub-headline"></div>
            <div class="article-meta">
                <span class="author">{", ".join(authors)}</span>
                <span class="published-dt">{df}</span>
            </div>
            </article>
        </body></html>
        """
        new_soup = BeautifulSoup(html_output, "html.parser")
        # sub headline
        for c in content.get("subHeadline", {}).get("json", []):
            ele = new_soup.new_tag(c["type"])
            self._extract_child_nodes(c.get("children", []), ele, new_soup)
            new_soup.find(class_="sub-headline").append(ele)
        # article content
        for node in body["json"]:
            if node["type"] not in ["p", "div"]:
                continue
            new_ele = new_soup.new_tag(node["type"])
            new_ele.string = ""
            if node.get("children"):
                self._extract_child_nodes(node["children"], new_ele, new_soup)
            new_soup.article.append(new_ele)
        return str(new_soup)
--- a/recipes/tls_mag.recipe
+++ b/recipes/tls_mag.recipe
@ -118,18 +118,6 @@ class tls(BasicNewsRecipe):
        else:
            prim = title = desc = label = auth = lede = ''
        bks = ''
        if 'bookdetails' in data and data['bookdetails']:
            bks += '<br>'
            for a in data['bookdetails']:
                for x, y in a.items():
                    if isinstance(y, str):
                        if x == 'imageurl':
                            bks += '<img src="{}">'.format(y)
                        elif y:
                            bks += '<div class="det">' + y + '</div>\n'
                bks += '<br>'
        if 'article_data_leadimage' in data:
            i = data['article_data_leadimage']
            if 'full_image' in i and i['full_image']:
@ -138,7 +126,20 @@ class tls(BasicNewsRecipe):
                        + i['imagecredit'] + '</i>'
                )
        cont = self.index_to_soup('https://www.the-tls.co.uk/wp-json/tls/v2/single-article/' + data['ID'], raw=True)
-        body = json.loads(cont)['content']
+        c_data = json.loads(cont)
        body = c_data['content']
        bks = ''
        if 'bookdetails' in c_data and c_data['bookdetails']:
            bks += '<br>'
            for a in c_data['bookdetails']:
                for x, y in a.items():
                    if isinstance(y, str):
                        if x == 'imageurl':
                            bks += '<img src="{}">'.format(y)
                        elif y:
                            bks += '<div class="det">' + y + '</div>\n'
                bks += '<br>'
        html = '<html><body><div>' \
                    + label + title + desc + auth + lede + bks + body + \