From 931ee9867ab1e73b478f707e6722adc327414d07 Mon Sep 17 00:00:00 2001 From: unkn0w7n <51942695+unkn0w7n@users.noreply.github.com> Date: Sun, 8 Sep 2024 19:09:57 +0530 Subject: [PATCH 1/4] Update SCMP --- recipes/icons/scmp.png | Bin 1081 -> 180 bytes recipes/scmp.recipe | 284 +++++++++++++++++------------------------ 2 files changed, 114 insertions(+), 170 deletions(-) diff --git a/recipes/icons/scmp.png b/recipes/icons/scmp.png index a8ce4677381c9645a330a9557ff837dabe034b38..f8663e55ab6de9c95e14c3868f7cf10a6dc6d495 100644 GIT binary patch delta 163 zcmdnVv4wGhWIZzj1B1(wu46z-CBP@d)xE&yA1e5Niitri`2YX^(`K#t|7hC3$5VyY z-R%RaXDbQv3ubuVP+#8IBRK^81r+a#OpBZEMe_!7C-HS~y{I2o;88!Y;!T9Zx@mpoS zSElU8J>07rtfiVk_hMm;Z}E@)O)#1=x8`7O&BEN2fVnX~U4MS6b!!HyO$=d;u>#J1 zCJ^Znu;1a&otQxbyKSaa$h7 zXI*M&k`Mou-k!NXVG>&orAcNw0bvrGGbz%kGoOGkiLLWG>j;!*2LgXqW*q@yk3qm| z!CV{q|L0>YV1GmRYyN~{*Xal4dzgPU-+y*IoGMCFvd@S#TE)WIP|3) zPJW|+?H^BJ?!Q5_dDo}YIODA%4&(ouJe*Xab?=Ykpcivk?Vcz^g0U?gNn*d}vRLRQ z7xUiWU^njD_qhyqc`}V1`CBmHsOFdif-x+{ zYZy*rzY>zo{W$l%63k`4-6x;gIB#UVF+-&K2E2?qZkB#7Bewor|@MEb5#g0MA zF9KEzdP!=-`{H;f)`6ePJy@D#O&~Ug2UKtwl79$%Q;$mLkcuAod&M&X!O@;HpS9Gw~<=9mhs zM>*HHn{&C~XDm-5loIEBP^N*hB5=#M^)MH61Xulmxq~+5^n|c4rQG^~IPU+s5mi7w zdVew8hOWU8EPj(i1*HJ`SZgs7GME2O1CH+$cq7>Hu_Rt_TCx3oF&x3Uf12sU8h1o6 z*R>%QX~n>BRqCAZW)Y{-CkxQ=7rtM{o^<(vl>P;`O&x!f?gSggI`>dw8u(2r!X-nq z^9g?k=A^7!an47+lEWb{X60y|U5}-Wdw&qNvz4)&coxo zG-I4QcA}a0q0}qi71cYB9`@z?sywR3JMj(g}Xl!{J=wit)AFz9Uc>Ws9f%V|> z0=3|&!^X#Sz, particularly in the Sports section remove_tags = [ dict( @@ -56,22 +134,6 @@ class SCMP(BasicNewsRecipe): dict(attrs={"addthis_title": True}), dict(name=["script", "style"]), ] - remove_attributes = ["style", "font"] - - extra_css = """ - .headline { font-size: 1.8rem; margin-bottom: 0.4rem; } - .sub-headline { font-size: 1rem; margin-bottom: 1.5rem; } - .sub-headline ul { padding-left: 1rem; } - .sub-headline ul li { fmargin-bottom: 0.8rem; } - .article-meta, .article-header__publish { padding-bottom: 0.5rem; } - .article-meta .author { text-transform: uppercase; font-weight: bold; } - .article-meta .published-dt { margin-left: 0.5rem; } - .article-img { margin-bottom: 0.8rem; max-width: 100%; } - .article-img img, .carousel__slide img { - display: block; margin-bottom: 0.3rem; max-width: 100%; height: auto; - box-sizing: border-box; } - .article-img .caption, .article-caption { font-size: 0.8rem; } - """ # https://www.scmp.com/rss feeds = [ @@ -86,156 +148,38 @@ class SCMP(BasicNewsRecipe): ("Sport", "https://www.scmp.com/rss/95/feed"), ("Post Mag", "https://www.scmp.com/rss/71/feed"), ("Style", "https://www.scmp.com/rss/72/feed"), + ("News", 'https://www.scmp.com/rss/91/feed') ] + def print_version(self, url): + return url.split('?')[0] + masthead_url = 'https://upload.wikimedia.org/wikipedia/commons/c/c3/SCMP_logo.svg' - def get_cover_url(self): - from datetime import date - cover = 'https://img.kiosko.net/' + str( - date.today().year - ) + '/' + date.today().strftime('%m') + '/' + date.today( - ).strftime('%d') + '/cn/scmp.750.jpg' - br = BasicNewsRecipe.get_browser(self, verify_ssl_certificates=False) - try: - br.open(cover) - except: - index = 'https://es.kiosko.net/cn/np/scmp.html' - soup = self.index_to_soup(index) - for image in soup.findAll('img', src=True): - if image['src'].endswith('750.jpg'): - return 'https:' + image['src'] - self.log("\nCover unavailable") - cover = None - return cover - - def _extract_child_nodes(self, children, ele, soup, level=1): - if not children: - return - - child_html = "" - for child in children: - if child.get("type", "") == "text": - child_html += child["data"] - else: - if child["type"] == "iframe": - # change iframe to with the src linked - new_ele = soup.new_tag("span") - new_ele["class"] = f'embed-{child["type"]}' - iframe_src = child.get("attribs", {}).get("src") - a_tag = soup.new_tag("a") - a_tag["href"] = iframe_src - a_tag.string = f"[Embed: {iframe_src}]" - new_ele.append(a_tag) - else: - new_ele = soup.new_tag(child["type"]) - for k, v in child.get("attribs", {}).items(): - if k.startswith("data-"): - continue - new_ele[k] = v - if child.get("children"): - self._extract_child_nodes( - child["children"], new_ele, soup, level + 1 - ) - child_html += str(new_ele) - if child["type"] == "img": - # generate a caption tag for - caption_text = child.get("attribs", {}).get("alt") or child.get( - "attribs", {} - ).get("title") - if caption_text: - new_ele = soup.new_tag("span") - new_ele.append(caption_text) - new_ele["class"] = "caption" - child_html += str(new_ele) - ele["class"] = "article-img" - ele.append(BeautifulSoup(child_html)) - def preprocess_raw_html(self, raw_html, url): - article = None - soup = BeautifulSoup(raw_html) - - for script in soup.find_all("script"): - if not script.contents: - continue - if not script.contents[0].startswith("window.__APOLLO_STATE__"): - continue - article_js = re.sub( - r"window.__APOLLO_STATE__\s*=\s*", "", script.contents[0].strip() - ) - if article_js.endswith(";"): - article_js = article_js[:-1] + body = '
' + b_root = parse(body) + root = parse(raw_html) + script = root.xpath('//script[@id="__NEXT_DATA__"]') + if script: try: - article = json.loads(article_js) - break - except json.JSONDecodeError: - self.log.exception("Unable to parse __APOLLO_STATE__") + load_article_from_json(script[0].text, b_root) + except Exception: + return raw_html + head = root.xpath('//h2') + root.xpath('//h3') + for h2 in head: + h2.tag = 'h4' + raw = etree.tostring(b_root, encoding='unicode') + return raw + return raw_html - if not (article and article.get("contentService")): - # Sometimes the page does not have article content in the