mirror of
				https://github.com/kovidgoyal/calibre.git
				synced 2025-10-31 10:37:00 -04:00 
			
		
		
		
	
		
			
				
	
	
		
			226 lines
		
	
	
		
			8.8 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			226 lines
		
	
	
		
			8.8 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
| """
 | |
| scmp.com
 | |
| """
 | |
| 
 | |
| import json
 | |
| import re
 | |
| from datetime import datetime, timedelta, timezone
 | |
| 
 | |
| from calibre.ebooks.BeautifulSoup import BeautifulSoup
 | |
| from calibre.web.feeds.news import BasicNewsRecipe, classes
 | |
| 
 | |
| 
 | |
| class SCMP(BasicNewsRecipe):
 | |
|     title = "South China Morning Post"
 | |
|     __author__ = "llam"
 | |
|     description = "SCMP.com, Hong Kong's premier online English daily provides exclusive up-to-date news, audio video news, podcasts, RSS Feeds, Blogs, breaking news, top stories, award winning news and analysis on Hong Kong and China."  # noqa
 | |
|     publisher = "South China Morning Post Publishers Ltd."
 | |
|     oldest_article = 1
 | |
|     max_articles_per_feed = 25
 | |
|     no_stylesheets = True
 | |
|     remove_javascript = True
 | |
|     encoding = "utf-8"
 | |
|     use_embedded_content = False
 | |
|     language = "en"
 | |
|     remove_empty_feeds = True
 | |
|     publication_type = "newspaper"
 | |
|     auto_cleanup = False
 | |
|     compress_news_images = True
 | |
|     ignore_duplicate_articles = {"title", "url"}
 | |
| 
 | |
|     # used when unable to extract article from <script>, particularly in the Sports section
 | |
|     remove_tags = [
 | |
|         dict(
 | |
|             classes(
 | |
|                 "sticky-wrap relative social-media social-media--extended__shares"
 | |
|                 " article-body-comment scmp_button_comment_wrapper social-media--extended__in-site"
 | |
|                 " footer scmp-advert-tile sidebar-col related-article share-widget"
 | |
|             )
 | |
|         ),
 | |
|         dict(attrs={"addthis_title": True}),
 | |
|         dict(name=["script", "style"]),
 | |
|     ]
 | |
|     remove_attributes = ["style", "font"]
 | |
| 
 | |
|     extra_css = """
 | |
|     .headline { font-size: 1.8rem; margin-bottom: 0.4rem; }
 | |
|     .sub-headline { font-size: 1rem; margin-bottom: 1.5rem; }
 | |
|     .sub-headline ul { padding-left: 1rem; }
 | |
|     .sub-headline ul li { fmargin-bottom: 0.8rem; }
 | |
|     .article-meta, .article-header__publish { padding-bottom: 0.5rem; }
 | |
|     .article-meta .author { text-transform: uppercase; font-weight: bold; }
 | |
|     .article-meta .published-dt { margin-left: 0.5rem; }
 | |
|     .article-img { margin-bottom: 0.8rem; max-width: 100%; }
 | |
|     .article-img img, .carousel__slide img {
 | |
|         display: block; margin-bottom: 0.3rem; max-width: 100%; height: auto;
 | |
|         box-sizing: border-box; }
 | |
|     .article-img .caption, .article-caption { font-size: 0.8rem; }
 | |
|     """
 | |
| 
 | |
|     # https://www.scmp.com/rss
 | |
|     feeds = [
 | |
|         ("Hong Kong", "https://www.scmp.com/rss/2/feed"),
 | |
|         ("China", "https://www.scmp.com/rss/4/feed"),
 | |
|         ("Asia", "https://www.scmp.com/rss/3/feed"),
 | |
|         ("World", "https://www.scmp.com/rss/5/feed"),
 | |
|         ("Business", "https://www.scmp.com/rss/92/feed"),
 | |
|         ("Tech", "https://www.scmp.com/rss/36/feed"),
 | |
|         ("Life", "https://www.scmp.com/rss/94/feed"),
 | |
|         ("Culture", "https://www.scmp.com/rss/322296/feed"),
 | |
|         ("Sport", "https://www.scmp.com/rss/95/feed"),
 | |
|         ("Post Mag", "https://www.scmp.com/rss/71/feed"),
 | |
|         ("Style", "https://www.scmp.com/rss/72/feed"),
 | |
|     ]
 | |
| 
 | |
|     masthead_url = 'https://upload.wikimedia.org/wikipedia/commons/c/c3/SCMP_logo.svg'
 | |
| 
 | |
|     def get_cover_url(self):
 | |
|         from datetime import date
 | |
|         cover = 'https://img.kiosko.net/' + str(
 | |
|             date.today().year
 | |
|         ) + '/' + date.today().strftime('%m') + '/' + date.today(
 | |
|         ).strftime('%d') + '/cn/scmp.750.jpg'
 | |
|         br = BasicNewsRecipe.get_browser(self, verify_ssl_certificates=False)
 | |
|         try:
 | |
|             br.open(cover)
 | |
|         except:
 | |
|             index = 'https://es.kiosko.net/cn/np/scmp.html'
 | |
|             soup = self.index_to_soup(index)
 | |
|             for image in soup.findAll('img', src=True):
 | |
|                 if image['src'].endswith('750.jpg'):
 | |
|                     return 'https:' + image['src']
 | |
|             self.log("\nCover unavailable")
 | |
|             cover = None
 | |
|         return cover
 | |
| 
 | |
|     def _extract_child_nodes(self, children, ele, soup, level=1):
 | |
|         if not children:
 | |
|             return
 | |
| 
 | |
|         child_html = ""
 | |
|         for child in children:
 | |
|             if child.get("type", "") == "text":
 | |
|                 child_html += child["data"]
 | |
|             else:
 | |
|                 if child["type"] == "iframe":
 | |
|                     # change iframe to <span> with the src linked
 | |
|                     new_ele = soup.new_tag("span")
 | |
|                     new_ele["class"] = f'embed-{child["type"]}'
 | |
|                     iframe_src = child.get("attribs", {}).get("src")
 | |
|                     a_tag = soup.new_tag("a")
 | |
|                     a_tag["href"] = iframe_src
 | |
|                     a_tag.string = f"[Embed: {iframe_src}]"
 | |
|                     new_ele.append(a_tag)
 | |
|                 else:
 | |
|                     new_ele = soup.new_tag(child["type"])
 | |
|                     for k, v in child.get("attribs", {}).items():
 | |
|                         if k.startswith("data-"):
 | |
|                             continue
 | |
|                         new_ele[k] = v
 | |
|                     if child.get("children"):
 | |
|                         self._extract_child_nodes(
 | |
|                             child["children"], new_ele, soup, level + 1
 | |
|                         )
 | |
|                 child_html += str(new_ele)
 | |
|                 if child["type"] == "img":
 | |
|                     # generate a caption <span> tag for <img>
 | |
|                     caption_text = child.get("attribs", {}).get("alt") or child.get(
 | |
|                         "attribs", {}
 | |
|                     ).get("title")
 | |
|                     if caption_text:
 | |
|                         new_ele = soup.new_tag("span")
 | |
|                         new_ele.append(caption_text)
 | |
|                         new_ele["class"] = "caption"
 | |
|                         child_html += str(new_ele)
 | |
|                     ele["class"] = "article-img"
 | |
|         ele.append(BeautifulSoup(child_html))
 | |
| 
 | |
|     def preprocess_raw_html(self, raw_html, url):
 | |
|         article = None
 | |
|         soup = BeautifulSoup(raw_html)
 | |
| 
 | |
|         for script in soup.find_all("script"):
 | |
|             if not script.contents:
 | |
|                 continue
 | |
|             if not script.contents[0].startswith("window.__APOLLO_STATE__"):
 | |
|                 continue
 | |
|             article_js = re.sub(
 | |
|                 r"window.__APOLLO_STATE__\s*=\s*", "", script.contents[0].strip()
 | |
|             )
 | |
|             if article_js.endswith(";"):
 | |
|                 article_js = article_js[:-1]
 | |
|             try:
 | |
|                 article = json.loads(article_js)
 | |
|                 break
 | |
|             except json.JSONDecodeError:
 | |
|                 self.log.exception("Unable to parse __APOLLO_STATE__")
 | |
| 
 | |
|         if not (article and article.get("contentService")):
 | |
|             # Sometimes the page does not have article content in the <script>
 | |
|             # particularly in the Sports section, so we fallback to
 | |
|             # raw_html and rely on remove_tags to clean it up
 | |
|             self.log(f"Unable to find article from script in {url}")
 | |
|             return raw_html
 | |
| 
 | |
|         content_service = article.get("contentService")
 | |
|         content_node_id = None
 | |
|         for k, v in content_service["ROOT_QUERY"].items():
 | |
|             if not k.startswith("content"):
 | |
|                 continue
 | |
|             content_node_id = v["id"]
 | |
|             break
 | |
|         content = content_service.get(content_node_id)
 | |
| 
 | |
|         if content.get("sponsorType"):
 | |
|             # skip sponsored articles
 | |
|             self.abort_article(f"Sponsored article: {url}")
 | |
| 
 | |
|         body = None
 | |
|         for k, v in content.items():
 | |
|             if (not k.startswith("body(")) or v.get("type", "") != "json":
 | |
|                 continue
 | |
|             body = v
 | |
| 
 | |
|         authors = [content_service[a["id"]]["name"] for a in content["authors"]]
 | |
|         date_published = datetime.fromtimestamp(
 | |
|             content["publishedDate"] / 1000, timezone.utc)
 | |
|         date_published_loc = date_published.astimezone(
 | |
|             timezone(offset=timedelta(hours=8))  # HK time
 | |
|         )
 | |
|         try:
 | |
|             df = date_published_loc.strftime('%-I:%M%p, %-d %b, %Y')
 | |
|         except Exception:
 | |
|             df = ''
 | |
| 
 | |
|         html_output = f"""<html><head><title>{content["headline"]}</title></head>
 | |
|         <body>
 | |
|             <article>
 | |
|             <h1 class="headline">{content["headline"]}</h1>
 | |
|             <div class="sub-headline"></div>
 | |
|             <div class="article-meta">
 | |
|                 <span class="author">{", ".join(authors)}</span>
 | |
|                 <span class="published-dt">{df}</span>
 | |
|             </div>
 | |
|             </article>
 | |
|         </body></html>
 | |
|         """
 | |
| 
 | |
|         new_soup = BeautifulSoup(html_output, "html.parser")
 | |
|         # sub headline
 | |
|         for c in content.get("subHeadline", {}).get("json", []):
 | |
|             ele = new_soup.new_tag(c["type"])
 | |
|             self._extract_child_nodes(c.get("children", []), ele, new_soup)
 | |
|             new_soup.find(class_="sub-headline").append(ele)
 | |
| 
 | |
|         # article content
 | |
|         for node in body["json"]:
 | |
|             if node["type"] not in ["p", "div"]:
 | |
|                 continue
 | |
|             new_ele = new_soup.new_tag(node["type"])
 | |
|             new_ele.string = ""
 | |
|             if node.get("children"):
 | |
|                 self._extract_child_nodes(node["children"], new_ele, new_soup)
 | |
|             new_soup.article.append(new_ele)
 | |
| 
 | |
|         return str(new_soup)
 |