mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Merge branch 'scmp-recipe' of https://github.com/ping/calibre
This commit is contained in:
commit
6b773361c2
@ -1,105 +1,197 @@
|
|||||||
'''
|
"""
|
||||||
scmp.com
|
scmp.com
|
||||||
'''
|
"""
|
||||||
|
|
||||||
from mechanize import Request
|
|
||||||
import json
|
import json
|
||||||
from calibre.web.feeds.news import BasicNewsRecipe
|
import re
|
||||||
from calibre.ebooks.BeautifulSoup import Tag
|
from datetime import datetime, timedelta, timezone
|
||||||
|
|
||||||
|
from calibre.ebooks.BeautifulSoup import BeautifulSoup
|
||||||
def classes(classes):
|
from calibre.web.feeds.news import BasicNewsRecipe, classes
|
||||||
q = frozenset(classes.split(' '))
|
|
||||||
return dict(attrs={
|
|
||||||
'class': lambda x: x and frozenset(x.split()).intersection(q)})
|
|
||||||
|
|
||||||
|
|
||||||
def new_tag(soup, name, attrs=()):
|
|
||||||
impl = getattr(soup, 'new_tag', None)
|
|
||||||
if impl is not None:
|
|
||||||
return impl(name, attrs=dict(attrs))
|
|
||||||
return Tag(soup, name, attrs=attrs or None)
|
|
||||||
|
|
||||||
|
|
||||||
class SCMP(BasicNewsRecipe):
|
class SCMP(BasicNewsRecipe):
|
||||||
title = 'South China Morning Post'
|
title = "South China Morning Post"
|
||||||
__author__ = 'llam'
|
__author__ = "llam"
|
||||||
description = "SCMP.com, Hong Kong's premier online English daily provides exclusive up-to-date news, audio video news, podcasts, RSS Feeds, Blogs, breaking news, top stories, award winning news and analysis on Hong Kong and China." # noqa
|
description = "SCMP.com, Hong Kong's premier online English daily provides exclusive up-to-date news, audio video news, podcasts, RSS Feeds, Blogs, breaking news, top stories, award winning news and analysis on Hong Kong and China." # noqa
|
||||||
publisher = 'South China Morning Post Publishers Ltd.'
|
publisher = "South China Morning Post Publishers Ltd."
|
||||||
oldest_article = 2
|
oldest_article = 1
|
||||||
delay = 1
|
max_articles_per_feed = 25
|
||||||
max_articles_per_feed = 200
|
|
||||||
no_stylesheets = True
|
no_stylesheets = True
|
||||||
encoding = 'utf-8'
|
remove_javascript = True
|
||||||
|
encoding = "utf-8"
|
||||||
use_embedded_content = False
|
use_embedded_content = False
|
||||||
language = 'en_CN'
|
language = "en"
|
||||||
remove_empty_feeds = True
|
remove_empty_feeds = True
|
||||||
needs_subscription = 'optional'
|
publication_type = "newspaper"
|
||||||
publication_type = 'newspaper'
|
auto_cleanup = False
|
||||||
|
compress_news_images = True
|
||||||
|
ignore_duplicate_articles = {"title", "url"}
|
||||||
|
|
||||||
keep_only_tags = [
|
# used when unable to extract article from <script>, particularly in the Sports section
|
||||||
dict(name='h1'),
|
|
||||||
classes('info__subHeadline article-author main__right'),
|
|
||||||
]
|
|
||||||
remove_tags = [
|
remove_tags = [
|
||||||
dict(name='button')
|
dict(
|
||||||
|
classes(
|
||||||
|
"sticky-wrap relative social-media social-media--extended__shares"
|
||||||
|
" article-body-comment scmp_button_comment_wrapper social-media--extended__in-site"
|
||||||
|
" footer scmp-advert-tile sidebar-col related-article share-widget"
|
||||||
|
)
|
||||||
|
),
|
||||||
|
dict(attrs={"addthis_title": True}),
|
||||||
|
dict(name=["script", "style"]),
|
||||||
]
|
]
|
||||||
|
remove_attributes = ["style", "font"]
|
||||||
|
|
||||||
def get_browser(self):
|
extra_css = """
|
||||||
br = BasicNewsRecipe.get_browser(self)
|
.headline { font-size: 1.8rem; margin-bottom: 0.4rem; }
|
||||||
if self.username is not None and self.password is not None:
|
.sub-headline { font-size: 1rem; margin-bottom: 1.5rem; }
|
||||||
# br.set_debug_http(True)
|
.sub-headline ul { padding-left: 1rem; }
|
||||||
# br.set_debug_responses(True)
|
.sub-headline ul li { fmargin-bottom: 0.8rem; }
|
||||||
# br.set_debug_redirects(True)
|
.article-meta, .article-header__publish { padding-bottom: 0.5rem; }
|
||||||
rq = Request('https://account.scmp.com/login', headers={
|
.article-meta .author { text-transform: uppercase; font-weight: bold; }
|
||||||
'Accept': 'application/json, text/plain, */*',
|
.article-meta .published-dt { margin-left: 0.5rem; }
|
||||||
'Content-Type': 'application/json;charset=UTF-8',
|
.article-img { margin-bottom: 0.8rem; max-width: 100%; }
|
||||||
'Referer': 'https://account.scmp.com/login',
|
.article-img img, .carousel__slide img {
|
||||||
}, data=json.dumps({'username': self.username, 'password': self.password}))
|
display: block; margin-bottom: 0.3rem; max-width: 100%; height: auto;
|
||||||
self.log('Sending login request...')
|
box-sizing: border-box; }
|
||||||
try:
|
.article-img .caption, .article-caption { font-size: 0.8rem; }
|
||||||
res = br.open(rq)
|
"""
|
||||||
except Exception as err:
|
|
||||||
if hasattr(err, 'read'):
|
|
||||||
raise Exception('Login request failed with error: {} and body: {}'.format(err, err.read().decode('utf-8', 'replace')))
|
|
||||||
raise
|
|
||||||
if res.code != 200:
|
|
||||||
raise ValueError('Failed to login, check your username and password')
|
|
||||||
nonce = json.loads(res.read())['nonce']
|
|
||||||
rq = Request('https://www.scmp.com/centralize/signin?nonce=' + nonce, headers={
|
|
||||||
'referer': 'https://account.scmp.com/login',
|
|
||||||
'sec-fetch-mode': 'navigate',
|
|
||||||
'sec-fetch-site': 'same-site',
|
|
||||||
'sec-fetch-user': '?1'})
|
|
||||||
res = br.open(rq)
|
|
||||||
if res.code != 200:
|
|
||||||
raise ValueError('Failed to login, check your username and password')
|
|
||||||
return br
|
|
||||||
|
|
||||||
|
# https://www.scmp.com/rss
|
||||||
feeds = [
|
feeds = [
|
||||||
('Hong Kong', 'https://www.scmp.com/rss/2/feed'),
|
("Hong Kong", "https://www.scmp.com/rss/2/feed"),
|
||||||
('China', 'https://www.scmp.com/rss/4/feed'),
|
("China", "https://www.scmp.com/rss/4/feed"),
|
||||||
('Asia', 'https://www.scmp.com/rss/3/feed'),
|
("Asia", "https://www.scmp.com/rss/3/feed"),
|
||||||
('World', 'https://www.scmp.com/rss/5/feed'),
|
("World", "https://www.scmp.com/rss/5/feed"),
|
||||||
('Business', 'https://www.scmp.com/rss/92/feed'),
|
("Business", "https://www.scmp.com/rss/92/feed"),
|
||||||
('Tech', 'https://www.scmp.com/rss/36/feed'),
|
("Tech", "https://www.scmp.com/rss/36/feed"),
|
||||||
('Life', 'https://www.scmp.com/rss/94/feed'),
|
("Life", "https://www.scmp.com/rss/94/feed"),
|
||||||
('Culture', 'https://www.scmp.com/rss/322296/feed'),
|
("Culture", "https://www.scmp.com/rss/322296/feed"),
|
||||||
('Sport', 'https://www.scmp.com/rss/95/feed'),
|
("Sport", "https://www.scmp.com/rss/95/feed"),
|
||||||
('Post Mag', 'https://www.scmp.com/rss/71/feed'),
|
("Post Mag", "https://www.scmp.com/rss/71/feed"),
|
||||||
('Style', 'https://www.scmp.com/rss/72/feed'),
|
("Style", "https://www.scmp.com/rss/72/feed"),
|
||||||
]
|
]
|
||||||
|
|
||||||
def preprocess_html(self, soup):
|
def _extract_child_nodes(self, children, ele, soup, level=1):
|
||||||
for img in soup.findAll("img", attrs={'data-original':True}):
|
if not children:
|
||||||
img['src'] = img['data-original']
|
return
|
||||||
meta = soup.find('meta', attrs={'name':'twitter:image:src'}, content=True)
|
|
||||||
if meta is not None:
|
child_html = ""
|
||||||
wrapper = soup.find(**classes('image-wrapper__placeholder'))
|
for child in children:
|
||||||
if wrapper is not None:
|
if child.get("type", "") == "text":
|
||||||
p = wrapper.parent
|
child_html += child["data"]
|
||||||
img = new_tag(soup, 'img')
|
else:
|
||||||
img['src'] = meta['content']
|
if child["type"] == "iframe":
|
||||||
p.append(img)
|
# change iframe to <span> with the src linked
|
||||||
wrapper.extract()
|
new_ele = soup.new_tag("span")
|
||||||
return soup
|
new_ele["class"] = f'embed-{child["type"]}'
|
||||||
|
iframe_src = child.get("attribs", {}).get("src")
|
||||||
|
a_tag = soup.new_tag("a")
|
||||||
|
a_tag["href"] = iframe_src
|
||||||
|
a_tag.string = f"[Embed: {iframe_src}]"
|
||||||
|
new_ele.append(a_tag)
|
||||||
|
else:
|
||||||
|
new_ele = soup.new_tag(child["type"])
|
||||||
|
for k, v in child.get("attribs", {}).items():
|
||||||
|
if k.startswith("data-"):
|
||||||
|
continue
|
||||||
|
new_ele[k] = v
|
||||||
|
if child.get("children"):
|
||||||
|
self._extract_child_nodes(
|
||||||
|
child["children"], new_ele, soup, level + 1
|
||||||
|
)
|
||||||
|
child_html += str(new_ele)
|
||||||
|
if child["type"] == "img":
|
||||||
|
# generate a caption <span> tag for <img>
|
||||||
|
caption_text = child.get("attribs", {}).get("alt") or child.get(
|
||||||
|
"attribs", {}
|
||||||
|
).get("title")
|
||||||
|
caption_tag = soup.new_tag("span")
|
||||||
|
caption_tag.string = caption_text
|
||||||
|
caption_tag["class"] = "caption"
|
||||||
|
child_html += str(caption_tag)
|
||||||
|
ele["class"] = "article-img"
|
||||||
|
ele.append(BeautifulSoup(child_html))
|
||||||
|
|
||||||
|
def preprocess_raw_html(self, raw_html, url):
|
||||||
|
article = None
|
||||||
|
soup = BeautifulSoup(raw_html)
|
||||||
|
|
||||||
|
for script in soup.find_all("script"):
|
||||||
|
if not script.text.startswith("window.__APOLLO_STATE__"):
|
||||||
|
continue
|
||||||
|
article_js = re.sub(
|
||||||
|
r"window.__APOLLO_STATE__\s*=\s*", "", script.text.strip()
|
||||||
|
)
|
||||||
|
if article_js.endswith(";"):
|
||||||
|
article_js = article_js[:-1]
|
||||||
|
article = json.loads(article_js)
|
||||||
|
break
|
||||||
|
|
||||||
|
if not (article and article.get("contentService")):
|
||||||
|
# Sometimes the page does not have article content in the <script>
|
||||||
|
# particularly in the Sports section, so we fallback to
|
||||||
|
# raw_html and rely on remove_tags to clean it up
|
||||||
|
self.log(f"Unable to find article from script in {url}")
|
||||||
|
return raw_html
|
||||||
|
|
||||||
|
content_service = article.get("contentService")
|
||||||
|
content_node_id = None
|
||||||
|
for k, v in content_service["ROOT_QUERY"].items():
|
||||||
|
if not k.startswith("content"):
|
||||||
|
continue
|
||||||
|
content_node_id = v["id"]
|
||||||
|
break
|
||||||
|
content = content_service.get(content_node_id)
|
||||||
|
|
||||||
|
if content.get("sponsorType"):
|
||||||
|
# skip sponsored articles
|
||||||
|
self.abort_article(f"Sponsored article: {url}")
|
||||||
|
|
||||||
|
body = None
|
||||||
|
for k, v in content.items():
|
||||||
|
if (not k.startswith("body(")) or v.get("type", "") != "json":
|
||||||
|
continue
|
||||||
|
body = v
|
||||||
|
|
||||||
|
authors = [content_service[a["id"]]["name"] for a in content["authors"]]
|
||||||
|
date_published = datetime.utcfromtimestamp(
|
||||||
|
content["publishedDate"] / 1000
|
||||||
|
).replace(tzinfo=timezone.utc)
|
||||||
|
date_published_loc = date_published.astimezone(
|
||||||
|
timezone(offset=timedelta(hours=8)) # HK time
|
||||||
|
)
|
||||||
|
|
||||||
|
html_output = f"""<html><head><title>{content["headline"]}</title></head>
|
||||||
|
<body>
|
||||||
|
<article>
|
||||||
|
<h1 class="headline">{content["headline"]}</h1>
|
||||||
|
<div class="sub-headline"></div>
|
||||||
|
<div class="article-meta">
|
||||||
|
<span class="author">{", ".join(authors)}</span>
|
||||||
|
<span class="published-dt">
|
||||||
|
{date_published_loc:%-I:%M%p, %-d %b, %Y}
|
||||||
|
</span>
|
||||||
|
</div>
|
||||||
|
</article>
|
||||||
|
</body></html>
|
||||||
|
"""
|
||||||
|
|
||||||
|
new_soup = BeautifulSoup(html_output, "html.parser")
|
||||||
|
# sub headline
|
||||||
|
for c in content.get("subHeadline", {}).get("json", []):
|
||||||
|
ele = new_soup.new_tag(c["type"])
|
||||||
|
self._extract_child_nodes(c.get("children", []), ele, new_soup)
|
||||||
|
new_soup.find(class_="sub-headline").append(ele)
|
||||||
|
|
||||||
|
# article content
|
||||||
|
for node in body["json"]:
|
||||||
|
if node["type"] not in ["p", "div"]:
|
||||||
|
continue
|
||||||
|
new_ele = new_soup.new_tag(node["type"])
|
||||||
|
new_ele.string = ""
|
||||||
|
if node.get("children"):
|
||||||
|
self._extract_child_nodes(node["children"], new_ele, new_soup)
|
||||||
|
new_soup.article.append(new_ele)
|
||||||
|
|
||||||
|
return str(new_soup)
|
||||||
|
Loading…
x
Reference in New Issue
Block a user