Merge branch 'scmp-recipe' of https://github.com/ping/calibre

This commit is contained in:
Kovid Goyal 2022-04-12 07:19:10 +05:30
commit 6b773361c2
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C

View File

@ -1,105 +1,197 @@
''' """
scmp.com scmp.com
''' """
from mechanize import Request
import json import json
from calibre.web.feeds.news import BasicNewsRecipe import re
from calibre.ebooks.BeautifulSoup import Tag from datetime import datetime, timedelta, timezone
from calibre.ebooks.BeautifulSoup import BeautifulSoup
def classes(classes): from calibre.web.feeds.news import BasicNewsRecipe, classes
q = frozenset(classes.split(' '))
return dict(attrs={
'class': lambda x: x and frozenset(x.split()).intersection(q)})
def new_tag(soup, name, attrs=()):
impl = getattr(soup, 'new_tag', None)
if impl is not None:
return impl(name, attrs=dict(attrs))
return Tag(soup, name, attrs=attrs or None)
class SCMP(BasicNewsRecipe): class SCMP(BasicNewsRecipe):
title = 'South China Morning Post' title = "South China Morning Post"
__author__ = 'llam' __author__ = "llam"
description = "SCMP.com, Hong Kong's premier online English daily provides exclusive up-to-date news, audio video news, podcasts, RSS Feeds, Blogs, breaking news, top stories, award winning news and analysis on Hong Kong and China." # noqa description = "SCMP.com, Hong Kong's premier online English daily provides exclusive up-to-date news, audio video news, podcasts, RSS Feeds, Blogs, breaking news, top stories, award winning news and analysis on Hong Kong and China." # noqa
publisher = 'South China Morning Post Publishers Ltd.' publisher = "South China Morning Post Publishers Ltd."
oldest_article = 2 oldest_article = 1
delay = 1 max_articles_per_feed = 25
max_articles_per_feed = 200
no_stylesheets = True no_stylesheets = True
encoding = 'utf-8' remove_javascript = True
encoding = "utf-8"
use_embedded_content = False use_embedded_content = False
language = 'en_CN' language = "en"
remove_empty_feeds = True remove_empty_feeds = True
needs_subscription = 'optional' publication_type = "newspaper"
publication_type = 'newspaper' auto_cleanup = False
compress_news_images = True
ignore_duplicate_articles = {"title", "url"}
keep_only_tags = [ # used when unable to extract article from <script>, particularly in the Sports section
dict(name='h1'),
classes('info__subHeadline article-author main__right'),
]
remove_tags = [ remove_tags = [
dict(name='button') dict(
classes(
"sticky-wrap relative social-media social-media--extended__shares"
" article-body-comment scmp_button_comment_wrapper social-media--extended__in-site"
" footer scmp-advert-tile sidebar-col related-article share-widget"
)
),
dict(attrs={"addthis_title": True}),
dict(name=["script", "style"]),
] ]
remove_attributes = ["style", "font"]
def get_browser(self): extra_css = """
br = BasicNewsRecipe.get_browser(self) .headline { font-size: 1.8rem; margin-bottom: 0.4rem; }
if self.username is not None and self.password is not None: .sub-headline { font-size: 1rem; margin-bottom: 1.5rem; }
# br.set_debug_http(True) .sub-headline ul { padding-left: 1rem; }
# br.set_debug_responses(True) .sub-headline ul li { fmargin-bottom: 0.8rem; }
# br.set_debug_redirects(True) .article-meta, .article-header__publish { padding-bottom: 0.5rem; }
rq = Request('https://account.scmp.com/login', headers={ .article-meta .author { text-transform: uppercase; font-weight: bold; }
'Accept': 'application/json, text/plain, */*', .article-meta .published-dt { margin-left: 0.5rem; }
'Content-Type': 'application/json;charset=UTF-8', .article-img { margin-bottom: 0.8rem; max-width: 100%; }
'Referer': 'https://account.scmp.com/login', .article-img img, .carousel__slide img {
}, data=json.dumps({'username': self.username, 'password': self.password})) display: block; margin-bottom: 0.3rem; max-width: 100%; height: auto;
self.log('Sending login request...') box-sizing: border-box; }
try: .article-img .caption, .article-caption { font-size: 0.8rem; }
res = br.open(rq) """
except Exception as err:
if hasattr(err, 'read'):
raise Exception('Login request failed with error: {} and body: {}'.format(err, err.read().decode('utf-8', 'replace')))
raise
if res.code != 200:
raise ValueError('Failed to login, check your username and password')
nonce = json.loads(res.read())['nonce']
rq = Request('https://www.scmp.com/centralize/signin?nonce=' + nonce, headers={
'referer': 'https://account.scmp.com/login',
'sec-fetch-mode': 'navigate',
'sec-fetch-site': 'same-site',
'sec-fetch-user': '?1'})
res = br.open(rq)
if res.code != 200:
raise ValueError('Failed to login, check your username and password')
return br
# https://www.scmp.com/rss
feeds = [ feeds = [
('Hong Kong', 'https://www.scmp.com/rss/2/feed'), ("Hong Kong", "https://www.scmp.com/rss/2/feed"),
('China', 'https://www.scmp.com/rss/4/feed'), ("China", "https://www.scmp.com/rss/4/feed"),
('Asia', 'https://www.scmp.com/rss/3/feed'), ("Asia", "https://www.scmp.com/rss/3/feed"),
('World', 'https://www.scmp.com/rss/5/feed'), ("World", "https://www.scmp.com/rss/5/feed"),
('Business', 'https://www.scmp.com/rss/92/feed'), ("Business", "https://www.scmp.com/rss/92/feed"),
('Tech', 'https://www.scmp.com/rss/36/feed'), ("Tech", "https://www.scmp.com/rss/36/feed"),
('Life', 'https://www.scmp.com/rss/94/feed'), ("Life", "https://www.scmp.com/rss/94/feed"),
('Culture', 'https://www.scmp.com/rss/322296/feed'), ("Culture", "https://www.scmp.com/rss/322296/feed"),
('Sport', 'https://www.scmp.com/rss/95/feed'), ("Sport", "https://www.scmp.com/rss/95/feed"),
('Post Mag', 'https://www.scmp.com/rss/71/feed'), ("Post Mag", "https://www.scmp.com/rss/71/feed"),
('Style', 'https://www.scmp.com/rss/72/feed'), ("Style", "https://www.scmp.com/rss/72/feed"),
] ]
def preprocess_html(self, soup): def _extract_child_nodes(self, children, ele, soup, level=1):
for img in soup.findAll("img", attrs={'data-original':True}): if not children:
img['src'] = img['data-original'] return
meta = soup.find('meta', attrs={'name':'twitter:image:src'}, content=True)
if meta is not None: child_html = ""
wrapper = soup.find(**classes('image-wrapper__placeholder')) for child in children:
if wrapper is not None: if child.get("type", "") == "text":
p = wrapper.parent child_html += child["data"]
img = new_tag(soup, 'img') else:
img['src'] = meta['content'] if child["type"] == "iframe":
p.append(img) # change iframe to <span> with the src linked
wrapper.extract() new_ele = soup.new_tag("span")
return soup new_ele["class"] = f'embed-{child["type"]}'
iframe_src = child.get("attribs", {}).get("src")
a_tag = soup.new_tag("a")
a_tag["href"] = iframe_src
a_tag.string = f"[Embed: {iframe_src}]"
new_ele.append(a_tag)
else:
new_ele = soup.new_tag(child["type"])
for k, v in child.get("attribs", {}).items():
if k.startswith("data-"):
continue
new_ele[k] = v
if child.get("children"):
self._extract_child_nodes(
child["children"], new_ele, soup, level + 1
)
child_html += str(new_ele)
if child["type"] == "img":
# generate a caption <span> tag for <img>
caption_text = child.get("attribs", {}).get("alt") or child.get(
"attribs", {}
).get("title")
caption_tag = soup.new_tag("span")
caption_tag.string = caption_text
caption_tag["class"] = "caption"
child_html += str(caption_tag)
ele["class"] = "article-img"
ele.append(BeautifulSoup(child_html))
def preprocess_raw_html(self, raw_html, url):
article = None
soup = BeautifulSoup(raw_html)
for script in soup.find_all("script"):
if not script.text.startswith("window.__APOLLO_STATE__"):
continue
article_js = re.sub(
r"window.__APOLLO_STATE__\s*=\s*", "", script.text.strip()
)
if article_js.endswith(";"):
article_js = article_js[:-1]
article = json.loads(article_js)
break
if not (article and article.get("contentService")):
# Sometimes the page does not have article content in the <script>
# particularly in the Sports section, so we fallback to
# raw_html and rely on remove_tags to clean it up
self.log(f"Unable to find article from script in {url}")
return raw_html
content_service = article.get("contentService")
content_node_id = None
for k, v in content_service["ROOT_QUERY"].items():
if not k.startswith("content"):
continue
content_node_id = v["id"]
break
content = content_service.get(content_node_id)
if content.get("sponsorType"):
# skip sponsored articles
self.abort_article(f"Sponsored article: {url}")
body = None
for k, v in content.items():
if (not k.startswith("body(")) or v.get("type", "") != "json":
continue
body = v
authors = [content_service[a["id"]]["name"] for a in content["authors"]]
date_published = datetime.utcfromtimestamp(
content["publishedDate"] / 1000
).replace(tzinfo=timezone.utc)
date_published_loc = date_published.astimezone(
timezone(offset=timedelta(hours=8)) # HK time
)
html_output = f"""<html><head><title>{content["headline"]}</title></head>
<body>
<article>
<h1 class="headline">{content["headline"]}</h1>
<div class="sub-headline"></div>
<div class="article-meta">
<span class="author">{", ".join(authors)}</span>
<span class="published-dt">
{date_published_loc:%-I:%M%p, %-d %b, %Y}
</span>
</div>
</article>
</body></html>
"""
new_soup = BeautifulSoup(html_output, "html.parser")
# sub headline
for c in content.get("subHeadline", {}).get("json", []):
ele = new_soup.new_tag(c["type"])
self._extract_child_nodes(c.get("children", []), ele, new_soup)
new_soup.find(class_="sub-headline").append(ele)
# article content
for node in body["json"]:
if node["type"] not in ["p", "div"]:
continue
new_ele = new_soup.new_tag(node["type"])
new_ele.string = ""
if node.get("children"):
self._extract_child_nodes(node["children"], new_ele, new_soup)
new_soup.article.append(new_ele)
return str(new_soup)