calibre/recipes/scmp.recipe
2025-01-24 11:14:25 +01:00

199 lines
7.6 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python
'''
scmp.com
'''
import json
import time
from datetime import datetime
from html5_parser import parse
from lxml import etree
from calibre import replace_entities
from calibre.web.feeds.news import BasicNewsRecipe, classes
def E(parent, name, text='', **attrs):
ans = parent.makeelement(name, **attrs)
ans.text = text
parent.append(ans)
return ans
def process_node(node, html_parent):
ntype = node.get('type')
if ntype not in {'track-viewed-percentage', 'inline-ad-slot', 'inline-widget', 'text'}:
c = html_parent.makeelement(ntype)
if ntype != 'p':
c.attrib.update({k: v or '' for k, v in node.get('attribs', {}).items()})
html_parent.append(c)
for nc in node.get('children', ()):
process_node(nc, c)
elif ntype == 'text':
text = node.get('data')
if text:
text = replace_entities(text)
if len(html_parent):
t = html_parent[-1]
t.tail = (t.tail or '') + text
else:
html_parent.text = (html_parent.text or '') + text
def ts_date(x):
dt = datetime.fromtimestamp(x/1000 + time.timezone)
return dt.strftime('%b %d, %Y at %I:%M %p')
def load_article_from_json(raw, root):
# open('/t/raw.json', 'w').write(raw)
data = json.loads(raw)['props']['pageProps']['payload']['data']['article']
body = root.xpath('//body')[0]
for child in tuple(body):
body.remove(child)
article = E(body, 'article')
E(article, 'div', replace_entities(data['firstTopic']['name']), style='color: gray; font-size:small; font-weight:bold;')
E(article, 'h1', replace_entities(data['headline']))
# E(article, 'p', replace_entities(data['subHeadline']['text']), style='font-style: italic; color:#202020;')
for subh in data['subHeadline']['json']:
process_node(subh, article)
auth = ts_date(data['publishedDate']) + ' | ' + str(data.get('readingTime', '')) + ' min read | ' + ', '.join([a['name'] for a in data['authors']])
E(article, 'p', auth, style='color: #202020; font-size:small;')
main_image_url = sub_img = ''
for l in data['images']:
if l['type'] == 'leading':
main_image_url = l['url']
sub_img = l['title']
if main_image_url != '':
div = E(article, 'div')
E(div, 'img', src=main_image_url)
E(div, 'div', sub_img, style='text-align:center; font-size:small;')
for node in data['body']['json']:
process_node(node, article)
class SCMP(BasicNewsRecipe):
title = 'South China Morning Post'
__author__ = 'unkn0wn'
description = (
'The South China Morning Post is a leading news media company that has reported on China and Asia '
'for more than a century with global impact. Founded in 1903, SCMP is headquartered in Hong Kong, '
'where it is the citys newspaper of record. Our teams span across Asia and the United States, '
'working together to connect with news consumers around the world. We are committed to informing '
'and inspiring through journalism of the highest standards. Our vision is to “Elevate Thought”, '
'and our mission is to “Lead the global conversation about China”.'
)
publisher = 'South China Morning Post Publishers Ltd.'
oldest_article = 1
no_stylesheets = True
remove_javascript = True
remove_attributes = ['width', 'height']
encoding = 'utf-8'
use_embedded_content = False
language = 'en_HK'
remove_empty_feeds = True
resolve_internal_links = True
publication_type = 'newspaper'
ignore_duplicate_articles = {'title', 'url'}
extra_css = 'blockquote, em { color: #202020; }'
masthead_url = 'https://upload.wikimedia.org/wikipedia/commons/c/c3/SCMP_logo.svg'
def get_cover_url(self):
soup = self.index_to_soup('https://www.frontpages.com/south-china-morning-post/')
return 'https://www.frontpages.com' + soup.find('img', attrs={'id':'giornale-img'})['src']
recipe_specific_options = {
'days': {
'short': 'Oldest article to download from this news source. In days ',
'long': 'For example, 0.5, gives you articles from the past 12 hours',
'default': str(oldest_article)
},
'comp': {
'short': 'Compress News Images?',
'long': 'enter yes',
'default': 'no'
},
'rev': {
'short': 'Reverse the order of articles in each feed?',
'long': 'enter yes',
'default': 'no'
}
}
def __init__(self, *args, **kwargs):
BasicNewsRecipe.__init__(self, *args, **kwargs)
d = self.recipe_specific_options.get('days')
if d and isinstance(d, str):
self.oldest_article = float(d)
r = self.recipe_specific_options.get('rev')
if r and isinstance(r, str):
if r.lower() == 'yes':
self.reverse_article_order = True
c = self.recipe_specific_options.get('comp')
if c and isinstance(c, str):
if c.lower() == 'yes':
self.compress_news_images = True
# used when unable to extract article from <script>, particularly in the Sports section
remove_tags = [
dict(
classes(
'sticky-wrap relative social-media social-media--extended__shares'
' article-body-comment scmp_button_comment_wrapper social-media--extended__in-site'
' footer scmp-advert-tile sidebar-col related-article share-widget'
)
),
dict(attrs={'addthis_title': True}),
dict(name=['script', 'style']),
]
# https://www.scmp.com/rss
feeds = [
('Hong Kong', 'https://www.scmp.com/rss/2/feed'),
('China', 'https://www.scmp.com/rss/4/feed'),
('Asia', 'https://www.scmp.com/rss/3/feed'),
('World', 'https://www.scmp.com/rss/5/feed'),
('Business', 'https://www.scmp.com/rss/92/feed'),
('Tech', 'https://www.scmp.com/rss/36/feed'),
('Life', 'https://www.scmp.com/rss/94/feed'),
('Culture', 'https://www.scmp.com/rss/322296/feed'),
('Sport', 'https://www.scmp.com/rss/95/feed'),
('Post Mag', 'https://www.scmp.com/rss/71/feed'),
('Style', 'https://www.scmp.com/rss/72/feed'),
('News', 'https://www.scmp.com/rss/91/feed')
]
def print_version(self, url):
return url.split('?')[0]
def preprocess_raw_html(self, raw_html, url):
body = '<html><body><article></article></body></html>'
b_root = parse(body)
root = parse(raw_html)
script = root.xpath('//script[@id="__NEXT_DATA__"]')
if script:
try:
load_article_from_json(script[0].text, b_root)
except Exception:
self.log('** Failed parse: ', url)
return raw_html
head = b_root.xpath('//h2') + b_root.xpath('//h3')
for h2 in head:
h2.tag = 'h4'
raw = etree.tostring(b_root, encoding='unicode')
return raw
return raw_html
def preprocess_html(self, soup):
from urllib.parse import urlparse
for img in soup.findAll('img', attrs={'src':True}):
y = 'https://img.i-scmp.com/cdn-cgi/image/fit=contain,width=768,format=auto'
img['src'] = y + urlparse(img['src']).path
for img in soup.findAll('img', attrs={'title':True}):
div = soup.new_tag('div', attrs={'style':'text-align:center; font-size:small;'})
div.string = img.get('title', '')
img.find_parent('div').append(div)
return soup