mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-06-23 15:30:45 -04:00
199 lines
7.6 KiB
Python
199 lines
7.6 KiB
Python
#!/usr/bin/env python
|
||
'''
|
||
scmp.com
|
||
'''
|
||
|
||
import json
|
||
import time
|
||
from datetime import datetime
|
||
|
||
from html5_parser import parse
|
||
from lxml import etree
|
||
|
||
from calibre import replace_entities
|
||
from calibre.web.feeds.news import BasicNewsRecipe, classes
|
||
|
||
|
||
def E(parent, name, text='', **attrs):
|
||
ans = parent.makeelement(name, **attrs)
|
||
ans.text = text
|
||
parent.append(ans)
|
||
return ans
|
||
|
||
|
||
def process_node(node, html_parent):
|
||
ntype = node.get('type')
|
||
|
||
if ntype not in {'track-viewed-percentage', 'inline-ad-slot', 'inline-widget', 'text'}:
|
||
c = html_parent.makeelement(ntype)
|
||
if ntype != 'p':
|
||
c.attrib.update({k: v or '' for k, v in node.get('attribs', {}).items()})
|
||
html_parent.append(c)
|
||
for nc in node.get('children', ()):
|
||
process_node(nc, c)
|
||
elif ntype == 'text':
|
||
text = node.get('data')
|
||
if text:
|
||
text = replace_entities(text)
|
||
if len(html_parent):
|
||
t = html_parent[-1]
|
||
t.tail = (t.tail or '') + text
|
||
else:
|
||
html_parent.text = (html_parent.text or '') + text
|
||
|
||
|
||
def ts_date(x):
|
||
dt = datetime.fromtimestamp(x/1000 + time.timezone)
|
||
return dt.strftime('%b %d, %Y at %I:%M %p')
|
||
|
||
|
||
def load_article_from_json(raw, root):
|
||
# open('/t/raw.json', 'w').write(raw)
|
||
data = json.loads(raw)['props']['pageProps']['payload']['data']['article']
|
||
body = root.xpath('//body')[0]
|
||
for child in tuple(body):
|
||
body.remove(child)
|
||
article = E(body, 'article')
|
||
E(article, 'div', replace_entities(data['firstTopic']['name']), style='color: gray; font-size:small; font-weight:bold;')
|
||
E(article, 'h1', replace_entities(data['headline']))
|
||
# E(article, 'p', replace_entities(data['subHeadline']['text']), style='font-style: italic; color:#202020;')
|
||
for subh in data['subHeadline']['json']:
|
||
process_node(subh, article)
|
||
auth = ts_date(data['publishedDate']) + ' | ' + str(data.get('readingTime', '')) + ' min read | ' + ', '.join([a['name'] for a in data['authors']])
|
||
E(article, 'p', auth, style='color: #202020; font-size:small;')
|
||
main_image_url = sub_img = ''
|
||
for l in data['images']:
|
||
if l['type'] == 'leading':
|
||
main_image_url = l['url']
|
||
sub_img = l['title']
|
||
if main_image_url != '':
|
||
div = E(article, 'div')
|
||
E(div, 'img', src=main_image_url)
|
||
E(div, 'div', sub_img, style='text-align:center; font-size:small;')
|
||
for node in data['body']['json']:
|
||
process_node(node, article)
|
||
|
||
|
||
class SCMP(BasicNewsRecipe):
|
||
title = 'South China Morning Post'
|
||
__author__ = 'unkn0wn'
|
||
description = (
|
||
'The South China Morning Post is a leading news media company that has reported on China and Asia '
|
||
'for more than a century with global impact. Founded in 1903, SCMP is headquartered in Hong Kong, '
|
||
'where it is the city’s newspaper of record. Our teams span across Asia and the United States, '
|
||
'working together to connect with news consumers around the world. We are committed to informing '
|
||
'and inspiring through journalism of the highest standards. Our vision is to “Elevate Thought”, '
|
||
'and our mission is to “Lead the global conversation about China”.'
|
||
)
|
||
publisher = 'South China Morning Post Publishers Ltd.'
|
||
oldest_article = 1
|
||
no_stylesheets = True
|
||
remove_javascript = True
|
||
remove_attributes = ['width', 'height']
|
||
encoding = 'utf-8'
|
||
use_embedded_content = False
|
||
language = 'en_HK'
|
||
remove_empty_feeds = True
|
||
resolve_internal_links = True
|
||
publication_type = 'newspaper'
|
||
ignore_duplicate_articles = {'title', 'url'}
|
||
extra_css = 'blockquote, em { color: #202020; }'
|
||
masthead_url = 'https://upload.wikimedia.org/wikipedia/commons/c/c3/SCMP_logo.svg'
|
||
|
||
def get_cover_url(self):
|
||
soup = self.index_to_soup('https://www.frontpages.com/south-china-morning-post/')
|
||
return 'https://www.frontpages.com' + soup.find('img', attrs={'id':'giornale-img'})['src']
|
||
|
||
recipe_specific_options = {
|
||
'days': {
|
||
'short': 'Oldest article to download from this news source. In days ',
|
||
'long': 'For example, 0.5, gives you articles from the past 12 hours',
|
||
'default': str(oldest_article)
|
||
},
|
||
'comp': {
|
||
'short': 'Compress News Images?',
|
||
'long': 'enter yes',
|
||
'default': 'no'
|
||
},
|
||
'rev': {
|
||
'short': 'Reverse the order of articles in each feed?',
|
||
'long': 'enter yes',
|
||
'default': 'no'
|
||
}
|
||
}
|
||
|
||
def __init__(self, *args, **kwargs):
|
||
BasicNewsRecipe.__init__(self, *args, **kwargs)
|
||
d = self.recipe_specific_options.get('days')
|
||
if d and isinstance(d, str):
|
||
self.oldest_article = float(d)
|
||
r = self.recipe_specific_options.get('rev')
|
||
if r and isinstance(r, str):
|
||
if r.lower() == 'yes':
|
||
self.reverse_article_order = True
|
||
c = self.recipe_specific_options.get('comp')
|
||
if c and isinstance(c, str):
|
||
if c.lower() == 'yes':
|
||
self.compress_news_images = True
|
||
|
||
# used when unable to extract article from <script>, particularly in the Sports section
|
||
remove_tags = [
|
||
dict(
|
||
classes(
|
||
'sticky-wrap relative social-media social-media--extended__shares'
|
||
' article-body-comment scmp_button_comment_wrapper social-media--extended__in-site'
|
||
' footer scmp-advert-tile sidebar-col related-article share-widget'
|
||
)
|
||
),
|
||
dict(attrs={'addthis_title': True}),
|
||
dict(name=['script', 'style']),
|
||
]
|
||
|
||
# https://www.scmp.com/rss
|
||
feeds = [
|
||
('Hong Kong', 'https://www.scmp.com/rss/2/feed'),
|
||
('China', 'https://www.scmp.com/rss/4/feed'),
|
||
('Asia', 'https://www.scmp.com/rss/3/feed'),
|
||
('World', 'https://www.scmp.com/rss/5/feed'),
|
||
('Business', 'https://www.scmp.com/rss/92/feed'),
|
||
('Tech', 'https://www.scmp.com/rss/36/feed'),
|
||
('Life', 'https://www.scmp.com/rss/94/feed'),
|
||
('Culture', 'https://www.scmp.com/rss/322296/feed'),
|
||
('Sport', 'https://www.scmp.com/rss/95/feed'),
|
||
('Post Mag', 'https://www.scmp.com/rss/71/feed'),
|
||
('Style', 'https://www.scmp.com/rss/72/feed'),
|
||
('News', 'https://www.scmp.com/rss/91/feed')
|
||
]
|
||
|
||
def print_version(self, url):
|
||
return url.split('?')[0]
|
||
|
||
def preprocess_raw_html(self, raw_html, url):
|
||
body = '<html><body><article></article></body></html>'
|
||
b_root = parse(body)
|
||
root = parse(raw_html)
|
||
script = root.xpath('//script[@id="__NEXT_DATA__"]')
|
||
if script:
|
||
try:
|
||
load_article_from_json(script[0].text, b_root)
|
||
except Exception:
|
||
self.log('** Failed parse: ', url)
|
||
return raw_html
|
||
head = b_root.xpath('//h2') + b_root.xpath('//h3')
|
||
for h2 in head:
|
||
h2.tag = 'h4'
|
||
raw = etree.tostring(b_root, encoding='unicode')
|
||
return raw
|
||
return raw_html
|
||
|
||
def preprocess_html(self, soup):
|
||
from urllib.parse import urlparse
|
||
for img in soup.findAll('img', attrs={'src':True}):
|
||
y = 'https://img.i-scmp.com/cdn-cgi/image/fit=contain,width=768,format=auto'
|
||
img['src'] = y + urlparse(img['src']).path
|
||
for img in soup.findAll('img', attrs={'title':True}):
|
||
div = soup.new_tag('div', attrs={'style':'text-align:center; font-size:small;'})
|
||
div.string = img.get('title', '')
|
||
img.find_parent('div').append(div)
|
||
return soup
|