This commit is contained in:
unkn0w7n 2024-09-08 19:23:47 +05:30
parent 931ee9867a
commit cd126ea658

View File

@ -22,7 +22,6 @@ def E(parent, name, text='', **attrs):
parent.append(ans) parent.append(ans)
return ans return ans
def process_node(node, html_parent): def process_node(node, html_parent):
ntype = node.get('type') ntype = node.get('type')
@ -48,11 +47,9 @@ def ts_date(x):
dt = datetime.fromtimestamp(x/1000 + time.timezone) dt = datetime.fromtimestamp(x/1000 + time.timezone)
return dt.strftime('%b %d, %Y at %I:%M %p') return dt.strftime('%b %d, %Y at %I:%M %p')
def auth(x): def auth(x):
return ', '.join([a['name'] for a in x]) return ', '.join([a['name'] for a in x])
def load_article_from_json(raw, root): def load_article_from_json(raw, root):
# open('/t/raw.json', 'w').write(raw) # open('/t/raw.json', 'w').write(raw)
data = json.loads(raw)['props']['pageProps']['payload']['data']['article'] data = json.loads(raw)['props']['pageProps']['payload']['data']['article']
@ -103,6 +100,11 @@ class SCMP(BasicNewsRecipe):
publication_type = "newspaper" publication_type = "newspaper"
ignore_duplicate_articles = {"title", "url"} ignore_duplicate_articles = {"title", "url"}
extra_css = 'blockquote, em { color: #202020; }' extra_css = 'blockquote, em { color: #202020; }'
masthead_url = 'https://upload.wikimedia.org/wikipedia/commons/c/c3/SCMP_logo.svg'
def get_cover_url(self):
soup = self.index_to_soup('https://www.frontpages.com/south-china-morning-post/')
return 'https://www.frontpages.com' + soup.find('img', attrs={'id':'giornale-img'})['src']
recipe_specific_options = { recipe_specific_options = {
'days': { 'days': {
@ -118,10 +120,6 @@ class SCMP(BasicNewsRecipe):
if d and isinstance(d, str): if d and isinstance(d, str):
self.oldest_article = float(d) self.oldest_article = float(d)
def get_cover_url(self):
soup = self.index_to_soup('https://www.frontpages.com/south-china-morning-post/')
return 'https://www.frontpages.com' + soup.find('img', attrs={'id':'giornale-img'})['src']
# used when unable to extract article from <script>, particularly in the Sports section # used when unable to extract article from <script>, particularly in the Sports section
remove_tags = [ remove_tags = [
dict( dict(
@ -154,8 +152,6 @@ class SCMP(BasicNewsRecipe):
def print_version(self, url): def print_version(self, url):
return url.split('?')[0] return url.split('?')[0]
masthead_url = 'https://upload.wikimedia.org/wikipedia/commons/c/c3/SCMP_logo.svg'
def preprocess_raw_html(self, raw_html, url): def preprocess_raw_html(self, raw_html, url):
body = '<html><body><article></article></body></html>' body = '<html><body><article></article></body></html>'
b_root = parse(body) b_root = parse(body)