...

2025-08-11 09:13:57 -04:00 · 2024-09-08 19:23:47 +05:30 · 2024-09-08 19:23:47 +05:30 · cd126ea658
commit cd126ea658
parent 931ee9867a
1 changed files with 5 additions and 9 deletions
--- a/recipes/scmp.recipe
+++ b/recipes/scmp.recipe
@ -22,7 +22,6 @@ def E(parent, name, text='', **attrs):
    parent.append(ans)
    return ans
 def process_node(node, html_parent):
    ntype = node.get('type')
@ -48,11 +47,9 @@ def ts_date(x):
    dt = datetime.fromtimestamp(x/1000 + time.timezone)
    return dt.strftime('%b %d, %Y at %I:%M %p')
 def auth(x):
    return ', '.join([a['name'] for a in x])
 def load_article_from_json(raw, root):
    # open('/t/raw.json', 'w').write(raw)
    data = json.loads(raw)['props']['pageProps']['payload']['data']['article']
@ -103,6 +100,11 @@ class SCMP(BasicNewsRecipe):
    publication_type = "newspaper"
    ignore_duplicate_articles = {"title", "url"}
    extra_css = 'blockquote, em { color: #202020; }'
    masthead_url = 'https://upload.wikimedia.org/wikipedia/commons/c/c3/SCMP_logo.svg'
    def get_cover_url(self):
        soup = self.index_to_soup('https://www.frontpages.com/south-china-morning-post/')
        return 'https://www.frontpages.com' + soup.find('img', attrs={'id':'giornale-img'})['src']
    recipe_specific_options = {
        'days': {
@ -118,10 +120,6 @@ class SCMP(BasicNewsRecipe):
        if d and isinstance(d, str):
            self.oldest_article = float(d)
    def get_cover_url(self):
        soup = self.index_to_soup('https://www.frontpages.com/south-china-morning-post/')
        return 'https://www.frontpages.com' + soup.find('img', attrs={'id':'giornale-img'})['src']
    # used when unable to extract article from <script>, particularly in the Sports section
    remove_tags = [
        dict(
@ -154,8 +152,6 @@ class SCMP(BasicNewsRecipe):
    def print_version(self, url):
        return url.split('?')[0]
    masthead_url = 'https://upload.wikimedia.org/wikipedia/commons/c/c3/SCMP_logo.svg'
    def preprocess_raw_html(self, raw_html, url):
        body = '<html><body><article></article></body></html>'
        b_root = parse(body)