From 3f86205c9fea857b46339a0c0a8385ad6a34552c Mon Sep 17 00:00:00 2001 From: unkn0w7n <51942695+unkn0w7n@users.noreply.github.com> Date: Mon, 17 Jul 2023 23:02:45 +0530 Subject: [PATCH 1/2] Update bloomberg.recipe --- recipes/bloomberg.recipe | 60 ++++++++++++++++++++++++++++------------ 1 file changed, 43 insertions(+), 17 deletions(-) diff --git a/recipes/bloomberg.recipe b/recipes/bloomberg.recipe index 9162f4dd04..add2105430 100644 --- a/recipes/bloomberg.recipe +++ b/recipes/bloomberg.recipe @@ -61,7 +61,7 @@ class Bloomberg(BasicNewsRecipe): 'https://news.google.com/rss/search?q=when:27h+allinurl:bloomberg.com%2Fnews%2Fnewsletters%2F&hl=en-US&gl=US&ceid=US:en'), ('News', 'https://news.google.com/rss/search?q=when:27h+allinurl:bloomberg.com%2Fnews%2Farticles%2F&hl=en-US&gl=US&ceid=US:en'), - ('Others', 'https://news.google.com/rss/search?q=when:27h+allinurl:bloomberg.com%2F&hl=en-US&gl=US&ceid=US:en') + ('Others', 'https://news.google.com/rss/search?q=when:27h+allinurl:bloomberg.com&hl=en-US&gl=US&ceid=US:en') ] def preprocess_raw_html(self, raw, *a): @@ -71,13 +71,12 @@ class Bloomberg(BasicNewsRecipe): m = root.xpath('//script[@data-component-props="FeatureBody"]') if not m: m2 = root.xpath('//script[@id="__NEXT_DATA__"]') - if not m2: - return raw + if m: data = json.loads(m[0].text) data = data['story'] - else: + elif m2: data = json.loads(m2[0].text) data = data['props']['pageProps']['story'] @@ -103,7 +102,7 @@ class Bloomberg(BasicNewsRecipe): if 'ledeImageUrl' in data: if data['ledeImageUrl'] is not None: - lede = '

'.format(data['ledeImageUrl']) + lede = '

'.format(data['ledeImageUrl']) if 'ledeDescription' in data: if data['ledeDescription'] is not None: @@ -117,25 +116,52 @@ class Bloomberg(BasicNewsRecipe): if m: body = data['body'] - else: + elif m2: body = '' body_data = data['body']['content'] + for objects in body_data: - if objects['type'] == 'media': - if objects['subType'] == 'photo': - body += '

'.format(objects['data']['photo']['src']) - body += '' + objects['data']['photo']['caption'] + '

' + + if objects['type'] == 'media' and objects['subType'] == 'photo': + body += '

'.format(objects['data']['photo']['src']) + body += '' + objects['data']['photo']['caption'] + '

' + if objects['type'] == 'media' and objects['subType'] == 'chart': + if objects['data'] and objects['data']['chart']: + body += '

'.format(objects['data']['chart']['fallback']) + if objects['type'] == 'paragraph' or 'heading': # lists are missed :( body += '

' + if 'content' not in objects: continue - for content in objects['content']: - if 'value' in content: - body += content['value'] - elif 'content' in content: - for val_cont in content['content']: - if 'value' in val_cont: - body += val_cont['value'] + + for item in objects['content']: + + if item['type'] == 'text' and item['value']: + body += item['value'] + + elif item['type'] == 'link' and item['data']: + if 'href' not in item['data']: + continue + if item['content'] and item['content'][0] and item['content'][0]['value']: + body += '' + item['content'][0]['value'] + '' + + elif item['type'] == 'entity': + if item['content'] and item['content'][0] and item['content'][0]['value']: + if item['subType'] == 'person' or 'security': + body += item['content'][0]['value'] + elif item['subType'] == 'story': + if item['data'] and item['data']['link'] and item['data']['link']['destination'] and item['data']['link']['destination']['web']: + body += '' + item.content[0].value + '' + + if objects['type'] == 'quote': + if 'content' not in objects: + continue + for item in objects['content']: + if item['type'] == 'paragraph' and item['content'] and item['content'][0]: + if 'value' not in item['content'][0]: + continue + body += '

' + item['content'][0]['value'] + '
' html = '' + cat + title + subhead + auth + lede + caption + '
' + body return html From ec634e92ffd3f6ae487dd6c499fc3b06ceab3590 Mon Sep 17 00:00:00 2001 From: unkn0w7n <51942695+unkn0w7n@users.noreply.github.com> Date: Mon, 17 Jul 2023 23:05:30 +0530 Subject: [PATCH 2/2] Update bloomberg-business-week.recipe --- recipes/bloomberg-business-week.recipe | 53 +++++++++++++++++++------- 1 file changed, 40 insertions(+), 13 deletions(-) diff --git a/recipes/bloomberg-business-week.recipe b/recipes/bloomberg-business-week.recipe index e22fa0cba6..1e24f8acc4 100644 --- a/recipes/bloomberg-business-week.recipe +++ b/recipes/bloomberg-business-week.recipe @@ -108,7 +108,7 @@ class Bloomberg(BasicNewsRecipe): if 'ledeImageUrl' in data: if data['ledeImageUrl'] is not None: - lede = '

'.format(data['ledeImageUrl']) + lede = '

'.format(data['ledeImageUrl']) if 'ledeDescription' in data: if data['ledeDescription'] is not None: @@ -125,22 +125,49 @@ class Bloomberg(BasicNewsRecipe): else: body = '' body_data = data['body']['content'] + for objects in body_data: - if objects['type'] == 'media': - if objects['subType'] == 'photo': - body += '

'.format(objects['data']['photo']['src']) - body += '' + objects['data']['photo']['caption'] + '

' - if objects['type'] == 'paragraph' or 'heading': + + if objects['type'] == 'media' and objects['subType'] == 'photo': + body += '

'.format(objects['data']['photo']['src']) + body += '' + objects['data']['photo']['caption'] + '

' + if objects['type'] == 'media' and objects['subType'] == 'chart': + if objects['data'] and objects['data']['chart']: + body += '

'.format(objects['data']['chart']['fallback']) + + if objects['type'] == 'paragraph' or 'heading': # lists are missed :( body += '

' + if 'content' not in objects: continue - for content in objects['content']: - if 'value' in content: - body += content['value'] - elif 'content' in content: - for val_cont in content['content']: - if 'value' in val_cont: - body += val_cont['value'] + + for item in objects['content']: + + if item['type'] == 'text' and item['value']: + body += item['value'] + + elif item['type'] == 'link' and item['data']: + if 'href' not in item['data']: + continue + if item['content'] and item['content'][0] and item['content'][0]['value']: + body += '' + item['content'][0]['value'] + '' + + elif item['type'] == 'entity': + if item['content'] and item['content'][0] and item['content'][0]['value']: + if item['subType'] == 'person' or 'security': + body += item['content'][0]['value'] + elif item['subType'] == 'story': + if item['data'] and item['data']['link'] and item['data']['link']['destination'] and item['data']['link']['destination']['web']: + body += '' + item.content[0].value + '' + + if objects['type'] == 'quote': + if 'content' not in objects: + continue + for item in objects['content']: + if item['type'] == 'paragraph' and item['content'] and item['content'][0]: + if 'value' not in item['content'][0]: + continue + body += '

' + item['content'][0]['value'] + '
' html = '' + cat + title + subhead + auth + lede + caption + '
' + body return html