From 1c2a8d3ecda444805a03f2a07e8b11564d8f245c Mon Sep 17 00:00:00 2001 From: unkn0w7n <51942695+unkn0w7n@users.noreply.github.com> Date: Sun, 23 Jul 2023 11:57:25 +0530 Subject: [PATCH] Update bloomberg-business-week.recipe --- recipes/bloomberg-business-week.recipe | 175 ++++++------------------- 1 file changed, 41 insertions(+), 134 deletions(-) diff --git a/recipes/bloomberg-business-week.recipe b/recipes/bloomberg-business-week.recipe index 41f3fa41ae..2e594caf41 100644 --- a/recipes/bloomberg-business-week.recipe +++ b/recipes/bloomberg-business-week.recipe @@ -5,6 +5,44 @@ import json import random import time +def get_contents(x): + otype = x.get('type', '') + if otype == 'text': + return x.get('value', '') + elif otype == 'paragraph': + return '

' + x.get('value', '') + ''.join(map(get_contents, x.get('content'))) + '

' + elif otype == 'heading': + return '

' + x.get('value', '') + ''.join(map(get_contents, x.get('content'))) + '

' + elif otype == 'list': + return '' + elif otype == 'listItem': + return '
  • ' + x.get('value', '') + ''.join(map(get_contents, x.get('content'))) + '
  • ' + elif otype == 'quote': + return '
    ' + x.get('value', '') + ''.join(map(get_contents, x.get('content'))) + '
    ' + elif otype == 'media': + if x['subType'] == 'photo': + return '
    {}
    '.format( + x['data']['photo']['src'], x['data']['photo']['caption']) + elif x['subType'] == 'chart': + if x['data'] and x['data']['chart']: + return '
    '.format(x['data']['chart']['fallback']) + elif otype == 'link': + if x['data'] and x['content'] and x['content'][0] and x['content'][0]['value']: + if 'href' in x['data']: + return '' + x['content'][0]['value'] + '' + return '' + x['content'][0]['value'] + '' + elif otype == 'entity': + if x['content'] and x['content'][0] and x['content'][0]['value']: + if x['subType'] == 'story': + if x['data'] and x['data']['link'] and x['data']['link']['destination']: + if 'web' in x['data']['link']['destination']: + return '' + x['content'][0]['value'] + '' + return '' + x['content'][0]['value'] + '' + elif x['subType'] in ('person', 'security'): + return '' + x['content'][0]['value'] + '' + + return '' + class Bloomberg(BasicNewsRecipe): title = u'Bloomberg Businessweek' language = 'en' @@ -124,142 +162,11 @@ class Bloomberg(BasicNewsRecipe): else: body = '' body_data = data['body']['content'] - - for objects in body_data: - + for x in body_data: pause = random.choice((0.5, 1, 1.25)) time.sleep(pause) - - if objects['type'] == 'media' and objects['subType'] == 'photo': - body += '

    '.format(objects['data']['photo']['src']) - body += '' + objects['data']['photo']['caption'] + '

    ' - if objects['type'] == 'media' and objects['subType'] == 'chart': - if objects['data'] and objects['data']['chart']: - body += '

    '.format(objects['data']['chart']['fallback']) - - if objects['type'] == 'paragraph': - body += '

    ' - if 'value' in objects: - body += objects['value'] - - if 'content' not in objects: - continue - - for item in objects['content']: - - if item['type'] == 'text' and item['value']: - body += item['value'] - - if item['type'] == 'link' and item['data']: - if item['content'] and item['content'][0] and item['content'][0]['value']: - if 'href' in item['data']: - body += '' + item['content'][0]['value'] + '' - else: - body += '' + item['content'][0]['value'] + '' - - if item['type'] == 'entity': - if item['content'] and item['content'][0] and item['content'][0]['value']: - if item['subType'] == 'story': - if item['data'] and item['data']['link'] and item['data']['link']['destination']: - if 'web' in item['data']['link']['destination']: - body += '' + item['content'][0]['value'] + '' - else: - body += '' + item['content'][0]['value'] + '' - - elif item['subType'] == 'person' or 'security': - body += item['content'][0]['value'] - - if objects['type'] == 'heading': - body += '

    ' - if 'value' in objects: - body += objects['value'] - - if 'content' not in objects: - continue - - for item in objects['content']: - - if item['type'] == 'text' and item['value']: - body += item['value'] - - if item['type'] == 'link' and item['data']: - if item['content'] and item['content'][0] and item['content'][0]['value']: - if 'href' in item['data']: - body += '' + item['content'][0]['value'] + '' - else: - body += '' + item['content'][0]['value'] + '' - - if item['type'] == 'entity': - if item['content'] and item['content'][0] and item['content'][0]['value']: - if item['subType'] == 'story': - if item['data'] and item['data']['link'] and item['data']['link']['destination']: - if 'web' in item['data']['link']['destination']: - body += '' + item['content'][0]['value'] + '' - else: - body += '' + item['content'][0]['value'] + '' - - elif item['subType'] == 'person' or 'security': - body += item['content'][0]['value'] - - if objects['type'] == 'quote': - if 'value' in objects: - body +='

    ' + objects['value'] + '
    ' - if 'content' not in objects: - continue - for item in objects['content']: - if item['type'] == 'paragraph' and item['content'] and item['content'][0]: - if 'value' not in item['content'][0]: - continue - body += '
    ' + item['content'][0]['value'] + '
    ' - - if objects['type'] == 'list': - if 'content' not in objects: - continue - body += '' - - skip = ['ad', 'inline-newsletter', 'inline-recirc', 'tabularData', 'list', 'quote', 'heading', 'paragraph', 'media'] - if not any(x in objects['type'] for x in skip): - body += '

    ' - if 'value' in objects: - body += objects['value'] - if not 'content' in objects: - continue - for content in objects['content']: - if 'value' in content: - body += content['value'] - elif 'content' in content: - for cont1 in content['content']: - if 'value' in cont1: - body += cont1['value'] - elif 'content' in val_cont: - for cont2 in val_cont['content']: - if 'value' in cont2: - body += cont2['value'] - elif 'content' in cont2: - for cont3 in cont2['content']: - if 'value' in cont3: - body += cont3['value'] - elif 'content' in cont3: - for cont4 in cont3['content']: - if 'value' in cont4: - body += cont4['value'] - - html = '' + cat + title + subhead + auth + lede + caption + '

    ' + body - return html + body += get_contents(x) + return '' + cat + title + subhead + auth + lede + caption + '
    ' + body + '
    ' def preprocess_html(self, soup): for icon in soup.findAll('img', attrs={'class':'video-player__play-icon'}):