diff --git a/recipes/bloomberg.recipe b/recipes/bloomberg.recipe index 9c62f4871b..3ccef3a289 100644 --- a/recipes/bloomberg.recipe +++ b/recipes/bloomberg.recipe @@ -1,64 +1,44 @@ import json -import random import time +from datetime import datetime, timedelta -from calibre.ptempfile import PersistentTemporaryFile from calibre.web.feeds.news import BasicNewsRecipe -from html5_parser import parse +from calibre.ebooks.BeautifulSoup import BeautifulSoup def get_contents(x): if x == '': return '' - otype = x.get('type', '') - if otype == 'text': - if 'attributes' in x: - if 'strong' in x['attributes']: - return '' + x.get('value', '') + ''.join(map(get_contents, x.get('content', ''))) + '' - if 'emphasis' in x['attributes']: - return '' + x.get('value', '') + ''.join(map(get_contents, x.get('content', ''))) + '' - return '' + x.get('value', '') + ''.join(map(get_contents, x.get('content', ''))) + '' - return x.get('value', '') + ''.join(map(get_contents, x.get('content', ''))) + otype = x.get('role', '') + if otype == 'p': + return '

' + ''.join(map(get_contents, x.get('parts', ''))) + '

' + elif otype == 'text': + if 'style' in x: + return '<' + x['style'] + '>' + ''.join(map(get_contents, x.get('parts', ''))) + '' + return x.get('text', '') + ''.join(map(get_contents, x.get('parts', ''))) elif otype == 'br': return '
' - elif otype == 'paragraph': - return '

' + x.get('value', '') + ''.join(map(get_contents, x.get('content', ''))) + '

' - elif otype == 'heading': - return '

' + x.get('value', '') + ''.join(map(get_contents, x.get('content', ''))) + '

' - elif otype == 'list': - return '' - elif otype == 'listItem': - return '
  • ' + x.get('value', '') + ''.join(map(get_contents, x.get('content', ''))) + '
  • ' - elif otype == 'quote': - return '
    ' + x.get('value', '') + ''.join(map(get_contents, x.get('content', ''))) + '
    ' - elif otype == 'media': - if x['subType'] == 'photo': - return '
    {} {}
    '.format( - x['data']['photo']['src'], x['data']['photo']['caption'], x['data']['photo']['credit']) - elif x['subType'] == 'chart': - if x['data'] and x['data']['chart']: - return '
    '.format(x['data']['chart']['fallback']) - elif otype == 'link': - if 'data' in x: - if 'href' in x['data']: - return '' + x.get('value', '') + ''.join(map(get_contents, x.get('content', ''))) + '' - return '' + x.get('value', '') + ''.join(map(get_contents, x.get('content', ''))) + '' - return '' + x.get('value', '') + ''.join(map(get_contents, x.get('content', ''))) + '' - elif otype == 'entity': - if x['subType'] == 'story': - if x['data'] and x['data']['link'] and x['data']['link']['destination']: - if 'web' in x['data']['link']['destination']: - return '' + x.get('value', '') + ''.join( - map(get_contents, x.get('content', ''))) + '' - return '' + x.get('value', '') + ''.join(map(get_contents, x.get('content', ''))) + '' - return '' + x.get('value', '') + ''.join(map(get_contents, x.get('content', ''))) + '' - return '' + x.get('value', '') + ''.join(map(get_contents, x.get('content', ''))) + '' - elif otype in {'div', 'callout'}: - return '
    ' + x.get('value', '') + ''.join(map(get_contents, x.get('content', ''))) + '
    ' - elif not any(x == otype for x in ['', 'ad', 'inline-newsletter', 'tabularData']): - if any(b in x for b in ['value', 'content']): - return '' + x.get('value', '') + ''.join(map(get_contents, x.get('content', ''))) + '' + elif otype == 'anchor': + return '' + ''.join(map(get_contents, x.get('parts', ''))) + '' + elif otype == 'h3': + return '

    ' + ''.join(map(get_contents, x.get('parts', ''))) + '

    ' + elif otype == 'ul': + return '' + elif otype == 'li': + return '
  • ' + ''.join(map(get_contents, x.get('parts', ''))) + '
  • ' + elif otype == 'webview': + return '
    ' + x['html'] + ''.join(map(get_contents, x.get('parts', ''))) + elif otype == 'blockquote': + return '
    ' + ''.join(map(get_contents, x.get('parts', ''))) + '
    ' + elif otype == ('image' or 'video'): + return '
    {}
    \n'.format( + x['imageURLs']['default'], x['caption'] + ' ' + x['credit'] + '' + ) + elif otype == ('correction' or 'disclaimer'): + return '

    ' + ''.join(map(get_contents, x.get('parts', ''))) + '

    ' + elif not any(x == otype for x in ['', 'ad', 'inline-newsletter', 'tabularData']): + return '' + ''.join(map(get_contents, x.get('parts', ''))) + '' return '' @@ -67,126 +47,94 @@ class Bloomberg(BasicNewsRecipe): language = 'en' __author__ = 'unkn0wn' no_stylesheets = True - use_embedded_content = False remove_attributes = ['style', 'height', 'width'] + encoding = 'utf-8' ignore_duplicate_articles = {'url', 'title'} masthead_url = 'https://assets.bbhub.io/company/sites/70/2022/09/logoBBGblck.svg' - description = ('Bloomberg delivers business and markets news, data, analysis, and video' - ' to the world, featuring stories from Businessweek and Bloomberg News.') - - simultaneous_downloads = 1 + description = ( + 'Bloomberg delivers business and markets news, data, analysis, and video' + ' to the world, featuring stories from Businessweek and Bloomberg News.' + ) + oldest_article = 1 # days + resolve_internal_links = True + remove_empty_feeds = True extra_css = ''' - .auth {font-size:small; font-weight:bold;} - .time, .chart {font-size:small;} - .subhead, .cap span {font-style:italic; color:#404040;} - em, .col {color:#202020;} - .cat {font-size:small; color:gray;} - .news-figure-caption-text, .cap, .img {font-size:small; text-align:center;} + .auth { font-size:small; font-weight:bold; } + .subhead, .cap span { font-style:italic; color:#202020; } + em, blockquote { color:#202020; } + .cat { font-size:small; color:gray; } + .img, .news-figure-caption-text { font-size:small; text-align:center; } + .corr { font-size:small; font-style:italic; color:#404040; } + .chart { font-size:small; } .news-figure-credit {font-size:small; text-align:center; color:#202020;} ''' - articles_are_obfuscated = True - resolve_internal_links = True + def parse_index(self): + inx = 'https://cdn-mobapi.bloomberg.com' + sec = self.index_to_soup(inx + '/wssmobile/v1/navigation/bloomberg_app/search-v2', raw=True) + sec_data = json.loads(sec)['searchNav'][0]['items'] - def get_obfuscated_article(self, url): - br = self.get_browser() - try: - br.open(url) - except Exception as e: - url = e.hdrs.get('location') - soup = self.index_to_soup(url) - link = soup.find('a', attrs={'href':lambda x: x and x.startswith('https://www.bloomberg.com')}) - skip_sections =[ # add sections you want to skip - '/video/', '/videos/', '/media/', 'podcast' - ] - if any(x in link['href'] for x in skip_sections): - self.abort_article('Aborting Video article') - self.log('Found link: ', link['href']) - html = br.open(link['href']).read() - pt = PersistentTemporaryFile('.html') - pt.write(html) - pt.close() - return pt.name + feeds = [] - def get_browser(self, *a, **kw): - kw['user_agent'] = 'Mozilla/5.0 (Windows NT 10.0; rv:109.0) Gecko/20100101 Firefox/119.0' - br = BasicNewsRecipe.get_browser(self, *a, **kw) - br.set_handle_redirect(False) - return br + for sects in sec_data: + section = sects['title'] + sec_slug = sects['links']['self']['href'] + self.log(section) - feeds = [ - ('Features', - 'https://news.google.com/rss/search?q=when:27h+allinurl:bloomberg.com%2Fnews%2Ffeatures%2F&hl=en-US&gl=US&ceid=US:en'), - ('Opinion', 'https://news.google.com/rss/search?q=when:27h+allinurl:bloomberg.com%2Fopinion%2F&hl=en-US&gl=US&ceid=US:en'), - ('Newsletters', - 'https://news.google.com/rss/search?q=when:27h+allinurl:bloomberg.com%2Fnews%2Fnewsletters%2F&hl=en-US&gl=US&ceid=US:en'), - ('News', - 'https://news.google.com/rss/search?q=when:27h+allinurl:bloomberg.com%2Fnews%2Farticles%2F&hl=en-US&gl=US&ceid=US:en'), - ('Others', 'https://news.google.com/rss/search?q=when:27h+allinurl:https%3A%2F%2Fwww.bloomberg.com&hl=en-US&gl=US&ceid=US:en') - ] + articles = [] + + art_soup = self.index_to_soup(inx + sec_slug, raw=True) + for arts in json.loads(art_soup)['modules']: + if arts['stories']: + for x in arts['stories']: + if x.get('type', '') == 'article': + dt = datetime.fromtimestamp(x['published'] + time.timezone) + if (datetime.now() - dt) > timedelta(self.oldest_article): + continue + title = x['title'] + desc = x['autoGeneratedSummary'] + url = inx + '/wssmobile/v1/stories/' + x['internalID'] + self.log(' ', title, '\n\t', desc) + articles.append({'title': title, 'description':desc, 'url': url}) + feeds.append((section, articles)) + return feeds def preprocess_raw_html(self, raw, *a): - root = parse(raw) - m = root.xpath('//script[@data-component-props="ArticleBody"]') - if not m: - m = root.xpath('//script[@data-component-props="FeatureBody"]') - if not m: - m2 = root.xpath('//script[@id="__NEXT_DATA__"]') + data = json.loads(raw) - if m: - data = json.loads(m[0].text) - data = data['story'] - - elif m2: - data = json.loads(m2[0].text) - data = data['props']['pageProps']['story'] - - art_url = data['url'] - if not art_url.startswith('http'): - art_url = 'https://www.bloomberg.com' + art_url - - title = '

    '.format(art_url) + data['headline'] + '

    ' + title = '

    '.format(data['longURL']) + data['title'] + '

    ' cat = subhead = lede = auth = caption = '' if 'primaryCategory' in data and data['primaryCategory'] is not None: - cat = '

    ' + data['primaryCategory'] + '

    ' + cat = '

    ' + data['primaryCategory'] + '

    ' - if len(data['abstract']) != 0 and len(data['abstract']) == 2: - subhead = '

    ' + data['abstract'][0] + '

    ' + data['abstract'][1] + '

    ' - else: - if 'summary' in data: - subhead = '

    ' + data['summary'] + '

    ' + if 'abstract' in data and data['abstract'] and data['abstract'] is not None: + subhead = '
    ' + elif 'summary' in data and data['summary']: + subhead = '

    ' + data['summary'] + '

    ' if 'byline' in data and data['byline'] is not None: - auth = '
    ' + data['byline']\ - + ' | ' + data['publishedAt'][:-14] + '
    ' + dt = datetime.fromtimestamp(data['updatedAt'] + time.timezone) + auth = '

    ' + 'By ' + data['byline'] + ' | Updated on ' + dt.strftime('%b %d, %Y at %I:%M %p') + '

    ' - if 'ledeImageUrl' in data and data['ledeImageUrl'] is not None: - lede = '

    '.format(data['ledeImageUrl']) + if 'ledeImage' in data and data['ledeImage'] is not None: + x = data['ledeImage'] + lede = '

    {}
    \n'.format( + x['imageURLs']['default'], x['caption'] + ' ' + x['credit'] + '' + ) - if 'ledeDescription' in data and data['ledeDescription'] is not None: - caption = '' + data['ledeDescription'] + '' - else: - if 'lede' in data and data['lede'] is not None: - if 'alt' in data['lede'] and data['lede']['alt'] is not None: - caption = '' + data['lede']['alt'] + '' - - if m: - time.sleep(3) - body = data['body'] - elif m2: - body = '' - body_data = data['body']['content'] - for x in body_data: - body += get_contents(x) - pause = random.choice((5, 6, 7, 8, 9)) - self.log('Delay: ', pause, ' seconds') - time.sleep(pause) - return '' + cat + title + subhead + auth + lede + caption + '
    ' + body + '
    ' + body = '' + body_data = data['components'] + for x in body_data: + body += get_contents(x) + html = '' + cat + title + subhead + auth + lede + caption + '
    ' + body + '
    ' + return BeautifulSoup(html).prettify() def preprocess_html(self, soup): + for h3 in soup.findAll('h3'): + h3.name = 'h4' for icon in soup.findAll('img', attrs={'class':'video-player__play-icon'}): icon.decompose() for div in soup.findAll('div', attrs={'class':'chart'}): @@ -204,14 +152,3 @@ class Bloomberg(BasicNewsRecipe): def populate_article_metadata(self, article, soup, first): article.url = soup.find('h1')['title'] - article.summary = self.tag_to_string(soup.find('div', attrs={'class':'subhead'})) - article.text_summary = self.tag_to_string(soup.find('div', attrs={'class':'subhead'})) - article.title = article.title.replace(' - Bloomberg', '') - - def get_browser(self): - # -- Handle Google's cookies consent page - br = BasicNewsRecipe.get_browser(self) - br.open('https://news.google.com') - br.select_form(action="https://consent.google.com/save") - br.submit() - return br