#!/usr/bin/env python # vim:fileencoding=utf-8 import json import time from datetime import datetime, timedelta from calibre.ebooks.BeautifulSoup import BeautifulSoup from calibre.web.feeds.news import BasicNewsRecipe, classes def get_contents(x): if x == '': return '' otype = x.get('role', '') if otype == 'p': return '
' + ''.join(map(get_contents, x.get('parts', ''))) + '
' elif otype == 'text': if 'style' in x: return '<' + x['style'] + '>' + ''.join(map(get_contents, x.get('parts', ''))) + '' + x['style'] + '>' return x.get('text', '') + ''.join(map(get_contents, x.get('parts', ''))) elif otype == 'br': return '' + ''.join(map(get_contents, x.get('parts', ''))) + '' elif otype in {'image', 'video'}: return '
' + ''.join(map(get_contents, x.get('parts', ''))) + '
' elif not any(x == otype for x in ['', 'ad', 'inline-newsletter', 'tabularData']): return '' + ''.join(map(get_contents, x.get('parts', ''))) + '' return '' class Bloomberg(BasicNewsRecipe): title = u'Bloomberg' language = 'en_US' __author__ = 'unkn0wn' no_stylesheets = True remove_attributes = ['style', 'height', 'width'] encoding = 'utf-8' ignore_duplicate_articles = {'url', 'title'} masthead_url = 'https://assets.bbhub.io/company/sites/70/2022/09/logoBBGblck.svg' description = ( 'Bloomberg delivers business and markets news, data, analysis, and video' ' to the world, featuring stories from Businessweek and Bloomberg News.' ) oldest_article = 1.2 # days resolve_internal_links = True remove_empty_feeds = True cover_url = 'https://assets.bwbx.io/images/users/iqjWHBFdfxIU/ivUxvlPidC3M/v0/600x-1.jpg' remove_tags = [ dict(name=['button', 'svg', 'meta', 'iframe']), dict(name='div', attrs={'id':['bb-that', 'bb-nav']}), dict(attrs={'data-image-type':'audio'}), classes('twitter-logo bb-global-footer __sticky__audio__bar__portal__ css--social-wrapper-outer bplayer-container') ] extra_css = ''' .auth { font-size:small; font-weight:bold; } .subhead, .cap span { font-style:italic; color:#202020; } em, blockquote { color:#202020; } .cat { font-size:small; color:gray; } .img, .news-figure-caption-text { font-size:small; text-align:center; } .corr { font-size:small; font-style:italic; color:#404040; } .chart { font-size:small; } .news-figure-credit {font-size:small; text-align:center; color:#202020;} ''' recipe_specific_options = { 'days': { 'short': 'Oldest article to download from this news source. In days ', 'long': 'For example, 0.5, gives you articles for the past 12 hours', 'default': str(oldest_article), } } def parse_index(self): d = self.recipe_specific_options.get('days') if d and isinstance(d, str): self.oldest_article = float(d) inx = 'https://cdn-mobapi.bloomberg.com' sec = self.index_to_soup(inx + '/wssmobile/v1/navigation/bloomberg_app/search-v2', raw=True) sec_data = json.loads(sec)['searchNav'] feeds = [] for i in sec_data: for sects in i['items']: section = sects['title'] sec_slug = sects['links']['self']['href'] self.log(section) articles = [] art_soup = self.index_to_soup(inx + sec_slug, raw=True) for arts in json.loads(art_soup)['modules']: if arts['stories']: for x in arts['stories']: if x.get('type', '') in {'article', 'interactive'}: dt = datetime.fromtimestamp(x['published'] + time.timezone) if (datetime.now() - dt) > timedelta(self.oldest_article): continue title = x['title'] desc = x['autoGeneratedSummary'] url = inx + '/wssmobile/v1/stories/' + x['internalID'] self.log(' ', title, '\n\t', desc) articles.append({'title': title, 'description':desc, 'url': url}) feeds.append((section, articles)) return feeds def preprocess_raw_html(self, raw, url): data = json.loads(raw) title = '' + data['summary'] + '
' + 'By ' + data['byline'] + ' | Updated on ' + dt.strftime('%b %d, %Y at %I:%M %p') + '
' body = '' if data.get('type', '') == 'interactive': body += '' + 'This is an interactive article, which is supposed to be read in a browser.' + '
' # body_data = data['components'] # for x in body_data: # body += get_contents(x) b_data = self.index_to_soup('https://cdn-mobapi.bloomberg.com/wssmobile/v1/bw/news/stories/' + url.split('/')[-1], raw=True) body += json.loads(b_data)['html'] if 'ledeImage' in data and data['ledeImage'] is not None: x = data['ledeImage'] if x['imageURLs']['default'].rsplit('/', 1)[0] not in body: lede = '