diff --git a/recipes/bloomberg.recipe b/recipes/bloomberg.recipe index 9c62f4871b..3ccef3a289 100644 --- a/recipes/bloomberg.recipe +++ b/recipes/bloomberg.recipe @@ -1,64 +1,44 @@ import json -import random import time +from datetime import datetime, timedelta -from calibre.ptempfile import PersistentTemporaryFile from calibre.web.feeds.news import BasicNewsRecipe -from html5_parser import parse +from calibre.ebooks.BeautifulSoup import BeautifulSoup def get_contents(x): if x == '': return '' - otype = x.get('type', '') - if otype == 'text': - if 'attributes' in x: - if 'strong' in x['attributes']: - return '' + x.get('value', '') + ''.join(map(get_contents, x.get('content', ''))) + '' - if 'emphasis' in x['attributes']: - return '' + x.get('value', '') + ''.join(map(get_contents, x.get('content', ''))) + '' - return '' + x.get('value', '') + ''.join(map(get_contents, x.get('content', ''))) + '' - return x.get('value', '') + ''.join(map(get_contents, x.get('content', ''))) + otype = x.get('role', '') + if otype == 'p': + return '
' + ''.join(map(get_contents, x.get('parts', ''))) + '
' + elif otype == 'text': + if 'style' in x: + return '<' + x['style'] + '>' + ''.join(map(get_contents, x.get('parts', ''))) + '' + x['style'] + '>' + return x.get('text', '') + ''.join(map(get_contents, x.get('parts', ''))) elif otype == 'br': return '' + x.get('value', '') + ''.join(map(get_contents, x.get('content', ''))) + '
' - elif otype == 'heading': - return '' + x.get('value', '') + ''.join(map(get_contents, x.get('content', ''))) + '' - elif otype == 'media': - if x['subType'] == 'photo': - return '
' + ''.join(map(get_contents, x.get('parts', ''))) + '' + elif otype == ('image' or 'video'): + return '
' + ''.join(map(get_contents, x.get('parts', ''))) + '
' + elif not any(x == otype for x in ['', 'ad', 'inline-newsletter', 'tabularData']): + return '' + ''.join(map(get_contents, x.get('parts', ''))) + '' return '' @@ -67,126 +47,94 @@ class Bloomberg(BasicNewsRecipe): language = 'en' __author__ = 'unkn0wn' no_stylesheets = True - use_embedded_content = False remove_attributes = ['style', 'height', 'width'] + encoding = 'utf-8' ignore_duplicate_articles = {'url', 'title'} masthead_url = 'https://assets.bbhub.io/company/sites/70/2022/09/logoBBGblck.svg' - description = ('Bloomberg delivers business and markets news, data, analysis, and video' - ' to the world, featuring stories from Businessweek and Bloomberg News.') - - simultaneous_downloads = 1 + description = ( + 'Bloomberg delivers business and markets news, data, analysis, and video' + ' to the world, featuring stories from Businessweek and Bloomberg News.' + ) + oldest_article = 1 # days + resolve_internal_links = True + remove_empty_feeds = True extra_css = ''' - .auth {font-size:small; font-weight:bold;} - .time, .chart {font-size:small;} - .subhead, .cap span {font-style:italic; color:#404040;} - em, .col {color:#202020;} - .cat {font-size:small; color:gray;} - .news-figure-caption-text, .cap, .img {font-size:small; text-align:center;} + .auth { font-size:small; font-weight:bold; } + .subhead, .cap span { font-style:italic; color:#202020; } + em, blockquote { color:#202020; } + .cat { font-size:small; color:gray; } + .img, .news-figure-caption-text { font-size:small; text-align:center; } + .corr { font-size:small; font-style:italic; color:#404040; } + .chart { font-size:small; } .news-figure-credit {font-size:small; text-align:center; color:#202020;} ''' - articles_are_obfuscated = True - resolve_internal_links = True + def parse_index(self): + inx = 'https://cdn-mobapi.bloomberg.com' + sec = self.index_to_soup(inx + '/wssmobile/v1/navigation/bloomberg_app/search-v2', raw=True) + sec_data = json.loads(sec)['searchNav'][0]['items'] - def get_obfuscated_article(self, url): - br = self.get_browser() - try: - br.open(url) - except Exception as e: - url = e.hdrs.get('location') - soup = self.index_to_soup(url) - link = soup.find('a', attrs={'href':lambda x: x and x.startswith('https://www.bloomberg.com')}) - skip_sections =[ # add sections you want to skip - '/video/', '/videos/', '/media/', 'podcast' - ] - if any(x in link['href'] for x in skip_sections): - self.abort_article('Aborting Video article') - self.log('Found link: ', link['href']) - html = br.open(link['href']).read() - pt = PersistentTemporaryFile('.html') - pt.write(html) - pt.close() - return pt.name + feeds = [] - def get_browser(self, *a, **kw): - kw['user_agent'] = 'Mozilla/5.0 (Windows NT 10.0; rv:109.0) Gecko/20100101 Firefox/119.0' - br = BasicNewsRecipe.get_browser(self, *a, **kw) - br.set_handle_redirect(False) - return br + for sects in sec_data: + section = sects['title'] + sec_slug = sects['links']['self']['href'] + self.log(section) - feeds = [ - ('Features', - 'https://news.google.com/rss/search?q=when:27h+allinurl:bloomberg.com%2Fnews%2Ffeatures%2F&hl=en-US&gl=US&ceid=US:en'), - ('Opinion', 'https://news.google.com/rss/search?q=when:27h+allinurl:bloomberg.com%2Fopinion%2F&hl=en-US&gl=US&ceid=US:en'), - ('Newsletters', - 'https://news.google.com/rss/search?q=when:27h+allinurl:bloomberg.com%2Fnews%2Fnewsletters%2F&hl=en-US&gl=US&ceid=US:en'), - ('News', - 'https://news.google.com/rss/search?q=when:27h+allinurl:bloomberg.com%2Fnews%2Farticles%2F&hl=en-US&gl=US&ceid=US:en'), - ('Others', 'https://news.google.com/rss/search?q=when:27h+allinurl:https%3A%2F%2Fwww.bloomberg.com&hl=en-US&gl=US&ceid=US:en') - ] + articles = [] + + art_soup = self.index_to_soup(inx + sec_slug, raw=True) + for arts in json.loads(art_soup)['modules']: + if arts['stories']: + for x in arts['stories']: + if x.get('type', '') == 'article': + dt = datetime.fromtimestamp(x['published'] + time.timezone) + if (datetime.now() - dt) > timedelta(self.oldest_article): + continue + title = x['title'] + desc = x['autoGeneratedSummary'] + url = inx + '/wssmobile/v1/stories/' + x['internalID'] + self.log(' ', title, '\n\t', desc) + articles.append({'title': title, 'description':desc, 'url': url}) + feeds.append((section, articles)) + return feeds def preprocess_raw_html(self, raw, *a): - root = parse(raw) - m = root.xpath('//script[@data-component-props="ArticleBody"]') - if not m: - m = root.xpath('//script[@data-component-props="FeatureBody"]') - if not m: - m2 = root.xpath('//script[@id="__NEXT_DATA__"]') + data = json.loads(raw) - if m: - data = json.loads(m[0].text) - data = data['story'] - - elif m2: - data = json.loads(m2[0].text) - data = data['props']['pageProps']['story'] - - art_url = data['url'] - if not art_url.startswith('http'): - art_url = 'https://www.bloomberg.com' + art_url - - title = '' + data['primaryCategory'] + '
' + cat = '' + data['primaryCategory'] + '
' - if len(data['abstract']) != 0 and len(data['abstract']) == 2: - subhead = '' + data['abstract'][0] + '
' + data['abstract'][1] + '
' + data['summary'] + '
' + data['summary'] + '
' + 'By ' + data['byline'] + ' | Updated on ' + dt.strftime('%b %d, %Y at %I:%M %p') + '
' - if 'ledeImageUrl' in data and data['ledeImageUrl'] is not None: - lede = ''.format(data['ledeImageUrl'])
+ if 'ledeImage' in data and data['ledeImage'] is not None:
+ x = data['ledeImage']
+ lede = '