import json import random import time from calibre.ptempfile import PersistentTemporaryFile from calibre.web.feeds.news import BasicNewsRecipe from html5_parser import parse def get_contents(x): if x == '': return '' otype = x.get('type', '') if otype == 'text': if 'attributes' in x: if 'strong' in x['attributes']: return '' + x.get('value', '') + ''.join(map(get_contents, x.get('content', ''))) + '' if 'emphasis' in x['attributes']: return '' + x.get('value', '') + ''.join(map(get_contents, x.get('content', ''))) + '' return '' + x.get('value', '') + ''.join(map(get_contents, x.get('content', ''))) + '' return x.get('value', '') + ''.join(map(get_contents, x.get('content', ''))) elif otype == 'br': return '
' elif otype == 'paragraph': return '

' + x.get('value', '') + ''.join(map(get_contents, x.get('content', ''))) + '

' elif otype == 'heading': return '

' + x.get('value', '') + ''.join(map(get_contents, x.get('content', ''))) + '

' elif otype == 'list': return '' elif otype == 'listItem': return '
  • ' + x.get('value', '') + ''.join(map(get_contents, x.get('content', ''))) + '
  • ' elif otype == 'quote': return '
    ' + x.get('value', '') + ''.join(map(get_contents, x.get('content', ''))) + '
    ' elif otype == 'media': if x['subType'] == 'photo': return '
    {} {}
    '.format( x['data']['photo']['src'], x['data']['photo']['caption'], x['data']['photo']['credit']) elif x['subType'] == 'chart': if x['data'] and x['data']['chart']: return '
    '.format(x['data']['chart']['fallback']) elif otype == 'link': if 'data' in x: if 'href' in x['data']: return '' + x.get('value', '') + ''.join(map(get_contents, x.get('content', ''))) + '' return '' + x.get('value', '') + ''.join(map(get_contents, x.get('content', ''))) + '' return '' + x.get('value', '') + ''.join(map(get_contents, x.get('content', ''))) + '' elif otype == 'entity': if x['subType'] == 'story': if x['data'] and x['data']['link'] and x['data']['link']['destination']: if 'web' in x['data']['link']['destination']: return '' + x.get('value', '') + ''.join( map(get_contents, x.get('content', ''))) + '' return '' + x.get('value', '') + ''.join(map(get_contents, x.get('content', ''))) + '' return '' + x.get('value', '') + ''.join(map(get_contents, x.get('content', ''))) + '' return '' + x.get('value', '') + ''.join(map(get_contents, x.get('content', ''))) + '' elif otype in {'div', 'callout'}: return '
    ' + x.get('value', '') + ''.join(map(get_contents, x.get('content', ''))) + '
    ' elif not any(x == otype for x in ['', 'ad', 'inline-newsletter', 'tabularData']): if any(b in x for b in ['value', 'content']): return '' + x.get('value', '') + ''.join(map(get_contents, x.get('content', ''))) + '' return '' class Bloomberg(BasicNewsRecipe): title = u'Bloomberg' language = 'en' __author__ = 'unkn0wn' no_stylesheets = True use_embedded_content = False remove_attributes = ['style', 'height', 'width'] ignore_duplicate_articles = {'url', 'title'} masthead_url = 'https://assets.bbhub.io/company/sites/70/2022/09/logoBBGblck.svg' description = ('Bloomberg delivers business and markets news, data, analysis, and video' ' to the world, featuring stories from Businessweek and Bloomberg News.') simultaneous_downloads = 1 extra_css = ''' .auth {font-size:small; font-weight:bold;} .time, .chart {font-size:small;} .subhead, .cap span {font-style:italic; color:#404040;} em, .col {color:#202020;} .cat {font-size:small; color:gray;} .news-figure-caption-text, .cap, .img {font-size:small; text-align:center;} .news-figure-credit {font-size:small; text-align:center; color:#202020;} ''' articles_are_obfuscated = True resolve_internal_links = True def get_obfuscated_article(self, url): br = self.get_browser() try: br.open(url) except Exception as e: url = e.hdrs.get('location') soup = self.index_to_soup(url) link = soup.find('a', attrs={'href':lambda x: x and x.startswith('https://www.bloomberg.com')}) skip_sections =[ # add sections you want to skip '/video/', '/videos/', '/media/', 'podcast' ] if any(x in link['href'] for x in skip_sections): self.abort_article('Aborting Video article') self.log('Found link: ', link['href']) html = br.open(link['href']).read() pt = PersistentTemporaryFile('.html') pt.write(html) pt.close() return pt.name def get_browser(self, *a, **kw): kw['user_agent'] = 'Mozilla/5.0 (Windows NT 10.0; rv:109.0) Gecko/20100101 Firefox/119.0' br = BasicNewsRecipe.get_browser(self, *a, **kw) br.set_handle_redirect(False) return br feeds = [ ('Features', 'https://news.google.com/rss/search?q=when:27h+allinurl:bloomberg.com%2Fnews%2Ffeatures%2F&hl=en-US&gl=US&ceid=US:en'), ('Opinion', 'https://news.google.com/rss/search?q=when:27h+allinurl:bloomberg.com%2Fopinion%2F&hl=en-US&gl=US&ceid=US:en'), ('Newsletters', 'https://news.google.com/rss/search?q=when:27h+allinurl:bloomberg.com%2Fnews%2Fnewsletters%2F&hl=en-US&gl=US&ceid=US:en'), ('News', 'https://news.google.com/rss/search?q=when:27h+allinurl:bloomberg.com%2Fnews%2Farticles%2F&hl=en-US&gl=US&ceid=US:en'), ('Others', 'https://news.google.com/rss/search?q=when:27h+allinurl:https%3A%2F%2Fwww.bloomberg.com&hl=en-US&gl=US&ceid=US:en') ] def preprocess_raw_html(self, raw, *a): root = parse(raw) m = root.xpath('//script[@data-component-props="ArticleBody"]') if not m: m = root.xpath('//script[@data-component-props="FeatureBody"]') if not m: m2 = root.xpath('//script[@id="__NEXT_DATA__"]') if m: data = json.loads(m[0].text) data = data['story'] elif m2: data = json.loads(m2[0].text) data = data['props']['pageProps']['story'] art_url = data['url'] if not art_url.startswith('http'): art_url = 'https://www.bloomberg.com' + art_url title = '

    '.format(art_url) + data['headline'] + '

    ' cat = subhead = lede = auth = caption = '' if 'primaryCategory' in data and data['primaryCategory'] is not None: cat = '

    ' + data['primaryCategory'] + '

    ' if len(data['abstract']) != 0 and len(data['abstract']) == 2: subhead = '

    ' + data['abstract'][0] + '

    ' + data['abstract'][1] + '

    ' else: if 'summary' in data: subhead = '

    ' + data['summary'] + '

    ' if 'byline' in data and data['byline'] is not None: auth = '
    ' + data['byline']\ + ' | ' + data['publishedAt'][:-14] + '
    ' if 'ledeImageUrl' in data and data['ledeImageUrl'] is not None: lede = '

    '.format(data['ledeImageUrl']) if 'ledeDescription' in data and data['ledeDescription'] is not None: caption = '' + data['ledeDescription'] + '' else: if 'lede' in data and data['lede'] is not None: if 'alt' in data['lede'] and data['lede']['alt'] is not None: caption = '' + data['lede']['alt'] + '' if m: time.sleep(3) body = data['body'] elif m2: body = '' body_data = data['body']['content'] for x in body_data: body += get_contents(x) pause = random.choice((5, 6, 7, 8, 9)) self.log('Delay: ', pause, ' seconds') time.sleep(pause) return '' + cat + title + subhead + auth + lede + caption + '

    ' + body + '
    ' def preprocess_html(self, soup): for icon in soup.findAll('img', attrs={'class':'video-player__play-icon'}): icon.decompose() for div in soup.findAll('div', attrs={'class':'chart'}): nos = div.find('noscript') if nos: nos.name = 'span' for img in soup.findAll('img', attrs={'data-native-src':True}): if img['data-native-src'].__contains__('videos') is False: img['src'] = img['data-native-src'] else: img['src'] = '' for img in soup.findAll('img', attrs={'src':lambda x: x and x.endswith(('-1x-1.jpg', '-1x-1.png'))}): img['src'] = img['src'].replace('-1x-1', '750x-1') return soup def populate_article_metadata(self, article, soup, first): article.url = soup.find('h1')['title'] article.summary = self.tag_to_string(soup.find('div', attrs={'class':'subhead'})) article.text_summary = self.tag_to_string(soup.find('div', attrs={'class':'subhead'})) article.title = article.title.replace(' - Bloomberg', '')