diff --git a/recipes/bloomberg-business-week.recipe b/recipes/bloomberg-business-week.recipe index 7d07202200..e7a47d28f6 100644 --- a/recipes/bloomberg-business-week.recipe +++ b/recipes/bloomberg-business-week.recipe @@ -1,196 +1,141 @@ import json -import random import time -from collections import defaultdict +from datetime import datetime, timedelta -from calibre.web.feeds.news import BasicNewsRecipe, classes, prefixed_classes -from html5_parser import parse +from calibre.ebooks.BeautifulSoup import BeautifulSoup +from calibre.web.feeds.news import BasicNewsRecipe +# https://www.bloomberg.com/magazine/businessweek/24_12 +# Set past_edition to edition id, which is '24_12'. +past_edition = None + def get_contents(x): if x == '': return '' - otype = x.get('type', '') - if otype == 'text': - if 'attributes' in x: - if 'strong' in x['attributes']: - return '' + x.get('value', '') + ''.join(map(get_contents, x.get('content', ''))) + '' - if 'emphasis' in x['attributes']: - return '' + x.get('value', '') + ''.join(map(get_contents, x.get('content', ''))) + '' - return '' + x.get('value', '') + ''.join(map(get_contents, x.get('content', ''))) + '' - return x.get('value', '') + ''.join(map(get_contents, x.get('content', ''))) + otype = x.get('role', '') + if otype == 'p': + return '
' + ''.join(map(get_contents, x.get('parts', ''))) + '
' + elif otype == 'text': + if 'style' in x: + return '<' + x['style'] + '>' + ''.join(map(get_contents, x.get('parts', ''))) + '' + x['style'] + '>' + return x.get('text', '') + ''.join(map(get_contents, x.get('parts', ''))) elif otype == 'br': return '' + x.get('value', '') + ''.join(map(get_contents, x.get('content', ''))) + '
' - elif otype == 'heading': - return '' + x.get('value', '') + ''.join(map(get_contents, x.get('content', ''))) + '' - elif otype == 'media': - if x['subType'] == 'photo': - return '
' + ''.join(map(get_contents, x.get('parts', ''))) + '' + elif otype in {'image', 'video'}: + return '
' + ''.join(map(get_contents, x.get('parts', ''))) + '
' + elif not any(x == otype for x in ['', 'ad', 'inline-newsletter', 'tabularData']): + return '' + ''.join(map(get_contents, x.get('parts', ''))) + '' return '' + class Bloomberg(BasicNewsRecipe): - title = u'Bloomberg Businessweek' + title = 'Bloomberg Businessweek' language = 'en' __author__ = 'unkn0wn' no_stylesheets = True - use_embedded_content = False remove_attributes = ['style', 'height', 'width'] + encoding = 'utf-8' ignore_duplicate_articles = {'url'} - resolve_internal_links = True masthead_url = 'https://assets.bwbx.io/s3/javelin/public/hub/images/BW-Logo-Black-cc9035fbb3.svg' description = ( 'Bloomberg Businessweek helps global leaders stay ahead with insights and in-depth analysis on the people,' ' companies, events, and trends shaping today\'s complex, global economy.' ) - - simultaneous_downloads = 1 + remove_empty_feeds = True extra_css = ''' - .auth {font-size:small; font-weight:bold;} - .time, .chart, .css--lede-byline, .css--lede-timestamp {font-size:small;} - .subhead, .cap span {font-style:italic; color:#404040;} - em, .col {color:#202020;} - .cat {font-size:small; color:gray;} - .news-figure-caption-text, .cap, .img, .css--caption-outer-wrapper {font-size:small; text-align:center;} + .auth { font-size:small; font-weight:bold; } + .subhead, .cap span { font-style:italic; color:#202020; } + em, blockquote { color:#202020; } + .cat { font-size:small; color:gray; } + .img, .news-figure-caption-text { font-size:small; text-align:center; } + .corr { font-size:small; font-style:italic; color:#404040; } + .chart { font-size:small; } .news-figure-credit {font-size:small; text-align:center; color:#202020;} ''' - remove_tags = [ - dict(name=['button', 'svg']), - dict(name='div', attrs={'id':['bb-that', 'bb-nav']}), - classes('twitter-logo bb-global-footer __sticky__audio__bar__portal__ css--social-wrapper-outer') - ] - - def get_browser(self, *a, **kw): - kw['user_agent'] = 'Mozilla/5.0 (Windows NT 10.0; rv:109.0) Gecko/20100101 Firefox/119.0' - br = BasicNewsRecipe.get_browser(self, *a, **kw) - br.set_handle_redirect(False) - return br - def parse_index(self): - soup = self.index_to_soup('https://www.bloomberg.com/businessweek') - bw = soup.find('a', href=lambda x: x and x.startswith('/magazine/businessweek/')) - edition = 'https://www.bloomberg.com' + bw['href'] - self.log('Downloading ', edition) - self.cover_url = bw.find('img')['src'].replace('25x19', '600x800') - soup = self.index_to_soup(edition) - if timefmt := soup.find(**prefixed_classes('styles_TableOfContentsTitle__')): - self.timefmt = ' [' + (self.tag_to_string(timefmt).replace(' Issue', '')).strip() + ']' + inx = 'https://cdn-mobapi.bloomberg.com' + sec = self.index_to_soup(inx + '/wssmobile/v1/bw/news/list?limit=1', raw=True) + id = json.loads(sec)['magazines'][0]['id'] + if past_edition: + id = past_edition + edit = self.index_to_soup(inx + '/wssmobile/v1/bw/news/week/' + id, raw=True) + d = json.loads(edit) + self.timefmt = ' [' + d['date'] + ']' + self.cover_url = d['image']['thumbUrl'] - feeds_dict = defaultdict(list) + feeds = [] - sec = '' - toc = soup.find('section', attrs={'id':'toc-archive-businessweek'}) - for div in toc.findAll(**prefixed_classes('MagazinePageMagazineArchive_itemContainer__')): - h3 = div.find(**prefixed_classes('MagazinePageMagazineArchive_itemSection__')) - if h3 and h3.text: - sec = self.tag_to_string(h3) - self.log(sec) - a = div.find(**prefixed_classes('MagazinePageMagazineArchive_storyLink__')) - url = a['href'] - if url.startswith('http') is False: - url = 'https://www.bloomberg.com' + a['href'] - title = self.tag_to_string(a) - byl = div.find(**prefixed_classes('Byline_phoenix__')) - desc = self.tag_to_string(byl) - self.log('\t', title, '\n\t', desc, '\n\t\t', url) - feeds_dict[sec].append({"title": title, "url": url, "description": desc}) - return [(sec, articles) for sec, articles in feeds_dict.items()] + for i in d['modules']: + section = i['title'] + self.log(section) + + articles = [] + + for x in i['articles']: + title = x['title'] + url = inx + '/wssmobile/v1/stories/' + x['id'] + self.log('\t', title) + articles.append({'title': title, 'url': url}) + feeds.append((section, articles)) + return feeds def preprocess_raw_html(self, raw, *a): - root = parse(raw) - m = root.xpath('//script[@data-component-props="ArticleBody"]') - if not m: - m = root.xpath('//script[@data-component-props="FeatureBody"]') - if not m: - m2 = root.xpath('//script[@id="__NEXT_DATA__"]') - if not m2: - return raw - if m: - data = json.loads(m[0].text) - data = data['story'] + data = json.loads(raw) - else: - data = json.loads(m2[0].text) - if 'story' in data['props']['pageProps']: - data = data['props']['pageProps']['story'] - else: - return raw - - title = '' + data['primaryCategory'] + '
' + cat = '' + data['abstract'][0] + '
' + data['abstract'][1] + '
' + data['summary'] + '
' + data['summary'] + '
' + 'By ' + data['byline'] + ' | Updated on ' + dt.strftime('%b %d, %Y at %I:%M %p') + '
' - if 'ledeImageUrl' in data and data['ledeImageUrl'] is not None: - lede = ''.format(data['ledeImageUrl'])
+ if 'ledeImage' in data and data['ledeImage'] is not None:
+ x = data['ledeImage']
+ lede = '
' + 'This is an interactive article, which is supposed to be read in a browser.' + '
' + body_data = data['components'] + for x in body_data: + body += get_contents(x) + html = '' + cat + title + subhead + auth + lede + caption + '' + ''.join(map(get_contents, x.get('parts', ''))) + '' - elif otype == ('image' or 'video'): + elif otype in {'image', 'video'}: return '
' + ''.join(map(get_contents, x.get('parts', ''))) + '
' elif not any(x == otype for x in ['', 'ad', 'inline-newsletter', 'tabularData']): @@ -44,7 +44,7 @@ def get_contents(x): class Bloomberg(BasicNewsRecipe): title = u'Bloomberg' - language = 'en' + language = 'en_US' __author__ = 'unkn0wn' no_stylesheets = True remove_attributes = ['style', 'height', 'width'] @@ -89,7 +89,7 @@ class Bloomberg(BasicNewsRecipe): for arts in json.loads(art_soup)['modules']: if arts['stories']: for x in arts['stories']: - if x.get('type', '') == 'article': + if x.get('type', '') in {'article', 'interactive'}: dt = datetime.fromtimestamp(x['published'] + time.timezone) if (datetime.now() - dt) > timedelta(self.oldest_article): continue @@ -109,7 +109,7 @@ class Bloomberg(BasicNewsRecipe): cat = subhead = lede = auth = caption = '' if 'primaryCategory' in data and data['primaryCategory'] is not None: - cat = '' + data['primaryCategory'] + '
' + cat = '' + 'This is an interactive article, which is supposed to be read in a browser.' + '
' body_data = data['components'] for x in body_data: body += get_contents(x)