diff --git a/recipes/bloomberg-business-week.recipe b/recipes/bloomberg-business-week.recipe index 7d07202200..e7a47d28f6 100644 --- a/recipes/bloomberg-business-week.recipe +++ b/recipes/bloomberg-business-week.recipe @@ -1,196 +1,141 @@ import json -import random import time -from collections import defaultdict +from datetime import datetime, timedelta -from calibre.web.feeds.news import BasicNewsRecipe, classes, prefixed_classes -from html5_parser import parse +from calibre.ebooks.BeautifulSoup import BeautifulSoup +from calibre.web.feeds.news import BasicNewsRecipe +# https://www.bloomberg.com/magazine/businessweek/24_12 +# Set past_edition to edition id, which is '24_12'. +past_edition = None + def get_contents(x): if x == '': return '' - otype = x.get('type', '') - if otype == 'text': - if 'attributes' in x: - if 'strong' in x['attributes']: - return '' + x.get('value', '') + ''.join(map(get_contents, x.get('content', ''))) + '' - if 'emphasis' in x['attributes']: - return '' + x.get('value', '') + ''.join(map(get_contents, x.get('content', ''))) + '' - return '' + x.get('value', '') + ''.join(map(get_contents, x.get('content', ''))) + '' - return x.get('value', '') + ''.join(map(get_contents, x.get('content', ''))) + otype = x.get('role', '') + if otype == 'p': + return '

' + ''.join(map(get_contents, x.get('parts', ''))) + '

' + elif otype == 'text': + if 'style' in x: + return '<' + x['style'] + '>' + ''.join(map(get_contents, x.get('parts', ''))) + '' + return x.get('text', '') + ''.join(map(get_contents, x.get('parts', ''))) elif otype == 'br': return '
' - elif otype == 'paragraph': - return '

' + x.get('value', '') + ''.join(map(get_contents, x.get('content', ''))) + '

' - elif otype == 'heading': - return '

' + x.get('value', '') + ''.join(map(get_contents, x.get('content', ''))) + '

' - elif otype == 'list': - return '' - elif otype == 'listItem': - return '
  • ' + x.get('value', '') + ''.join(map(get_contents, x.get('content', ''))) + '
  • ' - elif otype == 'quote': - return '
    ' + x.get('value', '') + ''.join(map(get_contents, x.get('content', ''))) + '
    ' - elif otype == 'media': - if x['subType'] == 'photo': - return '
    {} {}
    '.format( - x['data']['photo']['src'], x['data']['photo']['caption'], x['data']['photo']['credit']) - elif x['subType'] == 'chart': - if x['data'] and x['data']['chart']: - return '
    '.format(x['data']['chart']['fallback']) - elif otype == 'link': - if 'data' in x: - if 'href' in x['data']: - return '' + x.get('value', '') + ''.join(map(get_contents, x.get('content', ''))) + '' - return '' + x.get('value', '') + ''.join(map(get_contents, x.get('content', ''))) + '' - return '' + x.get('value', '') + ''.join(map(get_contents, x.get('content', ''))) + '' - elif otype == 'entity': - if x['subType'] == 'story': - if x['data'] and x['data']['link'] and x['data']['link']['destination']: - if 'web' in x['data']['link']['destination']: - return '' + x.get('value', '') + ''.join( - map(get_contents, x.get('content', ''))) + '' - return '' + x.get('value', '') + ''.join(map(get_contents, x.get('content', ''))) + '' - return '' + x.get('value', '') + ''.join(map(get_contents, x.get('content', ''))) + '' - return '' + x.get('value', '') + ''.join(map(get_contents, x.get('content', ''))) + '' - elif otype in {'div', 'callout'}: - return '
    ' + x.get('value', '') + ''.join(map(get_contents, x.get('content', ''))) + '
    ' - elif not any(x == otype for x in ['', 'ad', 'inline-newsletter', 'tabularData']): - if any(b in x for b in ['value', 'content']): - return '' + x.get('value', '') + ''.join(map(get_contents, x.get('content', ''))) + '' + elif otype == 'anchor': + return '' + ''.join(map(get_contents, x.get('parts', ''))) + '' + elif otype == 'h3': + return '

    ' + ''.join(map(get_contents, x.get('parts', ''))) + '

    ' + elif otype == 'ul': + return '' + elif otype == 'li': + return '
  • ' + ''.join(map(get_contents, x.get('parts', ''))) + '
  • ' + elif otype == 'webview': + return '
    ' + x['html'] + ''.join(map(get_contents, x.get('parts', ''))) + elif otype == 'blockquote': + return '
    ' + ''.join(map(get_contents, x.get('parts', ''))) + '
    ' + elif otype in {'image', 'video'}: + return '
    {}
    \n'.format( + x['imageURLs']['default'], x['caption'] + ' ' + x['credit'] + '' + ) + elif otype in {'correction', 'disclaimer'}: + return '

    ' + ''.join(map(get_contents, x.get('parts', ''))) + '

    ' + elif not any(x == otype for x in ['', 'ad', 'inline-newsletter', 'tabularData']): + return '' + ''.join(map(get_contents, x.get('parts', ''))) + '' return '' + class Bloomberg(BasicNewsRecipe): - title = u'Bloomberg Businessweek' + title = 'Bloomberg Businessweek' language = 'en' __author__ = 'unkn0wn' no_stylesheets = True - use_embedded_content = False remove_attributes = ['style', 'height', 'width'] + encoding = 'utf-8' ignore_duplicate_articles = {'url'} - resolve_internal_links = True masthead_url = 'https://assets.bwbx.io/s3/javelin/public/hub/images/BW-Logo-Black-cc9035fbb3.svg' description = ( 'Bloomberg Businessweek helps global leaders stay ahead with insights and in-depth analysis on the people,' ' companies, events, and trends shaping today\'s complex, global economy.' ) - - simultaneous_downloads = 1 + remove_empty_feeds = True extra_css = ''' - .auth {font-size:small; font-weight:bold;} - .time, .chart, .css--lede-byline, .css--lede-timestamp {font-size:small;} - .subhead, .cap span {font-style:italic; color:#404040;} - em, .col {color:#202020;} - .cat {font-size:small; color:gray;} - .news-figure-caption-text, .cap, .img, .css--caption-outer-wrapper {font-size:small; text-align:center;} + .auth { font-size:small; font-weight:bold; } + .subhead, .cap span { font-style:italic; color:#202020; } + em, blockquote { color:#202020; } + .cat { font-size:small; color:gray; } + .img, .news-figure-caption-text { font-size:small; text-align:center; } + .corr { font-size:small; font-style:italic; color:#404040; } + .chart { font-size:small; } .news-figure-credit {font-size:small; text-align:center; color:#202020;} ''' - remove_tags = [ - dict(name=['button', 'svg']), - dict(name='div', attrs={'id':['bb-that', 'bb-nav']}), - classes('twitter-logo bb-global-footer __sticky__audio__bar__portal__ css--social-wrapper-outer') - ] - - def get_browser(self, *a, **kw): - kw['user_agent'] = 'Mozilla/5.0 (Windows NT 10.0; rv:109.0) Gecko/20100101 Firefox/119.0' - br = BasicNewsRecipe.get_browser(self, *a, **kw) - br.set_handle_redirect(False) - return br - def parse_index(self): - soup = self.index_to_soup('https://www.bloomberg.com/businessweek') - bw = soup.find('a', href=lambda x: x and x.startswith('/magazine/businessweek/')) - edition = 'https://www.bloomberg.com' + bw['href'] - self.log('Downloading ', edition) - self.cover_url = bw.find('img')['src'].replace('25x19', '600x800') - soup = self.index_to_soup(edition) - if timefmt := soup.find(**prefixed_classes('styles_TableOfContentsTitle__')): - self.timefmt = ' [' + (self.tag_to_string(timefmt).replace(' Issue', '')).strip() + ']' + inx = 'https://cdn-mobapi.bloomberg.com' + sec = self.index_to_soup(inx + '/wssmobile/v1/bw/news/list?limit=1', raw=True) + id = json.loads(sec)['magazines'][0]['id'] + if past_edition: + id = past_edition + edit = self.index_to_soup(inx + '/wssmobile/v1/bw/news/week/' + id, raw=True) + d = json.loads(edit) + self.timefmt = ' [' + d['date'] + ']' + self.cover_url = d['image']['thumbUrl'] - feeds_dict = defaultdict(list) + feeds = [] - sec = '' - toc = soup.find('section', attrs={'id':'toc-archive-businessweek'}) - for div in toc.findAll(**prefixed_classes('MagazinePageMagazineArchive_itemContainer__')): - h3 = div.find(**prefixed_classes('MagazinePageMagazineArchive_itemSection__')) - if h3 and h3.text: - sec = self.tag_to_string(h3) - self.log(sec) - a = div.find(**prefixed_classes('MagazinePageMagazineArchive_storyLink__')) - url = a['href'] - if url.startswith('http') is False: - url = 'https://www.bloomberg.com' + a['href'] - title = self.tag_to_string(a) - byl = div.find(**prefixed_classes('Byline_phoenix__')) - desc = self.tag_to_string(byl) - self.log('\t', title, '\n\t', desc, '\n\t\t', url) - feeds_dict[sec].append({"title": title, "url": url, "description": desc}) - return [(sec, articles) for sec, articles in feeds_dict.items()] + for i in d['modules']: + section = i['title'] + self.log(section) + + articles = [] + + for x in i['articles']: + title = x['title'] + url = inx + '/wssmobile/v1/stories/' + x['id'] + self.log('\t', title) + articles.append({'title': title, 'url': url}) + feeds.append((section, articles)) + return feeds def preprocess_raw_html(self, raw, *a): - root = parse(raw) - m = root.xpath('//script[@data-component-props="ArticleBody"]') - if not m: - m = root.xpath('//script[@data-component-props="FeatureBody"]') - if not m: - m2 = root.xpath('//script[@id="__NEXT_DATA__"]') - if not m2: - return raw - if m: - data = json.loads(m[0].text) - data = data['story'] + data = json.loads(raw) - else: - data = json.loads(m2[0].text) - if 'story' in data['props']['pageProps']: - data = data['props']['pageProps']['story'] - else: - return raw - - title = '

    ' + data['headline'] + '

    ' + title = '

    '.format(data['longURL']) + data['title'] + '

    ' cat = subhead = lede = auth = caption = '' if 'primaryCategory' in data and data['primaryCategory'] is not None: - cat = '

    ' + data['primaryCategory'] + '

    ' + cat = '
    ' + data['primaryCategory'] + '
    ' - if len(data['abstract']) != 0 and len(data['abstract']) == 2: - subhead = '

    ' + data['abstract'][0] + '

    ' + data['abstract'][1] + '

    ' - else: - if 'summary' in data: - subhead = '

    ' + data['summary'] + '

    ' + if 'abstract' in data and data['abstract'] and data['abstract'] is not None: + subhead = '
    ' + elif 'summary' in data and data['summary']: + subhead = '

    ' + data['summary'] + '

    ' if 'byline' in data and data['byline'] is not None: - auth = '
    ' + data['byline']\ - + ' | ' + data['publishedAt'][:-14] + '
    ' + dt = datetime.fromtimestamp(data['updatedAt'] + time.timezone) + auth = '

    ' + 'By ' + data['byline'] + ' | Updated on ' + dt.strftime('%b %d, %Y at %I:%M %p') + '

    ' - if 'ledeImageUrl' in data and data['ledeImageUrl'] is not None: - lede = '

    '.format(data['ledeImageUrl']) + if 'ledeImage' in data and data['ledeImage'] is not None: + x = data['ledeImage'] + lede = '

    {}
    \n'.format( + x['imageURLs']['default'], x['caption'] + ' ' + x['credit'] + '' + ) - if 'ledeDescription' in data and data['ledeDescription'] is not None: - caption = '' + data['ledeDescription'] + '' - else: - if 'lede' in data and data['lede'] is not None: - if 'alt' in data['lede'] and data['lede']['alt'] is not None: - caption = '' + data['lede']['alt'] + '' - - if m: - time.sleep(3) - body = data['body'] - else: - body = '' - body_data = data['body']['content'] - for x in body_data: - body += get_contents(x) - pause = random.choice((5, 6, 7, 8, 9)) - self.log('Delay: ', pause, ' seconds') - time.sleep(pause) - return '' + cat + title + subhead + auth + lede + caption + '
    ' + body + '
    ' + body = '' + if data.get('type', '') == 'interactive': + body += '

    ' + 'This is an interactive article, which is supposed to be read in a browser.' + '

    ' + body_data = data['components'] + for x in body_data: + body += get_contents(x) + html = '' + cat + title + subhead + auth + lede + caption + '
    ' + body + '
    ' + return BeautifulSoup(html).prettify() def preprocess_html(self, soup): + for h3 in soup.findAll('h3'): + h3.name = 'h4' for icon in soup.findAll('img', attrs={'class':'video-player__play-icon'}): icon.decompose() for div in soup.findAll('div', attrs={'class':'chart'}): @@ -205,3 +150,9 @@ class Bloomberg(BasicNewsRecipe): for img in soup.findAll('img', attrs={'src':lambda x: x and x.endswith(('-1x-1.jpg', '-1x-1.png'))}): img['src'] = img['src'].replace('-1x-1', '750x-1') return soup + + def populate_article_metadata(self, article, soup, first): + article.url = soup.find('h1')['title'] + article.summary = self.tag_to_string(soup.find('div', attrs={'class':'subhead'})) + article.text_summary = article.summary + diff --git a/recipes/bloomberg.recipe b/recipes/bloomberg.recipe index 6ae76752a2..ec717b46b5 100644 --- a/recipes/bloomberg.recipe +++ b/recipes/bloomberg.recipe @@ -19,7 +19,7 @@ def get_contents(x): elif otype == 'br': return '
    ' elif otype == 'anchor': - return '' + ''.join(map(get_contents, x.get('parts', ''))) + '' + return '' + ''.join(map(get_contents, x.get('parts', ''))) + '' elif otype == 'h3': return '

    ' + ''.join(map(get_contents, x.get('parts', ''))) + '

    ' elif otype == 'ul': @@ -30,11 +30,11 @@ def get_contents(x): return '
    ' + x['html'] + ''.join(map(get_contents, x.get('parts', ''))) elif otype == 'blockquote': return '
    ' + ''.join(map(get_contents, x.get('parts', ''))) + '
    ' - elif otype == ('image' or 'video'): + elif otype in {'image', 'video'}: return '
    {}
    \n'.format( x['imageURLs']['default'], x['caption'] + ' ' + x['credit'] + '' ) - elif otype == ('correction' or 'disclaimer'): + elif otype in {'correction', 'disclaimer'}: return '

    ' + ''.join(map(get_contents, x.get('parts', ''))) + '

    ' elif not any(x == otype for x in ['', 'ad', 'inline-newsletter', 'tabularData']): @@ -44,7 +44,7 @@ def get_contents(x): class Bloomberg(BasicNewsRecipe): title = u'Bloomberg' - language = 'en' + language = 'en_US' __author__ = 'unkn0wn' no_stylesheets = True remove_attributes = ['style', 'height', 'width'] @@ -89,7 +89,7 @@ class Bloomberg(BasicNewsRecipe): for arts in json.loads(art_soup)['modules']: if arts['stories']: for x in arts['stories']: - if x.get('type', '') == 'article': + if x.get('type', '') in {'article', 'interactive'}: dt = datetime.fromtimestamp(x['published'] + time.timezone) if (datetime.now() - dt) > timedelta(self.oldest_article): continue @@ -109,7 +109,7 @@ class Bloomberg(BasicNewsRecipe): cat = subhead = lede = auth = caption = '' if 'primaryCategory' in data and data['primaryCategory'] is not None: - cat = '

    ' + data['primaryCategory'] + '

    ' + cat = '
    ' + data['primaryCategory'] + '
    ' if 'abstract' in data and data['abstract'] and data['abstract'] is not None: subhead = '
    ' @@ -127,6 +127,8 @@ class Bloomberg(BasicNewsRecipe): ) body = '' + if data.get('type', '') == 'interactive': + body += '

    ' + 'This is an interactive article, which is supposed to be read in a browser.' + '

    ' body_data = data['components'] for x in body_data: body += get_contents(x)