This commit is contained in:
Kovid Goyal 2024-07-17 12:21:52 +05:30
commit b04bc87f66
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C
2 changed files with 49 additions and 25 deletions

View File

@ -3,7 +3,7 @@ import time
from datetime import datetime, timedelta from datetime import datetime, timedelta
from calibre.ebooks.BeautifulSoup import BeautifulSoup from calibre.ebooks.BeautifulSoup import BeautifulSoup
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe, classes
# https://www.bloomberg.com/magazine/businessweek/24_12 # https://www.bloomberg.com/magazine/businessweek/24_12
@ -61,6 +61,13 @@ class Bloomberg(BasicNewsRecipe):
) )
remove_empty_feeds = True remove_empty_feeds = True
remove_tags = [
dict(name=['button', 'svg', 'meta']),
dict(name='div', attrs={'id':['bb-that', 'bb-nav']}),
dict(attrs={'data-image-type':'audio'}),
classes('twitter-logo bb-global-footer __sticky__audio__bar__portal__ css--social-wrapper-outer')
]
extra_css = ''' extra_css = '''
.auth { font-size:small; font-weight:bold; } .auth { font-size:small; font-weight:bold; }
.subhead, .cap span { font-style:italic; color:#202020; } .subhead, .cap span { font-style:italic; color:#202020; }
@ -99,7 +106,7 @@ class Bloomberg(BasicNewsRecipe):
feeds.append((section, articles)) feeds.append((section, articles))
return feeds return feeds
def preprocess_raw_html(self, raw, *a): def preprocess_raw_html(self, raw, url):
data = json.loads(raw) data = json.loads(raw)
title = '<h1 title="{}">'.format(data['longURL']) + data['title'] + '</h1>' title = '<h1 title="{}">'.format(data['longURL']) + data['title'] + '</h1>'
@ -118,23 +125,28 @@ class Bloomberg(BasicNewsRecipe):
dt = datetime.fromtimestamp(data['updatedAt'] + time.timezone) dt = datetime.fromtimestamp(data['updatedAt'] + time.timezone)
auth = '<p class="auth">' + 'By ' + data['byline'] + ' | Updated on ' + dt.strftime('%b %d, %Y at %I:%M %p') + '</p>' auth = '<p class="auth">' + 'By ' + data['byline'] + ' | Updated on ' + dt.strftime('%b %d, %Y at %I:%M %p') + '</p>'
if 'ledeImage' in data and data['ledeImage'] is not None:
x = data['ledeImage']
lede = '<br><img src="{}"><div class="img">{}</div>\n'.format(
x['imageURLs']['default'], x['caption'] + '<i> ' + x['credit'] + '</i>'
)
body = '' body = ''
if data.get('type', '') == 'interactive': if data.get('type', '') == 'interactive':
body += '<p><em>' + 'This is an interactive article, which is supposed to be read in a browser.' + '</p></em>' body += '<p><em>' + 'This is an interactive article, which is supposed to be read in a browser.' + '</p></em>'
body_data = data['components'] # body_data = data['components']
for x in body_data: # for x in body_data:
body += get_contents(x) # body += get_contents(x)
b_data = self.index_to_soup('https://cdn-mobapi.bloomberg.com/wssmobile/v1/bw/news/stories/' + url.split('/')[-1], raw=True)
body += json.loads(b_data)['html']
if 'ledeImage' in data and data['ledeImage'] is not None:
x = data['ledeImage']
if x['imageURLs']['default'].rsplit('/', 1)[0] not in body:
lede = '<br><img src="{}"><div class="img">{}</div>\n'.format(
x['imageURLs']['default'], x['caption'] + '<i> ' + x['credit'] + '</i>'
)
html = '<html><body>' + cat + title + subhead + auth + lede + caption + '<div>' + body + '</div></body></html>' html = '<html><body>' + cat + title + subhead + auth + lede + caption + '<div>' + body + '</div></body></html>'
return BeautifulSoup(html).prettify() return BeautifulSoup(html).prettify()
def preprocess_html(self, soup): def preprocess_html(self, soup):
for h3 in soup.findAll('h3'): for h3 in soup.findAll(['h2', 'h3']):
h3.name = 'h4' h3.name = 'h4'
for icon in soup.findAll('img', attrs={'class':'video-player__play-icon'}): for icon in soup.findAll('img', attrs={'class':'video-player__play-icon'}):
icon.decompose() icon.decompose()
@ -155,4 +167,3 @@ class Bloomberg(BasicNewsRecipe):
article.url = soup.find('h1')['title'] article.url = soup.find('h1')['title']
article.summary = self.tag_to_string(soup.find('div', attrs={'class':'subhead'})) article.summary = self.tag_to_string(soup.find('div', attrs={'class':'subhead'}))
article.text_summary = article.summary article.text_summary = article.summary

View File

@ -3,7 +3,7 @@ import time
from datetime import datetime, timedelta from datetime import datetime, timedelta
from calibre.ebooks.BeautifulSoup import BeautifulSoup from calibre.ebooks.BeautifulSoup import BeautifulSoup
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe, classes
def get_contents(x): def get_contents(x):
@ -58,6 +58,14 @@ class Bloomberg(BasicNewsRecipe):
oldest_article = 1 # days oldest_article = 1 # days
resolve_internal_links = True resolve_internal_links = True
remove_empty_feeds = True remove_empty_feeds = True
cover_url = 'https://assets.bwbx.io/images/users/iqjWHBFdfxIU/ivUxvlPidC3M/v0/600x-1.jpg'
remove_tags = [
dict(name=['button', 'svg', 'meta']),
dict(name='div', attrs={'id':['bb-that', 'bb-nav']}),
dict(attrs={'data-image-type':'audio'}),
classes('twitter-logo bb-global-footer __sticky__audio__bar__portal__ css--social-wrapper-outer')
]
extra_css = ''' extra_css = '''
.auth { font-size:small; font-weight:bold; } .auth { font-size:small; font-weight:bold; }
@ -101,7 +109,7 @@ class Bloomberg(BasicNewsRecipe):
feeds.append((section, articles)) feeds.append((section, articles))
return feeds return feeds
def preprocess_raw_html(self, raw, *a): def preprocess_raw_html(self, raw, url):
data = json.loads(raw) data = json.loads(raw)
title = '<h1 title="{}">'.format(data['longURL']) + data['title'] + '</h1>' title = '<h1 title="{}">'.format(data['longURL']) + data['title'] + '</h1>'
@ -120,23 +128,28 @@ class Bloomberg(BasicNewsRecipe):
dt = datetime.fromtimestamp(data['updatedAt'] + time.timezone) dt = datetime.fromtimestamp(data['updatedAt'] + time.timezone)
auth = '<p class="auth">' + 'By ' + data['byline'] + ' | Updated on ' + dt.strftime('%b %d, %Y at %I:%M %p') + '</p>' auth = '<p class="auth">' + 'By ' + data['byline'] + ' | Updated on ' + dt.strftime('%b %d, %Y at %I:%M %p') + '</p>'
if 'ledeImage' in data and data['ledeImage'] is not None:
x = data['ledeImage']
lede = '<br><img src="{}"><div class="img">{}</div>\n'.format(
x['imageURLs']['default'], x['caption'] + '<i> ' + x['credit'] + '</i>'
)
body = '' body = ''
if data.get('type', '') == 'interactive': if data.get('type', '') == 'interactive':
body += '<p><em>' + 'This is an interactive article, which is supposed to be read in a browser.' + '</p></em>' body += '<p><em>' + 'This is an interactive article, which is supposed to be read in a browser.' + '</p></em>'
body_data = data['components'] # body_data = data['components']
for x in body_data: # for x in body_data:
body += get_contents(x) # body += get_contents(x)
b_data = self.index_to_soup('https://cdn-mobapi.bloomberg.com/wssmobile/v1/bw/news/stories/' + url.split('/')[-1], raw=True)
body += json.loads(b_data)['html']
if 'ledeImage' in data and data['ledeImage'] is not None:
x = data['ledeImage']
if x['imageURLs']['default'].rsplit('/', 1)[0] not in body:
lede = '<br><img src="{}"><div class="img">{}</div>\n'.format(
x['imageURLs']['default'], x['caption'] + '<i> ' + x['credit'] + '</i>'
)
html = '<html><body>' + cat + title + subhead + auth + lede + caption + '<div>' + body + '</div></body></html>' html = '<html><body>' + cat + title + subhead + auth + lede + caption + '<div>' + body + '</div></body></html>'
return BeautifulSoup(html).prettify() return BeautifulSoup(html).prettify()
def preprocess_html(self, soup): def preprocess_html(self, soup):
for h3 in soup.findAll('h3'): for h3 in soup.findAll(['h2', 'h3']):
h3.name = 'h4' h3.name = 'h4'
for icon in soup.findAll('img', attrs={'class':'video-player__play-icon'}): for icon in soup.findAll('img', attrs={'class':'video-player__play-icon'}):
icon.decompose() icon.decompose()