From e33efc1beadd3d3ee582a3dd9db47398fbfb73f5 Mon Sep 17 00:00:00 2001
From: unkn0w7n <51942695+unkn0w7n@users.noreply.github.com>
Date: Thu, 10 Aug 2023 09:47:13 +0530
Subject: [PATCH 1/3] Update bloomberg-business-week.recipe
---
recipes/bloomberg-business-week.recipe | 71 ++++++++++++++++----------
1 file changed, 44 insertions(+), 27 deletions(-)
diff --git a/recipes/bloomberg-business-week.recipe b/recipes/bloomberg-business-week.recipe
index bb3f7c3c37..9c898a2f94 100644
--- a/recipes/bloomberg-business-week.recipe
+++ b/recipes/bloomberg-business-week.recipe
@@ -1,45 +1,56 @@
from calibre.web.feeds.news import BasicNewsRecipe, classes
-from calibre import browser
from html5_parser import parse
import json
import random
import time
def get_contents(x):
+ if x == '':
+ return ''
otype = x.get('type', '')
if otype == 'text':
- return x.get('value', '')
+ if 'attributes' in x:
+ if 'strong' in x['attributes']:
+ return '' + x.get('value', '') + ''.join(map(get_contents, x.get('content', ''))) + ''
+ if 'emphasis' in x['attributes']:
+ return '' + x.get('value', '') + ''.join(map(get_contents, x.get('content', ''))) + ''
+ return x.get('value', '') + ''.join(map(get_contents, x.get('content', '')))
+ elif otype == 'br':
+ return '
'
elif otype == 'paragraph':
- return '
' + x.get('value', '') + ''.join(map(get_contents, x.get('content'))) + '
'
+ return '' + x.get('value', '') + ''.join(map(get_contents, x.get('content', ''))) + '
'
elif otype == 'heading':
- return '' + x.get('value', '') + ''.join(map(get_contents, x.get('content'))) + '
'
+ return '' + x.get('value', '') + ''.join(map(get_contents, x.get('content', ''))) + '
'
elif otype == 'list':
- return '' + ''.join(map(get_contents, x.get('content'))) + '
'
+ return '' + ''.join(map(get_contents, x.get('content', ''))) + '
'
elif otype == 'listItem':
- return '' + x.get('value', '') + ''.join(map(get_contents, x.get('content'))) + ''
+ return '' + x.get('value', '') + ''.join(map(get_contents, x.get('content', ''))) + ''
elif otype == 'quote':
- return '' + x.get('value', '') + ''.join(map(get_contents, x.get('content'))) + '
'
+ return '' + x.get('value', '') + ''.join(map(get_contents, x.get('content', ''))) + '
'
elif otype == 'media':
if x['subType'] == 'photo':
return ''.format(
x['data']['photo']['src'], x['data']['photo']['caption'])
elif x['subType'] == 'chart':
if x['data'] and x['data']['chart']:
- return ''.format(x['data']['chart']['fallback'])
+ return ''.format(x['data']['chart']['fallback'])
elif otype == 'link':
- if x['data'] and x['content'] and x['content'][0] and x['content'][0]['value']:
+ if 'data' in x:
if 'href' in x['data']:
- return '' + x['content'][0]['value'] + ''
- return '' + x['content'][0]['value'] + ''
+ return '' + x.get('value', '') + ''.join(map(get_contents, x.get('content', ''))) + ''
+ return '' + x.get('value', '') + ''.join(map(get_contents, x.get('content', ''))) + ''
+ return '' + x.get('value', '') + ''.join(map(get_contents, x.get('content', ''))) + ''
elif otype == 'entity':
- if x['content'] and x['content'][0] and x['content'][0]['value']:
- if x['subType'] == 'story':
- if x['data'] and x['data']['link'] and x['data']['link']['destination']:
- if 'web' in x['data']['link']['destination']:
- return '' + x['content'][0]['value'] + ''
- return '' + x['content'][0]['value'] + ''
- elif x['subType'] in ('person', 'security'):
- return '' + x['content'][0]['value'] + ''
+ if x['subType'] == 'story':
+ if x['data'] and x['data']['link'] and x['data']['link']['destination']:
+ if 'web' in x['data']['link']['destination']:
+ return '' + x.get('value', '') + ''.join(map(get_contents, x.get('content', ''))) + ''
+ return '' + x.get('value', '') + ''.join(map(get_contents, x.get('content', ''))) + ''
+ return '' + x.get('value', '') + ''.join(map(get_contents, x.get('content', ''))) + ''
+ return '' + x.get('value', '') + ''.join(map(get_contents, x.get('content', ''))) + ''
+ elif not any(x == otype for x in ['', 'ad', 'inline-newsletter', 'tabularData']):
+ if any(b in x for b in ['value', 'content']):
+ return '' + x.get('value', '') + ''.join(map(get_contents, x.get('content', ''))) + ''
return ''
@@ -53,26 +64,32 @@ class Bloomberg(BasicNewsRecipe):
ignore_duplicate_articles = {'url'}
resolve_internal_links = True
masthead_url = 'https://assets.bwbx.io/s3/javelin/public/hub/images/BW-Logo-Black-cc9035fbb3.svg'
+ description = (
+ 'Bloomberg Businessweek helps global leaders stay ahead with insights and in-depth analysis on the people,'
+ ' companies, events, and trends shaping today\'s complex, global economy.'
+ )
- # delay = 7 # seconds
- simultaneous_downloads = 3
+ simultaneous_downloads = 1
extra_css = '''
.auth {font-size:small; font-weight:bold;}
- .time, .chart {font-size:small;}
+ .time, .chart, .css--lede-byline, .css--lede-timestamp {font-size:small;}
.subhead {font-style:italic; color:#404040;}
+ i, .col {color:#202020;}
.cat {font-size:small; color:gray;}
- .news-figure-caption-text, .cap, .img {font-size:small; text-align:center;}
+ .news-figure-caption-text, .cap, .img, .css--caption-outer-wrapper {font-size:small; text-align:center;}
.news-figure-credit {font-size:small; text-align:center; color:#202020;}
'''
remove_tags = [
+ dict(name=['button', 'svg']),
dict(name='div', attrs={'id':['bb-that', 'bb-nav']}),
- classes('twitter-logo bb-global-footer')
+ classes('twitter-logo bb-global-footer __sticky__audio__bar__portal__ css--social-wrapper-outer')
]
- def get_browser(self):
- br = browser()
+ def get_browser(self, *a, **kw):
+ kw['user_agent'] = 'common_words/based'
+ br = BasicNewsRecipe.get_browser(self, *a, **kw)
br.set_handle_redirect(False)
return br
@@ -163,7 +180,7 @@ class Bloomberg(BasicNewsRecipe):
body = ''
body_data = data['body']['content']
for x in body_data:
- pause = random.choice((0.5, 1, 1.25))
+ pause = random.choice((0.25, 0.5, 0.75, 1))
time.sleep(pause)
body += get_contents(x)
return '' + cat + title + subhead + auth + lede + caption + '' + body + '
'
From 0d825ec9bcd5d0effb994a88614e76809da50a7e Mon Sep 17 00:00:00 2001
From: unkn0w7n <51942695+unkn0w7n@users.noreply.github.com>
Date: Thu, 10 Aug 2023 09:54:26 +0530
Subject: [PATCH 2/3] Update bloomberg.recipe
---
recipes/bloomberg.recipe | 65 ++++++++++++++++++++++++----------------
1 file changed, 39 insertions(+), 26 deletions(-)
diff --git a/recipes/bloomberg.recipe b/recipes/bloomberg.recipe
index 224fcac853..014a30cd8c 100644
--- a/recipes/bloomberg.recipe
+++ b/recipes/bloomberg.recipe
@@ -1,5 +1,4 @@
from calibre.web.feeds.news import BasicNewsRecipe
-from calibre import browser
from html5_parser import parse
from calibre.ptempfile import PersistentTemporaryFile
import json
@@ -7,19 +6,28 @@ import random
import time
def get_contents(x):
+ if x == '':
+ return ''
otype = x.get('type', '')
if otype == 'text':
- return x.get('value', '')
+ if 'attributes' in x:
+ if 'strong' in x['attributes']:
+ return '' + x.get('value', '') + ''.join(map(get_contents, x.get('content', ''))) + ''
+ if 'emphasis' in x['attributes']:
+ return '' + x.get('value', '') + ''.join(map(get_contents, x.get('content', ''))) + ''
+ return x.get('value', '') + ''.join(map(get_contents, x.get('content', '')))
+ elif otype == 'br':
+ return '
'
elif otype == 'paragraph':
- return '' + x.get('value', '') + ''.join(map(get_contents, x.get('content'))) + '
'
+ return '' + x.get('value', '') + ''.join(map(get_contents, x.get('content', ''))) + '
'
elif otype == 'heading':
- return '' + x.get('value', '') + ''.join(map(get_contents, x.get('content'))) + '
'
+ return '' + x.get('value', '') + ''.join(map(get_contents, x.get('content', ''))) + '
'
elif otype == 'list':
- return '' + ''.join(map(get_contents, x.get('content'))) + '
'
+ return '' + ''.join(map(get_contents, x.get('content', ''))) + '
'
elif otype == 'listItem':
- return '' + x.get('value', '') + ''.join(map(get_contents, x.get('content'))) + ''
+ return '' + x.get('value', '') + ''.join(map(get_contents, x.get('content', ''))) + ''
elif otype == 'quote':
- return '' + x.get('value', '') + ''.join(map(get_contents, x.get('content'))) + '
'
+ return '' + x.get('value', '') + ''.join(map(get_contents, x.get('content', ''))) + '
'
elif otype == 'media':
if x['subType'] == 'photo':
return ''.format(
@@ -28,19 +36,22 @@ def get_contents(x):
if x['data'] and x['data']['chart']:
return ''.format(x['data']['chart']['fallback'])
elif otype == 'link':
- if x['data'] and x['content'] and x['content'][0] and x['content'][0]['value']:
+ if 'data' in x:
if 'href' in x['data']:
- return '' + x['content'][0]['value'] + ''
- return '' + x['content'][0]['value'] + ''
+ return '' + x.get('value', '') + ''.join(map(get_contents, x.get('content', ''))) + ''
+ return '' + x.get('value', '') + ''.join(map(get_contents, x.get('content', ''))) + ''
+ return '' + x.get('value', '') + ''.join(map(get_contents, x.get('content', ''))) + ''
elif otype == 'entity':
- if x['content'] and x['content'][0] and x['content'][0]['value']:
- if x['subType'] == 'story':
- if x['data'] and x['data']['link'] and x['data']['link']['destination']:
- if 'web' in x['data']['link']['destination']:
- return '' + x['content'][0]['value'] + ''
- return '' + x['content'][0]['value'] + ''
- elif x['subType'] in ('person', 'security'):
- return '' + x['content'][0]['value'] + ''
+ if x['subType'] == 'story':
+ if x['data'] and x['data']['link'] and x['data']['link']['destination']:
+ if 'web' in x['data']['link']['destination']:
+ return '' + x.get('value', '') + ''.join(map(get_contents, x.get('content', ''))) + ''
+ return '' + x.get('value', '') + ''.join(map(get_contents, x.get('content', ''))) + ''
+ return '' + x.get('value', '') + ''.join(map(get_contents, x.get('content', ''))) + ''
+ return '' + x.get('value', '') + ''.join(map(get_contents, x.get('content', ''))) + ''
+ elif not any(x == otype for x in ['', 'ad', 'inline-newsletter', 'tabularData']):
+ if any(b in x for b in ['value', 'content']):
+ return '' + x.get('value', '') + ''.join(map(get_contents, x.get('content', ''))) + ''
return ''
@@ -53,14 +64,16 @@ class Bloomberg(BasicNewsRecipe):
use_embedded_content = False
remove_attributes = ['style', 'height', 'width']
ignore_duplicate_articles = {'url', 'title'}
+ masthead_url = 'https://assets.bbhub.io/company/sites/70/2022/09/logoBBGblck.svg'
+ description = 'Bloomberg delivers business and markets news, data, analysis, and video to the world, featuring stories from Businessweek and Bloomberg News.'
- # delay = 7 # seconds
- simultaneous_downloads = 3
+ simultaneous_downloads = 1
extra_css = '''
.auth {font-size:small; font-weight:bold;}
.time, .chart {font-size:small;}
- .subhead, blockquote {font-style:italic; color:#404040;}
+ .subhead {font-style:italic; color:#404040;}
+ i, .col {color:#202020;}
.cat {font-size:small; color:gray;}
.news-figure-caption-text, .cap, .img {font-size:small; text-align:center;}
.news-figure-credit {font-size:small; text-align:center; color:#202020;}
@@ -89,9 +102,9 @@ class Bloomberg(BasicNewsRecipe):
pt.close()
return pt.name
-
- def get_browser(self):
- br = browser()
+ def get_browser(self, *a, **kw):
+ kw['user_agent'] = 'common_words/based'
+ br = BasicNewsRecipe.get_browser(self, *a, **kw)
br.set_handle_redirect(False)
return br
@@ -103,7 +116,7 @@ class Bloomberg(BasicNewsRecipe):
'https://news.google.com/rss/search?q=when:27h+allinurl:bloomberg.com%2Fnews%2Fnewsletters%2F&hl=en-US&gl=US&ceid=US:en'),
('News',
'https://news.google.com/rss/search?q=when:27h+allinurl:bloomberg.com%2Fnews%2Farticles%2F&hl=en-US&gl=US&ceid=US:en'),
- ('Others', 'https://news.google.com/rss/search?q=when:27h+allinurl:bloomberg.com&hl=en-US&gl=US&ceid=US:en')
+ ('Others', 'https://news.google.com/rss/search?q=when:27h+allinurl:https%3A%2F%2Fwww.bloomberg.com&hl=en-US&gl=US&ceid=US:en')
]
def preprocess_raw_html(self, raw, *a):
@@ -160,7 +173,7 @@ class Bloomberg(BasicNewsRecipe):
body = ''
body_data = data['body']['content']
for x in body_data:
- pause = random.choice((0.5, 1, 1.25))
+ pause = random.choice((0.25, 0.5, 0.75, 1))
time.sleep(pause)
body += get_contents(x)
return '' + cat + title + subhead + auth + lede + caption + '' + body + '
'
From 6d93f94e80fe22f7692cdced4e8498192d3b98ff Mon Sep 17 00:00:00 2001
From: unkn0w7n <51942695+unkn0w7n@users.noreply.github.com>
Date: Thu, 10 Aug 2023 09:59:34 +0530
Subject: [PATCH 3/3] ...
---
recipes/bloomberg-business-week.recipe | 1 +
recipes/bloomberg.recipe | 1 +
2 files changed, 2 insertions(+)
diff --git a/recipes/bloomberg-business-week.recipe b/recipes/bloomberg-business-week.recipe
index 9c898a2f94..ddba5024d6 100644
--- a/recipes/bloomberg-business-week.recipe
+++ b/recipes/bloomberg-business-week.recipe
@@ -14,6 +14,7 @@ def get_contents(x):
return '' + x.get('value', '') + ''.join(map(get_contents, x.get('content', ''))) + ''
if 'emphasis' in x['attributes']:
return '' + x.get('value', '') + ''.join(map(get_contents, x.get('content', ''))) + ''
+ return x.get('value', '') + ''.join(map(get_contents, x.get('content', '')))
return x.get('value', '') + ''.join(map(get_contents, x.get('content', '')))
elif otype == 'br':
return '
'
diff --git a/recipes/bloomberg.recipe b/recipes/bloomberg.recipe
index 014a30cd8c..f41000cab9 100644
--- a/recipes/bloomberg.recipe
+++ b/recipes/bloomberg.recipe
@@ -15,6 +15,7 @@ def get_contents(x):
return '' + x.get('value', '') + ''.join(map(get_contents, x.get('content', ''))) + ''
if 'emphasis' in x['attributes']:
return '' + x.get('value', '') + ''.join(map(get_contents, x.get('content', ''))) + ''
+ return x.get('value', '') + ''.join(map(get_contents, x.get('content', '')))
return x.get('value', '') + ''.join(map(get_contents, x.get('content', '')))
elif otype == 'br':
return '
'