From e33efc1beadd3d3ee582a3dd9db47398fbfb73f5 Mon Sep 17 00:00:00 2001 From: unkn0w7n <51942695+unkn0w7n@users.noreply.github.com> Date: Thu, 10 Aug 2023 09:47:13 +0530 Subject: [PATCH 1/3] Update bloomberg-business-week.recipe --- recipes/bloomberg-business-week.recipe | 71 ++++++++++++++++---------- 1 file changed, 44 insertions(+), 27 deletions(-) diff --git a/recipes/bloomberg-business-week.recipe b/recipes/bloomberg-business-week.recipe index bb3f7c3c37..9c898a2f94 100644 --- a/recipes/bloomberg-business-week.recipe +++ b/recipes/bloomberg-business-week.recipe @@ -1,45 +1,56 @@ from calibre.web.feeds.news import BasicNewsRecipe, classes -from calibre import browser from html5_parser import parse import json import random import time def get_contents(x): + if x == '': + return '' otype = x.get('type', '') if otype == 'text': - return x.get('value', '') + if 'attributes' in x: + if 'strong' in x['attributes']: + return '' + x.get('value', '') + ''.join(map(get_contents, x.get('content', ''))) + '' + if 'emphasis' in x['attributes']: + return '' + x.get('value', '') + ''.join(map(get_contents, x.get('content', ''))) + '' + return x.get('value', '') + ''.join(map(get_contents, x.get('content', ''))) + elif otype == 'br': + return '
' elif otype == 'paragraph': - return '

' + x.get('value', '') + ''.join(map(get_contents, x.get('content'))) + '

' + return '

' + x.get('value', '') + ''.join(map(get_contents, x.get('content', ''))) + '

' elif otype == 'heading': - return '

' + x.get('value', '') + ''.join(map(get_contents, x.get('content'))) + '

' + return '

' + x.get('value', '') + ''.join(map(get_contents, x.get('content', ''))) + '

' elif otype == 'list': - return '' + return '' elif otype == 'listItem': - return '
  • ' + x.get('value', '') + ''.join(map(get_contents, x.get('content'))) + '
  • ' + return '
  • ' + x.get('value', '') + ''.join(map(get_contents, x.get('content', ''))) + '
  • ' elif otype == 'quote': - return '
    ' + x.get('value', '') + ''.join(map(get_contents, x.get('content'))) + '
    ' + return '
    ' + x.get('value', '') + ''.join(map(get_contents, x.get('content', ''))) + '
    ' elif otype == 'media': if x['subType'] == 'photo': return '
    {}
    '.format( x['data']['photo']['src'], x['data']['photo']['caption']) elif x['subType'] == 'chart': if x['data'] and x['data']['chart']: - return '
    '.format(x['data']['chart']['fallback']) + return '
    '.format(x['data']['chart']['fallback']) elif otype == 'link': - if x['data'] and x['content'] and x['content'][0] and x['content'][0]['value']: + if 'data' in x: if 'href' in x['data']: - return '' + x['content'][0]['value'] + '' - return '' + x['content'][0]['value'] + '' + return '' + x.get('value', '') + ''.join(map(get_contents, x.get('content', ''))) + '' + return '' + x.get('value', '') + ''.join(map(get_contents, x.get('content', ''))) + '' + return '' + x.get('value', '') + ''.join(map(get_contents, x.get('content', ''))) + '' elif otype == 'entity': - if x['content'] and x['content'][0] and x['content'][0]['value']: - if x['subType'] == 'story': - if x['data'] and x['data']['link'] and x['data']['link']['destination']: - if 'web' in x['data']['link']['destination']: - return '' + x['content'][0]['value'] + '' - return '' + x['content'][0]['value'] + '' - elif x['subType'] in ('person', 'security'): - return '' + x['content'][0]['value'] + '' + if x['subType'] == 'story': + if x['data'] and x['data']['link'] and x['data']['link']['destination']: + if 'web' in x['data']['link']['destination']: + return '' + x.get('value', '') + ''.join(map(get_contents, x.get('content', ''))) + '' + return '' + x.get('value', '') + ''.join(map(get_contents, x.get('content', ''))) + '' + return '' + x.get('value', '') + ''.join(map(get_contents, x.get('content', ''))) + '' + return '' + x.get('value', '') + ''.join(map(get_contents, x.get('content', ''))) + '' + elif not any(x == otype for x in ['', 'ad', 'inline-newsletter', 'tabularData']): + if any(b in x for b in ['value', 'content']): + return '' + x.get('value', '') + ''.join(map(get_contents, x.get('content', ''))) + '' return '' @@ -53,26 +64,32 @@ class Bloomberg(BasicNewsRecipe): ignore_duplicate_articles = {'url'} resolve_internal_links = True masthead_url = 'https://assets.bwbx.io/s3/javelin/public/hub/images/BW-Logo-Black-cc9035fbb3.svg' + description = ( + 'Bloomberg Businessweek helps global leaders stay ahead with insights and in-depth analysis on the people,' + ' companies, events, and trends shaping today\'s complex, global economy.' + ) - # delay = 7 # seconds - simultaneous_downloads = 3 + simultaneous_downloads = 1 extra_css = ''' .auth {font-size:small; font-weight:bold;} - .time, .chart {font-size:small;} + .time, .chart, .css--lede-byline, .css--lede-timestamp {font-size:small;} .subhead {font-style:italic; color:#404040;} + i, .col {color:#202020;} .cat {font-size:small; color:gray;} - .news-figure-caption-text, .cap, .img {font-size:small; text-align:center;} + .news-figure-caption-text, .cap, .img, .css--caption-outer-wrapper {font-size:small; text-align:center;} .news-figure-credit {font-size:small; text-align:center; color:#202020;} ''' remove_tags = [ + dict(name=['button', 'svg']), dict(name='div', attrs={'id':['bb-that', 'bb-nav']}), - classes('twitter-logo bb-global-footer') + classes('twitter-logo bb-global-footer __sticky__audio__bar__portal__ css--social-wrapper-outer') ] - def get_browser(self): - br = browser() + def get_browser(self, *a, **kw): + kw['user_agent'] = 'common_words/based' + br = BasicNewsRecipe.get_browser(self, *a, **kw) br.set_handle_redirect(False) return br @@ -163,7 +180,7 @@ class Bloomberg(BasicNewsRecipe): body = '' body_data = data['body']['content'] for x in body_data: - pause = random.choice((0.5, 1, 1.25)) + pause = random.choice((0.25, 0.5, 0.75, 1)) time.sleep(pause) body += get_contents(x) return '' + cat + title + subhead + auth + lede + caption + '
    ' + body + '
    ' From 0d825ec9bcd5d0effb994a88614e76809da50a7e Mon Sep 17 00:00:00 2001 From: unkn0w7n <51942695+unkn0w7n@users.noreply.github.com> Date: Thu, 10 Aug 2023 09:54:26 +0530 Subject: [PATCH 2/3] Update bloomberg.recipe --- recipes/bloomberg.recipe | 65 ++++++++++++++++++++++++---------------- 1 file changed, 39 insertions(+), 26 deletions(-) diff --git a/recipes/bloomberg.recipe b/recipes/bloomberg.recipe index 224fcac853..014a30cd8c 100644 --- a/recipes/bloomberg.recipe +++ b/recipes/bloomberg.recipe @@ -1,5 +1,4 @@ from calibre.web.feeds.news import BasicNewsRecipe -from calibre import browser from html5_parser import parse from calibre.ptempfile import PersistentTemporaryFile import json @@ -7,19 +6,28 @@ import random import time def get_contents(x): + if x == '': + return '' otype = x.get('type', '') if otype == 'text': - return x.get('value', '') + if 'attributes' in x: + if 'strong' in x['attributes']: + return '' + x.get('value', '') + ''.join(map(get_contents, x.get('content', ''))) + '' + if 'emphasis' in x['attributes']: + return '' + x.get('value', '') + ''.join(map(get_contents, x.get('content', ''))) + '' + return x.get('value', '') + ''.join(map(get_contents, x.get('content', ''))) + elif otype == 'br': + return '
    ' elif otype == 'paragraph': - return '

    ' + x.get('value', '') + ''.join(map(get_contents, x.get('content'))) + '

    ' + return '

    ' + x.get('value', '') + ''.join(map(get_contents, x.get('content', ''))) + '

    ' elif otype == 'heading': - return '

    ' + x.get('value', '') + ''.join(map(get_contents, x.get('content'))) + '

    ' + return '

    ' + x.get('value', '') + ''.join(map(get_contents, x.get('content', ''))) + '

    ' elif otype == 'list': - return '' + return '' elif otype == 'listItem': - return '
  • ' + x.get('value', '') + ''.join(map(get_contents, x.get('content'))) + '
  • ' + return '
  • ' + x.get('value', '') + ''.join(map(get_contents, x.get('content', ''))) + '
  • ' elif otype == 'quote': - return '
    ' + x.get('value', '') + ''.join(map(get_contents, x.get('content'))) + '
    ' + return '
    ' + x.get('value', '') + ''.join(map(get_contents, x.get('content', ''))) + '
    ' elif otype == 'media': if x['subType'] == 'photo': return '
    {}
    '.format( @@ -28,19 +36,22 @@ def get_contents(x): if x['data'] and x['data']['chart']: return '
    '.format(x['data']['chart']['fallback']) elif otype == 'link': - if x['data'] and x['content'] and x['content'][0] and x['content'][0]['value']: + if 'data' in x: if 'href' in x['data']: - return '' + x['content'][0]['value'] + '' - return '' + x['content'][0]['value'] + '' + return '' + x.get('value', '') + ''.join(map(get_contents, x.get('content', ''))) + '' + return '' + x.get('value', '') + ''.join(map(get_contents, x.get('content', ''))) + '' + return '' + x.get('value', '') + ''.join(map(get_contents, x.get('content', ''))) + '' elif otype == 'entity': - if x['content'] and x['content'][0] and x['content'][0]['value']: - if x['subType'] == 'story': - if x['data'] and x['data']['link'] and x['data']['link']['destination']: - if 'web' in x['data']['link']['destination']: - return '' + x['content'][0]['value'] + '' - return '' + x['content'][0]['value'] + '' - elif x['subType'] in ('person', 'security'): - return '' + x['content'][0]['value'] + '' + if x['subType'] == 'story': + if x['data'] and x['data']['link'] and x['data']['link']['destination']: + if 'web' in x['data']['link']['destination']: + return '' + x.get('value', '') + ''.join(map(get_contents, x.get('content', ''))) + '' + return '' + x.get('value', '') + ''.join(map(get_contents, x.get('content', ''))) + '' + return '' + x.get('value', '') + ''.join(map(get_contents, x.get('content', ''))) + '' + return '' + x.get('value', '') + ''.join(map(get_contents, x.get('content', ''))) + '' + elif not any(x == otype for x in ['', 'ad', 'inline-newsletter', 'tabularData']): + if any(b in x for b in ['value', 'content']): + return '' + x.get('value', '') + ''.join(map(get_contents, x.get('content', ''))) + '' return '' @@ -53,14 +64,16 @@ class Bloomberg(BasicNewsRecipe): use_embedded_content = False remove_attributes = ['style', 'height', 'width'] ignore_duplicate_articles = {'url', 'title'} + masthead_url = 'https://assets.bbhub.io/company/sites/70/2022/09/logoBBGblck.svg' + description = 'Bloomberg delivers business and markets news, data, analysis, and video to the world, featuring stories from Businessweek and Bloomberg News.' - # delay = 7 # seconds - simultaneous_downloads = 3 + simultaneous_downloads = 1 extra_css = ''' .auth {font-size:small; font-weight:bold;} .time, .chart {font-size:small;} - .subhead, blockquote {font-style:italic; color:#404040;} + .subhead {font-style:italic; color:#404040;} + i, .col {color:#202020;} .cat {font-size:small; color:gray;} .news-figure-caption-text, .cap, .img {font-size:small; text-align:center;} .news-figure-credit {font-size:small; text-align:center; color:#202020;} @@ -89,9 +102,9 @@ class Bloomberg(BasicNewsRecipe): pt.close() return pt.name - - def get_browser(self): - br = browser() + def get_browser(self, *a, **kw): + kw['user_agent'] = 'common_words/based' + br = BasicNewsRecipe.get_browser(self, *a, **kw) br.set_handle_redirect(False) return br @@ -103,7 +116,7 @@ class Bloomberg(BasicNewsRecipe): 'https://news.google.com/rss/search?q=when:27h+allinurl:bloomberg.com%2Fnews%2Fnewsletters%2F&hl=en-US&gl=US&ceid=US:en'), ('News', 'https://news.google.com/rss/search?q=when:27h+allinurl:bloomberg.com%2Fnews%2Farticles%2F&hl=en-US&gl=US&ceid=US:en'), - ('Others', 'https://news.google.com/rss/search?q=when:27h+allinurl:bloomberg.com&hl=en-US&gl=US&ceid=US:en') + ('Others', 'https://news.google.com/rss/search?q=when:27h+allinurl:https%3A%2F%2Fwww.bloomberg.com&hl=en-US&gl=US&ceid=US:en') ] def preprocess_raw_html(self, raw, *a): @@ -160,7 +173,7 @@ class Bloomberg(BasicNewsRecipe): body = '' body_data = data['body']['content'] for x in body_data: - pause = random.choice((0.5, 1, 1.25)) + pause = random.choice((0.25, 0.5, 0.75, 1)) time.sleep(pause) body += get_contents(x) return '' + cat + title + subhead + auth + lede + caption + '
    ' + body + '
    ' From 6d93f94e80fe22f7692cdced4e8498192d3b98ff Mon Sep 17 00:00:00 2001 From: unkn0w7n <51942695+unkn0w7n@users.noreply.github.com> Date: Thu, 10 Aug 2023 09:59:34 +0530 Subject: [PATCH 3/3] ... --- recipes/bloomberg-business-week.recipe | 1 + recipes/bloomberg.recipe | 1 + 2 files changed, 2 insertions(+) diff --git a/recipes/bloomberg-business-week.recipe b/recipes/bloomberg-business-week.recipe index 9c898a2f94..ddba5024d6 100644 --- a/recipes/bloomberg-business-week.recipe +++ b/recipes/bloomberg-business-week.recipe @@ -14,6 +14,7 @@ def get_contents(x): return '' + x.get('value', '') + ''.join(map(get_contents, x.get('content', ''))) + '' if 'emphasis' in x['attributes']: return '' + x.get('value', '') + ''.join(map(get_contents, x.get('content', ''))) + '' + return x.get('value', '') + ''.join(map(get_contents, x.get('content', ''))) return x.get('value', '') + ''.join(map(get_contents, x.get('content', ''))) elif otype == 'br': return '
    ' diff --git a/recipes/bloomberg.recipe b/recipes/bloomberg.recipe index 014a30cd8c..f41000cab9 100644 --- a/recipes/bloomberg.recipe +++ b/recipes/bloomberg.recipe @@ -15,6 +15,7 @@ def get_contents(x): return '' + x.get('value', '') + ''.join(map(get_contents, x.get('content', ''))) + '' if 'emphasis' in x['attributes']: return '' + x.get('value', '') + ''.join(map(get_contents, x.get('content', ''))) + '' + return x.get('value', '') + ''.join(map(get_contents, x.get('content', ''))) return x.get('value', '') + ''.join(map(get_contents, x.get('content', ''))) elif otype == 'br': return '
    '