mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Merge branch 'master' of https://github.com/unkn0w7n/calibre
This commit is contained in:
commit
3767147101
@ -1,45 +1,57 @@
|
|||||||
from calibre.web.feeds.news import BasicNewsRecipe, classes
|
from calibre.web.feeds.news import BasicNewsRecipe, classes
|
||||||
from calibre import browser
|
|
||||||
from html5_parser import parse
|
from html5_parser import parse
|
||||||
import json
|
import json
|
||||||
import random
|
import random
|
||||||
import time
|
import time
|
||||||
|
|
||||||
def get_contents(x):
|
def get_contents(x):
|
||||||
|
if x == '':
|
||||||
|
return ''
|
||||||
otype = x.get('type', '')
|
otype = x.get('type', '')
|
||||||
if otype == 'text':
|
if otype == 'text':
|
||||||
return x.get('value', '')
|
if 'attributes' in x:
|
||||||
|
if 'strong' in x['attributes']:
|
||||||
|
return '<b>' + x.get('value', '') + ''.join(map(get_contents, x.get('content', ''))) + '</b>'
|
||||||
|
if 'emphasis' in x['attributes']:
|
||||||
|
return '<i>' + x.get('value', '') + ''.join(map(get_contents, x.get('content', ''))) + '</i>'
|
||||||
|
return x.get('value', '') + ''.join(map(get_contents, x.get('content', '')))
|
||||||
|
return x.get('value', '') + ''.join(map(get_contents, x.get('content', '')))
|
||||||
|
elif otype == 'br':
|
||||||
|
return '<br>'
|
||||||
elif otype == 'paragraph':
|
elif otype == 'paragraph':
|
||||||
return '<p>' + x.get('value', '') + ''.join(map(get_contents, x.get('content'))) + '</p>'
|
return '<p>' + x.get('value', '') + ''.join(map(get_contents, x.get('content', ''))) + '</p>'
|
||||||
elif otype == 'heading':
|
elif otype == 'heading':
|
||||||
return '<h3>' + x.get('value', '') + ''.join(map(get_contents, x.get('content'))) + '</h3>'
|
return '<h3>' + x.get('value', '') + ''.join(map(get_contents, x.get('content', ''))) + '</h3>'
|
||||||
elif otype == 'list':
|
elif otype == 'list':
|
||||||
return '<ul>' + ''.join(map(get_contents, x.get('content'))) + '</ul>'
|
return '<ul>' + ''.join(map(get_contents, x.get('content', ''))) + '</ul>'
|
||||||
elif otype == 'listItem':
|
elif otype == 'listItem':
|
||||||
return '<li>' + x.get('value', '') + ''.join(map(get_contents, x.get('content'))) + '</li>'
|
return '<li>' + x.get('value', '') + ''.join(map(get_contents, x.get('content', ''))) + '</li>'
|
||||||
elif otype == 'quote':
|
elif otype == 'quote':
|
||||||
return '<blockquote>' + x.get('value', '') + ''.join(map(get_contents, x.get('content'))) + '</blockquote>'
|
return '<blockquote class="col">' + x.get('value', '') + ''.join(map(get_contents, x.get('content', ''))) + '</blockquote>'
|
||||||
elif otype == 'media':
|
elif otype == 'media':
|
||||||
if x['subType'] == 'photo':
|
if x['subType'] == 'photo':
|
||||||
return '<div><div class="img"><img src="{}"></div><div class="cap">{}</div></div>'.format(
|
return '<div><div class="img"><img src="{}"></div><div class="cap">{}</div></div>'.format(
|
||||||
x['data']['photo']['src'], x['data']['photo']['caption'])
|
x['data']['photo']['src'], x['data']['photo']['caption'])
|
||||||
elif x['subType'] == 'chart':
|
elif x['subType'] == 'chart':
|
||||||
if x['data'] and x['data']['chart']:
|
if x['data'] and x['data']['chart']:
|
||||||
return '<div><img src="{}"></div>'.format(x['data']['chart']['fallback'])
|
return '<div class="img"><img src="{}"></div>'.format(x['data']['chart']['fallback'])
|
||||||
elif otype == 'link':
|
elif otype == 'link':
|
||||||
if x['data'] and x['content'] and x['content'][0] and x['content'][0]['value']:
|
if 'data' in x:
|
||||||
if 'href' in x['data']:
|
if 'href' in x['data']:
|
||||||
return '<a href="' + x['data']['href'] + '">' + x['content'][0]['value'] + '</a>'
|
return '<a href="' + x['data']['href'] + '">' + x.get('value', '') + ''.join(map(get_contents, x.get('content', ''))) + '</a>'
|
||||||
return '<i>' + x['content'][0]['value'] + '</i>'
|
return '<i>' + x.get('value', '') + ''.join(map(get_contents, x.get('content', ''))) + '</i>'
|
||||||
|
return '<i>' + x.get('value', '') + ''.join(map(get_contents, x.get('content', ''))) + '</i>'
|
||||||
elif otype == 'entity':
|
elif otype == 'entity':
|
||||||
if x['content'] and x['content'][0] and x['content'][0]['value']:
|
if x['subType'] == 'story':
|
||||||
if x['subType'] == 'story':
|
if x['data'] and x['data']['link'] and x['data']['link']['destination']:
|
||||||
if x['data'] and x['data']['link'] and x['data']['link']['destination']:
|
if 'web' in x['data']['link']['destination']:
|
||||||
if 'web' in x['data']['link']['destination']:
|
return '<a href="' + x['data']['link']['destination']['web'] + '">' + x.get('value', '') + ''.join(map(get_contents, x.get('content', ''))) + '</a>'
|
||||||
return '<a href="' + x['data']['link']['destination']['web'] + '">' + x['content'][0]['value'] + '</a>'
|
return '<i>' + x.get('value', '') + ''.join(map(get_contents, x.get('content', ''))) + '</i>'
|
||||||
return '<i>' + x['content'][0]['value'] + '</i>'
|
return '<i>' + x.get('value', '') + ''.join(map(get_contents, x.get('content', ''))) + '</i>'
|
||||||
elif x['subType'] in ('person', 'security'):
|
return '<i>' + x.get('value', '') + ''.join(map(get_contents, x.get('content', ''))) + '</i>'
|
||||||
return '<i>' + x['content'][0]['value'] + '</i>'
|
elif not any(x == otype for x in ['', 'ad', 'inline-newsletter', 'tabularData']):
|
||||||
|
if any(b in x for b in ['value', 'content']):
|
||||||
|
return '<i>' + x.get('value', '') + ''.join(map(get_contents, x.get('content', ''))) + '</i>'
|
||||||
|
|
||||||
return ''
|
return ''
|
||||||
|
|
||||||
@ -53,26 +65,32 @@ class Bloomberg(BasicNewsRecipe):
|
|||||||
ignore_duplicate_articles = {'url'}
|
ignore_duplicate_articles = {'url'}
|
||||||
resolve_internal_links = True
|
resolve_internal_links = True
|
||||||
masthead_url = 'https://assets.bwbx.io/s3/javelin/public/hub/images/BW-Logo-Black-cc9035fbb3.svg'
|
masthead_url = 'https://assets.bwbx.io/s3/javelin/public/hub/images/BW-Logo-Black-cc9035fbb3.svg'
|
||||||
|
description = (
|
||||||
|
'Bloomberg Businessweek helps global leaders stay ahead with insights and in-depth analysis on the people,'
|
||||||
|
' companies, events, and trends shaping today\'s complex, global economy.'
|
||||||
|
)
|
||||||
|
|
||||||
# delay = 7 # seconds
|
simultaneous_downloads = 1
|
||||||
simultaneous_downloads = 3
|
|
||||||
|
|
||||||
extra_css = '''
|
extra_css = '''
|
||||||
.auth {font-size:small; font-weight:bold;}
|
.auth {font-size:small; font-weight:bold;}
|
||||||
.time, .chart {font-size:small;}
|
.time, .chart, .css--lede-byline, .css--lede-timestamp {font-size:small;}
|
||||||
.subhead {font-style:italic; color:#404040;}
|
.subhead {font-style:italic; color:#404040;}
|
||||||
|
i, .col {color:#202020;}
|
||||||
.cat {font-size:small; color:gray;}
|
.cat {font-size:small; color:gray;}
|
||||||
.news-figure-caption-text, .cap, .img {font-size:small; text-align:center;}
|
.news-figure-caption-text, .cap, .img, .css--caption-outer-wrapper {font-size:small; text-align:center;}
|
||||||
.news-figure-credit {font-size:small; text-align:center; color:#202020;}
|
.news-figure-credit {font-size:small; text-align:center; color:#202020;}
|
||||||
'''
|
'''
|
||||||
|
|
||||||
remove_tags = [
|
remove_tags = [
|
||||||
|
dict(name=['button', 'svg']),
|
||||||
dict(name='div', attrs={'id':['bb-that', 'bb-nav']}),
|
dict(name='div', attrs={'id':['bb-that', 'bb-nav']}),
|
||||||
classes('twitter-logo bb-global-footer')
|
classes('twitter-logo bb-global-footer __sticky__audio__bar__portal__ css--social-wrapper-outer')
|
||||||
]
|
]
|
||||||
|
|
||||||
def get_browser(self):
|
def get_browser(self, *a, **kw):
|
||||||
br = browser()
|
kw['user_agent'] = 'common_words/based'
|
||||||
|
br = BasicNewsRecipe.get_browser(self, *a, **kw)
|
||||||
br.set_handle_redirect(False)
|
br.set_handle_redirect(False)
|
||||||
return br
|
return br
|
||||||
|
|
||||||
@ -163,7 +181,7 @@ class Bloomberg(BasicNewsRecipe):
|
|||||||
body = ''
|
body = ''
|
||||||
body_data = data['body']['content']
|
body_data = data['body']['content']
|
||||||
for x in body_data:
|
for x in body_data:
|
||||||
pause = random.choice((0.5, 1, 1.25))
|
pause = random.choice((0.25, 0.5, 0.75, 1))
|
||||||
time.sleep(pause)
|
time.sleep(pause)
|
||||||
body += get_contents(x)
|
body += get_contents(x)
|
||||||
return '<html><body>' + cat + title + subhead + auth + lede + caption + '<div>' + body + '</div></body></html>'
|
return '<html><body>' + cat + title + subhead + auth + lede + caption + '<div>' + body + '</div></body></html>'
|
||||||
|
@ -1,5 +1,4 @@
|
|||||||
from calibre.web.feeds.news import BasicNewsRecipe
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
from calibre import browser
|
|
||||||
from html5_parser import parse
|
from html5_parser import parse
|
||||||
from calibre.ptempfile import PersistentTemporaryFile
|
from calibre.ptempfile import PersistentTemporaryFile
|
||||||
import json
|
import json
|
||||||
@ -7,19 +6,29 @@ import random
|
|||||||
import time
|
import time
|
||||||
|
|
||||||
def get_contents(x):
|
def get_contents(x):
|
||||||
|
if x == '':
|
||||||
|
return ''
|
||||||
otype = x.get('type', '')
|
otype = x.get('type', '')
|
||||||
if otype == 'text':
|
if otype == 'text':
|
||||||
return x.get('value', '')
|
if 'attributes' in x:
|
||||||
|
if 'strong' in x['attributes']:
|
||||||
|
return '<b>' + x.get('value', '') + ''.join(map(get_contents, x.get('content', ''))) + '</b>'
|
||||||
|
if 'emphasis' in x['attributes']:
|
||||||
|
return '<i>' + x.get('value', '') + ''.join(map(get_contents, x.get('content', ''))) + '</i>'
|
||||||
|
return x.get('value', '') + ''.join(map(get_contents, x.get('content', '')))
|
||||||
|
return x.get('value', '') + ''.join(map(get_contents, x.get('content', '')))
|
||||||
|
elif otype == 'br':
|
||||||
|
return '<br>'
|
||||||
elif otype == 'paragraph':
|
elif otype == 'paragraph':
|
||||||
return '<p>' + x.get('value', '') + ''.join(map(get_contents, x.get('content'))) + '</p>'
|
return '<p>' + x.get('value', '') + ''.join(map(get_contents, x.get('content', ''))) + '</p>'
|
||||||
elif otype == 'heading':
|
elif otype == 'heading':
|
||||||
return '<h3>' + x.get('value', '') + ''.join(map(get_contents, x.get('content'))) + '</h3>'
|
return '<h3>' + x.get('value', '') + ''.join(map(get_contents, x.get('content', ''))) + '</h3>'
|
||||||
elif otype == 'list':
|
elif otype == 'list':
|
||||||
return '<ul>' + ''.join(map(get_contents, x.get('content'))) + '</ul>'
|
return '<ul>' + ''.join(map(get_contents, x.get('content', ''))) + '</ul>'
|
||||||
elif otype == 'listItem':
|
elif otype == 'listItem':
|
||||||
return '<li>' + x.get('value', '') + ''.join(map(get_contents, x.get('content'))) + '</li>'
|
return '<li>' + x.get('value', '') + ''.join(map(get_contents, x.get('content', ''))) + '</li>'
|
||||||
elif otype == 'quote':
|
elif otype == 'quote':
|
||||||
return '<blockquote>' + x.get('value', '') + ''.join(map(get_contents, x.get('content'))) + '</blockquote>'
|
return '<blockquote class="col">' + x.get('value', '') + ''.join(map(get_contents, x.get('content', ''))) + '</blockquote>'
|
||||||
elif otype == 'media':
|
elif otype == 'media':
|
||||||
if x['subType'] == 'photo':
|
if x['subType'] == 'photo':
|
||||||
return '<div><div class="img"><img src="{}"></div><div class="cap">{}</div></div>'.format(
|
return '<div><div class="img"><img src="{}"></div><div class="cap">{}</div></div>'.format(
|
||||||
@ -28,19 +37,22 @@ def get_contents(x):
|
|||||||
if x['data'] and x['data']['chart']:
|
if x['data'] and x['data']['chart']:
|
||||||
return '<div class="img"><img src="{}"></div>'.format(x['data']['chart']['fallback'])
|
return '<div class="img"><img src="{}"></div>'.format(x['data']['chart']['fallback'])
|
||||||
elif otype == 'link':
|
elif otype == 'link':
|
||||||
if x['data'] and x['content'] and x['content'][0] and x['content'][0]['value']:
|
if 'data' in x:
|
||||||
if 'href' in x['data']:
|
if 'href' in x['data']:
|
||||||
return '<a href="' + x['data']['href'] + '">' + x['content'][0]['value'] + '</a>'
|
return '<a href="' + x['data']['href'] + '">' + x.get('value', '') + ''.join(map(get_contents, x.get('content', ''))) + '</a>'
|
||||||
return '<i>' + x['content'][0]['value'] + '</i>'
|
return '<i>' + x.get('value', '') + ''.join(map(get_contents, x.get('content', ''))) + '</i>'
|
||||||
|
return '<i>' + x.get('value', '') + ''.join(map(get_contents, x.get('content', ''))) + '</i>'
|
||||||
elif otype == 'entity':
|
elif otype == 'entity':
|
||||||
if x['content'] and x['content'][0] and x['content'][0]['value']:
|
if x['subType'] == 'story':
|
||||||
if x['subType'] == 'story':
|
if x['data'] and x['data']['link'] and x['data']['link']['destination']:
|
||||||
if x['data'] and x['data']['link'] and x['data']['link']['destination']:
|
if 'web' in x['data']['link']['destination']:
|
||||||
if 'web' in x['data']['link']['destination']:
|
return '<a href="' + x['data']['link']['destination']['web'] + '">' + x.get('value', '') + ''.join(map(get_contents, x.get('content', ''))) + '</a>'
|
||||||
return '<a href="' + x['data']['link']['destination']['web'] + '">' + x['content'][0]['value'] + '</a>'
|
return '<i>' + x.get('value', '') + ''.join(map(get_contents, x.get('content', ''))) + '</i>'
|
||||||
return '<i>' + x['content'][0]['value'] + '</i>'
|
return '<i>' + x.get('value', '') + ''.join(map(get_contents, x.get('content', ''))) + '</i>'
|
||||||
elif x['subType'] in ('person', 'security'):
|
return '<i>' + x.get('value', '') + ''.join(map(get_contents, x.get('content', ''))) + '</i>'
|
||||||
return '<i>' + x['content'][0]['value'] + '</i>'
|
elif not any(x == otype for x in ['', 'ad', 'inline-newsletter', 'tabularData']):
|
||||||
|
if any(b in x for b in ['value', 'content']):
|
||||||
|
return '<i>' + x.get('value', '') + ''.join(map(get_contents, x.get('content', ''))) + '</i>'
|
||||||
|
|
||||||
return ''
|
return ''
|
||||||
|
|
||||||
@ -53,14 +65,16 @@ class Bloomberg(BasicNewsRecipe):
|
|||||||
use_embedded_content = False
|
use_embedded_content = False
|
||||||
remove_attributes = ['style', 'height', 'width']
|
remove_attributes = ['style', 'height', 'width']
|
||||||
ignore_duplicate_articles = {'url', 'title'}
|
ignore_duplicate_articles = {'url', 'title'}
|
||||||
|
masthead_url = 'https://assets.bbhub.io/company/sites/70/2022/09/logoBBGblck.svg'
|
||||||
|
description = 'Bloomberg delivers business and markets news, data, analysis, and video to the world, featuring stories from Businessweek and Bloomberg News.'
|
||||||
|
|
||||||
# delay = 7 # seconds
|
simultaneous_downloads = 1
|
||||||
simultaneous_downloads = 3
|
|
||||||
|
|
||||||
extra_css = '''
|
extra_css = '''
|
||||||
.auth {font-size:small; font-weight:bold;}
|
.auth {font-size:small; font-weight:bold;}
|
||||||
.time, .chart {font-size:small;}
|
.time, .chart {font-size:small;}
|
||||||
.subhead, blockquote {font-style:italic; color:#404040;}
|
.subhead {font-style:italic; color:#404040;}
|
||||||
|
i, .col {color:#202020;}
|
||||||
.cat {font-size:small; color:gray;}
|
.cat {font-size:small; color:gray;}
|
||||||
.news-figure-caption-text, .cap, .img {font-size:small; text-align:center;}
|
.news-figure-caption-text, .cap, .img {font-size:small; text-align:center;}
|
||||||
.news-figure-credit {font-size:small; text-align:center; color:#202020;}
|
.news-figure-credit {font-size:small; text-align:center; color:#202020;}
|
||||||
@ -89,9 +103,9 @@ class Bloomberg(BasicNewsRecipe):
|
|||||||
pt.close()
|
pt.close()
|
||||||
return pt.name
|
return pt.name
|
||||||
|
|
||||||
|
def get_browser(self, *a, **kw):
|
||||||
def get_browser(self):
|
kw['user_agent'] = 'common_words/based'
|
||||||
br = browser()
|
br = BasicNewsRecipe.get_browser(self, *a, **kw)
|
||||||
br.set_handle_redirect(False)
|
br.set_handle_redirect(False)
|
||||||
return br
|
return br
|
||||||
|
|
||||||
@ -103,7 +117,7 @@ class Bloomberg(BasicNewsRecipe):
|
|||||||
'https://news.google.com/rss/search?q=when:27h+allinurl:bloomberg.com%2Fnews%2Fnewsletters%2F&hl=en-US&gl=US&ceid=US:en'),
|
'https://news.google.com/rss/search?q=when:27h+allinurl:bloomberg.com%2Fnews%2Fnewsletters%2F&hl=en-US&gl=US&ceid=US:en'),
|
||||||
('News',
|
('News',
|
||||||
'https://news.google.com/rss/search?q=when:27h+allinurl:bloomberg.com%2Fnews%2Farticles%2F&hl=en-US&gl=US&ceid=US:en'),
|
'https://news.google.com/rss/search?q=when:27h+allinurl:bloomberg.com%2Fnews%2Farticles%2F&hl=en-US&gl=US&ceid=US:en'),
|
||||||
('Others', 'https://news.google.com/rss/search?q=when:27h+allinurl:bloomberg.com&hl=en-US&gl=US&ceid=US:en')
|
('Others', 'https://news.google.com/rss/search?q=when:27h+allinurl:https%3A%2F%2Fwww.bloomberg.com&hl=en-US&gl=US&ceid=US:en')
|
||||||
]
|
]
|
||||||
|
|
||||||
def preprocess_raw_html(self, raw, *a):
|
def preprocess_raw_html(self, raw, *a):
|
||||||
@ -160,7 +174,7 @@ class Bloomberg(BasicNewsRecipe):
|
|||||||
body = ''
|
body = ''
|
||||||
body_data = data['body']['content']
|
body_data = data['body']['content']
|
||||||
for x in body_data:
|
for x in body_data:
|
||||||
pause = random.choice((0.5, 1, 1.25))
|
pause = random.choice((0.25, 0.5, 0.75, 1))
|
||||||
time.sleep(pause)
|
time.sleep(pause)
|
||||||
body += get_contents(x)
|
body += get_contents(x)
|
||||||
return '<html><body>' + cat + title + subhead + auth + lede + caption + '<div>' + body + '</div></body></html>'
|
return '<html><body>' + cat + title + subhead + auth + lede + caption + '<div>' + body + '</div></body></html>'
|
||||||
|
Loading…
x
Reference in New Issue
Block a user