mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Merge branch 'master' of https://github.com/unkn0w7n/calibre
This commit is contained in:
commit
8eae5df87d
@ -31,10 +31,12 @@ class Reuters(BasicNewsRecipe):
|
|||||||
no_stylesheets = True
|
no_stylesheets = True
|
||||||
remove_attributes = ['style', 'height', 'width']
|
remove_attributes = ['style', 'height', 'width']
|
||||||
resolve_internal_links = True
|
resolve_internal_links = True
|
||||||
ignore_duplicate_articles = {'url', 'title'}
|
ignore_duplicate_articles = {'url'}
|
||||||
|
remove_empty_feeds = True
|
||||||
|
|
||||||
extra_css = '''
|
extra_css = '''
|
||||||
.label, .auth { font-size:small; color:#202020; }
|
.label, .auth { font-size:small; color:#202020; }
|
||||||
|
.desc { font-style: italic; }
|
||||||
.figc { font-size:small; }
|
.figc { font-size:small; }
|
||||||
img {display:block; margin:0 auto;}
|
img {display:block; margin:0 auto;}
|
||||||
'''
|
'''
|
||||||
@ -48,8 +50,8 @@ class Reuters(BasicNewsRecipe):
|
|||||||
'res': {
|
'res': {
|
||||||
'short': 'For hi-res images, select a resolution from the\nfollowing options: 960, 1080, 1200',
|
'short': 'For hi-res images, select a resolution from the\nfollowing options: 960, 1080, 1200',
|
||||||
'long': 'This is useful for non e-ink devices',
|
'long': 'This is useful for non e-ink devices',
|
||||||
'default': '480'
|
'default': '480',
|
||||||
}
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
def __init__(self, *args, **kwargs):
|
def __init__(self, *args, **kwargs):
|
||||||
@ -61,58 +63,55 @@ class Reuters(BasicNewsRecipe):
|
|||||||
def parse_index(self):
|
def parse_index(self):
|
||||||
index = 'https://www.reuters.com'
|
index = 'https://www.reuters.com'
|
||||||
today = datetime.now()
|
today = datetime.now()
|
||||||
feed_api = (
|
|
||||||
index
|
sections = []
|
||||||
+ '/arc/outboundfeeds/v3/mobile/section/{}/?from=0&size=50&outputType=json'
|
|
||||||
|
sec_api = json.loads(
|
||||||
|
self.index_to_soup(index + '/mobile/api/v1/menu/?outputType=json', raw=True)
|
||||||
|
)
|
||||||
|
|
||||||
|
for s in sec_api[0]['data']['hierarchy']['children']:
|
||||||
|
if s.get('type', '') == 'section':
|
||||||
|
sections.append((s['name'], s['id']))
|
||||||
|
sections.extend(
|
||||||
|
(s['name'] + ' - ' + s2['name'], s2['id'])
|
||||||
|
for s2 in s.get('children', [])
|
||||||
|
if s2.get('type', '') == 'section'
|
||||||
)
|
)
|
||||||
path_api = index + '/arc/outboundfeeds/v3/mobile{}?outputType=json'
|
|
||||||
sections = [
|
|
||||||
'world',
|
|
||||||
'business',
|
|
||||||
'markets',
|
|
||||||
'sustainability',
|
|
||||||
'legal',
|
|
||||||
'breakingviews',
|
|
||||||
'technology',
|
|
||||||
# 'sports',
|
|
||||||
'science',
|
|
||||||
'lifestyle',
|
|
||||||
]
|
|
||||||
|
|
||||||
feeds = []
|
feeds = []
|
||||||
|
|
||||||
for sec in sections:
|
for sec, link in sections:
|
||||||
section = sec.capitalize()
|
self.log(sec)
|
||||||
self.log(section)
|
|
||||||
|
|
||||||
articles = []
|
articles = []
|
||||||
|
|
||||||
data = json.loads(self.index_to_soup(feed_api.format(sec), raw=True))[
|
data = json.loads(
|
||||||
'wireitems'
|
self.index_to_soup(
|
||||||
]
|
index + '/mobile/v1' + link + '?outputType=json', raw=True
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
for x in data:
|
for st in (
|
||||||
if x.get('wireitem_type', '') == 'story':
|
story
|
||||||
for y in x['templates']:
|
for x in data
|
||||||
if y.get('type', '') == 'story':
|
if isinstance(x, dict)
|
||||||
title = y['story']['hed']
|
for story in x.get('data', {}).get('stories', [])
|
||||||
|
):
|
||||||
|
title = st['title']
|
||||||
|
|
||||||
date = datetime.fromisoformat(
|
date = datetime.fromisoformat(st['display_time'][:-1]) + timedelta(
|
||||||
y['story']['updated_at'][:-1]
|
seconds=time.timezone
|
||||||
) + timedelta(seconds=time.timezone)
|
)
|
||||||
if (today - date) > timedelta(self.oldest_article):
|
if (today - date) > timedelta(self.oldest_article):
|
||||||
continue
|
continue
|
||||||
|
|
||||||
desc = y['story']['lede']
|
desc = st['description']
|
||||||
path = y['template_action']
|
url = index + st['url']
|
||||||
if path.get('type', '') == 'article':
|
self.log(' ', title, '\n\t', desc, '\n\t', url)
|
||||||
url = path_api.format(path['api_path_native'])
|
articles.append({'title': title, 'description': desc, 'url': url})
|
||||||
self.log(' ', title, '\n\t', desc)
|
|
||||||
articles.append(
|
|
||||||
{'title': title, 'description': desc, 'url': url}
|
|
||||||
)
|
|
||||||
if articles:
|
if articles:
|
||||||
feeds.append((section, articles))
|
feeds.append((sec, articles))
|
||||||
return feeds
|
return feeds
|
||||||
|
|
||||||
def preprocess_raw_html(self, raw, url):
|
def preprocess_raw_html(self, raw, url):
|
||||||
@ -120,75 +119,84 @@ class Reuters(BasicNewsRecipe):
|
|||||||
w = self.recipe_specific_options.get('res')
|
w = self.recipe_specific_options.get('res')
|
||||||
if w and isinstance(w, str):
|
if w and isinstance(w, str):
|
||||||
res = '&width=' + w
|
res = '&width=' + w
|
||||||
js = json.loads(raw)
|
|
||||||
data = js['wireitems']
|
|
||||||
body = ''
|
body = ''
|
||||||
for x in data:
|
|
||||||
if x.get('wireitem_type', '') == 'story':
|
for det in json.loads(raw):
|
||||||
for y in x['templates']:
|
if not det.get('type', '') == 'article_detail':
|
||||||
if 'label' in y['cid']:
|
continue
|
||||||
body += '<div class="label">' + y['title'] + '</div>'
|
data = det['data']['article']
|
||||||
break
|
body += '<h1>' + data['title'] + '</h1>'
|
||||||
for y in x['templates']:
|
if data.get('description'):
|
||||||
if 'title' in y['cid']:
|
body += '<p class="desc">' + data['description'] + '</p>'
|
||||||
|
if data.get('authors'):
|
||||||
body += (
|
body += (
|
||||||
'<h1 title="{}">'.format(js['share_url'])
|
'<p class="auth">'
|
||||||
+ y['content']
|
+ 'By '
|
||||||
+ '</h1>'
|
+ ', '.join(at.get('byline', '') for at in data.get('authors', []))
|
||||||
|
+ '</p>'
|
||||||
)
|
)
|
||||||
break
|
|
||||||
for y in x['templates']:
|
if data.get('thumbnail') and data['thumbnail'].get('type', '') == 'image':
|
||||||
if 'author' in y['cid']:
|
th = data['thumbnail']
|
||||||
body += '<p>'
|
body += '<img src="{}"><div class="figc">{}</div>'.format(
|
||||||
auths = list(y.get('authors_names', []))
|
th['resizer_url'].split('&')[0] + res,
|
||||||
if auths:
|
th.get('caption', ''),
|
||||||
body += (
|
|
||||||
'<div class="auth">' + 'By ' + ', '.join(auths) + '</div>'
|
|
||||||
)
|
)
|
||||||
break
|
|
||||||
for y in x['templates']:
|
|
||||||
if 'datetime' in y['cid']:
|
|
||||||
body += (
|
body += (
|
||||||
'<div class="auth">'
|
'<p class="auth">'
|
||||||
+ str(y['read_minutes'])
|
+ str(data['read_minutes'])
|
||||||
+ ' minute read | '
|
+ ' minute read | '
|
||||||
+ p_dt(y['display_time'])
|
+ str(data['word_count'])
|
||||||
+ '</div>'
|
+ ' words | '
|
||||||
|
+ p_dt(
|
||||||
|
data['updated_time']
|
||||||
|
if data.get('updated_time')
|
||||||
|
else data['display_time']
|
||||||
)
|
)
|
||||||
body += '</p>'
|
+ '</p>'
|
||||||
break
|
)
|
||||||
for y in x['templates']:
|
|
||||||
if 'paragraph' in y['cid']:
|
if data.get('summary'):
|
||||||
|
(
|
||||||
|
'<blockquote>'
|
||||||
|
+ ''.join(f'<li>{su["description"]}</li>' for su in data['summary'])
|
||||||
|
+ '</blockquote>'
|
||||||
|
)
|
||||||
|
|
||||||
|
for y in data['content_elements']:
|
||||||
|
ty = y.get('type', '')
|
||||||
|
if ty == 'placeholder':
|
||||||
|
continue
|
||||||
|
|
||||||
|
elif ty == 'paragraph':
|
||||||
body += '<p>' + y['content'] + '</p>'
|
body += '<p>' + y['content'] + '</p>'
|
||||||
if 'header' in y['cid']:
|
elif ty == 'header':
|
||||||
body += '<h4>' + y['content'] + '</h4>'
|
body += '<h4>' + y['content'] + '</h4>'
|
||||||
if 'image' in y['cid']:
|
elif ty == 'graphic':
|
||||||
if 'renditions' in y['image']:
|
|
||||||
body += '<img src="{}"><div class="figc">{}</div>'.format(
|
body += '<img src="{}"><div class="figc">{}</div>'.format(
|
||||||
y['image']['url'].split('&')[0] + res,
|
y['resizer_url'].split('&')[0] + res,
|
||||||
y['image']['caption'],
|
y.get('description', ''),
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
body += '<img src="{}"><div class="figc">{}</div>'.format(
|
self.log('**', ty)
|
||||||
y['image']['url'], y['image']['caption']
|
|
||||||
)
|
if data.get('sign_off'):
|
||||||
if 'gallery' in y['cid']:
|
body += '<p class="auth">' + data['sign_off'] + '</p>'
|
||||||
for imgs in y['images']:
|
|
||||||
if 'renditions' in imgs:
|
|
||||||
body += '<img src="{}"><div class="figc">{}</div>'.format(
|
|
||||||
imgs['url'].split('&')[0] + res,
|
|
||||||
imgs['caption'],
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
body += '<img src="{}"><div class="figc">{}</div>'.format(
|
|
||||||
imgs['url'], imgs['caption']
|
|
||||||
)
|
|
||||||
if 'video' in y['cid']:
|
|
||||||
body += '<img src="{}"><div class="figc">{}</div>'.format(
|
|
||||||
y['video']['thumbnail']['url'],
|
|
||||||
y['video']['thumbnail']['caption'],
|
|
||||||
)
|
|
||||||
return '<html><body><div>' + body + '</div></body></html>'
|
return '<html><body><div>' + body + '</div></body></html>'
|
||||||
|
|
||||||
def populate_article_metadata(self, article, soup, first):
|
def get_browser(self, *args, **kwargs):
|
||||||
article.url = soup.find('h1')['title']
|
kwargs['user_agent'] = (
|
||||||
|
'ReutersNews/7.11.0.1742843009 Mozilla/5.0 (Linux; Android 14) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.6422.165 Mobile Safari/537.36'
|
||||||
|
)
|
||||||
|
br = BasicNewsRecipe.get_browser(self, *args, **kwargs)
|
||||||
|
br.addheaders += [('cookie', 'reuters-geo={"country":"-"; "region":"-"}=')]
|
||||||
|
return br
|
||||||
|
|
||||||
|
def print_version(self, url):
|
||||||
|
return (
|
||||||
|
url.replace('https://www.reuters.com', 'https://www.reuters.com/mobile/v1')
|
||||||
|
+ '?outputType=json'
|
||||||
|
)
|
||||||
|
Loading…
x
Reference in New Issue
Block a user