mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-10-20 21:40:29 -04:00
214 lines
7.4 KiB
Python
214 lines
7.4 KiB
Python
#!/usr/bin/env python
|
||
# vim:fileencoding=utf-8
|
||
import json
|
||
import time
|
||
from datetime import datetime, timedelta
|
||
|
||
from calibre.web.feeds.news import BasicNewsRecipe
|
||
|
||
|
||
def p_dt(x):
|
||
dt = datetime.fromisoformat(x[:-1]) + timedelta(seconds=time.timezone)
|
||
return dt.strftime('%b %d, %Y, %I:%M %p')
|
||
|
||
|
||
class Reuters(BasicNewsRecipe):
|
||
title = 'Reuters'
|
||
__author__ = 'unkn0wn'
|
||
description = (
|
||
'Reuters, the news and media division of Thomson Reuters, is the world’s largest multimedia news provider, '
|
||
'reaching billions of people worldwide every day. Reuters provides business, financial, national and international '
|
||
'news to professionals via desktop terminals, the world’s media organizations, industry events and directly to consumers.'
|
||
)
|
||
masthead_url = (
|
||
'https://upload.wikimedia.org/wikipedia/commons/9/9e/Reuters_logo_2024.svg'
|
||
)
|
||
cover_url = 'https://yt3.googleusercontent.com/ytc/AIdro_mk43b9eQwN15ZBDyMPDaElxvw4V-oUS9XDUvVnYB3gA9yA=s1024'
|
||
language = 'en'
|
||
encoding = 'utf-8'
|
||
oldest_article = 1.2 # days
|
||
no_javascript = True
|
||
no_stylesheets = True
|
||
remove_attributes = ['style', 'height', 'width']
|
||
resolve_internal_links = True
|
||
ignore_duplicate_articles = {'url'}
|
||
remove_empty_feeds = True
|
||
|
||
extra_css = '''
|
||
.label, .auth { font-size:small; color:#202020; }
|
||
.desc { font-style: italic; }
|
||
.figc { font-size:small; }
|
||
img {display:block; margin:0 auto;}
|
||
'''
|
||
|
||
recipe_specific_options = {
|
||
'days': {
|
||
'short': 'Oldest article to download from this news source. In days ',
|
||
'long': 'For example, 0.5, gives you articles from the past 12 hours',
|
||
'default': str(oldest_article),
|
||
},
|
||
'res': {
|
||
'short': 'For hi-res images, select a resolution from the\nfollowing options: 960, 1080, 1200',
|
||
'long': 'This is useful for non e-ink devices',
|
||
'default': '480',
|
||
},
|
||
'spr': {
|
||
'short': 'Include Sports sections?',
|
||
'long': 'Yes/No',
|
||
'default': 'No',
|
||
},
|
||
}
|
||
|
||
def __init__(self, *args, **kwargs):
|
||
BasicNewsRecipe.__init__(self, *args, **kwargs)
|
||
d = self.recipe_specific_options.get('days')
|
||
if d and isinstance(d, str):
|
||
self.oldest_article = float(d)
|
||
|
||
def parse_index(self):
|
||
index = 'https://www.reuters.com'
|
||
today = datetime.now()
|
||
|
||
sections = []
|
||
|
||
sec_api = json.loads(
|
||
self.index_to_soup(index + '/mobile/api/v1/menu/?outputType=json', raw=True)
|
||
)
|
||
|
||
for s in sec_api[0]['data']['hierarchy']['children']:
|
||
if s.get('type', '') == 'section':
|
||
sections.append((s['name'], s['id']))
|
||
sections.extend(
|
||
(s['name'] + ' - ' + s2['name'], s2['id'])
|
||
for s2 in s.get('children', [])
|
||
if s2.get('type', '') == 'section'
|
||
)
|
||
|
||
feeds = []
|
||
|
||
for sec, link in sections:
|
||
sp = self.recipe_specific_options.get('spr')
|
||
if sp and isinstance(sp, str):
|
||
if sp.lower().strip() != 'yes':
|
||
if sec.lower().startswith('sport'):
|
||
continue
|
||
|
||
self.log(sec)
|
||
|
||
articles = []
|
||
|
||
data = json.loads(
|
||
self.index_to_soup(
|
||
index + '/mobile/v1' + link + '?outputType=json', raw=True
|
||
)
|
||
)
|
||
|
||
for st in (
|
||
story
|
||
for x in data
|
||
if isinstance(x, dict)
|
||
for story in x.get('data', {}).get('stories', [])
|
||
):
|
||
title = st['title']
|
||
|
||
date = datetime.fromisoformat(st['display_time'][:-1]) + timedelta(
|
||
seconds=time.timezone
|
||
)
|
||
if (today - date) > timedelta(self.oldest_article):
|
||
continue
|
||
|
||
desc = st['description']
|
||
url = index + st['url']
|
||
self.log(' ', title, '\n\t', desc, '\n\t', url)
|
||
articles.append({'title': title, 'description': desc, 'url': url})
|
||
if articles:
|
||
feeds.append((sec, articles))
|
||
return feeds
|
||
|
||
def preprocess_raw_html(self, raw, url):
|
||
res = '&width=480'
|
||
w = self.recipe_specific_options.get('res')
|
||
if w and isinstance(w, str):
|
||
res = '&width=' + w
|
||
|
||
body = ''
|
||
|
||
for det in json.loads(raw):
|
||
if not det.get('type', '') == 'article_detail':
|
||
continue
|
||
data = det['data']['article']
|
||
body += '<h1>' + data['title'] + '</h1>'
|
||
if data.get('description'):
|
||
body += '<p class="desc">' + data['description'] + '</p>'
|
||
if data.get('authors'):
|
||
body += (
|
||
'<p class="auth">'
|
||
+ 'By '
|
||
+ ', '.join(at.get('byline', '') for at in data.get('authors', []))
|
||
+ '</p>'
|
||
)
|
||
|
||
if data.get('thumbnail') and data['thumbnail'].get('type', '') == 'image':
|
||
th = data['thumbnail']
|
||
body += '<img src="{}"><div class="figc">{}</div>'.format(
|
||
th['resizer_url'].split('&')[0] + res,
|
||
th.get('caption', ''),
|
||
)
|
||
|
||
body += (
|
||
'<p class="auth">'
|
||
+ str(data.get('read_minutes', '_'))
|
||
+ ' minute read | '
|
||
+ str(data['word_count'])
|
||
+ ' words | '
|
||
+ p_dt(
|
||
data['updated_time']
|
||
if data.get('updated_time')
|
||
else data['display_time']
|
||
)
|
||
+ '</p>'
|
||
)
|
||
|
||
if data.get('summary'):
|
||
body += (
|
||
'<blockquote>'
|
||
+ ''.join(f'<li>{su["description"]}</li>' for su in data['summary'])
|
||
+ '</blockquote>'
|
||
)
|
||
|
||
for y in data['content_elements']:
|
||
ty = y.get('type', '')
|
||
if ty == 'placeholder':
|
||
continue
|
||
|
||
elif ty == 'paragraph':
|
||
body += '<p>' + y['content'] + '</p>'
|
||
elif ty == 'header':
|
||
body += '<h4>' + y['content'] + '</h4>'
|
||
elif ty == 'graphic':
|
||
body += '<img src="{}"><div class="figc">{}</div>'.format(
|
||
y['resizer_url'].split('&')[0] + res,
|
||
y.get('description', ''),
|
||
)
|
||
else:
|
||
self.log('**', ty)
|
||
|
||
if data.get('sign_off'):
|
||
body += '<p class="auth">' + data['sign_off'] + '</p>'
|
||
|
||
return '<html><body><div>' + body + '</div></body></html>'
|
||
|
||
def get_browser(self, *args, **kwargs):
|
||
kwargs['user_agent'] = (
|
||
'ReutersNews/7.11.0.1742843009 Mozilla/5.0 (Linux; Android 14) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.6422.165 Mobile Safari/537.36'
|
||
)
|
||
br = BasicNewsRecipe.get_browser(self, *args, **kwargs)
|
||
br.addheaders += [('cookie', 'reuters-geo={"country":"-"; "region":"-"}=')]
|
||
return br
|
||
|
||
def print_version(self, url):
|
||
return (
|
||
url.replace('https://www.reuters.com', 'https://www.reuters.com/mobile/v1')
|
||
+ '?outputType=json'
|
||
)
|