This commit is contained in:
Kovid Goyal 2025-03-30 11:16:48 +05:30
commit 8eae5df87d
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C

View File

@ -31,10 +31,12 @@ class Reuters(BasicNewsRecipe):
no_stylesheets = True no_stylesheets = True
remove_attributes = ['style', 'height', 'width'] remove_attributes = ['style', 'height', 'width']
resolve_internal_links = True resolve_internal_links = True
ignore_duplicate_articles = {'url', 'title'} ignore_duplicate_articles = {'url'}
remove_empty_feeds = True
extra_css = ''' extra_css = '''
.label, .auth { font-size:small; color:#202020; } .label, .auth { font-size:small; color:#202020; }
.desc { font-style: italic; }
.figc { font-size:small; } .figc { font-size:small; }
img {display:block; margin:0 auto;} img {display:block; margin:0 auto;}
''' '''
@ -48,8 +50,8 @@ class Reuters(BasicNewsRecipe):
'res': { 'res': {
'short': 'For hi-res images, select a resolution from the\nfollowing options: 960, 1080, 1200', 'short': 'For hi-res images, select a resolution from the\nfollowing options: 960, 1080, 1200',
'long': 'This is useful for non e-ink devices', 'long': 'This is useful for non e-ink devices',
'default': '480' 'default': '480',
} },
} }
def __init__(self, *args, **kwargs): def __init__(self, *args, **kwargs):
@ -61,58 +63,55 @@ class Reuters(BasicNewsRecipe):
def parse_index(self): def parse_index(self):
index = 'https://www.reuters.com' index = 'https://www.reuters.com'
today = datetime.now() today = datetime.now()
feed_api = (
index sections = []
+ '/arc/outboundfeeds/v3/mobile/section/{}/?from=0&size=50&outputType=json'
sec_api = json.loads(
self.index_to_soup(index + '/mobile/api/v1/menu/?outputType=json', raw=True)
) )
path_api = index + '/arc/outboundfeeds/v3/mobile{}?outputType=json'
sections = [ for s in sec_api[0]['data']['hierarchy']['children']:
'world', if s.get('type', '') == 'section':
'business', sections.append((s['name'], s['id']))
'markets', sections.extend(
'sustainability', (s['name'] + ' - ' + s2['name'], s2['id'])
'legal', for s2 in s.get('children', [])
'breakingviews', if s2.get('type', '') == 'section'
'technology', )
# 'sports',
'science',
'lifestyle',
]
feeds = [] feeds = []
for sec in sections: for sec, link in sections:
section = sec.capitalize() self.log(sec)
self.log(section)
articles = [] articles = []
data = json.loads(self.index_to_soup(feed_api.format(sec), raw=True))[ data = json.loads(
'wireitems' self.index_to_soup(
] index + '/mobile/v1' + link + '?outputType=json', raw=True
)
)
for x in data: for st in (
if x.get('wireitem_type', '') == 'story': story
for y in x['templates']: for x in data
if y.get('type', '') == 'story': if isinstance(x, dict)
title = y['story']['hed'] for story in x.get('data', {}).get('stories', [])
):
title = st['title']
date = datetime.fromisoformat( date = datetime.fromisoformat(st['display_time'][:-1]) + timedelta(
y['story']['updated_at'][:-1] seconds=time.timezone
) + timedelta(seconds=time.timezone) )
if (today - date) > timedelta(self.oldest_article): if (today - date) > timedelta(self.oldest_article):
continue continue
desc = y['story']['lede'] desc = st['description']
path = y['template_action'] url = index + st['url']
if path.get('type', '') == 'article': self.log(' ', title, '\n\t', desc, '\n\t', url)
url = path_api.format(path['api_path_native']) articles.append({'title': title, 'description': desc, 'url': url})
self.log(' ', title, '\n\t', desc)
articles.append(
{'title': title, 'description': desc, 'url': url}
)
if articles: if articles:
feeds.append((section, articles)) feeds.append((sec, articles))
return feeds return feeds
def preprocess_raw_html(self, raw, url): def preprocess_raw_html(self, raw, url):
@ -120,75 +119,84 @@ class Reuters(BasicNewsRecipe):
w = self.recipe_specific_options.get('res') w = self.recipe_specific_options.get('res')
if w and isinstance(w, str): if w and isinstance(w, str):
res = '&width=' + w res = '&width=' + w
js = json.loads(raw)
data = js['wireitems']
body = '' body = ''
for x in data:
if x.get('wireitem_type', '') == 'story': for det in json.loads(raw):
for y in x['templates']: if not det.get('type', '') == 'article_detail':
if 'label' in y['cid']: continue
body += '<div class="label">' + y['title'] + '</div>' data = det['data']['article']
break body += '<h1>' + data['title'] + '</h1>'
for y in x['templates']: if data.get('description'):
if 'title' in y['cid']: body += '<p class="desc">' + data['description'] + '</p>'
body += ( if data.get('authors'):
'<h1 title="{}">'.format(js['share_url']) body += (
+ y['content'] '<p class="auth">'
+ '</h1>' + 'By '
) + ', '.join(at.get('byline', '') for at in data.get('authors', []))
break + '</p>'
for y in x['templates']: )
if 'author' in y['cid']:
body += '<p>' if data.get('thumbnail') and data['thumbnail'].get('type', '') == 'image':
auths = list(y.get('authors_names', [])) th = data['thumbnail']
if auths: body += '<img src="{}"><div class="figc">{}</div>'.format(
body += ( th['resizer_url'].split('&')[0] + res,
'<div class="auth">' + 'By ' + ', '.join(auths) + '</div>' th.get('caption', ''),
) )
break
for y in x['templates']: body += (
if 'datetime' in y['cid']: '<p class="auth">'
body += ( + str(data['read_minutes'])
'<div class="auth">' + ' minute read | '
+ str(y['read_minutes']) + str(data['word_count'])
+ ' minute read | ' + ' words | '
+ p_dt(y['display_time']) + p_dt(
+ '</div>' data['updated_time']
) if data.get('updated_time')
body += '</p>' else data['display_time']
break )
for y in x['templates']: + '</p>'
if 'paragraph' in y['cid']: )
body += '<p>' + y['content'] + '</p>'
if 'header' in y['cid']: if data.get('summary'):
body += '<h4>' + y['content'] + '</h4>' (
if 'image' in y['cid']: '<blockquote>'
if 'renditions' in y['image']: + ''.join(f'<li>{su["description"]}</li>' for su in data['summary'])
body += '<img src="{}"><div class="figc">{}</div>'.format( + '</blockquote>'
y['image']['url'].split('&')[0] + res, )
y['image']['caption'],
) for y in data['content_elements']:
else: ty = y.get('type', '')
body += '<img src="{}"><div class="figc">{}</div>'.format( if ty == 'placeholder':
y['image']['url'], y['image']['caption'] continue
)
if 'gallery' in y['cid']: elif ty == 'paragraph':
for imgs in y['images']: body += '<p>' + y['content'] + '</p>'
if 'renditions' in imgs: elif ty == 'header':
body += '<img src="{}"><div class="figc">{}</div>'.format( body += '<h4>' + y['content'] + '</h4>'
imgs['url'].split('&')[0] + res, elif ty == 'graphic':
imgs['caption'], body += '<img src="{}"><div class="figc">{}</div>'.format(
) y['resizer_url'].split('&')[0] + res,
else: y.get('description', ''),
body += '<img src="{}"><div class="figc">{}</div>'.format( )
imgs['url'], imgs['caption'] else:
) self.log('**', ty)
if 'video' in y['cid']:
body += '<img src="{}"><div class="figc">{}</div>'.format( if data.get('sign_off'):
y['video']['thumbnail']['url'], body += '<p class="auth">' + data['sign_off'] + '</p>'
y['video']['thumbnail']['caption'],
)
return '<html><body><div>' + body + '</div></body></html>' return '<html><body><div>' + body + '</div></body></html>'
def populate_article_metadata(self, article, soup, first): def get_browser(self, *args, **kwargs):
article.url = soup.find('h1')['title'] kwargs['user_agent'] = (
'ReutersNews/7.11.0.1742843009 Mozilla/5.0 (Linux; Android 14) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.6422.165 Mobile Safari/537.36'
)
br = BasicNewsRecipe.get_browser(self, *args, **kwargs)
br.addheaders += [('cookie', 'reuters-geo={"country":"-"; "region":"-"}=')]
return br
def print_version(self, url):
return (
url.replace('https://www.reuters.com', 'https://www.reuters.com/mobile/v1')
+ '?outputType=json'
)