From ee066587e55fe5df163d3dc494923fb011693654 Mon Sep 17 00:00:00 2001 From: unkn0w7n <51942695+unkn0w7n@users.noreply.github.com> Date: Sun, 30 Mar 2025 11:14:27 +0530 Subject: [PATCH] Update reuters.recipe --- recipes/reuters.recipe | 234 +++++++++++++++++++++-------------------- 1 file changed, 121 insertions(+), 113 deletions(-) diff --git a/recipes/reuters.recipe b/recipes/reuters.recipe index 282b39e869..1da3208d1f 100644 --- a/recipes/reuters.recipe +++ b/recipes/reuters.recipe @@ -31,10 +31,12 @@ class Reuters(BasicNewsRecipe): no_stylesheets = True remove_attributes = ['style', 'height', 'width'] resolve_internal_links = True - ignore_duplicate_articles = {'url', 'title'} + ignore_duplicate_articles = {'url'} + remove_empty_feeds = True extra_css = ''' .label, .auth { font-size:small; color:#202020; } + .desc { font-style: italic; } .figc { font-size:small; } img {display:block; margin:0 auto;} ''' @@ -48,8 +50,8 @@ class Reuters(BasicNewsRecipe): 'res': { 'short': 'For hi-res images, select a resolution from the\nfollowing options: 960, 1080, 1200', 'long': 'This is useful for non e-ink devices', - 'default': '480' - } + 'default': '480', + }, } def __init__(self, *args, **kwargs): @@ -61,58 +63,55 @@ class Reuters(BasicNewsRecipe): def parse_index(self): index = 'https://www.reuters.com' today = datetime.now() - feed_api = ( - index - + '/arc/outboundfeeds/v3/mobile/section/{}/?from=0&size=50&outputType=json' + + sections = [] + + sec_api = json.loads( + self.index_to_soup(index + '/mobile/api/v1/menu/?outputType=json', raw=True) ) - path_api = index + '/arc/outboundfeeds/v3/mobile{}?outputType=json' - sections = [ - 'world', - 'business', - 'markets', - 'sustainability', - 'legal', - 'breakingviews', - 'technology', - # 'sports', - 'science', - 'lifestyle', - ] + + for s in sec_api[0]['data']['hierarchy']['children']: + if s.get('type', '') == 'section': + sections.append((s['name'], s['id'])) + sections.extend( + (s['name'] + ' - ' + s2['name'], s2['id']) + for s2 in s.get('children', []) + if s2.get('type', '') == 'section' + ) feeds = [] - for sec in sections: - section = sec.capitalize() - self.log(section) + for sec, link in sections: + self.log(sec) articles = [] - data = json.loads(self.index_to_soup(feed_api.format(sec), raw=True))[ - 'wireitems' - ] + data = json.loads( + self.index_to_soup( + index + '/mobile/v1' + link + '?outputType=json', raw=True + ) + ) - for x in data: - if x.get('wireitem_type', '') == 'story': - for y in x['templates']: - if y.get('type', '') == 'story': - title = y['story']['hed'] + for st in ( + story + for x in data + if isinstance(x, dict) + for story in x.get('data', {}).get('stories', []) + ): + title = st['title'] - date = datetime.fromisoformat( - y['story']['updated_at'][:-1] - ) + timedelta(seconds=time.timezone) - if (today - date) > timedelta(self.oldest_article): - continue + date = datetime.fromisoformat(st['display_time'][:-1]) + timedelta( + seconds=time.timezone + ) + if (today - date) > timedelta(self.oldest_article): + continue - desc = y['story']['lede'] - path = y['template_action'] - if path.get('type', '') == 'article': - url = path_api.format(path['api_path_native']) - self.log(' ', title, '\n\t', desc) - articles.append( - {'title': title, 'description': desc, 'url': url} - ) + desc = st['description'] + url = index + st['url'] + self.log(' ', title, '\n\t', desc, '\n\t', url) + articles.append({'title': title, 'description': desc, 'url': url}) if articles: - feeds.append((section, articles)) + feeds.append((sec, articles)) return feeds def preprocess_raw_html(self, raw, url): @@ -120,75 +119,84 @@ class Reuters(BasicNewsRecipe): w = self.recipe_specific_options.get('res') if w and isinstance(w, str): res = '&width=' + w - js = json.loads(raw) - data = js['wireitems'] + body = '' - for x in data: - if x.get('wireitem_type', '') == 'story': - for y in x['templates']: - if 'label' in y['cid']: - body += '
' + y['title'] + '
' - break - for y in x['templates']: - if 'title' in y['cid']: - body += ( - '

'.format(js['share_url']) - + y['content'] - + '

' - ) - break - for y in x['templates']: - if 'author' in y['cid']: - body += '

' - auths = list(y.get('authors_names', [])) - if auths: - body += ( - '

' + 'By ' + ', '.join(auths) + '
' - ) - break - for y in x['templates']: - if 'datetime' in y['cid']: - body += ( - '
' - + str(y['read_minutes']) - + ' minute read | ' - + p_dt(y['display_time']) - + '
' - ) - body += '

' - break - for y in x['templates']: - if 'paragraph' in y['cid']: - body += '

' + y['content'] + '

' - if 'header' in y['cid']: - body += '

' + y['content'] + '

' - if 'image' in y['cid']: - if 'renditions' in y['image']: - body += '
{}
'.format( - y['image']['url'].split('&')[0] + res, - y['image']['caption'], - ) - else: - body += '
{}
'.format( - y['image']['url'], y['image']['caption'] - ) - if 'gallery' in y['cid']: - for imgs in y['images']: - if 'renditions' in imgs: - body += '
{}
'.format( - imgs['url'].split('&')[0] + res, - imgs['caption'], - ) - else: - body += '
{}
'.format( - imgs['url'], imgs['caption'] - ) - if 'video' in y['cid']: - body += '
{}
'.format( - y['video']['thumbnail']['url'], - y['video']['thumbnail']['caption'], - ) + + for det in json.loads(raw): + if not det.get('type', '') == 'article_detail': + continue + data = det['data']['article'] + body += '

' + data['title'] + '

' + if data.get('description'): + body += '

' + data['description'] + '

' + if data.get('authors'): + body += ( + '

' + + 'By ' + + ', '.join(at.get('byline', '') for at in data.get('authors', [])) + + '

' + ) + + if data.get('thumbnail') and data['thumbnail'].get('type', '') == 'image': + th = data['thumbnail'] + body += '
{}
'.format( + th['resizer_url'].split('&')[0] + res, + th.get('caption', ''), + ) + + body += ( + '

' + + str(data['read_minutes']) + + ' minute read | ' + + str(data['word_count']) + + ' words | ' + + p_dt( + data['updated_time'] + if data.get('updated_time') + else data['display_time'] + ) + + '

' + ) + + if data.get('summary'): + ( + '
' + + ''.join(f'
  • {su["description"]}
  • ' for su in data['summary']) + + '
    ' + ) + + for y in data['content_elements']: + ty = y.get('type', '') + if ty == 'placeholder': + continue + + elif ty == 'paragraph': + body += '

    ' + y['content'] + '

    ' + elif ty == 'header': + body += '

    ' + y['content'] + '

    ' + elif ty == 'graphic': + body += '
    {}
    '.format( + y['resizer_url'].split('&')[0] + res, + y.get('description', ''), + ) + else: + self.log('**', ty) + + if data.get('sign_off'): + body += '

    ' + data['sign_off'] + '

    ' + return '
    ' + body + '
    ' - def populate_article_metadata(self, article, soup, first): - article.url = soup.find('h1')['title'] + def get_browser(self, *args, **kwargs): + kwargs['user_agent'] = ( + 'ReutersNews/7.11.0.1742843009 Mozilla/5.0 (Linux; Android 14) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.6422.165 Mobile Safari/537.36' + ) + br = BasicNewsRecipe.get_browser(self, *args, **kwargs) + br.addheaders += [('cookie', 'reuters-geo={"country":"-"; "region":"-"}=')] + return br + + def print_version(self, url): + return ( + url.replace('https://www.reuters.com', 'https://www.reuters.com/mobile/v1') + + '?outputType=json' + )