#!/usr/bin/env python # vim:fileencoding=utf-8 import json import time from datetime import datetime, timedelta from calibre.web.feeds.news import BasicNewsRecipe def p_dt(x): dt = datetime.fromisoformat(x[:-1]) + timedelta(seconds=time.timezone) return dt.strftime('%b %d, %Y, %I:%M %p') class Reuters(BasicNewsRecipe): title = 'Reuters' __author__ = 'unkn0wn' description = ( 'Reuters, the news and media division of Thomson Reuters, is the world’s largest multimedia news provider, ' 'reaching billions of people worldwide every day. Reuters provides business, financial, national and international ' 'news to professionals via desktop terminals, the world’s media organizations, industry events and directly to consumers.' ) masthead_url = ( 'https://upload.wikimedia.org/wikipedia/commons/9/9e/Reuters_logo_2024.svg' ) cover_url = 'https://yt3.googleusercontent.com/ytc/AIdro_mk43b9eQwN15ZBDyMPDaElxvw4V-oUS9XDUvVnYB3gA9yA=s1024' language = 'en' encoding = 'utf-8' oldest_article = 1.2 # days no_javascript = True no_stylesheets = True remove_attributes = ['style', 'height', 'width'] resolve_internal_links = True ignore_duplicate_articles = {'url'} remove_empty_feeds = True extra_css = ''' .label, .auth { font-size:small; color:#202020; } .desc { font-style: italic; } .figc { font-size:small; } img {display:block; margin:0 auto;} ''' recipe_specific_options = { 'days': { 'short': 'Oldest article to download from this news source. In days ', 'long': 'For example, 0.5, gives you articles from the past 12 hours', 'default': str(oldest_article), }, 'res': { 'short': 'For hi-res images, select a resolution from the\nfollowing options: 960, 1080, 1200', 'long': 'This is useful for non e-ink devices', 'default': '480', }, 'spr': { 'short': 'Include Sports sections?', 'long': 'Yes/No', 'default': 'No', }, } def __init__(self, *args, **kwargs): BasicNewsRecipe.__init__(self, *args, **kwargs) d = self.recipe_specific_options.get('days') if d and isinstance(d, str): self.oldest_article = float(d) def parse_index(self): index = 'https://www.reuters.com' today = datetime.now() sections = [] sec_api = json.loads( self.index_to_soup(index + '/mobile/api/v1/menu/?outputType=json', raw=True) ) for s in sec_api[0]['data']['hierarchy']['children']: if s.get('type', '') == 'section': sections.append((s['name'], s['id'])) sections.extend( (s['name'] + ' - ' + s2['name'], s2['id']) for s2 in s.get('children', []) if s2.get('type', '') == 'section' ) feeds = [] for sec, link in sections: sp = self.recipe_specific_options.get('spr') if sp and isinstance(sp, str): if sp.lower().strip() != 'yes': if sec.lower().startswith('sport'): continue self.log(sec) articles = [] data = json.loads( self.index_to_soup( index + '/mobile/v1' + link + '?outputType=json', raw=True ) ) for st in ( story for x in data if isinstance(x, dict) for story in x.get('data', {}).get('stories', []) ): title = st['title'] date = datetime.fromisoformat(st['display_time'][:-1]) + timedelta( seconds=time.timezone ) if (today - date) > timedelta(self.oldest_article): continue desc = st['description'] url = index + st['url'] self.log(' ', title, '\n\t', desc, '\n\t', url) articles.append({'title': title, 'description': desc, 'url': url}) if articles: feeds.append((sec, articles)) return feeds def preprocess_raw_html(self, raw, url): res = '&width=480' w = self.recipe_specific_options.get('res') if w and isinstance(w, str): res = '&width=' + w body = '' for det in json.loads(raw): if not det.get('type', '') == 'article_detail': continue data = det['data']['article'] body += '

' + data['title'] + '

' if data.get('description'): body += '

' + data['description'] + '

' if data.get('authors'): body += ( '

' + 'By ' + ', '.join(at.get('byline', '') for at in data.get('authors', [])) + '

' ) if data.get('thumbnail') and data['thumbnail'].get('type', '') == 'image': th = data['thumbnail'] body += '
{}
'.format( th['resizer_url'].split('&')[0] + res, th.get('caption', ''), ) body += ( '

' + str(data.get('read_minutes', '_')) + ' minute read | ' + str(data['word_count']) + ' words | ' + p_dt( data['updated_time'] if data.get('updated_time') else data['display_time'] ) + '

' ) if data.get('summary'): body += ( '
' + ''.join(f'
  • {su["description"]}
  • ' for su in data['summary']) + '
    ' ) for y in data['content_elements']: ty = y.get('type', '') if ty == 'placeholder': continue elif ty == 'paragraph': body += '

    ' + y['content'] + '

    ' elif ty == 'header': body += '

    ' + y['content'] + '

    ' elif ty == 'graphic': body += '
    {}
    '.format( y['resizer_url'].split('&')[0] + res, y.get('description', ''), ) else: self.log('**', ty) if data.get('sign_off'): body += '

    ' + data['sign_off'] + '

    ' return '
    ' + body + '
    ' def get_browser(self, *args, **kwargs): kwargs['user_agent'] = ( 'ReutersNews/7.11.0.1742843009 Mozilla/5.0 (Linux; Android 14) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.6422.165 Mobile Safari/537.36' ) br = BasicNewsRecipe.get_browser(self, *args, **kwargs) br.addheaders += [('cookie', 'reuters-geo={"country":"-"; "region":"-"}=')] return br def print_version(self, url): return ( url.replace('https://www.reuters.com', 'https://www.reuters.com/mobile/v1') + '?outputType=json' )