calibre/recipes/reuters.recipe
2025-04-03 22:38:18 +05:30

214 lines
7.4 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python
# vim:fileencoding=utf-8
import json
import time
from datetime import datetime, timedelta
from calibre.web.feeds.news import BasicNewsRecipe
def p_dt(x):
dt = datetime.fromisoformat(x[:-1]) + timedelta(seconds=time.timezone)
return dt.strftime('%b %d, %Y, %I:%M %p')
class Reuters(BasicNewsRecipe):
title = 'Reuters'
__author__ = 'unkn0wn'
description = (
'Reuters, the news and media division of Thomson Reuters, is the worlds largest multimedia news provider, '
'reaching billions of people worldwide every day. Reuters provides business, financial, national and international '
'news to professionals via desktop terminals, the worlds media organizations, industry events and directly to consumers.'
)
masthead_url = (
'https://upload.wikimedia.org/wikipedia/commons/9/9e/Reuters_logo_2024.svg'
)
cover_url = 'https://yt3.googleusercontent.com/ytc/AIdro_mk43b9eQwN15ZBDyMPDaElxvw4V-oUS9XDUvVnYB3gA9yA=s1024'
language = 'en'
encoding = 'utf-8'
oldest_article = 1.2 # days
no_javascript = True
no_stylesheets = True
remove_attributes = ['style', 'height', 'width']
resolve_internal_links = True
ignore_duplicate_articles = {'url'}
remove_empty_feeds = True
extra_css = '''
.label, .auth { font-size:small; color:#202020; }
.desc { font-style: italic; }
.figc { font-size:small; }
img {display:block; margin:0 auto;}
'''
recipe_specific_options = {
'days': {
'short': 'Oldest article to download from this news source. In days ',
'long': 'For example, 0.5, gives you articles from the past 12 hours',
'default': str(oldest_article),
},
'res': {
'short': 'For hi-res images, select a resolution from the\nfollowing options: 960, 1080, 1200',
'long': 'This is useful for non e-ink devices',
'default': '480',
},
'spr': {
'short': 'Include Sports sections?',
'long': 'Yes/No',
'default': 'No',
},
}
def __init__(self, *args, **kwargs):
BasicNewsRecipe.__init__(self, *args, **kwargs)
d = self.recipe_specific_options.get('days')
if d and isinstance(d, str):
self.oldest_article = float(d)
def parse_index(self):
index = 'https://www.reuters.com'
today = datetime.now()
sections = []
sec_api = json.loads(
self.index_to_soup(index + '/mobile/api/v1/menu/?outputType=json', raw=True)
)
for s in sec_api[0]['data']['hierarchy']['children']:
if s.get('type', '') == 'section':
sections.append((s['name'], s['id']))
sections.extend(
(s['name'] + ' - ' + s2['name'], s2['id'])
for s2 in s.get('children', [])
if s2.get('type', '') == 'section'
)
feeds = []
for sec, link in sections:
sp = self.recipe_specific_options.get('spr')
if sp and isinstance(sp, str):
if sp.lower().strip() != 'yes':
if sec.lower().startswith('sport'):
continue
self.log(sec)
articles = []
data = json.loads(
self.index_to_soup(
index + '/mobile/v1' + link + '?outputType=json', raw=True
)
)
for st in (
story
for x in data
if isinstance(x, dict)
for story in x.get('data', {}).get('stories', [])
):
title = st['title']
date = datetime.fromisoformat(st['display_time'][:-1]) + timedelta(
seconds=time.timezone
)
if (today - date) > timedelta(self.oldest_article):
continue
desc = st['description']
url = index + st['url']
self.log(' ', title, '\n\t', desc, '\n\t', url)
articles.append({'title': title, 'description': desc, 'url': url})
if articles:
feeds.append((sec, articles))
return feeds
def preprocess_raw_html(self, raw, url):
res = '&width=480'
w = self.recipe_specific_options.get('res')
if w and isinstance(w, str):
res = '&width=' + w
body = ''
for det in json.loads(raw):
if not det.get('type', '') == 'article_detail':
continue
data = det['data']['article']
body += '<h1>' + data['title'] + '</h1>'
if data.get('description'):
body += '<p class="desc">' + data['description'] + '</p>'
if data.get('authors'):
body += (
'<p class="auth">'
+ 'By '
+ ', '.join(at.get('byline', '') for at in data.get('authors', []))
+ '</p>'
)
if data.get('thumbnail') and data['thumbnail'].get('type', '') == 'image':
th = data['thumbnail']
body += '<img src="{}"><div class="figc">{}</div>'.format(
th['resizer_url'].split('&')[0] + res,
th.get('caption', ''),
)
body += (
'<p class="auth">'
+ str(data.get('read_minutes', '_'))
+ ' minute read | '
+ str(data['word_count'])
+ ' words | '
+ p_dt(
data['updated_time']
if data.get('updated_time')
else data['display_time']
)
+ '</p>'
)
if data.get('summary'):
body += (
'<blockquote>'
+ ''.join(f'<li>{su["description"]}</li>' for su in data['summary'])
+ '</blockquote>'
)
for y in data['content_elements']:
ty = y.get('type', '')
if ty == 'placeholder':
continue
elif ty == 'paragraph':
body += '<p>' + y['content'] + '</p>'
elif ty == 'header':
body += '<h4>' + y['content'] + '</h4>'
elif ty == 'graphic':
body += '<img src="{}"><div class="figc">{}</div>'.format(
y['resizer_url'].split('&')[0] + res,
y.get('description', ''),
)
else:
self.log('**', ty)
if data.get('sign_off'):
body += '<p class="auth">' + data['sign_off'] + '</p>'
return '<html><body><div>' + body + '</div></body></html>'
def get_browser(self, *args, **kwargs):
kwargs['user_agent'] = (
'ReutersNews/7.11.0.1742843009 Mozilla/5.0 (Linux; Android 14) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.6422.165 Mobile Safari/537.36'
)
br = BasicNewsRecipe.get_browser(self, *args, **kwargs)
br.addheaders += [('cookie', 'reuters-geo={"country":"-"; "region":"-"}=')]
return br
def print_version(self, url):
return (
url.replace('https://www.reuters.com', 'https://www.reuters.com/mobile/v1')
+ '?outputType=json'
)