mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-09-29 15:31:08 -04:00
195 lines
7.6 KiB
Python
195 lines
7.6 KiB
Python
#!/usr/bin/env python
|
||
# vim:fileencoding=utf-8
|
||
import json
|
||
import time
|
||
from datetime import datetime, timedelta
|
||
|
||
from calibre.web.feeds.news import BasicNewsRecipe
|
||
|
||
|
||
def p_dt(x):
|
||
dt = datetime.fromisoformat(x[:-1]) + timedelta(seconds=time.timezone)
|
||
return dt.strftime('%b %d, %Y, %I:%M %p')
|
||
|
||
|
||
class Reuters(BasicNewsRecipe):
|
||
title = 'Reuters'
|
||
__author__ = 'unkn0wn'
|
||
description = (
|
||
'Reuters, the news and media division of Thomson Reuters, is the world’s largest multimedia news provider, '
|
||
'reaching billions of people worldwide every day. Reuters provides business, financial, national and international '
|
||
'news to professionals via desktop terminals, the world’s media organizations, industry events and directly to consumers.'
|
||
)
|
||
masthead_url = (
|
||
'https://upload.wikimedia.org/wikipedia/commons/9/9e/Reuters_logo_2024.svg'
|
||
)
|
||
cover_url = 'https://yt3.googleusercontent.com/ytc/AIdro_mk43b9eQwN15ZBDyMPDaElxvw4V-oUS9XDUvVnYB3gA9yA=s1024'
|
||
language = 'en'
|
||
encoding = 'utf-8'
|
||
oldest_article = 1.2 # days
|
||
no_javascript = True
|
||
no_stylesheets = True
|
||
remove_attributes = ['style', 'height', 'width']
|
||
resolve_internal_links = True
|
||
ignore_duplicate_articles = {'url', 'title'}
|
||
|
||
extra_css = """
|
||
.label, .auth { font-size:small; color:#202020; }
|
||
.figc { font-size:small; }
|
||
img {display:block; margin:0 auto;}
|
||
"""
|
||
|
||
recipe_specific_options = {
|
||
'days': {
|
||
'short': 'Oldest article to download from this news source. In days ',
|
||
'long': 'For example, 0.5, gives you articles from the past 12 hours',
|
||
'default': str(oldest_article),
|
||
},
|
||
'res': {
|
||
'short': 'For hi-res images, select a resolution from the\nfollowing options: 960, 1080, 1200',
|
||
'long': 'This is useful for non e-ink devices',
|
||
'default': '480'
|
||
}
|
||
}
|
||
|
||
def __init__(self, *args, **kwargs):
|
||
BasicNewsRecipe.__init__(self, *args, **kwargs)
|
||
d = self.recipe_specific_options.get('days')
|
||
if d and isinstance(d, str):
|
||
self.oldest_article = float(d)
|
||
|
||
def parse_index(self):
|
||
index = 'https://www.reuters.com'
|
||
today = datetime.now()
|
||
feed_api = (
|
||
index
|
||
+ '/arc/outboundfeeds/v3/mobile/section/{}/?from=0&size=50&outputType=json'
|
||
)
|
||
path_api = index + '/arc/outboundfeeds/v3/mobile{}?outputType=json'
|
||
sections = [
|
||
'world',
|
||
'business',
|
||
'markets',
|
||
'sustainability',
|
||
'legal',
|
||
'breakingviews',
|
||
'technology',
|
||
# 'sports',
|
||
'science',
|
||
'lifestyle',
|
||
]
|
||
|
||
feeds = []
|
||
|
||
for sec in sections:
|
||
section = sec.capitalize()
|
||
self.log(section)
|
||
|
||
articles = []
|
||
|
||
data = json.loads(self.index_to_soup(feed_api.format(sec), raw=True))[
|
||
'wireitems'
|
||
]
|
||
|
||
for x in data:
|
||
if x.get('wireitem_type', '') == 'story':
|
||
for y in x['templates']:
|
||
if y.get('type', '') == 'story':
|
||
title = y['story']['hed']
|
||
|
||
date = datetime.fromisoformat(
|
||
y['story']['updated_at'][:-1]
|
||
) + timedelta(seconds=time.timezone)
|
||
if (today - date) > timedelta(self.oldest_article):
|
||
continue
|
||
|
||
desc = y['story']['lede']
|
||
path = y['template_action']
|
||
if path.get('type', '') == 'article':
|
||
url = path_api.format(path['api_path_native'])
|
||
self.log(' ', title, '\n\t', desc)
|
||
articles.append(
|
||
{'title': title, 'description': desc, 'url': url}
|
||
)
|
||
if articles:
|
||
feeds.append((section, articles))
|
||
return feeds
|
||
|
||
def preprocess_raw_html(self, raw, url):
|
||
res = '&width=480'
|
||
w = self.recipe_specific_options.get('res')
|
||
if w and isinstance(w, str):
|
||
res = '&width=' + w
|
||
js = json.loads(raw)
|
||
data = js['wireitems']
|
||
body = ''
|
||
for x in data:
|
||
if x.get('wireitem_type', '') == 'story':
|
||
for y in x['templates']:
|
||
if 'label' in y['cid']:
|
||
body += '<div class="label">' + y['title'] + '</div>'
|
||
break
|
||
for y in x['templates']:
|
||
if 'title' in y['cid']:
|
||
body += (
|
||
'<h1 title="{}">'.format(js['share_url'])
|
||
+ y['content']
|
||
+ '</h1>'
|
||
)
|
||
break
|
||
for y in x['templates']:
|
||
if 'author' in y['cid']:
|
||
body += '<p>'
|
||
auths = [x for x in y.get('authors_names', [])]
|
||
if auths:
|
||
body += (
|
||
'<div class="auth">' + 'By ' + ', '.join(auths) + '</div>'
|
||
)
|
||
break
|
||
for y in x['templates']:
|
||
if 'datetime' in y['cid']:
|
||
body += (
|
||
'<div class="auth">'
|
||
+ str(y['read_minutes'])
|
||
+ ' minute read | '
|
||
+ p_dt(y['display_time'])
|
||
+ '</div>'
|
||
)
|
||
body += '</p>'
|
||
break
|
||
for y in x['templates']:
|
||
if 'paragraph' in y['cid']:
|
||
body += '<p>' + y['content'] + '</p>'
|
||
if 'header' in y['cid']:
|
||
body += '<h4>' + y['content'] + '</h4>'
|
||
if 'image' in y['cid']:
|
||
if 'renditions' in y['image']:
|
||
body += '<img src="{}"><div class="figc">{}</div>'.format(
|
||
y['image']['url'].split('&')[0] + res,
|
||
y['image']['caption'],
|
||
)
|
||
else:
|
||
body += '<img src="{}"><div class="figc">{}</div>'.format(
|
||
y['image']['url'], y['image']['caption']
|
||
)
|
||
if 'gallery' in y['cid']:
|
||
for imgs in y['images']:
|
||
if 'renditions' in imgs:
|
||
body += '<img src="{}"><div class="figc">{}</div>'.format(
|
||
imgs['url'].split('&')[0] + res,
|
||
imgs['caption'],
|
||
)
|
||
else:
|
||
body += '<img src="{}"><div class="figc">{}</div>'.format(
|
||
imgs['url'], imgs['caption']
|
||
)
|
||
if 'video' in y['cid']:
|
||
body += '<img src="{}"><div class="figc">{}</div>'.format(
|
||
y['video']['thumbnail']['url'],
|
||
y['video']['thumbnail']['caption'],
|
||
)
|
||
return '<html><body><div>' + body + '</div></body></html>'
|
||
|
||
def populate_article_metadata(self, article, soup, first):
|
||
article.url = soup.find('h1')['title']
|