diff --git a/recipes/reuters.recipe b/recipes/reuters.recipe
index ae52d6f406..d5e8eed538 100644
--- a/recipes/reuters.recipe
+++ b/recipes/reuters.recipe
@@ -1,114 +1,134 @@
-#!/usr/bin/env python
-# vim:fileencoding=utf-8
-# License: GPLv3 Copyright: 2020, Kovid Goyal
-
-from __future__ import absolute_import, division, print_function, unicode_literals
-
import json
+import time
+from datetime import datetime, timedelta
+from calibre.ebooks.BeautifulSoup import BeautifulSoup
from calibre.web.feeds.news import BasicNewsRecipe
-country = 'us'
-country_defs = {
- 'us': ('www.reuters.com', {
- 'World': 'world',
- 'Business': 'business',
- 'Markets': 'markets',
- 'Tech': 'technology',
- # 'Sports': 'lifestyle/sports',
- 'Wealth': 'markets/wealth',
- })
-}
-
-
-def prefixed_classes(classes):
- q = frozenset(classes.split(' '))
-
- def matcher(x):
- if x:
- for candidate in frozenset(x.split()):
- for x in q:
- if candidate.startswith(x):
- return True
- return False
- return {'attrs': {'class': matcher}}
-
-
-def extract_article_list(raw):
- if isinstance(raw, bytes):
- raw = raw.decode('utf-8')
- # open('/t/raw.html', 'w').write(raw)
- idx = raw.index(';Fusion.globalContent={')
- d = raw[idx:]
- d = d[d.index('{'):]
- data = json.JSONDecoder().raw_decode(d)[0]
- # from pprint import pformat
- # print(pformat(data), file=open('/t/raw.py', 'w'))
- k = 'arcResult' if 'arcResult' in data else 'result'
- for article in data[k]['articles']:
- yield {'title': article['title'], 'description': article['description'], 'url': article['canonical_url']}
-
-
-# if __name__ == '__main__':
-# print(list(extract_article_list(open('/t/World News _ Latest Top Stories _ Reuters.html').read())))
-
+def p_dt(x):
+ dt = datetime.fromisoformat(x[:-1]) + timedelta(seconds=time.timezone)
+ return dt.strftime('%b %d, %Y, %I:%M %p')
class Reuters(BasicNewsRecipe):
title = 'Reuters'
- description = 'News from all over'
- __author__ = 'Kovid Goyal'
+ __author__ = 'unkn0wn'
+ description = (
+ 'Reuters, the news and media division of Thomson Reuters, is the world’s largest multimedia news provider, '
+ 'reaching billions of people worldwide every day. Reuters provides business, financial, national and international '
+ 'news to professionals via desktop terminals, the world’s media organizations, industry events and directly to consumers.'
+ )
+ masthead_url = 'https://www.reutersprofessional.com/wp-content/uploads/2024/03/primary-logo.svg'
language = 'en'
-
-
- keep_only_tags = [
- prefixed_classes('article-body__container__ article-header__container__'),
- ]
- remove_tags = [
- prefixed_classes(
- 'context-widget__tabs___ article-header__toolbar__ read-next-mobile__container__ toolbar__container__ button__link__'
- ' ArticleBody-read-time-and-social Slideshow-expand-button- TwoColumnsLayout-footer- RegistrationPrompt__container___'
- ' SocialEmbed__inner___ trust-badge author-bio__social__ with-spinner__spinner__ author-bio__author-image__'
- ),
- dict(name=['button', 'link', 'svg']),
- ]
+ encoding = 'utf-8'
+ oldest_article = 2 # days
+ no_javascript = True
+ no_stylesheets = True
remove_attributes = ['style', 'height', 'width']
+ resolve_internal_links = True
+ ignore_duplicate_articles = {'url', 'title'}
extra_css = '''
- img { max-width: 100%; }
- [class^="article-header__tags__"],
- [class^="author-bio__author-card__"],
- [class^="article-header__author-date__"] {
- font-size:small;
- }
- [data-testid="primary-gallery"], [data-testid="primary-image"] { font-size:small; text-align:center; }
+ .label, .auth { font-size:small; color:#202020; }
+ .figc { font-size:small; text-align:center; }
+ img {display:block; margin:0 auto;}
'''
def parse_index(self):
- base, sections = country_defs[country]
- ans = []
+ index = 'https://www.reuters.com'
+ today = datetime.now()
+ feed_api = index + '/arc/outboundfeeds/v3/mobile/section/{}/?from=0&size=50&outputType=json'
+ path_api = index + '/arc/outboundfeeds/v3/mobile{}?outputType=json'
+ sections = [
+ 'world', 'business', 'markets','sustainability', 'legal',
+ 'breakingviews', 'technology', 'sports', 'science', 'lifestyle'
+ ]
- for section_title in sections:
- slug = sections[section_title]
- self.log(section_title)
- articles = list(self.parse_reuters_section(base, slug))
+ feeds = []
+
+ for sec in sections:
+ section = sec.capitalize()
+ self.log(section)
+
+ articles = []
+
+ data = json.loads(self.index_to_soup(feed_api.format(sec), raw=True))['wireitems']
+
+ for x in data:
+ if x.get('wireitem_type', '') == 'story':
+ for y in x['templates']:
+ if y.get('type', '') == 'story':
+ title = y['story']['hed']
+
+ date = datetime.fromisoformat(y['story']['updated_at'][:-1]) + timedelta(seconds=time.timezone)
+ if (today - date) > timedelta(self.oldest_article):
+ continue
+
+ desc = y['story']['lede']
+ path = y['template_action']
+ if path.get('type', '') == 'article':
+ url = path_api.format(path['api_path_native'])
+ self.log(' ', title, '\n\t', desc)
+ articles.append({'title': title, 'description':desc, 'url': url})
if articles:
- ans.append((section_title, articles))
- if self.test and len(ans) >= self.test[0]:
- break
- return ans
+ feeds.append((section, articles))
+ return feeds
- def parse_reuters_section(self, base, slug):
- url = 'https://' + base + '/' + slug
- raw = self.index_to_soup(url, raw=True)
- for article in extract_article_list(raw):
- article['url'] = 'https://{}{}'.format(base, article['url'])
- yield article
- self.log('\t', article['title'], article['url'])
+ def preprocess_raw_html(self, raw, url):
+ js = json.loads(raw)
+ data = js['wireitems']
+ body = ''
+ for x in data:
+ if x.get('wireitem_type', '') == 'story':
+ for y in x['templates']:
+ if 'label' in y['cid']:
+ body += '' + y['title'] + '
'
+ break
+ for y in x['templates']:
+ if 'title' in y['cid']:
+ body += ''.format(js['share_url']) + y['content'] + '
'
+ break
+ for y in x['templates']:
+ if 'author' in y['cid']:
+ body += ''
+ auths = [x for x in y.get('authors_names', [])]
+ if auths:
+ body += '
' + 'By ' + ', '.join(auths) + '
'
+ break
+ for y in x['templates']:
+ if 'datetime' in y['cid']:
+ body += '' + str(y['read_minutes']) \
+ + ' minute read | ' + p_dt(y['display_time']) + '
'
+ body += '
'
+ break
+ for y in x['templates']:
+ if 'paragraph' in y['cid']:
+ body += '' + y['content'] + '
'
+ if 'header' in y['cid']:
+ body += '' + y['content'] + '
'
+ if 'image' in y['cid']:
+ if 'renditions' in y['image']:
+ body += '
{}
'.format(
+ y['image']['url'].split('&')[0] + '&width=480', y['image']['caption']
+ )
+ else:
+ body += '
{}
'.format(
+ y['image']['url'], y['image']['caption']
+ )
+ if 'gallery' in y['cid']:
+ for imgs in y['images']:
+ if 'renditions' in imgs:
+ body += '
{}
'.format(
+ imgs['url'].split('&')[0] + '&width=480', imgs['caption']
+ )
+ else:
+ body += '
{}
'.format(
+ imgs['url'], imgs['caption']
+ )
+ if 'video' in y['cid']:
+ body += '
{}
'.format(
+ y['video']['thumbnail']['url'], y['video']['thumbnail']['caption']
+ )
+ return BeautifulSoup('' + body + '
').prettify()
- def preprocess_html(self, soup):
- for noscript in soup.findAll('noscript'):
- if noscript.findAll('img'):
- noscript.name = 'div'
- for img in soup.findAll('img', attrs={'srcset':True}):
- img['src'] = img['srcset'].split()[0]
- return soup
+ def populate_article_metadata(self, article, soup, first):
+ article.url = soup.find('h1')['title']