#!/usr/bin/env python # vim:fileencoding=utf-8 import json import time from datetime import datetime, timedelta from calibre.ebooks.BeautifulSoup import BeautifulSoup from calibre.web.feeds.news import BasicNewsRecipe def p_dt(x): dt = datetime.fromisoformat(x[:-1]) + timedelta(seconds=time.timezone) return dt.strftime('%b %d, %Y, %I:%M %p') class Reuters(BasicNewsRecipe): title = 'Reuters' __author__ = 'unkn0wn' description = ( 'Reuters, the news and media division of Thomson Reuters, is the world’s largest multimedia news provider, ' 'reaching billions of people worldwide every day. Reuters provides business, financial, national and international ' 'news to professionals via desktop terminals, the world’s media organizations, industry events and directly to consumers.' ) masthead_url = 'https://www.reutersagency.com/wp-content/uploads/2024/06/reuters-logo.png' cover_url = 'https://yt3.googleusercontent.com/ytc/AIdro_mk43b9eQwN15ZBDyMPDaElxvw4V-oUS9XDUvVnYB3gA9yA=s1024' language = 'en' encoding = 'utf-8' oldest_article = 1.2 # days no_javascript = True no_stylesheets = True remove_attributes = ['style', 'height', 'width'] resolve_internal_links = True ignore_duplicate_articles = {'url', 'title'} extra_css = ''' .label, .auth { font-size:small; color:#202020; } .figc { font-size:small; } img {display:block; margin:0 auto;} ''' recipe_specific_options = { 'days': { 'short': 'Oldest article to download from this news source. In days ', 'long': 'For example, 0.5, gives you articles from the past 12 hours', 'default': str(oldest_article) } } def __init__(self, *args, **kwargs): BasicNewsRecipe.__init__(self, *args, **kwargs) d = self.recipe_specific_options.get('days') if d and isinstance(d, str): self.oldest_article = float(d) def parse_index(self): index = 'https://www.reuters.com' today = datetime.now() feed_api = index + '/arc/outboundfeeds/v3/mobile/section/{}/?from=0&size=50&outputType=json' path_api = index + '/arc/outboundfeeds/v3/mobile{}?outputType=json' sections = [ 'world', 'business', 'markets','sustainability', 'legal', 'breakingviews', 'technology', 'sports', 'science', 'lifestyle' ] feeds = [] for sec in sections: section = sec.capitalize() self.log(section) articles = [] data = json.loads(self.index_to_soup(feed_api.format(sec), raw=True))['wireitems'] for x in data: if x.get('wireitem_type', '') == 'story': for y in x['templates']: if y.get('type', '') == 'story': title = y['story']['hed'] date = datetime.fromisoformat(y['story']['updated_at'][:-1]) + timedelta(seconds=time.timezone) if (today - date) > timedelta(self.oldest_article): continue desc = y['story']['lede'] path = y['template_action'] if path.get('type', '') == 'article': url = path_api.format(path['api_path_native']) self.log(' ', title, '\n\t', desc) articles.append({'title': title, 'description':desc, 'url': url}) if articles: feeds.append((section, articles)) return feeds def preprocess_raw_html(self, raw, url): js = json.loads(raw) data = js['wireitems'] body = '' for x in data: if x.get('wireitem_type', '') == 'story': for y in x['templates']: if 'label' in y['cid']: body += '

' + y['title'] + '

' break for y in x['templates']: if 'title' in y['cid']: body += '

'.format(js['share_url']) + y['content'] + '

' break for y in x['templates']: if 'author' in y['cid']: body += '

' auths = [x for x in y.get('authors_names', [])] if auths: body += '

' + 'By ' + ', '.join(auths) + '

' break for y in x['templates']: if 'datetime' in y['cid']: body += '

' + str(y['read_minutes']) \ + ' minute read | ' + p_dt(y['display_time']) + '

' body += '

' break for y in x['templates']: if 'paragraph' in y['cid']: body += '

' + y['content'] + '

' if 'header' in y['cid']: body += '

' + y['content'] + '

' if 'image' in y['cid']: if 'renditions' in y['image']: body += '

{}

'.format( y['image']['url'].split('&')[0] + '&width=480', y['image']['caption'] ) else: body += '

{}

'.format( y['image']['url'], y['image']['caption'] ) if 'gallery' in y['cid']: for imgs in y['images']: if 'renditions' in imgs: body += '

{}

'.format( imgs['url'].split('&')[0] + '&width=480', imgs['caption'] ) else: body += '

{}

'.format( imgs['url'], imgs['caption'] ) if 'video' in y['cid']: body += '

{}

'.format( y['video']['thumbnail']['url'], y['video']['thumbnail']['caption'] ) return BeautifulSoup('

' + body + '

').prettify() def populate_article_metadata(self, article, soup, first): article.url = soup.find('h1')['title']