calibre/recipes/reuters.recipe

#!/usr/bin/env python
# vim:fileencoding=utf-8
import json
import time
from datetime import datetime, timedelta

from calibre.web.feeds.news import BasicNewsRecipe


def p_dt(x):
    dt = datetime.fromisoformat(x[:-1]) + timedelta(seconds=time.timezone)
    return dt.strftime('%b %d, %Y, %I:%M %p')


class Reuters(BasicNewsRecipe):
    title = 'Reuters'
    __author__ = 'unkn0wn'
    description = (
        'Reuters, the news and media division of Thomson Reuters, is the world’s largest multimedia news provider, '
        'reaching billions of people worldwide every day. Reuters provides business, financial, national and international '
        'news to professionals via desktop terminals, the world’s media organizations, industry events and directly to consumers.'
    )
    masthead_url = (
        'https://upload.wikimedia.org/wikipedia/commons/9/9e/Reuters_logo_2024.svg'
    )
    cover_url = 'https://yt3.googleusercontent.com/ytc/AIdro_mk43b9eQwN15ZBDyMPDaElxvw4V-oUS9XDUvVnYB3gA9yA=s1024'
    language = 'en'
    encoding = 'utf-8'
    oldest_article = 1.2  # days
    no_javascript = True
    no_stylesheets = True
    remove_attributes = ['style', 'height', 'width']
    resolve_internal_links = True
    ignore_duplicate_articles = {'url'}
    remove_empty_feeds = True

    extra_css = '''
        .label, .auth { font-size:small; color:#202020; }
        .desc { font-style: italic; }
        .figc { font-size:small; }
        img {display:block; margin:0 auto;}
    '''

    recipe_specific_options = {
        'days': {
            'short': 'Oldest article to download from this news source. In days ',
            'long': 'For example, 0.5, gives you articles from the past 12 hours',
            'default': str(oldest_article),
        },
        'res': {
            'short': 'For hi-res images, select a resolution from the\nfollowing options: 960, 1080, 1200',
            'long': 'This is useful for non e-ink devices',
            'default': '480',
        },
        'spr': {
            'short': 'Include Sports sections?',
            'long': 'Yes/No',
            'default': 'No',
        },
    }

    def __init__(self, *args, **kwargs):
        BasicNewsRecipe.__init__(self, *args, **kwargs)
        d = self.recipe_specific_options.get('days')
        if d and isinstance(d, str):
            self.oldest_article = float(d)

    def parse_index(self):
        index = 'https://www.reuters.com'
        today = datetime.now()

        sections = []

        sec_api = json.loads(
            self.index_to_soup(index + '/mobile/api/v1/menu/?outputType=json', raw=True)
        )

        for s in sec_api[0]['data']['hierarchy']['children']:
            if s.get('type', '') == 'section':
                sections.append((s['name'], s['id']))
                sections.extend(
                    (s['name'] + ' - ' + s2['name'], s2['id'])
                    for s2 in s.get('children', [])
                    if s2.get('type', '') == 'section'
                )

        feeds = []

        for sec, link in sections:
            sp = self.recipe_specific_options.get('spr')
            if sp and isinstance(sp, str):
                if sp.lower().strip() != 'yes':
                    if sec.lower().startswith('sport'):
                        continue

            self.log(sec)

            articles = []

            data = json.loads(
                self.index_to_soup(
                    index + '/mobile/v1' + link + '?outputType=json', raw=True
                )
            )

            for st in (
                story
                for x in data
                if isinstance(x, dict)
                for story in x.get('data', {}).get('stories', [])
            ):
                title = st['title']

                date = datetime.fromisoformat(st['display_time'][:-1]) + timedelta(
                    seconds=time.timezone
                )
                if (today - date) > timedelta(self.oldest_article):
                    continue

                desc = st['description']
                url = index + st['url']
                self.log('            ', title, '\n\t', desc, '\n\t', url)
                articles.append({'title': title, 'description': desc, 'url': url})
            if articles:
                feeds.append((sec, articles))
        return feeds

    def preprocess_raw_html(self, raw, url):
        res = '&width=480'
        w = self.recipe_specific_options.get('res')
        if w and isinstance(w, str):
            res = '&width=' + w

        body = ''

        for det in json.loads(raw):
            if not det.get('type', '') == 'article_detail':
                continue
            data = det['data']['article']
            body += '<h1>' + data['title'] + '</h1>'
            if data.get('description'):
                body += '<p class="desc">' + data['description'] + '</p>'
            if data.get('authors'):
                body += (
                    '<p class="auth">'
                    + 'By '
                    + ', '.join(at.get('byline', '') for at in data.get('authors', []))
                    + '</p>'
                )

            if data.get('thumbnail') and data['thumbnail'].get('type', '') == 'image':
                th = data['thumbnail']
                body += '<img src="{}"><div class="figc">{}</div>'.format(
                    th['resizer_url'].split('&')[0] + res,
                    th.get('caption', ''),
                )

            body += (
                '<p class="auth">'
                + str(data.get('read_minutes', '_'))
                + ' minute read | '
                + str(data['word_count'])
                + ' words | '
                + p_dt(
                    data['updated_time']
                    if data.get('updated_time')
                    else data['display_time']
                )
                + '</p>'
            )

            if data.get('summary'):
                body += (
                    '<blockquote>'
                    + ''.join(f'<li>{su["description"]}</li>' for su in data['summary'])
                    + '</blockquote>'
                )

            for y in data['content_elements']:
                ty = y.get('type', '')
                if ty == 'placeholder':
                    continue

                elif ty == 'paragraph':
                    body += '<p>' + y['content'] + '</p>'
                elif ty == 'header':
                    body += '<h4>' + y['content'] + '</h4>'
                elif ty == 'graphic':
                    body += '<img src="{}"><div class="figc">{}</div>'.format(
                        y['resizer_url'].split('&')[0] + res,
                        y.get('description', ''),
                    )
                else:
                    self.log('**', ty)

            if data.get('sign_off'):
                body += '<p class="auth">' + data['sign_off'] + '</p>'

        return '<html><body><div>' + body + '</div></body></html>'

    def get_browser(self, *args, **kwargs):
        kwargs['user_agent'] = (
            'ReutersNews/7.11.0.1742843009 Mozilla/5.0 (Linux; Android 14) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.6422.165 Mobile Safari/537.36'
        )
        br = BasicNewsRecipe.get_browser(self, *args, **kwargs)
        br.addheaders += [('cookie', 'reuters-geo={"country":"-"; "region":"-"}=')]
        return br

    def print_version(self, url):
        return (
            url.replace('https://www.reuters.com', 'https://www.reuters.com/mobile/v1')
            + '?outputType=json'
        )