calibre/recipes/reuters.recipe

#!/usr/bin/env python
# vim:fileencoding=utf-8
import json
import time
from datetime import datetime, timedelta

from calibre.web.feeds.news import BasicNewsRecipe


def p_dt(x):
    dt = datetime.fromisoformat(x[:-1]) + timedelta(seconds=time.timezone)
    return dt.strftime('%b %d, %Y, %I:%M %p')


class Reuters(BasicNewsRecipe):
    title = 'Reuters'
    __author__ = 'unkn0wn'
    description = (
        'Reuters, the news and media division of Thomson Reuters, is the world’s largest multimedia news provider, '
        'reaching billions of people worldwide every day. Reuters provides business, financial, national and international '
        'news to professionals via desktop terminals, the world’s media organizations, industry events and directly to consumers.'
    )
    masthead_url = (
        'https://upload.wikimedia.org/wikipedia/commons/9/9e/Reuters_logo_2024.svg'
    )
    cover_url = 'https://yt3.googleusercontent.com/ytc/AIdro_mk43b9eQwN15ZBDyMPDaElxvw4V-oUS9XDUvVnYB3gA9yA=s1024'
    language = 'en'
    encoding = 'utf-8'
    oldest_article = 1.2  # days
    no_javascript = True
    no_stylesheets = True
    remove_attributes = ['style', 'height', 'width']
    resolve_internal_links = True
    ignore_duplicate_articles = {'url', 'title'}

    extra_css = """
        .label, .auth { font-size:small; color:#202020; }
        .figc { font-size:small; }
        img {display:block; margin:0 auto;}
    """

    recipe_specific_options = {
        'days': {
            'short': 'Oldest article to download from this news source. In days ',
            'long': 'For example, 0.5, gives you articles from the past 12 hours',
            'default': str(oldest_article),
        },
        'res': {
            'short': 'For hi-res images, select a resolution from the\nfollowing options: 960, 1080, 1200',
            'long': 'This is useful for non e-ink devices',
            'default': '480'
        }
    }

    def __init__(self, *args, **kwargs):
        BasicNewsRecipe.__init__(self, *args, **kwargs)
        d = self.recipe_specific_options.get('days')
        if d and isinstance(d, str):
            self.oldest_article = float(d)

    def parse_index(self):
        index = 'https://www.reuters.com'
        today = datetime.now()
        feed_api = (
            index
            + '/arc/outboundfeeds/v3/mobile/section/{}/?from=0&size=50&outputType=json'
        )
        path_api = index + '/arc/outboundfeeds/v3/mobile{}?outputType=json'
        sections = [
            'world',
            'business',
            'markets',
            'sustainability',
            'legal',
            'breakingviews',
            'technology',
            # 'sports',
            'science',
            'lifestyle',
        ]

        feeds = []

        for sec in sections:
            section = sec.capitalize()
            self.log(section)

            articles = []

            data = json.loads(self.index_to_soup(feed_api.format(sec), raw=True))[
                'wireitems'
            ]

            for x in data:
                if x.get('wireitem_type', '') == 'story':
                    for y in x['templates']:
                        if y.get('type', '') == 'story':
                            title = y['story']['hed']

                            date = datetime.fromisoformat(
                                y['story']['updated_at'][:-1]
                            ) + timedelta(seconds=time.timezone)
                            if (today - date) > timedelta(self.oldest_article):
                                continue

                            desc = y['story']['lede']
                            path = y['template_action']
                            if path.get('type', '') == 'article':
                                url = path_api.format(path['api_path_native'])
                                self.log('            ', title, '\n\t', desc)
                                articles.append(
                                    {'title': title, 'description': desc, 'url': url}
                                )
            if articles:
                feeds.append((section, articles))
        return feeds

    def preprocess_raw_html(self, raw, url):
        res = '&width=480'
        w = self.recipe_specific_options.get('res')
        if w and isinstance(w, str):
            res = '&width=' + w
        js = json.loads(raw)
        data = js['wireitems']
        body = ''
        for x in data:
            if x.get('wireitem_type', '') == 'story':
                for y in x['templates']:
                    if 'label' in y['cid']:
                        body += '<div class="label">' + y['title'] + '</div>'
                        break
                for y in x['templates']:
                    if 'title' in y['cid']:
                        body += (
                            '<h1 title="{}">'.format(js['share_url'])
                            + y['content']
                            + '</h1>'
                        )
                        break
                for y in x['templates']:
                    if 'author' in y['cid']:
                        body += '<p>'
                        auths = [x for x in y.get('authors_names', [])]
                        if auths:
                            body += (
                                '<div class="auth">' + 'By ' + ', '.join(auths) + '</div>'
                            )
                            break
                for y in x['templates']:
                    if 'datetime' in y['cid']:
                        body += (
                            '<div class="auth">'
                            + str(y['read_minutes'])
                            + ' minute read | '
                            + p_dt(y['display_time'])
                            + '</div>'
                        )
                        body += '</p>'
                        break
                for y in x['templates']:
                    if 'paragraph' in y['cid']:
                        body += '<p>' + y['content'] + '</p>'
                    if 'header' in y['cid']:
                        body += '<h4>' + y['content'] + '</h4>'
                    if 'image' in y['cid']:
                        if 'renditions' in y['image']:
                            body += '<img src="{}"><div class="figc">{}</div>'.format(
                                y['image']['url'].split('&')[0] + res,
                                y['image']['caption'],
                            )
                        else:
                            body += '<img src="{}"><div class="figc">{}</div>'.format(
                                y['image']['url'], y['image']['caption']
                            )
                    if 'gallery' in y['cid']:
                        for imgs in y['images']:
                            if 'renditions' in imgs:
                                body += '<img src="{}"><div class="figc">{}</div>'.format(
                                    imgs['url'].split('&')[0] + res,
                                    imgs['caption'],
                                )
                            else:
                                body += '<img src="{}"><div class="figc">{}</div>'.format(
                                    imgs['url'], imgs['caption']
                                )
                    if 'video' in y['cid']:
                        body += '<img src="{}"><div class="figc">{}</div>'.format(
                            y['video']['thumbnail']['url'],
                            y['video']['thumbnail']['caption'],
                        )
        return '<html><body><div>' + body + '</div></body></html>'

    def populate_article_metadata(self, article, soup, first):
        article.url = soup.find('h1')['title']