calibre/recipes/reuters.recipe

#!/usr/bin/env python
# vim:fileencoding=utf-8
# License: GPLv3 Copyright: 2020, Kovid Goyal <kovid at kovidgoyal.net>

from __future__ import absolute_import, division, print_function, unicode_literals

import json
from calibre.web.feeds.news import BasicNewsRecipe

country = 'us'
country_defs = {
    'us': ('www.reuters.com', {
        'World': 'world',
        'Business': 'business',
        'Markets': 'markets',
        'Tech': 'technology',
        # 'Sports': 'lifestyle/sports',
        'Wealth': 'markets/wealth',
    })
}


def prefixed_classes(classes):
    q = frozenset(classes.split(' '))

    def matcher(x):
        if x:
            for candidate in frozenset(x.split()):
                for x in q:
                    if candidate.startswith(x):
                        return True
        return False
    return {'attrs': {'class': matcher}}


def extract_article_list(raw):
    if isinstance(raw, bytes):
        raw = raw.decode('utf-8')
    # open('/t/raw.html', 'w').write(raw)
    idx = raw.index(';Fusion.globalContent={')
    d = raw[idx:]
    d = d[d.index('{'):]
    data = json.JSONDecoder().raw_decode(d)[0]
    # from pprint import pformat
    # print(pformat(data), file=open('/t/raw.py', 'w'))
    k = 'arcResult' if 'arcResult' in data else 'result'
    for article in data[k]['articles']:
        yield {'title': article['title'], 'description': article['description'], 'url': article['canonical_url']}


# if __name__ == '__main__':
#     print(list(extract_article_list(open('/t/World News _ Latest Top Stories _ Reuters.html').read())))


class Reuters(BasicNewsRecipe):
    title = 'Reuters'
    description = 'News from all over'
    __author__ = 'Kovid Goyal'
    language = 'en'


    keep_only_tags = [
        prefixed_classes('article-body__container__ article-header__container__'),
    ]
    remove_tags = [
        prefixed_classes(
            'context-widget__tabs___ article-header__toolbar__ read-next-mobile__container__ toolbar__container__ button__link__'
            ' ArticleBody-read-time-and-social Slideshow-expand-button- TwoColumnsLayout-footer- RegistrationPrompt__container___'
            ' SocialEmbed__inner___ trust-badge author-bio__social__ with-spinner__spinner__ author-bio__author-image__'
        ),
        dict(name=['button', 'link', 'svg']),
    ]
    remove_attributes = ['style', 'height', 'width']

    extra_css = '''
        img { max-width: 100%; }
        [class^="article-header__tags__"],
        [class^="author-bio__author-card__"],
        [class^="article-header__author-date__"] {
            font-size:small;
        }
        [data-testid="primary-gallery"], [data-testid="primary-image"] { font-size:small; text-align:center; }
    '''

    def parse_index(self):
        base, sections = country_defs[country]
        ans = []

        for section_title in sections:
            slug = sections[section_title]
            self.log(section_title)
            articles = list(self.parse_reuters_section(base, slug))
            if articles:
                ans.append((section_title, articles))
            if self.test and len(ans) >= self.test[0]:
                break
        return ans

    def parse_reuters_section(self, base, slug):
        url = 'https://' + base + '/' + slug
        raw = self.index_to_soup(url, raw=True)
        for article in extract_article_list(raw):
            article['url'] = 'https://{}{}'.format(base, article['url'])
            yield article
            self.log('\t', article['title'], article['url'])

    def preprocess_html(self, soup):
        for noscript in soup.findAll('noscript'):
            if noscript.findAll('img'):
                noscript.name = 'div'
        for img in soup.findAll('img', attrs={'srcset':True}):
            img['src'] = img['srcset'].split()[0]
        return soup