calibre/recipes/sports_illustrated.recipe

#!/usr/bin/env python
# vim:fileencoding=utf-8
# License: GPLv3 Copyright: 2018, Kovid Goyal <kovid at kovidgoyal.net>

from __future__ import absolute_import, division, print_function, unicode_literals

from calibre.web.feeds.news import BasicNewsRecipe


def absolutize(href):
    if href.startswith('/'):
        href = 'https://www.si.com' + href
    return href


def classes(classes):
    q = frozenset(classes.split(' '))
    return dict(attrs={
        'class': lambda x: x and frozenset(x.split()).intersection(q)})


class SI(BasicNewsRecipe):
    title = 'Sports Illustrated'
    __author__ = 'Kovid Goyal'
    language = 'en'
    encoding = 'utf-8'
    ignore_duplicate_articles = {'title', 'url'}
    no_stylesheets = True
    compress_news_images = True
    compress_news_images_auto_size = 5
    remove_attributes = ['style']

    keep_only_tags = [
        classes('m-detail-header m-detail--body'),
    ]
    remove_tags = [
        classes('media-video OUTBRAIN m-in-content-ad-row'),
        dict(name=['button', 'meta', 'source']),
    ]

    def get_browser(self, *args, **kwargs):
        br = BasicNewsRecipe.get_browser(self, *args, **kwargs)
        br.addheaders += [('Accept-Language', 'en')]
        return br

    def preprocess_html(self, soup, *a):
        for tag in soup.findAll(attrs={'data-src': True}):
            tag.name = 'img'
            del tag.contents[:]
            tag['src'] = tag['data-src']
            tag['height'] = tag['width'] = ''
        return soup

    def parse_index(self):
        soup = self.index_to_soup('https://www.si.com/')
        # from calibre.utils.ipython import ipython
        # ipython({'soup': soup})
        cats = {}
        for ps in soup.findAll('phoenix-super-link'):
            h2 = ps.find('h2')
            title = self.tag_to_string(h2)
            label = ps.find(attrs={'phx-track-id': 'Label'})
            category = self.tag_to_string(label) if label is not None else 'Features'
            url = absolutize(ps['href'])
            arts = cats.setdefault(category, [])
            arts.append({'title': title, 'url': url})
            self.log('Found article', title)
        ans = []
        for key in sorted(cats, key=lambda x: x.lower()):
            ans.append((key, cats[key]))
        return ans