#!/usr/bin/env python # vim:fileencoding=utf-8 # License: GPLv3 Copyright: 2018, Kovid Goyal from __future__ import absolute_import, division, print_function, unicode_literals from calibre.web.feeds.news import BasicNewsRecipe def absolutize(href): if href.startswith('/'): href = 'https://www.si.com' + href return href def classes(classes): q = frozenset(classes.split(' ')) return dict(attrs={ 'class': lambda x: x and frozenset(x.split()).intersection(q)}) class SI(BasicNewsRecipe): title = 'Sports Illustrated' __author__ = 'Kovid Goyal' language = 'en' encoding = 'utf-8' ignore_duplicate_articles = {'title', 'url'} no_stylesheets = True compress_news_images = True compress_news_images_auto_size = 5 remove_attributes = ['style'] keep_only_tags = [ classes('m-detail-header m-detail--body'), ] remove_tags = [ classes('media-video OUTBRAIN m-in-content-ad-row'), dict(name=['button', 'meta', 'source']), ] def get_browser(self, *args, **kwargs): br = BasicNewsRecipe.get_browser(self, *args, **kwargs) br.addheaders += [('Accept-Language', 'en')] return br def preprocess_html(self, soup, *a): for tag in soup.findAll(attrs={'data-src': True}): tag.name = 'img' del tag.contents[:] tag['src'] = tag['data-src'] tag['height'] = tag['width'] = '' return soup def parse_index(self): soup = self.index_to_soup('https://www.si.com/') # from calibre.utils.ipython import ipython # ipython({'soup': soup}) cats = {} for ps in soup.findAll('phoenix-super-link'): h2 = ps.find('h2') title = self.tag_to_string(h2) label = ps.find(attrs={'phx-track-id': 'Label'}) category = self.tag_to_string(label) if label is not None else 'Features' url = absolutize(ps['href']) arts = cats.setdefault(category, []) arts.append({'title': title, 'url': url}) self.log('Found article', title) ans = [] for key in sorted(cats, key=lambda x: x.lower()): ans.append((key, cats[key])) return ans