Update Sports Illustrated

This commit is contained in:
Kovid Goyal 2020-06-20 10:49:08 +05:30
parent 14c8099abb
commit bacb0b78a8
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C

View File

@ -7,6 +7,12 @@ from __future__ import absolute_import, division, print_function, unicode_litera
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
def absolutize(href):
if href.startswith('/'):
href = 'https://www.si.com' + href
return href
def classes(classes): def classes(classes):
q = frozenset(classes.split(' ')) q = frozenset(classes.split(' '))
return dict(attrs={ return dict(attrs={
@ -25,11 +31,11 @@ class SI(BasicNewsRecipe):
remove_attributes = ['style'] remove_attributes = ['style']
keep_only_tags = [ keep_only_tags = [
classes('headline article-content'), classes('m-detail-header m-detail--body'),
] ]
remove_tags = [ remove_tags = [
classes('media-video OUTBRAIN'), classes('media-video OUTBRAIN m-in-content-ad-row'),
dict(name='meta'), dict(name=['button', 'meta', 'source']),
] ]
def get_browser(self, *args, **kwargs): def get_browser(self, *args, **kwargs):
@ -42,20 +48,24 @@ class SI(BasicNewsRecipe):
tag.name = 'img' tag.name = 'img'
del tag.contents[:] del tag.contents[:]
tag['src'] = tag['data-src'] tag['src'] = tag['data-src']
tag['height'] = tag['width'] = ''
return soup return soup
feeds = [ def parse_index(self):
('Top stories', 'https://www.si.com/rss/si_topstories.rss'), soup = self.index_to_soup('https://www.si.com/')
('NFL', 'https://www.si.com/rss/si_nfl.rss'), # from calibre.utils.ipython import ipython
('College Football', 'https://www.si.com/rss/si_ncaaf.rss'), # ipython({'soup': soup})
('MLB', 'https://www.si.com/rss/si_mlb.rss'), cats = {}
('NBA', 'https://www.si.com/rss/si_nba.rss'), for ps in soup.findAll('phoenix-super-link'):
('College basketball', 'https://www.si.com/rss/si_ncaab.rss'), h2 = ps.find('h2')
('NHL', 'https://www.si.com/rss/si_hockey.rss'), title = self.tag_to_string(h2)
('Soccer', 'https://www.si.com/rss/si_soccer.rss'), label = ps.find(attrs={'phx-track-id': 'Label'})
('Tennis', 'https://www.si.com/rss/si_tennis.rss'), category = self.tag_to_string(label) if label is not None else 'Features'
('Fantasy', 'https://www.si.com/rss/si_fantasy.rss'), url = absolutize(ps['href'])
('MMA', 'https://www.si.com/rss/si_mma.rss'), arts = cats.setdefault(category, [])
('Swim Daily', 'https://www.si.com/rss/si_swim_daily.rss'), arts.append({'title': title, 'url': url})
('Writers', 'https://www.si.com/rss/si_writers.rss'), self.log('Found article', title)
] ans = []
for key in sorted(cats, key=lambda x: x.lower()):
ans.append((key, cats[key]))
return ans