mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-06-23 15:30:45 -04:00
80 lines
2.5 KiB
Plaintext
80 lines
2.5 KiB
Plaintext
import re
|
|
from calibre.web.feeds.news import BasicNewsRecipe
|
|
|
|
|
|
class GrantLand(BasicNewsRecipe):
|
|
title = u"Grantland"
|
|
description = 'Writings on Sports & Pop Culture'
|
|
language = 'en'
|
|
__author__ = 'barty on mobileread.com forum'
|
|
max_articles_per_feed = 100
|
|
no_stylesheets = True
|
|
# auto_cleanup is too aggressive sometimes and we end up with blank
|
|
# articles
|
|
auto_cleanup = False
|
|
timefmt = ' [%a, %d %b %Y]'
|
|
oldest_article = 90
|
|
|
|
cover_url = 'http://cdn0.sbnation.com/imported_assets/740965/blog_grantland_grid_3.jpg'
|
|
masthead_url = 'http://a1.espncdn.com/prod/assets/grantland/grantland-logo.jpg'
|
|
|
|
INDEX = 'http://www.grantland.com'
|
|
CATEGORIES = [
|
|
# comment out second line if you don't want older articles
|
|
# (user friendly name, url suffix, max number of articles to load)
|
|
('Today in Grantland', '', 20),
|
|
('In Case You Missed It', 'incaseyoumissedit', 35),
|
|
]
|
|
|
|
remove_tags = [
|
|
{'name': ['style', 'aside', 'nav', 'footer', 'script']},
|
|
{'name': 'h1', 'text': 'Grantland'},
|
|
{'id': ['header', 'col-right']},
|
|
{'class': ['connect_widget']},
|
|
{'name': 'section', 'class': re.compile(r'\b(ad|module)\b')},
|
|
]
|
|
|
|
preprocess_regexps = [
|
|
# remove blog banners
|
|
(re.compile(r'<a href="/blog/(?:(?!</a>).)+</a>',
|
|
re.DOTALL | re.IGNORECASE), lambda m: ''),
|
|
]
|
|
|
|
def parse_index(self):
|
|
feeds = []
|
|
seen_urls = set()
|
|
|
|
for category in self.CATEGORIES:
|
|
|
|
(cat_name, tag, max_articles) = category
|
|
self.log('Reading category:', cat_name)
|
|
articles = []
|
|
|
|
page = "%s/%s" % (self.INDEX, tag)
|
|
soup = self.index_to_soup(page)
|
|
|
|
main = soup.find('div', id='col-main')
|
|
if main is None:
|
|
main = soup
|
|
|
|
for tag in main.findAll('a', href=re.compile(r'(story|post)/_/id/\d+')):
|
|
url = tag['href']
|
|
if url in seen_urls:
|
|
continue
|
|
title = tag.string
|
|
# blank title probably means <a href=".."><img /></a>. skip
|
|
if not title:
|
|
continue
|
|
self.log('\tFound article:', title)
|
|
self.log('\t', url)
|
|
articles.append({'title': title, 'url': url})
|
|
seen_urls.add(url)
|
|
|
|
if len(articles) >= max_articles:
|
|
break
|
|
|
|
if articles:
|
|
feeds.append((cat_name, articles))
|
|
|
|
return feeds
|