From f998c8bf4bbd170d7376253ae97d0ac9179a7bda Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sun, 22 Jan 2012 10:08:26 +0530 Subject: [PATCH] Update grantland.com --- recipes/grantland.recipe | 148 ++++++++++++++++----------------------- 1 file changed, 59 insertions(+), 89 deletions(-) diff --git a/recipes/grantland.recipe b/recipes/grantland.recipe index e169f87f25..2cee9b2077 100644 --- a/recipes/grantland.recipe +++ b/recipes/grantland.recipe @@ -2,105 +2,75 @@ import re from calibre.web.feeds.news import BasicNewsRecipe class GrantLand(BasicNewsRecipe): - title = u"Grantland" - description = 'Writings on Sports & Pop Culture' - language = 'en' - __author__ = 'barty on mobileread.com forum' - max_articles_per_feed = 100 - no_stylesheets = False - # auto_cleanup is too aggressive sometimes and we end up with blank articles - auto_cleanup = False - timefmt = ' [%a, %d %b %Y]' - oldest_article = 365 + title = u"Grantland" + description = 'Writings on Sports & Pop Culture' + language = 'en' + __author__ = 'barty on mobileread.com forum' + max_articles_per_feed = 100 + no_stylesheets = True + # auto_cleanup is too aggressive sometimes and we end up with blank articles + auto_cleanup = False + timefmt = ' [%a, %d %b %Y]' + oldest_article = 90 - cover_url = 'http://cdn0.sbnation.com/imported_assets/740965/blog_grantland_grid_3.jpg' - masthead_url = 'http://a1.espncdn.com/prod/assets/grantland/grantland-logo.jpg' + cover_url = 'http://cdn0.sbnation.com/imported_assets/740965/blog_grantland_grid_3.jpg' + masthead_url = 'http://a1.espncdn.com/prod/assets/grantland/grantland-logo.jpg' - INDEX = 'http://www.grantland.com' - CATEGORIES = [ - # comment out categories you don't want - # (user friendly name, url suffix, max number of articles to load) - ('Today in Grantland','',20), - ('In Case You Missed It','incaseyoumissedit',35), - ] + INDEX = 'http://www.grantland.com' + CATEGORIES = [ + # comment out second line if you don't want older articles + # (user friendly name, url suffix, max number of articles to load) + ('Today in Grantland','',20), + ('In Case You Missed It','incaseyoumissedit',35), + ] - remove_tags = [ - {'name':['head','style','script']}, - {'id':['header']}, - {'class':re.compile(r'\bside|\bad\b|floatright|tags')} - ] - remove_tags_before = {'class':'wrapper'} - remove_tags_after = [{'id':'content'}] + remove_tags = [ + {'name':['style','aside','nav','footer','script']}, + {'name':'h1','text':'Grantland'}, + {'id':['header','col-right']}, + {'class':['connect_widget']}, + {'name':'section','class':re.compile(r'\b(ad|module)\b')}, + ] - preprocess_regexps = [ - #
tags with an img inside are just blog banners, don't need them - # note: there are other useful
tags so we don't want to just strip all of them - (re.compile(r'
.+?
', re.DOTALL|re.IGNORECASE),lambda m: ''), - # delete everything between the *last*
and - (re.compile(r'
', re.DOTALL|re.IGNORECASE),lambda m: '
'), - ] - extra_css = """cite, time { font-size: 0.8em !important; margin-right: 1em !important; } - img + cite { display:block; text-align:right}""" + preprocess_regexps = [ + # remove blog banners + (re.compile(r'. skip + if not title: + continue + self.log('\tFound article:', title) + self.log('\t', url) + articles.append({'title':title,'url':url}) + seen_urls.add(url) - if len(articles) >= max_articles: - break + if len(articles) >= max_articles: + break - if articles: - feeds.append((cat_name, articles)) + if articles: + feeds.append((cat_name, articles)) - return feeds - - def print_version(self, url): - return url+'?view=print' + return feeds