import re from calibre.web.feeds.news import BasicNewsRecipe class GrantLand(BasicNewsRecipe): title = u"Grantland" description = 'Writings on Sports & Pop Culture' language = 'en' __author__ = 'barty on mobileread.com forum' max_articles_per_feed = 100 no_stylesheets = True # auto_cleanup is too aggressive sometimes and we end up with blank # articles auto_cleanup = False timefmt = ' [%a, %d %b %Y]' oldest_article = 90 cover_url = 'http://cdn0.sbnation.com/imported_assets/740965/blog_grantland_grid_3.jpg' masthead_url = 'http://a1.espncdn.com/prod/assets/grantland/grantland-logo.jpg' INDEX = 'http://www.grantland.com' CATEGORIES = [ # comment out second line if you don't want older articles # (user friendly name, url suffix, max number of articles to load) ('Today in Grantland', '', 20), ('In Case You Missed It', 'incaseyoumissedit', 35), ] remove_tags = [ {'name': ['style', 'aside', 'nav', 'footer', 'script']}, {'name': 'h1', 'text': 'Grantland'}, {'id': ['header', 'col-right']}, {'class': ['connect_widget']}, {'name': 'section', 'class': re.compile(r'\b(ad|module)\b')}, ] preprocess_regexps = [ # remove blog banners (re.compile(r'. skip if not title: continue self.log('\tFound article:', title) self.log('\t', url) articles.append({'title': title, 'url': url}) seen_urls.add(url) if len(articles) >= max_articles: break if articles: feeds.append((cat_name, articles)) return feeds