From 5416312344f82fdffac56a0fa15c9647f0b6ae9a Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Fri, 13 Jan 2012 08:15:59 +0530 Subject: [PATCH] Update grantland.com --- recipes/grantland.recipe | 48 ++++++++++++++++++++++++---------------- 1 file changed, 29 insertions(+), 19 deletions(-) diff --git a/recipes/grantland.recipe b/recipes/grantland.recipe index 03e6deb238..e169f87f25 100644 --- a/recipes/grantland.recipe +++ b/recipes/grantland.recipe @@ -5,7 +5,7 @@ class GrantLand(BasicNewsRecipe): title = u"Grantland" description = 'Writings on Sports & Pop Culture' language = 'en' - __author__ = 'Barty' + __author__ = 'barty on mobileread.com forum' max_articles_per_feed = 100 no_stylesheets = False # auto_cleanup is too aggressive sometimes and we end up with blank articles @@ -57,33 +57,43 @@ class GrantLand(BasicNewsRecipe): headers = soup.findAll('h2' if tag=='' else 'h3') for header in headers: - tag = header.find('a') - if tag is None or not hasattr(tag,'href'): + tag = header.find('a',href=True) + if tag is None: continue url = tag['href'] - if url.startswith('/'): - url = self.INDEX + url if url in seen_urls: continue - seen_urls.add(url) title = self.tag_to_string(tag) if 'Podcast:' in title or 'In Case You Missed It' in title: continue desc = dt = '' - par = header.parent - #tag = par.find('cite') - #if tag is not None: - # desc = '['+self.tag_to_string(tag) + '] ' - tag = par.find('div') - if tag is not None: - desc = desc + self.tag_to_string(tag) - tag = tag.find('time') - if tag is not None: - dt = self.tag_to_string( tag) + # get at the div that contains description and other info + div = header.parent.find('div') + if div is not None: + desc = self.tag_to_string(div) + dt = div.find('time') + if dt is not None: + dt = self.tag_to_string( dt) + + # if div contains the same url that is in h2/h3 + # that means this is a series split into multiple articles + if div.find('a',href=url): + self.log('\tFound series:', title) + # grab all articles in series + for tag in div.findAll('a',href=True): + url = tag['href'] + if url in seen_urls: + continue + self.log('\t', url) + seen_urls.add(url) + articles.append({'title':title+' - '+self.tag_to_string( tag), + 'url':url,'description':desc,'date':dt}) + else: + self.log('\tFound article:', title) + self.log('\t', url) + seen_urls.add(url) + articles.append({'title':title,'url':url,'description':desc,'date':dt}) - self.log('\tFound article:', title) - self.log('\t', url) - articles.append({'title':title,'url':url,'description':desc,'date':dt}) if len(articles) >= max_articles: break