mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Update grantland.com
This commit is contained in:
parent
5dd6c7d9d1
commit
5416312344
@ -5,7 +5,7 @@ class GrantLand(BasicNewsRecipe):
|
|||||||
title = u"Grantland"
|
title = u"Grantland"
|
||||||
description = 'Writings on Sports & Pop Culture'
|
description = 'Writings on Sports & Pop Culture'
|
||||||
language = 'en'
|
language = 'en'
|
||||||
__author__ = 'Barty'
|
__author__ = 'barty on mobileread.com forum'
|
||||||
max_articles_per_feed = 100
|
max_articles_per_feed = 100
|
||||||
no_stylesheets = False
|
no_stylesheets = False
|
||||||
# auto_cleanup is too aggressive sometimes and we end up with blank articles
|
# auto_cleanup is too aggressive sometimes and we end up with blank articles
|
||||||
@ -57,33 +57,43 @@ class GrantLand(BasicNewsRecipe):
|
|||||||
headers = soup.findAll('h2' if tag=='' else 'h3')
|
headers = soup.findAll('h2' if tag=='' else 'h3')
|
||||||
|
|
||||||
for header in headers:
|
for header in headers:
|
||||||
tag = header.find('a')
|
tag = header.find('a',href=True)
|
||||||
if tag is None or not hasattr(tag,'href'):
|
if tag is None:
|
||||||
continue
|
continue
|
||||||
url = tag['href']
|
url = tag['href']
|
||||||
if url.startswith('/'):
|
|
||||||
url = self.INDEX + url
|
|
||||||
if url in seen_urls:
|
if url in seen_urls:
|
||||||
continue
|
continue
|
||||||
seen_urls.add(url)
|
|
||||||
title = self.tag_to_string(tag)
|
title = self.tag_to_string(tag)
|
||||||
if 'Podcast:' in title or 'In Case You Missed It' in title:
|
if 'Podcast:' in title or 'In Case You Missed It' in title:
|
||||||
continue
|
continue
|
||||||
desc = dt = ''
|
desc = dt = ''
|
||||||
par = header.parent
|
# get at the div that contains description and other info
|
||||||
#tag = par.find('cite')
|
div = header.parent.find('div')
|
||||||
#if tag is not None:
|
if div is not None:
|
||||||
# desc = '['+self.tag_to_string(tag) + '] '
|
desc = self.tag_to_string(div)
|
||||||
tag = par.find('div')
|
dt = div.find('time')
|
||||||
if tag is not None:
|
if dt is not None:
|
||||||
desc = desc + self.tag_to_string(tag)
|
dt = self.tag_to_string( dt)
|
||||||
tag = tag.find('time')
|
|
||||||
if tag is not None:
|
|
||||||
dt = self.tag_to_string( tag)
|
|
||||||
|
|
||||||
|
# if div contains the same url that is in h2/h3
|
||||||
|
# that means this is a series split into multiple articles
|
||||||
|
if div.find('a',href=url):
|
||||||
|
self.log('\tFound series:', title)
|
||||||
|
# grab all articles in series
|
||||||
|
for tag in div.findAll('a',href=True):
|
||||||
|
url = tag['href']
|
||||||
|
if url in seen_urls:
|
||||||
|
continue
|
||||||
|
self.log('\t', url)
|
||||||
|
seen_urls.add(url)
|
||||||
|
articles.append({'title':title+' - '+self.tag_to_string( tag),
|
||||||
|
'url':url,'description':desc,'date':dt})
|
||||||
|
else:
|
||||||
self.log('\tFound article:', title)
|
self.log('\tFound article:', title)
|
||||||
self.log('\t', url)
|
self.log('\t', url)
|
||||||
|
seen_urls.add(url)
|
||||||
articles.append({'title':title,'url':url,'description':desc,'date':dt})
|
articles.append({'title':title,'url':url,'description':desc,'date':dt})
|
||||||
|
|
||||||
if len(articles) >= max_articles:
|
if len(articles) >= max_articles:
|
||||||
break
|
break
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user