Fix TSN and St. Louis Post DIspatch

This commit is contained in:
Kovid Goyal 2013-01-22 09:45:10 +05:30
parent d52a659da2
commit e0d0eb1973
2 changed files with 18 additions and 27 deletions

View File

@ -7,12 +7,16 @@ class AdvancedUserRecipe1282093204(BasicNewsRecipe):
oldest_article = 1
max_articles_per_feed = 15
use_embedded_content = False
no_stylesheets = True
auto_cleanup = True
masthead_url = 'http://farm5.static.flickr.com/4118/4929686950_0e22e2c88a.jpg'
feeds = [
(u'News-Bill McClellan', u'http://www2.stltoday.com/search/?q=&d1=&d2=&s=start_time&sd=desc&c=news%2Flocal%2fcolumns%2Fbill-mclellan&f=rss&t=article'),
(u'News-Columns', u'http://www2.stltoday.com/search/?q=&d1=&d2=&s=start_time&sd=desc&c=news%2Flocal%2Fcolumns*&l=50&f=rss&t=article'),
(u'News-Crime & Courtshttp://www2.stltoday.com/search/?q=&d1=&d2=&s=start_time&sd=desc&c=news%2Flocal%2Fcrime-and-courts&l=50&f=rss&t=article'),
(u'News-Crime & Courts', 'http://www2.stltoday.com/search/?q=&d1=&d2=&s=start_time&sd=desc&c=news%2Flocal%2Fcrime-and-courts&l=50&f=rss&t=article'),
(u'News-Deb Peterson', u'http://www2.stltoday.com/search/?q=&d1=&d2=&s=start_time&sd=desc&c=news%2Flocal%2fcolumns%2Fdeb-peterson&f=rss&t=article'),
(u'News-Education', u'http://www2.stltoday.com/search/?q=&d1=&d2=&s=start_time&sd=desc&c=news%2Flocal%2feducation&f=rss&t=article'),
(u'News-Government & Politics', u'http://www2.stltoday.com/search/?q=&d1=&d2=&s=start_time&sd=desc&c=news%2Flocal%2fgovt-and-politics&f=rss&t=article'),
@ -62,9 +66,9 @@ class AdvancedUserRecipe1282093204(BasicNewsRecipe):
(u'Entertainment-House-O-Fun', u'http://www2.stltoday.com/search/?q=&d1=&d2=&s=start_time&sd=desc&c=entertainment%2Fhouse-o-fun&l=100&f=rss&t=article'),
(u'Entertainment-Kevin C. Johnson', u'http://www2.stltoday.com/search/?q=&d1=&d2=&s=start_time&sd=desc&c=entertainment%2Fmusic%2Fkevin-johnson&l=100&f=rss&t=article')
]
remove_empty_feeds = True
remove_tags = [dict(name='div', attrs={'id':'blox-logo'}),dict(name='a')]
keep_only_tags = [dict(name='h1'), dict(name='p', attrs={'class':'byline'}), dict(name="div", attrs={'id':'blox-story-text'})]
#remove_empty_feeds = True
#remove_tags = [dict(name='div', attrs={'id':'blox-logo'}),dict(name='a')]
#keep_only_tags = [dict(name='h1'), dict(name='p', attrs={'class':'byline'}), dict(name="div", attrs={'id':'blox-story-text'})]
extra_css = 'p {text-align: left;}'

View File

@ -7,28 +7,15 @@ class AdvancedUserRecipe1289990851(BasicNewsRecipe):
language = 'en_CA'
__author__ = 'Nexus'
no_stylesheets = True
auto_cleanup = True
use_embedded_content = False
INDEX = 'http://tsn.ca/nhl/story/?id=nhl'
keep_only_tags = [dict(name='div', attrs={'id':['tsnColWrap']}),
dict(name='div', attrs={'id':['tsnStory']})]
remove_tags = [dict(name='div', attrs={'id':'tsnRelated'}),
dict(name='div', attrs={'class':'textSize'})]
def parse_index(self):
feeds = []
soup = self.index_to_soup(self.INDEX)
feed_parts = soup.findAll('div', attrs={'class': 'feature'})
for feed_part in feed_parts:
articles = []
if not feed_part.h2:
continue
feed_title = feed_part.h2.string
article_parts = feed_part.findAll('a')
for article_part in article_parts:
article_title = article_part.string
article_date = ''
article_url = 'http://tsn.ca/' + article_part['href']
articles.append({'title': article_title, 'url': article_url, 'description':'', 'date':article_date})
if articles:
feeds.append((feed_title, articles))
return feeds
#keep_only_tags = [dict(name='div', attrs={'id':['tsnColWrap']}),
#dict(name='div', attrs={'id':['tsnStory']})]
#remove_tags = [dict(name='div', attrs={'id':'tsnRelated'}),
#dict(name='div', attrs={'class':'textSize'})]
feeds = [
('News',
'http://www.tsn.ca/datafiles/rss/Stories.xml'),
]