Fix Globe and Mail recipe for updated site

This commit is contained in:
Kovid Goyal 2009-06-02 12:01:08 -07:00
parent e6728649be
commit cd648cad29

View File

@ -8,46 +8,37 @@ globeandmail.com
''' '''
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulSoup
class GlobeAndMail(BasicNewsRecipe): class GlobeAndMail(BasicNewsRecipe):
title = 'Globe and Mail' title = 'Globe and Mail'
__author__ = 'Kovid Goyal' __author__ = 'Kovid Goyal'
language = _('English') language = _('English')
oldest_article = 2.0
no_stylesheets = True
description = 'Canada\'s national newspaper' description = 'Canada\'s national newspaper'
keep_only_tags = [dict(id='content')] remove_tags_before = dict(id="article-top")
remove_tags = [dict(attrs={'class':'nav'}), dict(id=['related', 'TPphoto', 'secondaryNav', 'articleBottomToolsHolder'])] remove_tags = [
{'id':['util', 'article-tabs', 'comments', 'article-relations',
def parse_index(self): 'gallery-controls', 'video', 'galleryLoading']},
src = self.browser.open('http://www.theglobeandmail.com/frontpage/').read() ]
soup = BeautifulSoup(src) remove_tags_after = dict(id='article-content')
feeds = [] feeds = [
articles = [] ('Latest headlines', 'http://www.theglobeandmail.com/?service=rss'),
feed = 'Front Page' ('Top stories', 'http://www.theglobeandmail.com/?service=rss&feed=topstories'),
for tag in soup.findAll(['h3', 'h4']): ('National', 'http://www.theglobeandmail.com/news/national/?service=rss'),
if tag.name == 'h3': ('Politics', 'http://www.theglobeandmail.com/news/politics/?service=rss'),
a = tag.find('a', href=True) ('World', 'http://www.theglobeandmail.com/news/world/?service=rss'),
if a is not None: ('Business', 'http://www.theglobeandmail.com/report-on-business/?service=rss'),
href = 'http://www.theglobeandmail.com' + a['href'].strip() ('Opinions', 'http://www.theglobeandmail.com/news/opinions/?service=rss'),
text = a.find(text=True) ('Columnists', 'http://www.theglobeandmail.com/news/opinions/columnists/?service=rss'),
if text: ('Globe Investor', 'http://www.theglobeandmail.com/globe-investor/?service=rss'),
text = text.strip() ('Sports', 'http://www.theglobeandmail.com/sports/?service=rss'),
desc = '' ('Technology', 'http://www.theglobeandmail.com/news/technology/?service=rss'),
summary = tag.findNextSiblings('p', attrs={'class':'summary'}, limit=1) ('Arts', 'http://www.theglobeandmail.com/news/arts/?service=rss'),
if summary: ('Life', 'http://www.theglobeandmail.com/life/?service=rss'),
desc = self.tag_to_string(summary[0], False) ('Blogs', 'http://www.theglobeandmail.com/blogs/?service=rss'),
articles.append({ ('Real Estate', 'http://www.theglobeandmail.com/real-estate/?service=rss'),
'title': text, ('Auto', 'http://www.theglobeandmail.com/auto/?service=rss'),
'url' : href, ]
'desc' : desc,
'date' : '',
})
elif tag.name == 'h4':
if articles:
feeds.append((feed, articles))
articles = []
feed = self.tag_to_string(tag, False)
return feeds