From cd648cad291e07be189ce49b6decab796eec3b0b Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Tue, 2 Jun 2009 12:01:08 -0700 Subject: [PATCH] Fix Globe and Mail recipe for updated site --- .../feeds/recipes/recipe_globe_and_mail.py | 65 ++++++++----------- 1 file changed, 28 insertions(+), 37 deletions(-) diff --git a/src/calibre/web/feeds/recipes/recipe_globe_and_mail.py b/src/calibre/web/feeds/recipes/recipe_globe_and_mail.py index c50405096a..6214fa0578 100644 --- a/src/calibre/web/feeds/recipes/recipe_globe_and_mail.py +++ b/src/calibre/web/feeds/recipes/recipe_globe_and_mail.py @@ -8,46 +8,37 @@ globeandmail.com ''' from calibre.web.feeds.news import BasicNewsRecipe -from calibre.ebooks.BeautifulSoup import BeautifulSoup class GlobeAndMail(BasicNewsRecipe): - + title = 'Globe and Mail' __author__ = 'Kovid Goyal' language = _('English') + oldest_article = 2.0 + no_stylesheets = True description = 'Canada\'s national newspaper' - keep_only_tags = [dict(id='content')] - remove_tags = [dict(attrs={'class':'nav'}), dict(id=['related', 'TPphoto', 'secondaryNav', 'articleBottomToolsHolder'])] - - def parse_index(self): - src = self.browser.open('http://www.theglobeandmail.com/frontpage/').read() - soup = BeautifulSoup(src) - - feeds = [] - articles = [] - feed = 'Front Page' - for tag in soup.findAll(['h3', 'h4']): - if tag.name == 'h3': - a = tag.find('a', href=True) - if a is not None: - href = 'http://www.theglobeandmail.com' + a['href'].strip() - text = a.find(text=True) - if text: - text = text.strip() - desc = '' - summary = tag.findNextSiblings('p', attrs={'class':'summary'}, limit=1) - if summary: - desc = self.tag_to_string(summary[0], False) - articles.append({ - 'title': text, - 'url' : href, - 'desc' : desc, - 'date' : '', - }) - elif tag.name == 'h4': - if articles: - feeds.append((feed, articles)) - articles = [] - feed = self.tag_to_string(tag, False) - - return feeds \ No newline at end of file + remove_tags_before = dict(id="article-top") + remove_tags = [ + {'id':['util', 'article-tabs', 'comments', 'article-relations', + 'gallery-controls', 'video', 'galleryLoading']}, + ] + remove_tags_after = dict(id='article-content') + + feeds = [ + ('Latest headlines', 'http://www.theglobeandmail.com/?service=rss'), + ('Top stories', 'http://www.theglobeandmail.com/?service=rss&feed=topstories'), + ('National', 'http://www.theglobeandmail.com/news/national/?service=rss'), + ('Politics', 'http://www.theglobeandmail.com/news/politics/?service=rss'), + ('World', 'http://www.theglobeandmail.com/news/world/?service=rss'), + ('Business', 'http://www.theglobeandmail.com/report-on-business/?service=rss'), + ('Opinions', 'http://www.theglobeandmail.com/news/opinions/?service=rss'), + ('Columnists', 'http://www.theglobeandmail.com/news/opinions/columnists/?service=rss'), + ('Globe Investor', 'http://www.theglobeandmail.com/globe-investor/?service=rss'), + ('Sports', 'http://www.theglobeandmail.com/sports/?service=rss'), + ('Technology', 'http://www.theglobeandmail.com/news/technology/?service=rss'), + ('Arts', 'http://www.theglobeandmail.com/news/arts/?service=rss'), + ('Life', 'http://www.theglobeandmail.com/life/?service=rss'), + ('Blogs', 'http://www.theglobeandmail.com/blogs/?service=rss'), + ('Real Estate', 'http://www.theglobeandmail.com/real-estate/?service=rss'), + ('Auto', 'http://www.theglobeandmail.com/auto/?service=rss'), + ]