Fix Globe and Mail recipe for updated site

2025-07-09 03:04:10 -04:00 · 2009-06-02 12:01:08 -07:00 · 2009-06-02 12:01:08 -07:00 · cd648cad29
commit cd648cad29
parent e6728649be
1 changed files with 28 additions and 37 deletions
--- a/src/calibre/web/feeds/recipes/recipe_globe_and_mail.py
+++ b/src/calibre/web/feeds/recipes/recipe_globe_and_mail.py
@ -8,46 +8,37 @@ globeandmail.com
 '''
 from calibre.web.feeds.news import BasicNewsRecipe
 from calibre.ebooks.BeautifulSoup import BeautifulSoup
 class GlobeAndMail(BasicNewsRecipe):
-    
+
    title = 'Globe and Mail'
    __author__ = 'Kovid Goyal'
    language = _('English')
    oldest_article = 2.0
    no_stylesheets = True
    description = 'Canada\'s national newspaper'
-    keep_only_tags = [dict(id='content')]
+    remove_tags_before = dict(id="article-top")
-    remove_tags    = [dict(attrs={'class':'nav'}), dict(id=['related', 'TPphoto', 'secondaryNav', 'articleBottomToolsHolder'])]
+    remove_tags = [
-    
+            {'id':['util', 'article-tabs', 'comments', 'article-relations',
-    def parse_index(self):
+            'gallery-controls', 'video', 'galleryLoading']},
-        src = self.browser.open('http://www.theglobeandmail.com/frontpage/').read()
+            ]
-        soup =  BeautifulSoup(src)
+    remove_tags_after = dict(id='article-content')
-        
+
-        feeds = []
+    feeds = [
-        articles = []
+            ('Latest headlines', 'http://www.theglobeandmail.com/?service=rss'),
-        feed = 'Front Page'
+            ('Top stories', 'http://www.theglobeandmail.com/?service=rss&feed=topstories'),
-        for tag in soup.findAll(['h3', 'h4']):
+            ('National', 'http://www.theglobeandmail.com/news/national/?service=rss'),
-            if tag.name == 'h3':
+            ('Politics', 'http://www.theglobeandmail.com/news/politics/?service=rss'),
-                a = tag.find('a', href=True)
+            ('World', 'http://www.theglobeandmail.com/news/world/?service=rss'),
-                if a is not None:
+            ('Business', 'http://www.theglobeandmail.com/report-on-business/?service=rss'),
-                    href = 'http://www.theglobeandmail.com' + a['href'].strip()
+            ('Opinions', 'http://www.theglobeandmail.com/news/opinions/?service=rss'),
-                    text = a.find(text=True)
+            ('Columnists', 'http://www.theglobeandmail.com/news/opinions/columnists/?service=rss'),
-                    if text:
+            ('Globe Investor', 'http://www.theglobeandmail.com/globe-investor/?service=rss'),
-                        text = text.strip()
+            ('Sports', 'http://www.theglobeandmail.com/sports/?service=rss'),
-                        desc = ''
+            ('Technology', 'http://www.theglobeandmail.com/news/technology/?service=rss'),
-                        summary = tag.findNextSiblings('p', attrs={'class':'summary'}, limit=1)
+            ('Arts', 'http://www.theglobeandmail.com/news/arts/?service=rss'),
-                        if summary:
+            ('Life', 'http://www.theglobeandmail.com/life/?service=rss'),
-                            desc = self.tag_to_string(summary[0], False)
+            ('Blogs', 'http://www.theglobeandmail.com/blogs/?service=rss'),
-                        articles.append({
+            ('Real Estate', 'http://www.theglobeandmail.com/real-estate/?service=rss'),
-                                         'title': text,
+            ('Auto', 'http://www.theglobeandmail.com/auto/?service=rss'),
-                                         'url'  : href,
+            ]
                                         'desc' : desc,
                                         'date' : '', 
                                         })
            elif tag.name == 'h4':
                if articles:
                    feeds.append((feed, articles))
                articles = []
                feed = self.tag_to_string(tag, False)
        return feeds