Update Globe and Mail. Fix #405 (New news feed)

2025-07-09 03:04:10 -04:00 · 2010-10-14 17:39:00 -06:00 · 2010-10-14 17:39:00 -06:00 · d5462c8d00
commit d5462c8d00
parent 7e6c93504c
2 changed files with 28 additions and 42 deletions
--- a/resources/recipes/globe_and_mail.recipe
+++ b/resources/recipes/globe_and_mail.recipe
@ -26,31 +26,12 @@ class GlobeAndMail(BasicNewsRecipe):
    #credit {margin-top:0px;}
    .tag {font-size: 22pt;}'''
    description = 'Canada\'s national newspaper'
-    remove_tags_before = dict(id="article-top")
+    keep_only_tags = [dict(name='article')]
-    remove_tags = [
+    remove_tags = [dict(name='aside'),
-		{'id':['util', 'article-tabs', 'comments', 'article-relations',
+                   dict(name='footer'),
-		'gallery-controls', 'video', 'galleryLoading','deck','header',
+                   dict(name='div', attrs={'class':(lambda x: isinstance(x, (str,unicode)) and 'articlecommentcountholder' in x.split(' '))}),
-        'toolsBottom'] },
+                   dict(name='ul', attrs={'class':(lambda x: isinstance(x, (str,unicode)) and 'articletoolbar' in x.split(' '))}),
-		{'class':['credit','inline-img-caption','tab-pointer'] },
+                  ]
 		dict(name='div', attrs={'id':['lead-photo', 'most-popular-story']}),
 		dict(name='div', attrs={'class':'right'}),
 		dict(name='div', attrs={'id':'footer'}),
 		dict(name='div', attrs={'id':'beta-msg'}),
 		dict(name='img', attrs={'class':'headshot'}),
 		dict(name='div', attrs={'class':'brand'}),
 		dict(name='div', attrs={'id':'nav-wrap'}),
 		dict(name='div', attrs={'id':'featureTopics'}),
 		dict(name='div', attrs={'id':'videoNav'}),
 		dict(name='div', attrs={'id':'blog-header'}),
 		dict(name='div', attrs={'id':'right-rail'}),
 		dict(name='div', attrs={'id':'group-footer-container'}),
 		dict(name=['iframe', 'style'])
 		]
    remove_attributes = ['style']
    remove_tags_after = [{'id':['article-content']},
 		{'class':['pull','inline-img'] },
 		dict(name='img', attrs={'class':'inline-media-embed'}),
 		]
    feeds = [
            (u'Latest headlines', u'http://www.theglobeandmail.com/?service=rss'),
            (u'Top stories', u'http://www.theglobeandmail.com/?service=rss&feed=topstories'),
--- a/resources/recipes/volksrant.recipe
+++ b/resources/recipes/volksrant.recipe
@ -11,6 +11,7 @@ __docformat__ = 'restructuredtext en'
 on 10/10/10 to include function to grab print version of articles
 '''
 from datetime import date
 from calibre.web.feeds.news import BasicNewsRecipe
 '''
 added by Tony Stegall
@ -27,7 +28,6 @@ class AdvancedUserRecipe1249039563(BasicNewsRecipe):
    no_stylesheets = True
    language = 'nl'
    extra_css      = '''
                        body{font-family:Arial,Helvetica,sans-serif; font-size:small;}
                        h1{font-size:large;}
@ -43,14 +43,16 @@ class AdvancedUserRecipe1249039563(BasicNewsRecipe):
    def get_obfuscated_article(self, url):
        br = self.get_browser()
        print 'THE CURRENT URL IS: ', url
        br.open(url)
        year = date.today().year
        try:
-         response = br.follow_link(url_regex='.*?(2010)(\\/)(article)(\\/)(print)(\\/)', nr = 0)
+          response = br.follow_link(url_regex='.*?(%d)(\\/)(article)(\\/)(print)(\\/)'%year, nr = 0)
-         html = response.read()
+          html = response.read()
        except:
-         response = br.open(url)
+          response = br.open(url)
-         html = response.read()
+          html = response.read()
        self.temp_files.append(PersistentTemporaryFile('_fa.html'))
        self.temp_files[-1].write(html)
@ -59,19 +61,22 @@ class AdvancedUserRecipe1249039563(BasicNewsRecipe):
   ###############################################################################################################
-    feeds          = [
+    '''
-                      (u'Laatste Nieuws', u'http://volkskrant.nl/rss/laatstenieuws.rss'),
+      Change Log:
-                      (u'Binnenlands nieuws', u'http://volkskrant.nl/rss/nederland.rss'),
+       Date: 10/15/2010
-                      (u'Buitenlands nieuws', u'http://volkskrant.nl/rss/internationaal.rss'),
+       Feeds updated by Martin Tarenskeen
-                      (u'Economisch nieuws', u'http://volkskrant.nl/rss/economie.rss'),
+    '''
-                      (u'Sportnieuws', u'http://volkskrant.nl/rss/sport.rss'),
+
-                      (u'Kunstnieuws', u'http://volkskrant.nl/rss/kunst.rss'),
+    feeds          = [
                      (u'Laatste Nieuws', u'http://www.volkskrant.nl/rss/laatstenieuws.rss'),
                      (u'Binnenland', u'http://www.volkskrant.nl/rss/nederland.rss'),
                      (u'Buitenland', u'http://www.volkskrant.nl/rss/internationaal.rss'),
                      (u'Economie', u'http://www.volkskrant.nl/rss/economie.rss'),
                      (u'Sport', u'http://www.volkskrant.nl/rss/sport.rss'),
                      (u'Cultuur', u'http://www.volkskrant.nl/rss/kunst.rss'),
                      (u'Gezondheid & Wetenschap', u'http://www.volkskrant.nl/rss/wetenschap.rss'),
                      (u'Internet & Media', u'http://www.volkskrant.nl/rss/media.rss') ]
                        #both of these rss feeds link back to the main volksrant.nl url a.k.a Broken
                        #If someone happens to know the correct paths then they can put them in here
                      #(u'Wetenschapsnieuws', u'http://feeds.feedburner.com/DeVolkskrantWetenschap'),
                      #(u'Technologienieuws', u'http://feeds.feedburner.com/vkmedia')
                      ]
 '''
 example for formating