Update Globe and Mail. Fix #405 (New news feed)

This commit is contained in:
Kovid Goyal 2010-10-14 17:39:00 -06:00
parent 7e6c93504c
commit d5462c8d00
2 changed files with 28 additions and 42 deletions

View File

@ -26,31 +26,12 @@ class GlobeAndMail(BasicNewsRecipe):
#credit {margin-top:0px;} #credit {margin-top:0px;}
.tag {font-size: 22pt;}''' .tag {font-size: 22pt;}'''
description = 'Canada\'s national newspaper' description = 'Canada\'s national newspaper'
remove_tags_before = dict(id="article-top") keep_only_tags = [dict(name='article')]
remove_tags = [ remove_tags = [dict(name='aside'),
{'id':['util', 'article-tabs', 'comments', 'article-relations', dict(name='footer'),
'gallery-controls', 'video', 'galleryLoading','deck','header', dict(name='div', attrs={'class':(lambda x: isinstance(x, (str,unicode)) and 'articlecommentcountholder' in x.split(' '))}),
'toolsBottom'] }, dict(name='ul', attrs={'class':(lambda x: isinstance(x, (str,unicode)) and 'articletoolbar' in x.split(' '))}),
{'class':['credit','inline-img-caption','tab-pointer'] }, ]
dict(name='div', attrs={'id':['lead-photo', 'most-popular-story']}),
dict(name='div', attrs={'class':'right'}),
dict(name='div', attrs={'id':'footer'}),
dict(name='div', attrs={'id':'beta-msg'}),
dict(name='img', attrs={'class':'headshot'}),
dict(name='div', attrs={'class':'brand'}),
dict(name='div', attrs={'id':'nav-wrap'}),
dict(name='div', attrs={'id':'featureTopics'}),
dict(name='div', attrs={'id':'videoNav'}),
dict(name='div', attrs={'id':'blog-header'}),
dict(name='div', attrs={'id':'right-rail'}),
dict(name='div', attrs={'id':'group-footer-container'}),
dict(name=['iframe', 'style'])
]
remove_attributes = ['style']
remove_tags_after = [{'id':['article-content']},
{'class':['pull','inline-img'] },
dict(name='img', attrs={'class':'inline-media-embed'}),
]
feeds = [ feeds = [
(u'Latest headlines', u'http://www.theglobeandmail.com/?service=rss'), (u'Latest headlines', u'http://www.theglobeandmail.com/?service=rss'),
(u'Top stories', u'http://www.theglobeandmail.com/?service=rss&feed=topstories'), (u'Top stories', u'http://www.theglobeandmail.com/?service=rss&feed=topstories'),

View File

@ -11,6 +11,7 @@ __docformat__ = 'restructuredtext en'
on 10/10/10 to include function to grab print version of articles on 10/10/10 to include function to grab print version of articles
''' '''
from datetime import date
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
''' '''
added by Tony Stegall added by Tony Stegall
@ -27,7 +28,6 @@ class AdvancedUserRecipe1249039563(BasicNewsRecipe):
no_stylesheets = True no_stylesheets = True
language = 'nl' language = 'nl'
extra_css = ''' extra_css = '''
body{font-family:Arial,Helvetica,sans-serif; font-size:small;} body{font-family:Arial,Helvetica,sans-serif; font-size:small;}
h1{font-size:large;} h1{font-size:large;}
@ -43,14 +43,16 @@ class AdvancedUserRecipe1249039563(BasicNewsRecipe):
def get_obfuscated_article(self, url): def get_obfuscated_article(self, url):
br = self.get_browser() br = self.get_browser()
print 'THE CURRENT URL IS: ', url
br.open(url) br.open(url)
year = date.today().year
try: try:
response = br.follow_link(url_regex='.*?(2010)(\\/)(article)(\\/)(print)(\\/)', nr = 0) response = br.follow_link(url_regex='.*?(%d)(\\/)(article)(\\/)(print)(\\/)'%year, nr = 0)
html = response.read() html = response.read()
except: except:
response = br.open(url) response = br.open(url)
html = response.read() html = response.read()
self.temp_files.append(PersistentTemporaryFile('_fa.html')) self.temp_files.append(PersistentTemporaryFile('_fa.html'))
self.temp_files[-1].write(html) self.temp_files[-1].write(html)
@ -59,19 +61,22 @@ class AdvancedUserRecipe1249039563(BasicNewsRecipe):
############################################################################################################### ###############################################################################################################
feeds = [ '''
(u'Laatste Nieuws', u'http://volkskrant.nl/rss/laatstenieuws.rss'), Change Log:
(u'Binnenlands nieuws', u'http://volkskrant.nl/rss/nederland.rss'), Date: 10/15/2010
(u'Buitenlands nieuws', u'http://volkskrant.nl/rss/internationaal.rss'), Feeds updated by Martin Tarenskeen
(u'Economisch nieuws', u'http://volkskrant.nl/rss/economie.rss'), '''
(u'Sportnieuws', u'http://volkskrant.nl/rss/sport.rss'),
(u'Kunstnieuws', u'http://volkskrant.nl/rss/kunst.rss'), feeds = [
(u'Laatste Nieuws', u'http://www.volkskrant.nl/rss/laatstenieuws.rss'),
(u'Binnenland', u'http://www.volkskrant.nl/rss/nederland.rss'),
(u'Buitenland', u'http://www.volkskrant.nl/rss/internationaal.rss'),
(u'Economie', u'http://www.volkskrant.nl/rss/economie.rss'),
(u'Sport', u'http://www.volkskrant.nl/rss/sport.rss'),
(u'Cultuur', u'http://www.volkskrant.nl/rss/kunst.rss'),
(u'Gezondheid & Wetenschap', u'http://www.volkskrant.nl/rss/wetenschap.rss'),
(u'Internet & Media', u'http://www.volkskrant.nl/rss/media.rss') ]
#both of these rss feeds link back to the main volksrant.nl url a.k.a Broken
#If someone happens to know the correct paths then they can put them in here
#(u'Wetenschapsnieuws', u'http://feeds.feedburner.com/DeVolkskrantWetenschap'),
#(u'Technologienieuws', u'http://feeds.feedburner.com/vkmedia')
]
''' '''
example for formating example for formating