Improved Globe and Mail

This commit is contained in:
Kovid Goyal 2010-11-15 11:13:27 -07:00
parent e9bebad70d
commit e4d29be176

View File

@ -1,7 +1,7 @@
#!/usr/bin/env python #!/usr/bin/env python
__license__ = 'GPL v3' __license__ = 'GPL v3'
__copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net' __copyright__ = '2010, Szing'
__docformat__ = 'restructuredtext en' __docformat__ = 'restructuredtext en'
''' '''
@ -10,49 +10,52 @@ globeandmail.com
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
class GlobeAndMail(BasicNewsRecipe): class AdvancedUserRecipe1287083651(BasicNewsRecipe):
title = u'Globe and Mail' title = u'Globe & Mail'
language = 'en_CA' __license__ = 'GPL v3'
__author__ = 'Szing'
__author__ = 'Kovid Goyal'
oldest_article = 2 oldest_article = 2
max_articles_per_feed = 10
no_stylesheets = True no_stylesheets = True
extra_css = ''' max_articles_per_feed = 100
h3 {font-size: 22pt; font-weight:bold; margin:0px; padding:0px 0px 8pt 0px;} encoding = 'utf8'
h4 {margin-top: 0px;} publisher = 'Globe & Mail'
#byline { font-family: monospace; font-weight:bold; } language = 'en_CA'
#placeline {font-weight:bold;} extra_css = 'p.meta {font-size:75%}\n .redtext {color: red;}\n .byline {font-size: 70%}'
#credit {margin-top:0px;}
.tag {font-size: 22pt;}'''
description = 'Canada\'s national newspaper'
keep_only_tags = [dict(name='article')]
remove_tags = [dict(name='aside'),
dict(name='footer'),
dict(name='div', attrs={'class':(lambda x: isinstance(x, (str,unicode)) and 'articlecommentcountholder' in x.split(' '))}),
dict(name='ul', attrs={'class':(lambda x: isinstance(x, (str,unicode)) and 'articletoolbar' in x.split(' '))}),
]
feeds = [ feeds = [
(u'Latest headlines', u'http://www.theglobeandmail.com/?service=rss'), (u'Top National Stories', u'http://www.theglobeandmail.com/news/national/?service=rss'),
(u'Top stories', u'http://www.theglobeandmail.com/?service=rss&feed=topstories'),
(u'National', u'http://www.theglobeandmail.com/news/national/?service=rss'),
(u'Politics', u'http://www.theglobeandmail.com/news/politics/?service=rss'),
(u'World', u'http://www.theglobeandmail.com/news/world/?service=rss'),
(u'Business', u'http://www.theglobeandmail.com/report-on-business/?service=rss'), (u'Business', u'http://www.theglobeandmail.com/report-on-business/?service=rss'),
(u'Opinions', u'http://www.theglobeandmail.com/news/opinions/?service=rss'), (u'Commentary', u'http://www.theglobeandmail.com/report-on-business/commentary/?service=rss'),
(u'Columnists', u'http://www.theglobeandmail.com/news/opinions/columnists/?service=rss'), (u'Blogs', u'http://www.theglobeandmail.com/blogs/?service=rss'),
(u'Globe Investor', u'http://www.theglobeandmail.com/globe-investor/?service=rss'), (u'Facts & Arguments', u'http://www.theglobeandmail.com/life/facts-and-arguments/?service=rss'),
(u'Sports', u'http://www.theglobeandmail.com/sports/?service=rss'),
(u'Technology', u'http://www.theglobeandmail.com/news/technology/?service=rss'), (u'Technology', u'http://www.theglobeandmail.com/news/technology/?service=rss'),
(u'Investing', u'http://www.theglobeandmail.com/globe-investor/?service=rss'),
(u'Top Polical Stories', u'http://www.theglobeandmail.com/news/politics/?service=rss'),
(u'Arts', u'http://www.theglobeandmail.com/news/arts/?service=rss'), (u'Arts', u'http://www.theglobeandmail.com/news/arts/?service=rss'),
(u'Life', u'http://www.theglobeandmail.com/life/?service=rss'), (u'Life', u'http://www.theglobeandmail.com/life/?service=rss'),
(u'Blogs', u'http://www.theglobeandmail.com/blogs/?service=rss'),
(u'Real Estate', u'http://www.theglobeandmail.com/real-estate/?service=rss'), (u'Real Estate', u'http://www.theglobeandmail.com/real-estate/?service=rss'),
(u'Auto', u'http://www.theglobeandmail.com/auto/?service=rss') (u'Auto', u'http://www.theglobeandmail.com/sports/?service=rss'),
(u'Sports', u'http://www.theglobeandmail.com/auto/?service=rss')
] ]
def get_article_url(self, article): keep_only_tags = [
url = BasicNewsRecipe.get_article_url(self, article) dict(name='h1'),
if '/video/' not in url: dict(name='h2', attrs={'id':'articletitle'}),
return url dict(name='p', attrs={'class':['leadText', 'meta', 'leadImage', 'redtext byline', 'bodyText']}),
dict(name='div', attrs={'class':['news','articlemeta','articlecopy']}),
dict(name='id', attrs={'class':'article'}),
dict(name='table', attrs={'class':'todays-market'}),
dict(name='header', attrs={'id':'leadheader'})
]
remove_tags = [
dict(name='div', attrs={'id':['tabInside', 'ShareArticles', 'topStories']})
]
#this has to be here or the text in the article appears twice.
remove_tags_after = [dict(id='article')]
#Use the mobile version rather than the web version
def print_version(self, url):
return url + '&service=mobile'