Add recipe for the Globe and Mail

This commit is contained in:
Kovid Goyal 2008-05-02 08:43:12 -07:00
parent d196a0066e
commit 395fcc0ae0
2 changed files with 53 additions and 1 deletions

View File

@ -7,7 +7,7 @@ Builtin recipes.
recipes = [
'newsweek', 'atlantic', 'economist', 'dilbert', 'portfolio',
'nytimes', 'usatoday', 'outlook_india', 'bbc', 'greader', 'wsj',
'wired',
'wired', 'globe_and_mail',
]
import re, imp, inspect, time

View File

@ -0,0 +1,52 @@
#!/usr/bin/env python
__license__ = 'GPL v3'
__copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net'
__docformat__ = 'restructuredtext en'
'''
globeandmail.com
'''
import re
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulSoup
class GlobeAndMail(BasicNewsRecipe):
title = 'Globe and Mail'
__author__ = 'Kovid Goyal'
description = 'Canada\'s national newspaper'
keep_only_tags = [dict(id='content')]
remove_tags = [dict(attrs={'class':'nav'}), dict(id=['related', 'TPphoto', 'secondaryNav', 'articleBottomToolsHolder'])]
def parse_index(self):
src = self.browser.open('http://www.theglobeandmail.com/frontpage/').read()
soup = BeautifulSoup(src)
feeds = []
articles = []
feed = 'Front Page'
for tag in soup.findAll(['h3', 'h4']):
if tag.name == 'h3':
a = tag.find('a', href=True)
if a is not None:
href = 'http://www.theglobeandmail.com' + a['href'].strip()
text = a.find(text=True)
if text:
text = text.strip()
desc = ''
summary = tag.findNextSiblings('p', attrs={'class':'summary'}, limit=1)
if summary:
desc = self.tag_to_string(summary[0], False)
articles.append({
'title': text,
'url' : href,
'desc' : desc,
'date' : '',
})
elif tag.name == 'h4':
if articles:
feeds.append((feed, articles))
articles = []
feed = self.tag_to_string(tag, False)
return feeds