Add recipe for the Sydney Morning Herald

This commit is contained in:
Kovid Goyal 2008-05-16 07:58:45 -07:00
parent dc214c984a
commit b3052188d2
3 changed files with 56 additions and 2 deletions

View File

@ -7,7 +7,7 @@ Builtin recipes.
recipes = [
'newsweek', 'atlantic', 'economist', 'portfolio',
'nytimes', 'usatoday', 'outlook_india', 'bbc', 'greader', 'wsj',
'wired', 'globe_and_mail',
'wired', 'globe_and_mail', 'smh',
]
import re, imp, inspect, time

View File

@ -0,0 +1,54 @@
#!/usr/bin/env python
__license__ = 'GPL v3'
__copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net'
__docformat__ = 'restructuredtext en'
'''
smh.com.au
'''
import time
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulSoup
class SMH(BasicNewsRecipe):
title = 'Sydney Morning Herald'
description = 'Business News, World News and Breaking News in Australia'
__author__ = 'Kovid Goyal'
def get_browser(self):
br = BasicNewsRecipe.get_browser()
br.set_handle_refresh(False)
return br
def parse_index(self):
soup = BeautifulSoup(self.browser.open('http://www.smh.com.au/text/').read())
feeds, articles = [], []
feed = None
for tag in soup.findAll(['h3', 'a']):
if tag.name == 'h3':
if articles:
feeds.append((feed, articles))
articles = []
feed = self.tag_to_string(tag)
elif feed is not None and tag.has_key('href') and tag['href'].strip():
url = tag['href'].strip()
if url.startswith('/'):
url = 'http://www.smh.com.au' + url
title = self.tag_to_string(tag)
articles.append({
'title': title,
'url' : url,
'date' : time.strftime('%a, %d %b'),
'description' : '',
'content' : '',
})
return feeds

View File

@ -140,7 +140,7 @@ class RecursiveFetcher(object, LoggingInterface):
except urllib2.URLError, err:
if hasattr(err, 'code') and responses.has_key(err.code):
raise FetchError, responses[err.code]
if err.reason[0] == 104: # Connection reset by peer
if getattr(err, 'reason', [0])[0] == 104: # Connection reset by peer
self.log_debug('Connection reset by peer retrying in 1 second.')
time.sleep(1)
f = self.browser.open(url)