Improved recipe for the Sydney Morning Herald

This commit is contained in:
Kovid Goyal 2009-11-20 21:53:55 -07:00
parent 7f4dcb8827
commit 79f223845d

View File

@ -6,51 +6,86 @@ __docformat__ = 'restructuredtext en'
'''
smh.com.au
'''
from calibre import strftime
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulSoup
class SMH(BasicNewsRecipe):
title = 'Sydney Morning Herald'
description = 'Business News, World News and Breaking News in Australia'
__author__ = 'Kovid Goyal'
__author__ = 'Kovid Goyal and Sujata Raman'
language = 'en_AU'
max_articles_per_feed = 100
no_stylesheets = True
use_embedded_content = False
no_javascript = True
timefmt = ' [%A, %d %B, %Y]'
encoding = 'utf-8'
keep_only_tags = [dict(name='div', attrs ={'id':'content'})]
remove_tags = [
dict(name='div', attrs={'align' :'right'}),
dict(name='p', attrs={'class' :'comments'}),
dict(name='a', attrs={'class' :['more-photos','performerpromo']}),
dict(name='img', attrs={'alt' :'aap'}),
dict(name='div', attrs ={'id':['googleAds','moreGoogleAds','comments','footer','sidebar','austereopuff','adSpotIsland']}),
dict(name='div', attrs ={'class':['article-links','wof','articleTools top','cN-multimediaGroup cfix','articleTools bottom']}),
dict(name='div', attrs ={'class':['clear','adSpot-textboxgr1','adSpot-textBox','articleTools-c3 cfix','articleExtras-bottom','span-16 last']}),
dict(name='div', attrs ={'class':[ 'sidebar span-5','cT-socialCommenting','cN-linkList','cN-topicSelector','cT-storyTools cfix','cT-imageMultimedia']}) ,
dict(name='iframe'),
]
extra_css = '''
h1{font-family:Georgia,"Times New Roman",Times,serif; font-weight:bold; font-size:large;}
.cT-storyDetails{font-family:Arial,Helvetica,sans-serif; color:#666666;font-size:x-small;}
.articleBody{font-family:Arial,Helvetica,sans-serif; color:black;font-size:small;}
.cT-imageLandscape{font-family:Arial,Helvetica,sans-serif; color:#333333 ;font-size:x-small;}
.source{font-family:Arial,Helvetica,sans-serif; color:#333333 ;font-size:xx-small;}
#content{font-family:Arial,Helvetica,sans-serif;font-size:x-small;}
.pageprint{font-family:Arial,Helvetica,sans-serif;font-size:small;}
#bylineDetails{font-family:Arial,Helvetica,sans-serif; color:#666666;font-size:x-small;}
.featurePic-wide{font-family:Arial,Helvetica,sans-serif;font-size:x-small;}
#idfeaturepic{font-family:Arial,Helvetica,sans-serif;font-size:x-small;}
h3{font-family:Georgia,"Times New Roman",Times,serif; font-size:small;}
h2{font-family:Georgia,"Times New Roman",Times,serif; font-size:small;}
h4{font-family:Georgia,"Times New Roman",Times,serif; font-size:small;}
h5{font-family:Georgia,"Times New Roman",Times,serif; font-size:small;}
body{font-family:Arial,Helvetica,sans-serif; font-size:x-small;}
'''
feeds = [
('Top Stories', 'http://feeds.smh.com.au/rssheadlines/top.xml'),
('National', 'http://feeds.smh.com.au/rssheadlines/national.xml'),
('World', 'http://feeds.smh.com.au/rssheadlines/world.xml'),
('Business', 'http://www.smh.com.au/rssheadlines/business.xml'),
('National Times', 'http://www.smh.com.au/rssheadlines/opinion/article/rss.xml'),
('Entertainment', 'http://feeds.smh.com.au/rssheadlines/entertainment.xml'),
('Technology', 'http://feeds.smh.com.au/rssheadlines/technology.xml'),
('Sport', 'http://feeds.smh.com.au/rssheadlines/sport.xml'),
]
def preprocess_html(self, soup):
bod = soup.find('bod')
if bod is not None:
bod.tag = 'div'
p = soup.find(id='content')
bod.extract()
p.insert(len(p), bod)
return soup
def get_browser(self):
br = BasicNewsRecipe.get_browser()
br.set_handle_refresh(False)
return br
def parse_index(self):
soup = BeautifulSoup(self.browser.open('http://www.smh.com.au/text/').read())
feeds, articles = [], []
feed = None
def get_article_url(self, article):
url = article.link
if 'media' in url:
url = ''
return url
for tag in soup.findAll(['h3', 'a']):
if tag.name == 'h3':
if articles:
feeds.append((feed, articles))
articles = []
feed = self.tag_to_string(tag)
elif feed is not None and tag.has_key('href') and tag['href'].strip():
url = tag['href'].strip()
if url.startswith('/'):
url = 'http://www.smh.com.au' + url
title = self.tag_to_string(tag)
articles.append({
'title': title,
'url' : url,
'date' : strftime('%a, %d %b'),
'description' : '',
'content' : '',
})
return feeds