__license__ = 'GPL v3' __copyright__ = '2011, M. Ching modified from work 2009-2011 Darko Miletic ' ''' staradvertiser.com ''' from calibre.web.feeds.news import BasicNewsRecipe class Starbulletin(BasicNewsRecipe): title = 'Honolulu Star-Advertiser' __author__ = 'Darko Miletic' description = 'Latest national and local Hawaii sports news' publisher = 'Honolulu Star-Advertiser' category = 'news, Honolulu, Hawaii' oldest_article = 2 needs_subscription = True max_articles_per_feed = 100 language = 'en' no_stylesheets = True use_embedded_content = False encoding = 'utf8' publication_type = 'newspaper' masthead_url = 'http://media.staradvertiser.com/designimages/star-advertiser-logo-small.gif' # extra_css = """ # body{font-family: Verdana,Arial,Helvetica,sans-serif} # h1,.brown,.hsa_postCredit{color: #663300} # .storyDeck{font-size: 1.2em; font-weight: bold} # img{display: block} # """ conversion_options = { 'comment' : description , 'tags' : category , 'publisher' : publisher , 'language' : language , 'linearize_tables' : True } keep_only_tags = [ dict(attrs={'id':'hsa_storyTitle'}) ,dict(attrs={'id':'hsa_storyTitle article-important'}) ,dict(attrs={'class':['hsa_dateStamp','hsa_postCredit','storyDeck']}) ,dict(name='span',attrs={'class':['hsa_dateStamp','hsa_postCredit']}) ,dict(name='span',attrs={'class':['hsa_dateStamp article-important','hsa_postCredit article-important']}) ,dict(name='div',attrs={'class':'storytext article-important'}) ,dict(name='div',attrs={'class':'storytext'}) ] remove_tags = [ dict(name=['object','link','script','meta','base','iframe']) # removed 'span' from preceding list to permit keeping of author and timestamp ,dict(attrs={'class':['insideStoryImage','insideStoryAd']}) ,dict(attrs={'name':'fb_share'}) ] def get_browser(self): br = BasicNewsRecipe.get_browser() if self.username is not None and self.password is not None: br.open('http://www.staradvertiser.com/manage/Login/') br.select_form(name='loginForm') br['email'] = self.username br['password'] = self.password br.submit() return br feeds = [ (u'Breaking News', u'http://www.staradvertiser.com/news/breaking/index.rss') ,(u'News', u'http://www.staradvertiser.com/newspremium/index.rss') ,(u'Business', u'http://www.staradvertiser.com/businesspremium/index.rss') ,(u'Sports', u'http://www.staradvertiser.com/sportspremium/index.rss') ,(u'Features', u'http://www.staradvertiser.com/featurespremium/index.rss') ] def preprocess_html(self, soup): for item in soup.findAll(style=True): del item['style'] for item in soup.findAll('a'): limg = item.find('img') if item.string is not None: str = item.string item.replaceWith(str) else: if limg: item.name = 'div' item.attrs = [] else: str = self.tag_to_string(item) item.replaceWith(str) for item in soup.findAll('img'): if not item.has_key('alt'): item['alt'] = 'image' return soup