#!/usr/bin/env python __license__ = 'GPL v3' __author__ = 'Tony Stegall' __copyright__ = '2010, Tony Stegall or Tonythebookworm on mobiread.com' __version__ = '1' __date__ = '16, October 2010' __docformat__ = 'English' from calibre.web.feeds.news import BasicNewsRecipe class MalaysianMirror(BasicNewsRecipe): title = 'MalaysianMirror' __author__ = 'Tonythebookworm' description = 'The Pulse of the Nation' language = 'en' no_stylesheets = True publisher = 'Tonythebookworm' category = 'news' use_embedded_content= False no_stylesheets = True oldest_article = 24 remove_javascript = True remove_empty_feeds = True conversion_options = {'linearize_tables' : True} extra_css = ''' #content_heading{font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:large;} td{text-align:right; font-size:small;margin-top:0px;margin-bottom: 0px;} #content_body{font-family:Helvetica,Arial,sans-serif;font-size:small;} ''' keep_only_tags = [dict(name='table', attrs={'class':['contentpaneopen']}) ] remove_tags = [dict(name='table', attrs={'class':['buttonheading']})] ####################################################################################################################### max_articles_per_feed = 10 ''' Make a variable that will hold the url for the main site because our links do not include the index ''' INDEX = 'http://www.malaysianmirror.com' def parse_index(self): feeds = [] for title, url in [ (u"Media Buzz", u"http://www.malaysianmirror.com/media-buzz-front"), (u"Life Style", u"http://www.malaysianmirror.com/lifestylefront"), (u"Features", u"http://www.malaysianmirror.com/featurefront"), ]: articles = self.make_links(url) if articles: feeds.append((title, articles)) return feeds def make_links(self, url): title = 'Temp' current_articles = [] soup = self.index_to_soup(url) # print 'The soup is: ', soup for item in soup.findAll('div', attrs={'class':'contentheading'}): #print 'item is: ', item link = item.find('a') #print 'the link is: ', link if link: url = self.INDEX + link['href'] title = self.tag_to_string(link) #print 'the title is: ', title #print 'the url is: ', url #print 'the title is: ', title current_articles.append({'title': title, 'url': url, 'description':'', 'date':''}) # append all this return current_articles def preprocess_html(self, soup): for item in soup.findAll(attrs={'style':True}): del item['style'] return soup