#!/usr/bin/env python __license__ = 'GPL v3' __author__ = 'Tony Stegall' __copyright__ = '2010, Tony Stegall or Tonythebookworm on mobiread.com' __version__ = '1' __date__ = '16, October 2010' __docformat__ = 'English' from calibre.web.feeds.news import BasicNewsRecipe class MalaysianMirror(BasicNewsRecipe): title = 'MalaysianMirror' __author__ = 'Tonythebookworm' description = 'The Pulse of the Nation' language = 'en' no_stylesheets = True publisher = 'Tonythebookworm' category = 'news' use_embedded_content = False no_stylesheets = True oldest_article = 24 remove_javascript = True remove_empty_feeds = True conversion_options = {'linearize_tables': True} extra_css = ''' #content_heading{font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:large;} td{text-align:right; font-size:small;margin-top:0px;margin-bottom: 0px;} #content_body{font-family:Helvetica,Arial,sans-serif;font-size:small;} ''' keep_only_tags = [dict(name='table', attrs={'class': ['contentpaneopen']}) ] remove_tags = [dict(name='table', attrs={'class': ['buttonheading']})] ########################################################################## max_articles_per_feed = 10 ''' Make a variable that will hold the url for the main site because our links do not include the index ''' INDEX = 'http://www.malaysianmirror.com' def parse_index(self): feeds = [] for title, url in [ (u"Media Buzz", u"http://www.malaysianmirror.com/media-buzz-front"), (u"Life Style", u"http://www.malaysianmirror.com/lifestylefront"), (u"Features", u"http://www.malaysianmirror.com/featurefront"), ]: articles = self.make_links(url) if articles: feeds.append((title, articles)) return feeds def make_links(self, url): title = 'Temp' current_articles = [] soup = self.index_to_soup(url) for item in soup.findAll('div', attrs={'class': 'contentheading'}): # print 'item is: ', item link = item.find('a') # print 'the link is: ', link if link: url = self.INDEX + link['href'] title = self.tag_to_string(link) # print 'the title is: ', title # print 'the url is: ', url # print 'the title is: ', title current_articles.append( {'title': title, 'url': url, 'description': '', 'date': ''}) # append all this return current_articles def preprocess_html(self, soup): for item in soup.findAll(attrs={'style': True}): del item['style'] return soup