calibre/recipes/malaysian_mirror.recipe

#!/usr/bin/env python
__license__ = 'GPL v3'
__author__ = 'Tony Stegall'
__copyright__ = '2010, Tony Stegall or Tonythebookworm on mobiread.com'
__version__ = '1'
__date__ = '16, October 2010'
__docformat__ = 'English'


from calibre.web.feeds.news import BasicNewsRecipe


class MalaysianMirror(BasicNewsRecipe):
    title = 'MalaysianMirror'
    __author__ = 'Tonythebookworm'
    description = 'The Pulse of the Nation'
    language = 'en'
    no_stylesheets = True
    publisher = 'Tonythebookworm'
    category = 'news'
    use_embedded_content = False
    no_stylesheets = True
    oldest_article = 24

    remove_javascript = True
    remove_empty_feeds = True
    conversion_options = {'linearize_tables': True}
    extra_css = '''
                    #content_heading{font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:large;}

                    td{text-align:right; font-size:small;margin-top:0px;margin-bottom: 0px;}

                    #content_body{font-family:Helvetica,Arial,sans-serif;font-size:small;}
                '''

    keep_only_tags = [dict(name='table', attrs={'class': ['contentpaneopen']})
                      ]
    remove_tags = [dict(name='table', attrs={'class': ['buttonheading']})]
    ##########################################################################

    max_articles_per_feed = 10

    '''
    Make a variable that will hold the url for the main site because our links do not include the index
    '''

    INDEX = 'http://www.malaysianmirror.com'

    def parse_index(self):
        feeds = []
        for title, url in [
            (u"Media Buzz", u"http://www.malaysianmirror.com/media-buzz-front"),
            (u"Life Style", u"http://www.malaysianmirror.com/lifestylefront"),
            (u"Features", u"http://www.malaysianmirror.com/featurefront"),


        ]:
            articles = self.make_links(url)
            if articles:
                feeds.append((title, articles))
        return feeds

    def make_links(self, url):
        title = 'Temp'
        current_articles = []
        soup = self.index_to_soup(url)
        for item in soup.findAll('div', attrs={'class': 'contentheading'}):
            # print 'item is: ', item
            link = item.find('a')
            # print 'the link is: ', link
            if link:
                url = self.INDEX + link['href']
                title = self.tag_to_string(link)
                # print 'the title is: ', title
                # print 'the url is: ', url
                # print 'the title is: ', title
                current_articles.append(
                    {'title': title, 'url': url, 'description': '', 'date': ''})  # append all this
        return current_articles

    def preprocess_html(self, soup):
        for item in soup.findAll(attrs={'style': True}):
            del item['style']
        return soup