calibre/recipes/the_manila_bulletin.recipe

import time
from calibre.web.feeds.recipes import BasicNewsRecipe


class TheManilaBulletin(BasicNewsRecipe):
    title = u'The Manila Bulletin'
    custom_title = "The Manila Bulletin - " + \
        time.strftime('%d %b %Y %I:%M %p')
    __author__ = 'jde'
    __date__ = '06 June 2012'
    __version__ = '1.0'
    description = "The Manila Bulletin, (also known as the Bulletin and previously known as the Manila Daily Bulletin and the Bulletin Today) is the Philippines' largest broadsheet newspaper by circulation."  # noqa
    language = 'en_PH'
    publisher = 'The Manila Bulletin'
    category = 'news, Philippines'
    tags = 'news, Philippines'
    cover_url = 'http://www.mb.com.ph/sites/default/files/mb_logo.jpg'
    masthead_url = 'http://www.mb.com.ph/sites/default/files/mb_logo.jpg'
    oldest_article = 1.5  # days
    max_articles_per_feed = 25
    simultaneous_downloads = 20
    publication_type = 'newspaper'
    timefmt = ' [%a, %d %b %Y %I:%M %p]'
    no_stylesheets = True
    use_embedded_content = False
    encoding = None
    recursions = 0
    needs_subscription = False
    remove_javascript = True
    remove_empty_feeds = True

    keep_only_tags = [
        dict(name='div', attrs={'class': 'article node'}), dict(name='div', attrs={
            'class': 'label'}), dict(name='div', attrs={'class': 'content clear-block'})
    ]

    remove_tags = [
        dict(name='li', attrs={'class': 'print_html'}),
        dict(name='li', attrs={'class': 'print_html first'}),
        dict(name='li', attrs={'class': 'print_mail'}),
        dict(name='li', attrs={'class': 'print_mail last'}),
        dict(name='div', attrs={'class': 'article-sidebar'}), dict(name='table', attrs={'id': 'attachments'})
    ]

    auto_cleanup = False

    conversion_options = {'title': custom_title,
                          'comments': description,
                          'tags': tags,
                          'language': language,
                          'publisher': publisher,
                          'authors': publisher,
                          'smarten_punctuation': True
                          }

    feeds = [
        # ,
    (u'Regional', u'http://www.mb.com.ph/feed/news/regional'),

    (u'Main News', u'http://www.mb.com.ph/feed/news/main'),
    (u'Business', u'http://www.mb.com.ph/feed/business'),
    (u'Sports', u'http://www.mb.com.ph/feed/sports'),
    (u'Entertainment', u'http://www.mb.com.ph/feed/entertainment'),
    (u'Opinion', u'http://www.mb.com.ph/feed/news/opinion'),
    (u'Agriculture', u'http://www.mb.com.ph/feed/news/agriculture'),
    (u'Technology', u'http://www.mb.com.ph/feed/lifestyle/technology'),
    (u'Lifestyle', u'http://www.mb.com.ph/feed/lifestyle'),
    (u'Drive', u'http://www.mb.com.ph/feed/lifestyle/drive'),
    ]


#  if use print version - convert url
#  http://www.mb.com.ph/articles/361252/higher-power-rate-looms
#  http://www.mb.com.ph/print/361252
#
#    def print_version(self,url):
#        segments = url.split('/')
#        printURL = '/'.join(segments[0:3]) + '/print/' + '/'.join(segments[5])
#        return printURL