calibre/recipes/japan_times.recipe

__license__   = 'GPL v3'
__copyright__ = '2008-2013, Darko Miletic <darko.miletic at gmail.com>'
'''
japantimes.co.jp
'''

from calibre.web.feeds.news import BasicNewsRecipe

class JapanTimes(BasicNewsRecipe):
    title                 = 'The Japan Times'
    __author__            = 'Darko Miletic'
    description           = "Daily news and features on Japan from the most widely read English-language newspaper in Japan. Coverage includes national news, business news, sports news, commentary and features on living in Japan, entertainment, the arts, education and more."
    language              = 'en_JP'
    category              = 'news, politics, japan'
    publisher             = 'The Japan Times'
    oldest_article        = 2
    max_articles_per_feed = 150
    no_stylesheets        = True
    use_embedded_content  = False
    encoding              = 'utf8'
    publication_type      = 'newspaper'
    masthead_url          = 'http://www.japantimes.co.jp/wp-content/themes/jt_theme/library/img/logo-japan-times.png'
    extra_css             = 'body{font-family: Geneva,Arial,Helvetica,sans-serif}'

    conversion_options = {
                          'comment'   : description
                        , 'tags'      : category
                        , 'publisher' : publisher
                        , 'language'  : language
                        }

    remove_tags_after  = dict(name='div', attrs={'class':'entry'})
    keep_only_tags     = [dict(name='div', attrs={'class':'padding_block'})]
    remove_tags        = [
                           dict(name=['iframe','embed','object','base'])
                          ,dict(attrs={'class':['meta_extras','related_articles']})
                          ,dict(attrs={'id':'content_footer_menu'})
                         ]
    feeds              = [
                            (u'News'     , u'http://www.japantimes.co.jp/news/feed/'     )
                           ,(u'Opinion'  , u'http://www.japantimes.co.jp/opinion/feed/'  )
                           ,(u'Life'     , u'http://www.japantimes.co.jp/opinion/feed/'  )
                           ,(u'Community', u'http://www.japantimes.co.jp/community/feed/')
                           ,(u'Culture'  , u'http://www.japantimes.co.jp/culture/feed/'  )
                           ,(u'Sports'   , u'http://www.japantimes.co.jp/sports/feed/'   )
                         ]

    def get_article_url(self, article):
        rurl = BasicNewsRecipe.get_article_url(self, article)
        return rurl.partition('?')[0]

    def preprocess_raw_html(self, raw, url):
       return '<html><head>'+raw[raw.find('</head>'):]