#!/usr/bin/env python __license__ = 'GPL v3' __copyright__ = '2011, Adrian Gunawan ' __author__ = 'Adrian Gunawan' __version__ = 'v1.0' __date__ = '02 February 2011' ''' http://www.thejakartapost.com/ ''' from calibre.web.feeds.news import BasicNewsRecipe class JakartaPost(BasicNewsRecipe): title = u'Jakarta Post' __author__ = u'Kovid Goyal' description = u'Indonesian Newspaper in English from Jakarta Post Online Edition' category = 'breaking news, national, business, international, Indonesia' language = 'en_ID' oldest_article = 2 max_articles_per_feed = 100 no_stylesheets = True no_javascript = True remove_empty_feeds = True timefmt = ' [%A, %d %B, %Y]' encoding = 'utf-8' ignore_duplicate_articles = {'title', 'url'} keep_only_tags = [ dict(name='h1'), dict(attrs={'class': lambda x: x and 'byline' in x.split()}), dict(attrs={'class': lambda x: x and 'span-13' in x.split()}), ] remove_tags_after = dict( attrs={'class': lambda x: x and 'span-12' in x.split()}) def parse_jp_section(self, url): soup = self.index_to_soup(url) for div in soup.findAll('div', attrs={'class': 'story'}): a = div.find('a', href=True) url = a.get('href') title = self.tag_to_string(a) p = div.find('p') desc = self.tag_to_string(p) self.log('\t', title, ' at ', url) yield {'title': title, 'url': url, 'description': desc} def parse_index(self): ans = [] for sec in ('editors_choice channel/headlines channel/business channel/national channel/archipelago' ' channel/jakarta channel/world channel/sports').split(): title = (sec.partition('/')[2] or sec).replace('_', ' ').capitalize() articles = list(self.parse_jp_section( 'http://www.thejakartapost.com/' + sec)) self.log('Found section:', title) if articles: ans.append((title, articles)) if self.test and len(ans) >= self.test[0]: break return ans