diff --git a/recipes/jakarta_post.recipe b/recipes/jakarta_post.recipe index 7a9c810d39..a1460bcf14 100644 --- a/recipes/jakarta_post.recipe +++ b/recipes/jakarta_post.recipe @@ -13,10 +13,8 @@ from calibre.web.feeds.news import BasicNewsRecipe class JakartaPost(BasicNewsRecipe): title = u'Jakarta Post' - masthead_url = 'http://www.thejakartapost.com/images/jakartapost_logo.jpg' - cover_url = 'http://www.thejakartapost.com/images/jakartapost_logo.jpg' - __author__ = u'Adrian Gunawan' + __author__ = u'Kovid Goyal' description = u'Indonesian Newspaper in English from Jakarta Post Online Edition' category = 'breaking news, national, business, international, Indonesia' language = 'en_ID' @@ -24,39 +22,40 @@ class JakartaPost(BasicNewsRecipe): max_articles_per_feed = 100 no_stylesheets = True - use_embedded_content = False no_javascript = True remove_empty_feeds = True - auto_cleanup = True timefmt = ' [%A, %d %B, %Y]' encoding = 'utf-8' + ignore_duplicate_articles = {'title', 'url'} - extra_css = ''' - h1{font-family:Georgia,"Times New Roman",Times,serif; font-weight:bold; font-size:large;} - .cT-storyDetails{font-family:Arial,Helvetica,sans-serif; color:#666666;font-size:x-small;} - .articleBody{font-family:Arial,Helvetica,sans-serif; color:black;font-size:small;} - .cT-imageLandscape{font-family:Arial,Helvetica,sans-serif; color:#333333 ;font-size:x-small;} - .source{font-family:Arial,Helvetica,sans-serif; color:#333333 ;font-size:xx-small;} - #content{font-family:Arial,Helvetica,sans-serif;font-size:x-small;} - .pageprint{font-family:Arial,Helvetica,sans-serif;font-size:small;} - #bylineDetails{font-family:Arial,Helvetica,sans-serif; color:#666666;font-size:x-small;} - .featurePic-wide{font-family:Arial,Helvetica,sans-serif;font-size:x-small;} - #idfeaturepic{font-family:Arial,Helvetica,sans-serif;font-size:x-small;} - h3{font-family:Georgia,"Times New Roman",Times,serif; font-size:small;} - h2{font-family:Georgia,"Times New Roman",Times,serif; font-size:small;} - h4{font-family:Georgia,"Times New Roman",Times,serif; font-size:small;} - h5{font-family:Georgia,"Times New Roman",Times,serif; font-size:small;} - body{font-family:Arial,Helvetica,sans-serif; font-size:x-small;} - ''' + keep_only_tags = [ + dict(name='h1'), + dict(attrs={'class':lambda x: x and 'byline' in x.split()}), + dict(attrs={'class':lambda x: x and 'span-13' in x.split()}), + ] + remove_tags_after = dict(attrs={'class':lambda x: x and 'span-12' in x.split()}) - feeds = [ + def parse_jp_section(self, url): + soup = self.index_to_soup(url) + for div in soup.findAll('div', attrs={'class':'story'}): + a = div.find('a', href=True) + url = a.get('href') + title = self.tag_to_string(a) + p = div.find('p') + desc = self.tag_to_string(p) + self.log('\t', title, ' at ', url) + yield {'title':title, 'url':url, 'description':desc} - (u'Breaking News', u'http://www.thejakartapost.com/breaking/feed'), - (u'National', u'http://www.thejakartapost.com/channel/national/feed'), - (u'Archipelago', u'http://www.thejakartapost.com/channel/archipelago/feed'), - (u'Business', u'http://www.thejakartapost.com/channel/business/feed'), - (u'Jakarta', u'http://www.thejakartapost.com/channel/jakarta/feed'), - (u'World', u'http://www.thejakartapost.com/channel/world/feed'), - (u'Sports', u'http://www.thejakartapost.com/channel/sports/feed'), - ] + def parse_index(self): + ans = [] + for sec in ('editors_choice channel/headlines channel/business channel/national channel/archipelago' + ' channel/jakarta channel/world channel/sports').split(): + title = (sec.partition('/')[2] or sec).replace('_', ' ').capitalize() + articles = list(self.parse_jp_section('http://www.thejakartapost.com/' + sec)) + self.log('Found section:', title) + if articles: + ans.append((title, articles)) + if self.test and len(ans) >= self.test[0]: + break + return ans