#!/usr/bin/env python ''' www.mainichi.jp/english ''' from calibre.web.feeds.news import BasicNewsRecipe class MainichiEnglishNews(BasicNewsRecipe): title = u'The Mainichi' __author__ = 'unkn0wn' description = 'Japanese traditional newspaper Mainichi news in English' publisher = 'Mainichi News' publication_type = 'newspaper' category = 'news, japan' language = 'en_JP' index = 'http://mainichi.jp/english/' masthead_url = index + 'images/themainichi.png' no_stylesheets = True remove_javascript = True auto_cleanup = True ignore_duplicate_articles = {'title', 'url'} def parse_index(self): index = 'https://mainichi.jp/english/' sections = [ 'articles' ] feeds = [] soup = self.index_to_soup(index) for sec in sections: section = sec.capitalize() self.log(section) articles = [] for a in soup.findAll('a', attrs={'href':lambda x: x and 'articles' in x}): if a.find('img'): continue url = a['href'] if not url.startswith('http'): url = 'https:' + url title = self.tag_to_string(a) self.log('\t', title, '\n\t\t', url) articles.append({'title': title, 'url': url}) if articles: feeds.append((section, articles)) return feeds