#!/usr/bin/env python """ www.mainichi.jp/english """ from calibre.web.feeds.news import BasicNewsRecipe class MainichiEnglishNews(BasicNewsRecipe): title = u"The Mainichi" __author__ = 'unkn0wn' description = "Japanese traditional newspaper Mainichi news in English" publisher = "Mainichi News" publication_type = "newspaper" category = "news, japan" language = "en_JP" index = "http://mainichi.jp/english/" masthead_url = index + "images/themainichi.png" no_stylesheets = True remove_javascript = True auto_cleanup = True ignore_duplicate_articles = {'title', 'url'} def parse_index(self): index = 'https://mainichi.jp/english/' sections = [ 'articles' ] feeds = [] soup = self.index_to_soup(index) for sec in sections: section = sec.capitalize() self.log(section) articles = [] for a in soup.findAll('a', attrs={'href':lambda x: x and 'articles' in x}): if a.find('img'): continue url = a['href'] if not url.startswith('http'): url = 'https:' + url title = self.tag_to_string(a) self.log('\t', title, '\n\t\t', url) articles.append({'title': title, 'url': url}) if articles: feeds.append((section, articles)) return feeds