#!/usr/bin/env python ''' www.mainichi.jp ''' from calibre.web.feeds.news import BasicNewsRecipe class MainichiDailyNews(BasicNewsRecipe): title = u'\u6bce\u65e5\u65b0\u805e' __author__ = 'unkn0wn' description = 'Japanese traditional newspaper Mainichi Daily News' publisher = 'Mainichi News' publication_type = 'newspaper' category = 'news, japan' language = 'ja' no_stylesheets = True remove_javascript = True auto_cleanup = True ignore_duplicate_articles = {'title', 'url'} def parse_index(self): index = 'https://mainichi.jp' sections = [ 'articles' ] feeds = [] soup = self.index_to_soup(index) for sec in sections: section = sec.capitalize() self.log(section) articles = [] for a in soup.findAll('a', attrs={'href':lambda x: x and 'articles' in x}): if a.find('img'): continue url = a['href'] if not url.startswith('http'): url = 'https:' + url title = self.tag_to_string(a) self.log('\t', title, '\n\t\t', url) articles.append({'title': title, 'url': url}) if articles: feeds.append((section, articles)) return feeds