__license__ = 'GPL v3' __copyright__ = '2010, Hiroshi Miura ' ''' www.toyokeizai.net ''' from calibre.web.feeds.news import BasicNewsRecipe import re class Toyokeizai(BasicNewsRecipe): title = u'ToyoKeizai News' __author__ = 'Hiroshi Miura' oldest_article = 1 max_articles_per_feed = 50 description = 'Japanese traditional economy and business magazine, only for advanced subscribers supported' publisher = 'Toyokeizai Shinbun Sha' category = 'economy, magazine, japan' language = 'ja' encoding = 'euc-jp' index = 'http://member.toyokeizai.net/news/' remove_javascript = True no_stylesheets = True masthead_title = u'TOYOKEIZAI' needs_subscription = True timefmt = '[%y/%m/%d]' recursions = 5 match_regexps = [r'page/\d+'] keep_only_tags = [ dict(name='div', attrs={'class': ['news']}), dict(name='div', attrs={'class': ["news_cont"]}), dict(name='div', attrs={'class': ["news_con"]}), # dict(name='div', attrs={'class':["norightsMessage"]}) ] remove_tags = [{'class': "mt35 mgz"}, {'class': "mt20 newzia"}, {'class': "mt20 fontS"}, {'class': "bk_btn_m"}, dict(id='newzia_connect_member') ] def parse_index(self): feeds = [] soup = self.index_to_soup(self.index) topstories = soup.find('ul', attrs={'class': 'list6'}) if topstories: newsarticles = [] for itt in topstories.findAll('li'): itema = itt.find('a', href=True) itemd = itt.find('span') newsarticles.append({ 'title': itema.string, 'date': re.compile(r"\- ").sub("", itemd.string), 'url': 'http://member.toyokeizai.net' + itema['href'], 'description': itema['title'] # noqa }) feeds.append(('news', newsarticles)) return feeds def get_browser(self): br = BasicNewsRecipe.get_browser(self) if self.username is not None and self.password is not None: br.open('http://member.toyokeizai.net/norights/form/') br.select_form(nr=0) br['kaiin_id'] = self.username br['password'] = self.password br.submit() return br