#!/usr/bin/env python # -*- coding: utf-8 -*- __license__ = 'GPL v3' __copyright__ = '2022, Albert Aparicio Isarn ' ''' https://www.asahi.com/ajw/ ''' from datetime import datetime from calibre.web.feeds.news import BasicNewsRecipe class AsahiShimbunEnglishNews(BasicNewsRecipe): title = 'The Asahi Shimbun' __author__ = 'Albert Aparicio Isarn' description = ('The Asahi Shimbun is widely regarded for its journalism as the most respected daily newspaper in Japan.' ' The English version offers selected articles from the vernacular Asahi Shimbun, as well as extensive' ' coverage of cool Japan,focusing on manga, travel and other timely news.') publisher = 'The Asahi Shimbun Company' publication_type = 'newspaper' category = 'news, japan' language = 'en_JP' index = 'https://www.asahi.com' masthead_url = 'https://p.potaufeu.asahi.com/ajw/css/images/en_logo@2x.png' oldest_article = 3 max_articles_per_feed = 40 no_stylesheets = True remove_javascript = True remove_tags_before = {'id': 'MainInner'} remove_tags_after = {'class': 'ArticleText'} remove_tags = [{'name': 'div', 'class': 'SnsUtilityArea'}] def get_whats_new(self): soup = self.index_to_soup(self.index + '/ajw/new') news_section = soup.find('div', attrs={'class': 'specialList'}) new_news = [] for item in news_section.findAll('li'): title = item.find('p', attrs={'class': 'title'}).string date_string = item.find('p', attrs={'class': 'date'}).next date = date_string.strip() url = self.index + item.find('a')['href'] new_news.append( { 'title': title, 'date': datetime.strptime(date, '%B %d, %Y').strftime('%Y/%m/%d'), 'url': url, 'description': '', } ) return new_news def get_top6(self, soup): top = soup.find('ul', attrs={'class': 'top6'}) top6_news = [] for item in top.findAll('li'): title = item.find('p', attrs={'class': 'title'}).string date_string = item.find('p', attrs={'class': 'date'}).next date = date_string.strip() url = self.index + item.find('a')['href'] top6_news.append( { 'title': title, 'date': datetime.strptime(date, '%B %d, %Y').strftime('%Y/%m/%d'), 'url': url, 'description': '', } ) return top6_news def get_section_news(self, soup): news_grid = soup.find('ul', attrs={'class': 'default'}) news = [] for item in news_grid.findAll('li'): title = item.find('p', attrs={'class': 'title'}).string date_string = item.find('p', attrs={'class': 'date'}).next date = date_string.strip() url = self.index + item.find('a')['href'] news.append( { 'title': title, 'date': datetime.strptime(date, '%B %d, %Y').strftime('%Y/%m/%d'), 'url': url, 'description': '', } ) return news def get_section(self, section): soup = self.index_to_soup(self.index + '/ajw/' + section) section_news_items = self.get_top6(soup) section_news_items.extend(self.get_section_news(soup)) return section_news_items def get_special_section(self, section): soup = self.index_to_soup(self.index + '/ajw/' + section) top = soup.find('div', attrs={'class': 'Section'}) special_news = [] for item in top.findAll('li'): item_a = item.find('a') text_split = item_a.text.strip().split('\n') title = text_split[0] description = text_split[1].strip() url = self.index + item_a['href'] special_news.append( { 'title': title, 'date': '', 'url': url, 'description': description, } ) return special_news def parse_index(self): # soup = self.index_to_soup(self.index) feeds = [ ("What's New", self.get_whats_new()), ('National Report', self.get_section('national_report')), ('Politics', self.get_section('politics')), ('Business', self.get_section('business')), ('Asia & World - China', self.get_section('asia_world/china')), ('Asia & World - Korean Peninsula', self.get_section('asia_world/korean_peninsula')), ('Asia & World - Around Asia', self.get_section('asia_world/around_asia')), ('Asia & World - World', self.get_section('asia_world/world')), ('Sci & Tech', self.get_section('sci_tech')), ('Culture - Style', self.get_section('culture/style')), # ("Culture - Cooking", self.get_section("culture/cooking")), ('Culture - Movies', self.get_section('culture/movies')), ('Culture - Manga & Anime', self.get_section('culture/manga_anime')), ('Travel', self.get_section('travel')), ('Sports', self.get_section('sports')), ('Opinion - Editorial', self.get_section('opinion/editorial')), ('Opinion - Vox Populi', self.get_section('opinion/vox')), ('Opinion - Views', self.get_section('opinion/views')), ('Special', self.get_special_section('special')), ] return feeds