#!/usr/bin/env python # -*- coding: utf-8 -*- __license__ = "GPL v3" __copyright__ = "2022, Albert Aparicio Isarn " """ https://www.asahi.com/ajw/ """ from datetime import datetime from calibre.web.feeds.news import BasicNewsRecipe class AsahiShimbunEnglishNews(BasicNewsRecipe): title = "The Asahi Shimbun" __author__ = "Albert Aparicio Isarn" description = ("The Asahi Shimbun is widely regarded for its journalism as the most respected daily newspaper in Japan." " The English version offers selected articles from the vernacular Asahi Shimbun, as well as extensive" " coverage of cool Japan,focusing on manga, travel and other timely news.") publisher = "The Asahi Shimbun Company" publication_type = "newspaper" category = "news, japan" language = "en_JP" index = "https://www.asahi.com" masthead_url = "https://p.potaufeu.asahi.com/ajw/css/images/en_logo@2x.png" oldest_article = 3 max_articles_per_feed = 40 no_stylesheets = True remove_javascript = True remove_tags_before = {"id": "MainInner"} remove_tags_after = {"class": "ArticleText"} remove_tags = [{"name": "div", "class": "SnsUtilityArea"}] def get_whats_new(self): soup = self.index_to_soup(self.index + "/ajw/new") news_section = soup.find("div", attrs={"class": "specialList"}) new_news = [] for item in news_section.findAll("li"): title = item.find("p", attrs={"class": "title"}).string date_string = item.find("p", attrs={"class": "date"}).next date = date_string.strip() url = self.index + item.find("a")["href"] new_news.append( { "title": title, "date": datetime.strptime(date, "%B %d, %Y").strftime("%Y/%m/%d"), "url": url, "description": "", } ) return new_news def get_top6(self, soup): top = soup.find("ul", attrs={"class": "top6"}) top6_news = [] for item in top.findAll("li"): title = item.find("p", attrs={"class": "title"}).string date_string = item.find("p", attrs={"class": "date"}).next date = date_string.strip() url = self.index + item.find("a")["href"] top6_news.append( { "title": title, "date": datetime.strptime(date, "%B %d, %Y").strftime("%Y/%m/%d"), "url": url, "description": "", } ) return top6_news def get_section_news(self, soup): news_grid = soup.find("ul", attrs={"class": "default"}) news = [] for item in news_grid.findAll("li"): title = item.find("p", attrs={"class": "title"}).string date_string = item.find("p", attrs={"class": "date"}).next date = date_string.strip() url = self.index + item.find("a")["href"] news.append( { "title": title, "date": datetime.strptime(date, "%B %d, %Y").strftime("%Y/%m/%d"), "url": url, "description": "", } ) return news def get_section(self, section): soup = self.index_to_soup(self.index + "/ajw/" + section) section_news_items = self.get_top6(soup) section_news_items.extend(self.get_section_news(soup)) return section_news_items def get_special_section(self, section): soup = self.index_to_soup(self.index + "/ajw/" + section) top = soup.find("div", attrs={"class": "Section"}) special_news = [] for item in top.findAll("li"): item_a = item.find("a") text_split = item_a.text.strip().split("\n") title = text_split[0] description = text_split[1].strip() url = self.index + item_a["href"] special_news.append( { "title": title, "date": "", "url": url, "description": description, } ) return special_news def parse_index(self): # soup = self.index_to_soup(self.index) feeds = [ ("What's New", self.get_whats_new()), ("National Report", self.get_section("national_report")), ("Politics", self.get_section("politics")), ("Business", self.get_section("business")), ("Asia & World - China", self.get_section("asia_world/china")), ("Asia & World - Korean Peninsula", self.get_section("asia_world/korean_peninsula")), ("Asia & World - Around Asia", self.get_section("asia_world/around_asia")), ("Asia & World - World", self.get_section("asia_world/world")), ("Sci & Tech", self.get_section("sci_tech")), ("Culture - Style", self.get_section("culture/style")), # ("Culture - Cooking", self.get_section("culture/cooking")), ("Culture - Movies", self.get_section("culture/movies")), ("Culture - Manga & Anime", self.get_section("culture/manga_anime")), ("Travel", self.get_section("travel")), ("Sports", self.get_section("sports")), ("Opinion - Editorial", self.get_section("opinion/editorial")), ("Opinion - Vox Populi", self.get_section("opinion/vox")), ("Opinion - Views", self.get_section("opinion/views")), ("Special", self.get_special_section("special")), ] return feeds