Recipes - The Asahi Shimbun (English)

Recipe made from scratch
2025-07-09 03:04:10 -04:00 · 2022-05-26 09:59:20 +02:00 · 2022-05-26 09:59:20 +02:00 · 358662bf3f
commit 358662bf3f
parent 9d712f55de
1 changed files with 165 additions and 0 deletions
--- a/recipes/asahi_shimbun_en.recipe
+++ b/recipes/asahi_shimbun_en.recipe
@ -0,0 +1,165 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+__license__ = "GPL v3"
+__copyright__ = "2022, Albert Aparicio Isarn <aaparicio at posteo.net>"
+
+"""
+https://www.asahi.com/ajw/
+"""
+
+from datetime import datetime
+
+from calibre.web.feeds.news import BasicNewsRecipe
+
+
+class AsahiShimbunEnglishNews(BasicNewsRecipe):
+    title = "The Asahi Shimbun"
+    __author__ = "Albert Aparicio Isarn"
+
+    description = "The Asahi Shimbun is widely regarded for its journalism as the most respected daily newspaper in Japan. The English version offers selected articles from the vernacular Asahi Shimbun, as well as extensive coverage of cool Japan,focusing on manga, travel and other timely news"
+    publisher = "The Asahi Shimbun Company"
+    publication_type = "newspaper"
+    category = "news, japan"
+    language = "en_JP"
+
+    index = "https://www.asahi.com"
+    masthead_url = "https://p.potaufeu.asahi.com/ajw/css/images/en_logo@2x.png"
+
+    oldest_article = 3
+    max_articles_per_feed = 40
+    no_stylesheets = True
+    remove_javascript = True
+
+    remove_tags_before = {"id": "MainInner"}
+    remove_tags_after = {"class": "ArticleText"}
+    remove_tags = [{"name": "div", "class": "SnsUtilityArea"}]
+
+    def get_whats_new(self):
+        soup = self.index_to_soup(self.index + "/ajw/new")
+        news_section = soup.find("div", attrs={"class": "specialList"})
+
+        new_news = []
+
+        for item in news_section.findAll("li"):
+            title = item.find("p", attrs={"class": "title"}).string
+            date_string = item.find("p", attrs={"class": "date"}).next
+            date = date_string.strip()
+            url = self.index + item.find("a")["href"]
+
+            new_news.append(
+                {
+                    "title": title,
+                    "date": datetime.strptime(date, "%B %d, %Y").strftime("%Y/%m/%d"),
+                    "url": url,
+                    "description": "",
+                }
+            )
+
+        return new_news
+
+    def get_top6(self, soup):
+        top = soup.find("ul", attrs={"class": "top6"})
+
+        top6_news = []
+
+        for item in top.findAll("li"):
+            title = item.find("p", attrs={"class": "title"}).string
+            date_string = item.find("p", attrs={"class": "date"}).next
+            date = date_string.strip()
+            url = self.index + item.find("a")["href"]
+
+            top6_news.append(
+                {
+                    "title": title,
+                    "date": datetime.strptime(date, "%B %d, %Y").strftime("%Y/%m/%d"),
+                    "url": url,
+                    "description": "",
+                }
+            )
+
+        return top6_news
+
+    def get_section_news(self, soup):
+        news_grid = soup.find("ul", attrs={"class": "default"})
+
+        news = []
+
+        for item in news_grid.findAll("li"):
+            title = item.find("p", attrs={"class": "title"}).string
+            date_string = item.find("p", attrs={"class": "date"}).next
+            date = date_string.strip()
+
+            url = self.index + item.find("a")["href"]
+
+            news.append(
+                {
+                    "title": title,
+                    "date": datetime.strptime(date, "%B %d, %Y").strftime("%Y/%m/%d"),
+                    "url": url,
+                    "description": "",
+                }
+            )
+
+        return news
+
+    def get_section(self, section):
+        soup = self.index_to_soup(self.index + "/ajw/" + section)
+
+        section_news_items = self.get_top6(soup)
+        section_news_items.extend(self.get_section_news(soup))
+
+        return section_news_items
+
+    def get_special_section(self, section):
+        soup = self.index_to_soup(self.index + "/ajw/" + section)
+        top = soup.find("div", attrs={"class": "Section"})
+
+        special_news = []
+
+        for item in top.findAll("li"):
+            item_a = item.find("a")
+
+            text_split = item_a.text.strip().split("\n")
+            title = text_split[0]
+            description = text_split[1].strip()
+
+            url = self.index + item_a["href"]
+
+            special_news.append(
+                {
+                    "title": title,
+                    "date": "",
+                    "url": url,
+                    "description": description,
+                }
+            )
+
+        return special_news
+
+    def parse_index(self):
+        # soup = self.index_to_soup(self.index)
+
+        feeds = [
+            ("What's New", self.get_whats_new()),
+            ("National Report", self.get_section("national_report")),
+            ("Politics", self.get_section("politics")),
+            ("Business", self.get_section("business")),
+            ("Asia & World - China", self.get_section("asia_world/china")),
+            ("Asia & World - Korean Peninsula", self.get_section("asia_world/korean_peninsula")),
+            ("Asia & World - Around Asia", self.get_section("asia_world/around_asia")),
+            ("Asia & World - World", self.get_section("asia_world/world")),
+            ("Sci & Tech", self.get_section("sci_tech")),
+            ("Culture - Style", self.get_section("culture/style")),
+            ("Culture - Cooking", self.get_section("culture/cooking")),
+            ("Culture - Movies", self.get_section("culture/movies")),
+            ("Culture - Manga & Anime", self.get_section("culture/manga_anime")),
+            ("Travel", self.get_section("travel")),
+            ("Sports", self.get_section("sports")),
+            ("Opinion - Editorial", self.get_section("opinion/editorial")),
+            ("Opinion - Vox Populi", self.get_section("opinion/vox")),
+            ("Opinion - Views", self.get_section("opinion/views")),
+            ("Special", self.get_special_section("special")),
+        ]
+
+        return feeds