Asahi Shimbun by Albert Aparicio Isarn

2025-07-09 03:04:10 -04:00 · 2022-05-26 13:52:13 +05:30 · 2022-05-26 13:52:13 +05:30 · 0f2e921ff1
commit 0f2e921ff1
parent 9d712f55de 358662bf3f
1 changed files with 167 additions and 0 deletions
--- a/recipes/asahi_shimbun_en.recipe
+++ b/recipes/asahi_shimbun_en.recipe
@ -0,0 +1,167 @@
 #!/usr/bin/env python
 # -*- coding: utf-8 -*-
 __license__ = "GPL v3"
 __copyright__ = "2022, Albert Aparicio Isarn <aaparicio at posteo.net>"
 """
 https://www.asahi.com/ajw/
 """
 from datetime import datetime
 from calibre.web.feeds.news import BasicNewsRecipe
 class AsahiShimbunEnglishNews(BasicNewsRecipe):
    title = "The Asahi Shimbun"
    __author__ = "Albert Aparicio Isarn"
    description = ("The Asahi Shimbun is widely regarded for its journalism as the most respected daily newspaper in Japan."
                   " The English version offers selected articles from the vernacular Asahi Shimbun, as well as extensive"
                   " coverage of cool Japan,focusing on manga, travel and other timely news.")
    publisher = "The Asahi Shimbun Company"
    publication_type = "newspaper"
    category = "news, japan"
    language = "en_JP"
    index = "https://www.asahi.com"
    masthead_url = "https://p.potaufeu.asahi.com/ajw/css/images/en_logo@2x.png"
    oldest_article = 3
    max_articles_per_feed = 40
    no_stylesheets = True
    remove_javascript = True
    remove_tags_before = {"id": "MainInner"}
    remove_tags_after = {"class": "ArticleText"}
    remove_tags = [{"name": "div", "class": "SnsUtilityArea"}]
    def get_whats_new(self):
        soup = self.index_to_soup(self.index + "/ajw/new")
        news_section = soup.find("div", attrs={"class": "specialList"})
        new_news = []
        for item in news_section.findAll("li"):
            title = item.find("p", attrs={"class": "title"}).string
            date_string = item.find("p", attrs={"class": "date"}).next
            date = date_string.strip()
            url = self.index + item.find("a")["href"]
            new_news.append(
                {
                    "title": title,
                    "date": datetime.strptime(date, "%B %d, %Y").strftime("%Y/%m/%d"),
                    "url": url,
                    "description": "",
                }
            )
        return new_news
    def get_top6(self, soup):
        top = soup.find("ul", attrs={"class": "top6"})
        top6_news = []
        for item in top.findAll("li"):
            title = item.find("p", attrs={"class": "title"}).string
            date_string = item.find("p", attrs={"class": "date"}).next
            date = date_string.strip()
            url = self.index + item.find("a")["href"]
            top6_news.append(
                {
                    "title": title,
                    "date": datetime.strptime(date, "%B %d, %Y").strftime("%Y/%m/%d"),
                    "url": url,
                    "description": "",
                }
            )
        return top6_news
    def get_section_news(self, soup):
        news_grid = soup.find("ul", attrs={"class": "default"})
        news = []
        for item in news_grid.findAll("li"):
            title = item.find("p", attrs={"class": "title"}).string
            date_string = item.find("p", attrs={"class": "date"}).next
            date = date_string.strip()
            url = self.index + item.find("a")["href"]
            news.append(
                {
                    "title": title,
                    "date": datetime.strptime(date, "%B %d, %Y").strftime("%Y/%m/%d"),
                    "url": url,
                    "description": "",
                }
            )
        return news
    def get_section(self, section):
        soup = self.index_to_soup(self.index + "/ajw/" + section)
        section_news_items = self.get_top6(soup)
        section_news_items.extend(self.get_section_news(soup))
        return section_news_items
    def get_special_section(self, section):
        soup = self.index_to_soup(self.index + "/ajw/" + section)
        top = soup.find("div", attrs={"class": "Section"})
        special_news = []
        for item in top.findAll("li"):
            item_a = item.find("a")
            text_split = item_a.text.strip().split("\n")
            title = text_split[0]
            description = text_split[1].strip()
            url = self.index + item_a["href"]
            special_news.append(
                {
                    "title": title,
                    "date": "",
                    "url": url,
                    "description": description,
                }
            )
        return special_news
    def parse_index(self):
        # soup = self.index_to_soup(self.index)
        feeds = [
            ("What's New", self.get_whats_new()),
            ("National Report", self.get_section("national_report")),
            ("Politics", self.get_section("politics")),
            ("Business", self.get_section("business")),
            ("Asia & World - China", self.get_section("asia_world/china")),
            ("Asia & World - Korean Peninsula", self.get_section("asia_world/korean_peninsula")),
            ("Asia & World - Around Asia", self.get_section("asia_world/around_asia")),
            ("Asia & World - World", self.get_section("asia_world/world")),
            ("Sci & Tech", self.get_section("sci_tech")),
            ("Culture - Style", self.get_section("culture/style")),
            ("Culture - Cooking", self.get_section("culture/cooking")),
            ("Culture - Movies", self.get_section("culture/movies")),
            ("Culture - Manga & Anime", self.get_section("culture/manga_anime")),
            ("Travel", self.get_section("travel")),
            ("Sports", self.get_section("sports")),
            ("Opinion - Editorial", self.get_section("opinion/editorial")),
            ("Opinion - Vox Populi", self.get_section("opinion/vox")),
            ("Opinion - Views", self.get_section("opinion/views")),
            ("Special", self.get_special_section("special")),
        ]
        return feeds