Recipes - The Asahi Shimbun (English)

Recipe made from scratch
This commit is contained in:
Albert Aparicio 2022-05-26 09:59:20 +02:00
parent 9d712f55de
commit 358662bf3f

View File

@ -0,0 +1,165 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
__license__ = "GPL v3"
__copyright__ = "2022, Albert Aparicio Isarn <aaparicio at posteo.net>"
"""
https://www.asahi.com/ajw/
"""
from datetime import datetime
from calibre.web.feeds.news import BasicNewsRecipe
class AsahiShimbunEnglishNews(BasicNewsRecipe):
title = "The Asahi Shimbun"
__author__ = "Albert Aparicio Isarn"
description = "The Asahi Shimbun is widely regarded for its journalism as the most respected daily newspaper in Japan. The English version offers selected articles from the vernacular Asahi Shimbun, as well as extensive coverage of cool Japan,focusing on manga, travel and other timely news"
publisher = "The Asahi Shimbun Company"
publication_type = "newspaper"
category = "news, japan"
language = "en_JP"
index = "https://www.asahi.com"
masthead_url = "https://p.potaufeu.asahi.com/ajw/css/images/en_logo@2x.png"
oldest_article = 3
max_articles_per_feed = 40
no_stylesheets = True
remove_javascript = True
remove_tags_before = {"id": "MainInner"}
remove_tags_after = {"class": "ArticleText"}
remove_tags = [{"name": "div", "class": "SnsUtilityArea"}]
def get_whats_new(self):
soup = self.index_to_soup(self.index + "/ajw/new")
news_section = soup.find("div", attrs={"class": "specialList"})
new_news = []
for item in news_section.findAll("li"):
title = item.find("p", attrs={"class": "title"}).string
date_string = item.find("p", attrs={"class": "date"}).next
date = date_string.strip()
url = self.index + item.find("a")["href"]
new_news.append(
{
"title": title,
"date": datetime.strptime(date, "%B %d, %Y").strftime("%Y/%m/%d"),
"url": url,
"description": "",
}
)
return new_news
def get_top6(self, soup):
top = soup.find("ul", attrs={"class": "top6"})
top6_news = []
for item in top.findAll("li"):
title = item.find("p", attrs={"class": "title"}).string
date_string = item.find("p", attrs={"class": "date"}).next
date = date_string.strip()
url = self.index + item.find("a")["href"]
top6_news.append(
{
"title": title,
"date": datetime.strptime(date, "%B %d, %Y").strftime("%Y/%m/%d"),
"url": url,
"description": "",
}
)
return top6_news
def get_section_news(self, soup):
news_grid = soup.find("ul", attrs={"class": "default"})
news = []
for item in news_grid.findAll("li"):
title = item.find("p", attrs={"class": "title"}).string
date_string = item.find("p", attrs={"class": "date"}).next
date = date_string.strip()
url = self.index + item.find("a")["href"]
news.append(
{
"title": title,
"date": datetime.strptime(date, "%B %d, %Y").strftime("%Y/%m/%d"),
"url": url,
"description": "",
}
)
return news
def get_section(self, section):
soup = self.index_to_soup(self.index + "/ajw/" + section)
section_news_items = self.get_top6(soup)
section_news_items.extend(self.get_section_news(soup))
return section_news_items
def get_special_section(self, section):
soup = self.index_to_soup(self.index + "/ajw/" + section)
top = soup.find("div", attrs={"class": "Section"})
special_news = []
for item in top.findAll("li"):
item_a = item.find("a")
text_split = item_a.text.strip().split("\n")
title = text_split[0]
description = text_split[1].strip()
url = self.index + item_a["href"]
special_news.append(
{
"title": title,
"date": "",
"url": url,
"description": description,
}
)
return special_news
def parse_index(self):
# soup = self.index_to_soup(self.index)
feeds = [
("What's New", self.get_whats_new()),
("National Report", self.get_section("national_report")),
("Politics", self.get_section("politics")),
("Business", self.get_section("business")),
("Asia & World - China", self.get_section("asia_world/china")),
("Asia & World - Korean Peninsula", self.get_section("asia_world/korean_peninsula")),
("Asia & World - Around Asia", self.get_section("asia_world/around_asia")),
("Asia & World - World", self.get_section("asia_world/world")),
("Sci & Tech", self.get_section("sci_tech")),
("Culture - Style", self.get_section("culture/style")),
("Culture - Cooking", self.get_section("culture/cooking")),
("Culture - Movies", self.get_section("culture/movies")),
("Culture - Manga & Anime", self.get_section("culture/manga_anime")),
("Travel", self.get_section("travel")),
("Sports", self.get_section("sports")),
("Opinion - Editorial", self.get_section("opinion/editorial")),
("Opinion - Vox Populi", self.get_section("opinion/vox")),
("Opinion - Views", self.get_section("opinion/views")),
("Special", self.get_special_section("special")),
]
return feeds