calibre/recipes/asahi_shimbun_en.recipe
2025-01-24 11:14:14 +01:00

168 lines
5.6 KiB
Python

#!/usr/bin/env python
# -*- coding: utf-8 -*-
__license__ = 'GPL v3'
__copyright__ = '2022, Albert Aparicio Isarn <aaparicio at posteo.net>'
'''
https://www.asahi.com/ajw/
'''
from datetime import datetime
from calibre.web.feeds.news import BasicNewsRecipe
class AsahiShimbunEnglishNews(BasicNewsRecipe):
title = 'The Asahi Shimbun'
__author__ = 'Albert Aparicio Isarn'
description = ('The Asahi Shimbun is widely regarded for its journalism as the most respected daily newspaper in Japan.'
' The English version offers selected articles from the vernacular Asahi Shimbun, as well as extensive'
' coverage of cool Japan,focusing on manga, travel and other timely news.')
publisher = 'The Asahi Shimbun Company'
publication_type = 'newspaper'
category = 'news, japan'
language = 'en_JP'
index = 'https://www.asahi.com'
masthead_url = 'https://p.potaufeu.asahi.com/ajw/css/images/en_logo@2x.png'
oldest_article = 3
max_articles_per_feed = 40
no_stylesheets = True
remove_javascript = True
remove_tags_before = {'id': 'MainInner'}
remove_tags_after = {'class': 'ArticleText'}
remove_tags = [{'name': 'div', 'class': 'SnsUtilityArea'}]
def get_whats_new(self):
soup = self.index_to_soup(self.index + '/ajw/new')
news_section = soup.find('div', attrs={'class': 'specialList'})
new_news = []
for item in news_section.findAll('li'):
title = item.find('p', attrs={'class': 'title'}).string
date_string = item.find('p', attrs={'class': 'date'}).next
date = date_string.strip()
url = self.index + item.find('a')['href']
new_news.append(
{
'title': title,
'date': datetime.strptime(date, '%B %d, %Y').strftime('%Y/%m/%d'),
'url': url,
'description': '',
}
)
return new_news
def get_top6(self, soup):
top = soup.find('ul', attrs={'class': 'top6'})
top6_news = []
for item in top.findAll('li'):
title = item.find('p', attrs={'class': 'title'}).string
date_string = item.find('p', attrs={'class': 'date'}).next
date = date_string.strip()
url = self.index + item.find('a')['href']
top6_news.append(
{
'title': title,
'date': datetime.strptime(date, '%B %d, %Y').strftime('%Y/%m/%d'),
'url': url,
'description': '',
}
)
return top6_news
def get_section_news(self, soup):
news_grid = soup.find('ul', attrs={'class': 'default'})
news = []
for item in news_grid.findAll('li'):
title = item.find('p', attrs={'class': 'title'}).string
date_string = item.find('p', attrs={'class': 'date'}).next
date = date_string.strip()
url = self.index + item.find('a')['href']
news.append(
{
'title': title,
'date': datetime.strptime(date, '%B %d, %Y').strftime('%Y/%m/%d'),
'url': url,
'description': '',
}
)
return news
def get_section(self, section):
soup = self.index_to_soup(self.index + '/ajw/' + section)
section_news_items = self.get_top6(soup)
section_news_items.extend(self.get_section_news(soup))
return section_news_items
def get_special_section(self, section):
soup = self.index_to_soup(self.index + '/ajw/' + section)
top = soup.find('div', attrs={'class': 'Section'})
special_news = []
for item in top.findAll('li'):
item_a = item.find('a')
text_split = item_a.text.strip().split('\n')
title = text_split[0]
description = text_split[1].strip()
url = self.index + item_a['href']
special_news.append(
{
'title': title,
'date': '',
'url': url,
'description': description,
}
)
return special_news
def parse_index(self):
# soup = self.index_to_soup(self.index)
feeds = [
("What's New", self.get_whats_new()),
('National Report', self.get_section('national_report')),
('Politics', self.get_section('politics')),
('Business', self.get_section('business')),
('Asia & World - China', self.get_section('asia_world/china')),
('Asia & World - Korean Peninsula', self.get_section('asia_world/korean_peninsula')),
('Asia & World - Around Asia', self.get_section('asia_world/around_asia')),
('Asia & World - World', self.get_section('asia_world/world')),
('Sci & Tech', self.get_section('sci_tech')),
('Culture - Style', self.get_section('culture/style')),
# ("Culture - Cooking", self.get_section("culture/cooking")),
('Culture - Movies', self.get_section('culture/movies')),
('Culture - Manga & Anime', self.get_section('culture/manga_anime')),
('Travel', self.get_section('travel')),
('Sports', self.get_section('sports')),
('Opinion - Editorial', self.get_section('opinion/editorial')),
('Opinion - Vox Populi', self.get_section('opinion/vox')),
('Opinion - Views', self.get_section('opinion/views')),
('Special', self.get_special_section('special')),
]
return feeds