calibre/recipes/mainichi_en.recipe
Kovid Goyal 803da449ba
Cleanup previous PR
recipes have to run on old python 2 based calibre as well, so no type
annotations
2022-02-18 17:29:35 +05:30

143 lines
4.5 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python
# -*- coding: utf-8 -*-
__license__ = "GPL v3"
__copyright__ = (
"2010, Hiroshi Miura <miurahr@linux.com>. "
"2021, Albert Aparicio Isarn <aaparicio at posteo.net>"
)
"""
www.mainichi.jp/english
"""
from datetime import datetime
from calibre.web.feeds.news import BasicNewsRecipe
class MainichiEnglishNews(BasicNewsRecipe):
title = u"The Mainichi"
__author__ = "Albert Aparicio Isarn (old version by Hiroshi Miura)"
description = "Japanese traditional newspaper Mainichi news in English"
publisher = "Mainichi News"
publication_type = "newspaper"
category = "news, japan"
language = "en_JP"
index = "http://mainichi.jp/english/"
masthead_url = index + "images/themainichi.png"
oldest_article = 2
max_articles_per_feed = 40
no_stylesheets = True
remove_javascript = True
remove_tags_before = {"id": "main-cont"}
remove_tags_after = {"class": "main-text"}
remove_tags = [{"name": "div", "id": "tools"}, {"name": "div", "class": "sub"}]
def get_pickup_section(self, soup):
# Topmost story
top = soup.find("section", attrs={"class": "pickup section"})
top_link = top.find("p", attrs={"class": "midashi"}).find("a")
try:
top_date = (
soup.find("div", attrs={"id": "main"})
.find("div", attrs={"class": "date-box"})
.find("p", attrs={"class": "date"})
.string
)
top_date_formatted = datetime.strptime(top_date, "%A, %B %d, %Y").strftime("%Y/%m/%d")
except AttributeError:
# If date not present, assume it is from today
top_date_formatted = datetime.now().strftime("%Y/%m/%d")
top_description = top.find("p", attrs={"class": "txt"}).text
return [
{
"title": top_link.string,
"date": top_date_formatted,
"url": "https:" + top_link["href"],
"description": top_description,
}
]
def retrieve_news_from_column(self, column):
column_news = []
for item in column.findAll("li"):
if item:
itema = item.find("a")
date_item = itema.find("p", attrs={"class": "date"})
column_news.append(
{
"title": itema.find("span").string,
"date": date_item.string.strip("") if date_item else "",
"url": "https:" + itema["href"],
"description": "",
}
)
return column_news
def get_top_stories(self, soup):
top_stories = self.get_pickup_section(soup)
news_section = soup.find("section", attrs={"class": "newslist"})
top_news = news_section.find("div", attrs={"class": "main-box"}).find("ul")
top_stories.extend(self.retrieve_news_from_column(top_news))
return top_stories
def get_editor_picks(self, soup):
editor_picks = []
news_section = soup.find("section", attrs={"class": "newslist"})
news = news_section.find("div", attrs={"class": "sub-box"}).find("ul")
editor_picks.extend(self.retrieve_news_from_column(news))
return editor_picks
def get_section(self, section):
soup = self.index_to_soup(self.index + section + "index.html")
section_news_items = self.get_pickup_section(soup)
news_columns = (
soup.find("section", attrs={"class": "newslist section"})
.find("div", attrs={"class": "col-set"})
.find("ul")
)
section_news_items.extend(self.retrieve_news_from_column(news_columns))
return section_news_items
def parse_index(self):
soup = self.index_to_soup(self.index + "index.html")
feeds = [
("Top Stories", self.get_top_stories(soup)),
("Editor's Picks", self.get_editor_picks(soup)),
# ("Latest Articles", self.get_section(self.index + "latest"+"index.html")),
("Japan", self.get_section("japan")),
("World", self.get_section("world")),
("Business", self.get_section("business")),
("Sports", self.get_section("sports")),
("Science", self.get_section("science")),
("Entertainment", self.get_section("entertainment")),
("Opinion", self.get_section("opinion")),
("Lifestyle", self.get_section("lifestyle")),
("Obituaries", self.get_section("obituaries")),
]
return feeds