#!/usr/bin/env python # -*- coding: utf-8 -*- __license__ = "GPL v3" __copyright__ = ( "2010, Hiroshi Miura . " "2021, Albert Aparicio Isarn " ) """ www.mainichi.jp/english """ from datetime import datetime from calibre.web.feeds.news import BasicNewsRecipe class MainichiEnglishNews(BasicNewsRecipe): title = u"The Mainichi" __author__ = "Albert Aparicio Isarn (old version by Hiroshi Miura)" description = "Japanese traditional newspaper Mainichi news in English" publisher = "Mainichi News" publication_type = "newspaper" category = "news, japan" language = "en_JP" index = "http://mainichi.jp/english/" masthead_url = index + "images/themainichi.png" oldest_article = 2 max_articles_per_feed = 40 no_stylesheets = True remove_javascript = True remove_tags_before = {"id": "main-cont"} remove_tags_after = {"class": "main-text"} remove_tags = [{"name": "div", "id": "tools"}, {"name": "div", "class": "sub"}] def get_pickup_section(self, soup): # Topmost story top = soup.find("section", attrs={"class": "pickup section"}) top_link = top.find("p", attrs={"class": "midashi"}).find("a") try: top_date = ( soup.find("div", attrs={"id": "main"}) .find("div", attrs={"class": "date-box"}) .find("p", attrs={"class": "date"}) .string ) top_date_formatted = datetime.strptime(top_date, "%A, %B %d, %Y").strftime("%Y/%m/%d") except AttributeError: # If date not present, assume it is from today top_date_formatted = datetime.now().strftime("%Y/%m/%d") top_description = top.find("p", attrs={"class": "txt"}).text return [ { "title": top_link.string, "date": top_date_formatted, "url": "https:" + top_link["href"], "description": top_description, } ] def retrieve_news_from_column(self, column): column_news = [] for item in column.findAll("li"): if item: itema = item.find("a") date_item = itema.find("p", attrs={"class": "date"}) column_news.append( { "title": itema.find("span").string, "date": date_item.string.strip("()") if date_item else "", "url": "https:" + itema["href"], "description": "", } ) return column_news def get_top_stories(self, soup): top_stories = self.get_pickup_section(soup) news_section = soup.find("section", attrs={"class": "newslist"}) top_news = news_section.find("div", attrs={"class": "main-box"}).find("ul") top_stories.extend(self.retrieve_news_from_column(top_news)) return top_stories def get_editor_picks(self, soup): editor_picks = [] news_section = soup.find("section", attrs={"class": "newslist"}) news = news_section.find("div", attrs={"class": "sub-box"}).find("ul") editor_picks.extend(self.retrieve_news_from_column(news)) return editor_picks def get_section(self, section): soup = self.index_to_soup(self.index + section + "index.html") section_news_items = self.get_pickup_section(soup) news_columns = ( soup.find("section", attrs={"class": "newslist section"}) .find("div", attrs={"class": "col-set"}) .find("ul") ) section_news_items.extend(self.retrieve_news_from_column(news_columns)) return section_news_items def parse_index(self): soup = self.index_to_soup(self.index + "index.html") feeds = [ ("Top Stories", self.get_top_stories(soup)), ("Editor's Picks", self.get_editor_picks(soup)), # ("Latest Articles", self.get_section(self.index + "latest"+"index.html")), ("Japan", self.get_section("japan")), ("World", self.get_section("world")), ("Business", self.get_section("business")), ("Sports", self.get_section("sports")), ("Science", self.get_section("science")), ("Entertainment", self.get_section("entertainment")), ("Opinion", self.get_section("opinion")), ("Lifestyle", self.get_section("lifestyle")), ("Obituaries", self.get_section("obituaries")), ] return feeds