From 14ce0c41f1b38de02f90319e3d7421369690c75a Mon Sep 17 00:00:00 2001 From: Albert Aparicio Date: Fri, 18 Feb 2022 11:33:09 +0100 Subject: [PATCH] Recipes - Redo Mainichi (English version) from scratch The present version of the Mainichi (en) version was not working. I remade it from scratch, taking into account the current layout of the Mainichi website --- recipes/mainichi_en.recipe | 179 +++++++++++++++++++++++++++---------- 1 file changed, 131 insertions(+), 48 deletions(-) diff --git a/recipes/mainichi_en.recipe b/recipes/mainichi_en.recipe index f78d262c6e..990f4e392e 100644 --- a/recipes/mainichi_en.recipe +++ b/recipes/mainichi_en.recipe @@ -1,64 +1,147 @@ -__license__ = 'GPL v3' -__copyright__ = '2010, Hiroshi Miura ' -''' -www.mainichi.jp -''' +#!/usr/bin/env python +# -*- coding: utf-8 -*- -import re +__license__ = "GPL v3" +__copyright__ = ( + "2010, Hiroshi Miura . " + "2021, Albert Aparicio Isarn " +) + +""" +www.mainichi.jp/english +""" + +from datetime import datetime +from typing import List, Dict + +from calibre.ebooks.BeautifulSoup import BeautifulSoup from calibre.web.feeds.news import BasicNewsRecipe class MainichiEnglishNews(BasicNewsRecipe): - title = u'The Mainichi' - __author__ = 'Hiroshi Miura' + title = u"The Mainichi" + __author__ = "Albert Aparicio Isarn (old version by Hiroshi Miura)" + + description = "Japanese traditional newspaper Mainichi news in English" + publisher = "Mainichi News" + publication_type = "newspaper" + category = "news, japan" + language = "en_JP" + + index = "http://mainichi.jp/english/" + masthead_url = index + "images/themainichi.png" + oldest_article = 2 max_articles_per_feed = 40 - description = 'Japanese traditional newspaper Mainichi news in English' - publisher = 'Mainichi News' - category = 'news, japan' - language = 'en_JP' - index = 'http://mainichi.jp/english/english/index.html' + no_stylesheets = True remove_javascript = True - masthead_url = 'http://mainichi.jp/english/images/themainichi.png' - remove_tags_before = {'class': "NewsTitle"} - remove_tags_after = {'class': "NewsBody clr"} + remove_tags_before = {"id": "main-cont"} + remove_tags_after = {"class": "main-text"} + remove_tags = [{"name": "div", "id": "tools"}, {"name": "div", "class": "sub"}] - def parse_feeds(self): + def get_pickup_section(self, soup: BeautifulSoup()) -> List[Dict[str, str]]: + # Topmost story + top = soup.find("section", attrs={"class": "pickup section"}) + top_link = top.find("p", attrs={"class": "midashi"}).find("a") - feeds = BasicNewsRecipe.parse_feeds(self) + try: + top_date = ( + soup.find("div", attrs={"id": "main"}) + .find("div", attrs={"class": "date-box"}) + .find("p", attrs={"class": "date"}) + .string + ) - for curfeed in feeds: - delList = [] - for a, curarticle in enumerate(curfeed.articles): - if re.search(r'pheedo.jp', curarticle.url): - delList.append(curarticle) - if re.search(r'rssad.jp', curarticle.url): - delList.append(curarticle) - if len(delList) > 0: - for d in delList: - index = curfeed.articles.index(d) - curfeed.articles[index:index + 1] = [] + top_date_formatted = datetime.strptime(top_date, "%A, %B %d, %Y").strftime("%Y/%m/%d") + except AttributeError: + # If date not present, assume it is from today + top_date_formatted = datetime.now().strftime("%Y/%m/%d") - return feeds + top_description = top.find("p", attrs={"class": "txt"}).text + + return [ + { + "title": top_link.string, + "date": top_date_formatted, + "url": "https:" + top_link["href"], + "description": top_description, + } + ] + + def retrieve_news_from_column(self, column) -> List[Dict[str, str]]: + column_news = [] + + for item in column.findAll("li"): + if item: + itema = item.find("a") + date_item = itema.find("p", attrs={"class": "date"}) + + column_news.append( + { + "title": itema.find("span").string, + "date": date_item.string.strip("()") if date_item else "", + "url": "https:" + itema["href"], + "description": "", + } + ) + + return column_news + + def get_top_stories(self, soup: BeautifulSoup()) -> List[Dict[str, str]]: + top_stories = self.get_pickup_section(soup) + + news_section = soup.find("section", attrs={"class": "newslist"}) + top_news = news_section.find("div", attrs={"class": "main-box"}).find("ul") + + top_stories.extend(self.retrieve_news_from_column(top_news)) + + return top_stories + + def get_editor_picks(self, soup: BeautifulSoup()) -> List[Dict[str, str]]: + editor_picks = [] + + news_section = soup.find("section", attrs={"class": "newslist"}) + news = news_section.find("div", attrs={"class": "sub-box"}).find("ul") + + editor_picks.extend(self.retrieve_news_from_column(news)) + + return editor_picks + + def get_section(self, section) -> List[Dict[str, str]]: + soup: BeautifulSoup = self.index_to_soup(self.index + section + "index.html") + + section_news_items = self.get_pickup_section(soup) + + news_columns = ( + soup.find("section", attrs={"class": "newslist section"}) + .find("div", attrs={"class": "col-set"}) + .find("ul") + ) + + section_news_items.extend(self.retrieve_news_from_column(news_columns)) + + return section_news_items def parse_index(self): - feeds = [] - soup = self.index_to_soup(self.index) - for section in soup.findAll('section'): - newsarticles = [] - section_name = 'news' - hds = section.find('div', attrs={'class': 'CategoryHead clr'}) - if hds: - section_item = hds.find('h1') - if section_item: - section_name = section_item.find('a').string - items = section.find('ul', attrs={'class': 'MaiLink'}) - for item in items.findAll('li'): - if item: - itema = item.find('a') - newsarticles.append({ - 'title': itema.string, 'date': '', 'url': itema['href'], 'description': '' - }) - feeds.append((section_name, newsarticles)) + soup: BeautifulSoup = self.index_to_soup(self.index + "index.html") + + feeds = [ + ("Top Stories", self.get_top_stories(soup)), + ("Editor's Picks", self.get_editor_picks(soup)), + # ("Latest Articles", self.get_section(self.index + "latest"+"index.html")), + ("Japan", self.get_section("japan")), + ("World", self.get_section("world")), + ("Business", self.get_section("business")), + ("Sports", self.get_section("sports")), + ("Science", self.get_section("science")), + ("Entertainment", self.get_section("entertainment")), + ("Opinion", self.get_section("opinion")), + ("Lifestyle", self.get_section("lifestyle")), + ("Obituaries", self.get_section("obituaries")), + ] + return feeds + + +calibre_most_common_ua = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.87 Safari/537.36"