#!/usr/bin/env pythona import json from html5_parser import parse from calibre.web.feeds.news import BasicNewsRecipe, prefixed_classes def absurl(url): if url.startswith('/'): url = 'https://asia.nikkei.com' + url return url class Nikkei(BasicNewsRecipe): title = 'Nikkei Asia Magazine' __author__ = 'unkn0wn' language = 'en' no_stylesheets = True description = ( 'The voice of the Asian century. Trusted independent journalism ' 'from Asia, the center of global growth.' ) masthead_url = 'https://upload.wikimedia.org/wikipedia/commons/2/2f/Nikkei_Asia_logo.svg' remove_attributes = ['style', 'height', 'width'] ignore_duplicate_articles = {'url'} resolve_internal_links = True remove_empty_feeds = True encoding = 'utf-8' use_embedded_content = False extra_css = """ .subhead { font-style:italic; color:#202020; } em, blockquote { color:#202020; } .sec, .byline { font-size:small; font-weight:bold; } .article__image, .article__caption { font-size:small; text-align:center; } """ recipe_specific_options = { 'date': {'short': 'The edition date (YYYY-MM-DD format)', 'long': '2024-09-19'} } remove_tags = [dict(name='svg')] def parse_index(self): d = self.recipe_specific_options.get('date') if d and isinstance(d, str): url = 'https://asia.nikkei.com/Print-Edition/Issue-' + d else: archives = self.index_to_soup( 'https://asia.nikkei.com/Print-Edition/Archives' ) card = archives.find( **prefixed_classes('MagazineIssueCardArchives_magazineIssueCardContent__') ) url = absurl(card.a['href']) self.timefmt = f' [{url.split("Issue-")[-1]}]' self.title = 'Nikkei Asia' self.log(self.title, self.timefmt) soup = self.index_to_soup(url) self.cover_url = ( soup.find( **prefixed_classes('MagazineIssueCard_magazineIssueCardCoverImage__') )['src'].split('?')[0] + '?width=600&source=nar-cms' ) ans = [] grid = soup.find(**prefixed_classes('MagazineArticles_magazineArticlesGrid__')) for a in grid.findAll( **prefixed_classes( 'MagazineArticlesSpotlightCard_magazineArticlesSpotlightCardHeadline__ ' 'StreamArticleCard_streamArticleCardHeadline__' ) ): title = self.tag_to_string(a) url = absurl(a.a['href']) desc = '' exc = a.findNext( **prefixed_classes( 'MagazineArticlesSpotlightCard_magazineArticlesSpotlightCardSubheadWrapper__ ' 'StreamArticleCard_streamArticleCardSubhead__' ) ) if exc: desc = self.tag_to_string(exc) self.log(title, '\n ', desc, '\n ', url) ans.append({'title': title, 'url': url, 'description': desc}) return [('Articles', ans)] def preprocess_raw_html(self, raw, url): root = parse(raw) script = root.xpath('//script[@id="__NEXT_DATA__"]')[0].text data = json.loads(script)['props']['pageProps']['data'] title = f'

{data["headline"]}

' exp = auth = image = sec = '' sec = f'
{data["primaryTag"]["name"]}
' if data.get('subhead'): exp = f'

{data["subhead"]}

' if data.get('byline'): auth = f'

{data["byline"]}

' if data.get('image'): img = data['image'] image = ( f'
' f'{data.get("fullCaption", "")}
' ) return ( '' + sec + title + exp + image + auth + data['body'] + '' ) def preprocess_html(self, soup): for attr in self.remove_attributes: for x in soup.findAll(attrs={attr: True}): del x[attr] for img in soup.findAll('img', src=True): img['src'] = img['src'].split('?')[0] + '?width=600&source=nar-cms' return soup