mirror of
				https://github.com/kovidgoyal/calibre.git
				synced 2025-10-25 15:52:25 -04:00 
			
		
		
		
	Update Nikkei Asia Magazine
google webcache no longer works.
This commit is contained in:
		
							parent
							
								
									7e709b4fc7
								
							
						
					
					
						commit
						cb1ecb5f9e
					
				| @ -1,4 +1,7 @@ | |||||||
| from calibre.web.feeds.news import BasicNewsRecipe, classes | #!/usr/bin/env pythona | ||||||
|  | import json | ||||||
|  | from html5_parser import parse | ||||||
|  | from calibre.web.feeds.news import BasicNewsRecipe, prefixed_classes | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def absurl(url): | def absurl(url): | ||||||
| @ -6,16 +9,17 @@ def absurl(url): | |||||||
|         url = 'https://asia.nikkei.com' + url |         url = 'https://asia.nikkei.com' + url | ||||||
|     return url |     return url | ||||||
| 
 | 
 | ||||||
| class nikkei(BasicNewsRecipe): | 
 | ||||||
|     title = 'Nikkei Asia' | class Nikkei(BasicNewsRecipe): | ||||||
|  |     title = 'Nikkei Asia Magazine' | ||||||
|     __author__ = 'unkn0wn' |     __author__ = 'unkn0wn' | ||||||
|     language = 'en' |     language = 'en' | ||||||
|     no_stylesheets = True |     no_stylesheets = True | ||||||
|     description = ( |     description = ( | ||||||
|         'Japan, China, India and Southeast Asia news and expert analysis published by Nikkei' |         'The voice of the Asian century. Trusted independent journalism ' | ||||||
|         ', an award-winning independent provider of quality journalism.' |         'from Asia, the center of global growth.' | ||||||
|     ) |     ) | ||||||
|     masthead_url = 'https://www.global-nikkei.com/22ia/images/logo/Nikkei-Asia-Logo.svg' |     masthead_url = 'https://upload.wikimedia.org/wikipedia/commons/2/2f/Nikkei_Asia_logo.svg' | ||||||
|     remove_attributes = ['style', 'height', 'width'] |     remove_attributes = ['style', 'height', 'width'] | ||||||
|     ignore_duplicate_articles = {'url'} |     ignore_duplicate_articles = {'url'} | ||||||
|     resolve_internal_links = True |     resolve_internal_links = True | ||||||
| @ -23,46 +27,94 @@ class nikkei(BasicNewsRecipe): | |||||||
|     encoding = 'utf-8' |     encoding = 'utf-8' | ||||||
|     use_embedded_content = False |     use_embedded_content = False | ||||||
| 
 | 
 | ||||||
|     extra_css = ''' |     extra_css = """ | ||||||
|         .article-header__sub-title { font-style:italic; color:#202020; } |         .subhead { font-style:italic; color:#202020; } | ||||||
|         .article-header__details, .article__details { font-size:small; font-weight:bold; } |         em, blockquote { color:#202020; } | ||||||
|         .timestamp { color:#5c5c5c; } |         .sec, .byline { font-size:small; font-weight:bold; } | ||||||
|         .article-header__topic { font-size:small; font-weight:bold; color:#5c5c5c; } |         .article__image, .article__caption { font-size:small; text-align:center; } | ||||||
|         .article__image, .article__caption { font-size:small; text-align:center; color:#202020; }     |     """ | ||||||
|     ''' |  | ||||||
| 
 | 
 | ||||||
|     keep_only_tags = [ |     recipe_specific_options = { | ||||||
|         classes('article-header__container article') |         'date': {'short': 'The edition date (YYYY-MM-DD format)', 'long': '2024-09-19'} | ||||||
|     ] |     } | ||||||
| 
 | 
 | ||||||
|     remove_tags = [ |     remove_tags = [dict(name='svg')] | ||||||
|         dict(name='svg'), |  | ||||||
|         classes('article__advert share__container no-print') |  | ||||||
|     ] |  | ||||||
| 
 | 
 | ||||||
|     def parse_index(self): |     def parse_index(self): | ||||||
|         archives = self.index_to_soup('https://asia.nikkei.com/Print-Edition/Archives') |         d = self.recipe_specific_options.get('date') | ||||||
|         card = archives.find(attrs={'class':'card-article__body'}) |         if d and isinstance(d, str): | ||||||
|         self.title = 'Nikkei Asia: ' + self.tag_to_string(card.h4).strip() |             url = 'https://asia.nikkei.com/Print-Edition/Issue-' + d | ||||||
|         self.description = self.tag_to_string(card.p) |         else: | ||||||
|         self.timefmt = ' [' + self.tag_to_string(card.span.time).strip() + ']' |             archives = self.index_to_soup( | ||||||
|         self.log('Downloading ', self.title, self.timefmt, self.description) |                 'https://asia.nikkei.com/Print-Edition/Archives' | ||||||
|  |             ) | ||||||
|  |             card = archives.find( | ||||||
|  |                 **prefixed_classes('MagazineIssueCardArchives_magazineIssueCardContent__') | ||||||
|  |             ) | ||||||
|  |             url = absurl(card.a['href']) | ||||||
| 
 | 
 | ||||||
|         soup = self.index_to_soup(absurl(card.h4.a['href'])) |         self.timefmt = f' [{url.split("Issue-")[-1]}]' | ||||||
|         self.cover_url = soup.find(**classes('print-edition__cover-image')).img['src'] |         self.title = 'Nikkei Asia' | ||||||
|  |         self.log(self.title, self.timefmt) | ||||||
|  |         soup = self.index_to_soup(url) | ||||||
|  |         self.cover_url = ( | ||||||
|  |             soup.find( | ||||||
|  |                 **prefixed_classes('MagazineIssueCard_magazineIssueCardCoverImage__') | ||||||
|  |             )['src'].split('?')[0] | ||||||
|  |             + '?width=600&source=nar-cms' | ||||||
|  |         ) | ||||||
| 
 | 
 | ||||||
|         ans = [] |         ans = [] | ||||||
| 
 | 
 | ||||||
|         for art in soup.findAll(**classes('card-article__body')): |         grid = soup.find(**prefixed_classes('MagazineArticles_magazineArticlesGrid__')) | ||||||
|             head = art.find(**classes('card-article__headline')) |         for a in grid.findAll( | ||||||
|             title = self.tag_to_string(head).strip() |             **prefixed_classes( | ||||||
|             url = absurl(head.a['href']) |                 'MagazineArticlesSpotlightCard_magazineArticlesSpotlightCardHeadline__ ' | ||||||
|  |                 'StreamArticleCard_streamArticleCardHeadline__' | ||||||
|  |             ) | ||||||
|  |         ): | ||||||
|  |             title = self.tag_to_string(a) | ||||||
|  |             url = absurl(a.a['href']) | ||||||
|             desc = '' |             desc = '' | ||||||
|             if exc := art.find(**classes('card-article__excerpt')): |             exc = a.findNext( | ||||||
|                 desc = self.tag_to_string(exc).strip() |                 **prefixed_classes( | ||||||
|             self.log( title, '\n   ', desc,  '\n        ', url ) |                     'MagazineArticlesSpotlightCard_magazineArticlesSpotlightCardSubheadWrapper__ ' | ||||||
|  |                     'StreamArticleCard_streamArticleCardSubhead__' | ||||||
|  |                 ) | ||||||
|  |             ) | ||||||
|  |             if exc: | ||||||
|  |                 desc = self.tag_to_string(exc) | ||||||
|  |             self.log(title, '\n   ', desc, '\n        ', url) | ||||||
|             ans.append({'title': title, 'url': url, 'description': desc}) |             ans.append({'title': title, 'url': url, 'description': desc}) | ||||||
|         return [('Articles', ans)] |         return [('Articles', ans)] | ||||||
| 
 | 
 | ||||||
|     def print_version(self, url): |     def preprocess_raw_html(self, raw, url): | ||||||
|         return 'https://webcache.googleusercontent.com/search?q=cache:' + url.split('?')[0] |         root = parse(raw) | ||||||
|  |         script = root.xpath('//script[@id="__NEXT_DATA__"]')[0].text | ||||||
|  |         data = json.loads(script)['props']['pageProps']['data'] | ||||||
|  |         title = f'<h1>{data["headline"]}</h1>' | ||||||
|  |         exp = auth = image = sec = '' | ||||||
|  |         sec = f'<div class="sec">{data["primaryTag"]["name"]}</div>' | ||||||
|  |         if data.get('subhead'): | ||||||
|  |             exp = f'<p class="subhead">{data["subhead"]}</p>' | ||||||
|  |         if data.get('byline'): | ||||||
|  |             auth = f'<p class="byline">{data["byline"]}</p>' | ||||||
|  |         if data.get('image'): | ||||||
|  |             img = data['image'] | ||||||
|  |             image = ( | ||||||
|  |                 f'<div><img src="{img["imageUrl"]}"><div class="article__caption">' | ||||||
|  |                 f'{data.get("fullCaption", "")}</div></div>' | ||||||
|  |             ) | ||||||
|  |         return ( | ||||||
|  |             '<html><body>' + sec + title | ||||||
|  |             + exp + image + auth + data['body'] | ||||||
|  |             + '</body></html>' | ||||||
|  |         ) | ||||||
|  | 
 | ||||||
|  |     def preprocess_html(self, soup): | ||||||
|  |         for attr in self.remove_attributes: | ||||||
|  |             for x in soup.findAll(attrs={attr: True}): | ||||||
|  |                 del x[attr] | ||||||
|  |         for img in soup.findAll('img', src=True): | ||||||
|  |             img['src'] = img['src'].split('?')[0] + '?width=600&source=nar-cms' | ||||||
|  |         return soup | ||||||
|  | |||||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user