mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-06-23 15:30:45 -04:00
Merge branch 'master' of https://github.com/unkn0w7n/calibre
This commit is contained in:
commit
a2be9e6981
@ -1,4 +1,7 @@
|
|||||||
from calibre.web.feeds.news import BasicNewsRecipe, classes
|
#!/usr/bin/env pythona
|
||||||
|
import json
|
||||||
|
from html5_parser import parse
|
||||||
|
from calibre.web.feeds.news import BasicNewsRecipe, prefixed_classes
|
||||||
|
|
||||||
|
|
||||||
def absurl(url):
|
def absurl(url):
|
||||||
@ -6,16 +9,17 @@ def absurl(url):
|
|||||||
url = 'https://asia.nikkei.com' + url
|
url = 'https://asia.nikkei.com' + url
|
||||||
return url
|
return url
|
||||||
|
|
||||||
class nikkei(BasicNewsRecipe):
|
|
||||||
title = 'Nikkei Asia'
|
class Nikkei(BasicNewsRecipe):
|
||||||
|
title = 'Nikkei Asia Magazine'
|
||||||
__author__ = 'unkn0wn'
|
__author__ = 'unkn0wn'
|
||||||
language = 'en'
|
language = 'en'
|
||||||
no_stylesheets = True
|
no_stylesheets = True
|
||||||
description = (
|
description = (
|
||||||
'Japan, China, India and Southeast Asia news and expert analysis published by Nikkei'
|
'The voice of the Asian century. Trusted independent journalism '
|
||||||
', an award-winning independent provider of quality journalism.'
|
'from Asia, the center of global growth.'
|
||||||
)
|
)
|
||||||
masthead_url = 'https://www.global-nikkei.com/22ia/images/logo/Nikkei-Asia-Logo.svg'
|
masthead_url = 'https://upload.wikimedia.org/wikipedia/commons/2/2f/Nikkei_Asia_logo.svg'
|
||||||
remove_attributes = ['style', 'height', 'width']
|
remove_attributes = ['style', 'height', 'width']
|
||||||
ignore_duplicate_articles = {'url'}
|
ignore_duplicate_articles = {'url'}
|
||||||
resolve_internal_links = True
|
resolve_internal_links = True
|
||||||
@ -23,46 +27,94 @@ class nikkei(BasicNewsRecipe):
|
|||||||
encoding = 'utf-8'
|
encoding = 'utf-8'
|
||||||
use_embedded_content = False
|
use_embedded_content = False
|
||||||
|
|
||||||
extra_css = '''
|
extra_css = """
|
||||||
.article-header__sub-title { font-style:italic; color:#202020; }
|
.subhead { font-style:italic; color:#202020; }
|
||||||
.article-header__details, .article__details { font-size:small; font-weight:bold; }
|
em, blockquote { color:#202020; }
|
||||||
.timestamp { color:#5c5c5c; }
|
.sec, .byline { font-size:small; font-weight:bold; }
|
||||||
.article-header__topic { font-size:small; font-weight:bold; color:#5c5c5c; }
|
.article__image, .article__caption { font-size:small; text-align:center; }
|
||||||
.article__image, .article__caption { font-size:small; text-align:center; color:#202020; }
|
"""
|
||||||
'''
|
|
||||||
|
|
||||||
keep_only_tags = [
|
recipe_specific_options = {
|
||||||
classes('article-header__container article')
|
'date': {'short': 'The edition date (YYYY-MM-DD format)', 'long': '2024-09-19'}
|
||||||
]
|
}
|
||||||
|
|
||||||
remove_tags = [
|
remove_tags = [dict(name='svg')]
|
||||||
dict(name='svg'),
|
|
||||||
classes('article__advert share__container no-print')
|
|
||||||
]
|
|
||||||
|
|
||||||
def parse_index(self):
|
def parse_index(self):
|
||||||
archives = self.index_to_soup('https://asia.nikkei.com/Print-Edition/Archives')
|
d = self.recipe_specific_options.get('date')
|
||||||
card = archives.find(attrs={'class':'card-article__body'})
|
if d and isinstance(d, str):
|
||||||
self.title = 'Nikkei Asia: ' + self.tag_to_string(card.h4).strip()
|
url = 'https://asia.nikkei.com/Print-Edition/Issue-' + d
|
||||||
self.description = self.tag_to_string(card.p)
|
else:
|
||||||
self.timefmt = ' [' + self.tag_to_string(card.span.time).strip() + ']'
|
archives = self.index_to_soup(
|
||||||
self.log('Downloading ', self.title, self.timefmt, self.description)
|
'https://asia.nikkei.com/Print-Edition/Archives'
|
||||||
|
)
|
||||||
soup = self.index_to_soup(absurl(card.h4.a['href']))
|
card = archives.find(
|
||||||
self.cover_url = soup.find(**classes('print-edition__cover-image')).img['src']
|
**prefixed_classes('MagazineIssueCardArchives_magazineIssueCardContent__')
|
||||||
|
)
|
||||||
|
url = absurl(card.a['href'])
|
||||||
|
|
||||||
|
self.timefmt = f' [{url.split("Issue-")[-1]}]'
|
||||||
|
self.title = 'Nikkei Asia'
|
||||||
|
self.log(self.title, self.timefmt)
|
||||||
|
soup = self.index_to_soup(url)
|
||||||
|
self.cover_url = (
|
||||||
|
soup.find(
|
||||||
|
**prefixed_classes('MagazineIssueCard_magazineIssueCardCoverImage__')
|
||||||
|
)['src'].split('?')[0]
|
||||||
|
+ '?width=600&source=nar-cms'
|
||||||
|
)
|
||||||
|
|
||||||
ans = []
|
ans = []
|
||||||
|
|
||||||
for art in soup.findAll(**classes('card-article__body')):
|
grid = soup.find(**prefixed_classes('MagazineArticles_magazineArticlesGrid__'))
|
||||||
head = art.find(**classes('card-article__headline'))
|
for a in grid.findAll(
|
||||||
title = self.tag_to_string(head).strip()
|
**prefixed_classes(
|
||||||
url = absurl(head.a['href'])
|
'MagazineArticlesSpotlightCard_magazineArticlesSpotlightCardHeadline__ '
|
||||||
|
'StreamArticleCard_streamArticleCardHeadline__'
|
||||||
|
)
|
||||||
|
):
|
||||||
|
title = self.tag_to_string(a)
|
||||||
|
url = absurl(a.a['href'])
|
||||||
desc = ''
|
desc = ''
|
||||||
if exc := art.find(**classes('card-article__excerpt')):
|
exc = a.findNext(
|
||||||
desc = self.tag_to_string(exc).strip()
|
**prefixed_classes(
|
||||||
self.log( title, '\n ', desc, '\n ', url )
|
'MagazineArticlesSpotlightCard_magazineArticlesSpotlightCardSubheadWrapper__ '
|
||||||
|
'StreamArticleCard_streamArticleCardSubhead__'
|
||||||
|
)
|
||||||
|
)
|
||||||
|
if exc:
|
||||||
|
desc = self.tag_to_string(exc)
|
||||||
|
self.log(title, '\n ', desc, '\n ', url)
|
||||||
ans.append({'title': title, 'url': url, 'description': desc})
|
ans.append({'title': title, 'url': url, 'description': desc})
|
||||||
return [('Articles', ans)]
|
return [('Articles', ans)]
|
||||||
|
|
||||||
def print_version(self, url):
|
def preprocess_raw_html(self, raw, url):
|
||||||
return 'https://webcache.googleusercontent.com/search?q=cache:' + url.split('?')[0]
|
root = parse(raw)
|
||||||
|
script = root.xpath('//script[@id="__NEXT_DATA__"]')[0].text
|
||||||
|
data = json.loads(script)['props']['pageProps']['data']
|
||||||
|
title = f'<h1>{data["headline"]}</h1>'
|
||||||
|
exp = auth = image = sec = ''
|
||||||
|
sec = f'<div class="sec">{data["primaryTag"]["name"]}</div>'
|
||||||
|
if data.get('subhead'):
|
||||||
|
exp = f'<p class="subhead">{data["subhead"]}</p>'
|
||||||
|
if data.get('byline'):
|
||||||
|
auth = f'<p class="byline">{data["byline"]}</p>'
|
||||||
|
if data.get('image'):
|
||||||
|
img = data['image']
|
||||||
|
image = (
|
||||||
|
f'<div><img src="{img["imageUrl"]}"><div class="article__caption">'
|
||||||
|
f'{data.get("fullCaption", "")}</div></div>'
|
||||||
|
)
|
||||||
|
return (
|
||||||
|
'<html><body>' + sec + title
|
||||||
|
+ exp + image + auth + data['body']
|
||||||
|
+ '</body></html>'
|
||||||
|
)
|
||||||
|
|
||||||
|
def preprocess_html(self, soup):
|
||||||
|
for attr in self.remove_attributes:
|
||||||
|
for x in soup.findAll(attrs={attr: True}):
|
||||||
|
del x[attr]
|
||||||
|
for img in soup.findAll('img', src=True):
|
||||||
|
img['src'] = img['src'].split('?')[0] + '?width=600&source=nar-cms'
|
||||||
|
return soup
|
||||||
|
Loading…
x
Reference in New Issue
Block a user